1 | //! Fused Multiply-Add instruction set (FMA) |
2 | //! |
3 | //! The FMA instruction set is an extension to the 128 and 256-bit SSE |
4 | //! instructions in the x86 microprocessor instruction set to perform fused |
5 | //! multiply–add (FMA) operations. |
6 | //! |
7 | //! The references are: |
8 | //! |
9 | //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: |
10 | //! Instruction Set Reference, A-Z][intel64_ref]. |
11 | //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and |
12 | //! System Instructions][amd64_ref]. |
13 | //! |
14 | //! Wikipedia's [FMA][wiki_fma] page provides a quick overview of the |
15 | //! instructions available. |
16 | //! |
17 | //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf |
18 | //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf |
19 | //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate |
20 | |
21 | use crate::core_arch::simd_llvm::simd_fma; |
22 | use crate::core_arch::x86::*; |
23 | |
24 | #[cfg (test)] |
25 | use stdarch_test::assert_instr; |
26 | |
27 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
28 | /// and `b`, and add the intermediate result to packed elements in `c`. |
29 | /// |
30 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_pd) |
31 | #[inline ] |
32 | #[target_feature (enable = "fma" )] |
33 | #[cfg_attr (test, assert_instr(vfmadd))] |
34 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
35 | pub unsafe fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
36 | simd_fma(a, b, c) |
37 | } |
38 | |
39 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
40 | /// and `b`, and add the intermediate result to packed elements in `c`. |
41 | /// |
42 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_pd) |
43 | #[inline ] |
44 | #[target_feature (enable = "fma" )] |
45 | #[cfg_attr (test, assert_instr(vfmadd))] |
46 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
47 | pub unsafe fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
48 | simd_fma(a, b, c) |
49 | } |
50 | |
51 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
52 | /// and `b`, and add the intermediate result to packed elements in `c`. |
53 | /// |
54 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ps) |
55 | #[inline ] |
56 | #[target_feature (enable = "fma" )] |
57 | #[cfg_attr (test, assert_instr(vfmadd))] |
58 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
59 | pub unsafe fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
60 | simd_fma(a, b, c) |
61 | } |
62 | |
63 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
64 | /// and `b`, and add the intermediate result to packed elements in `c`. |
65 | /// |
66 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmadd_ps) |
67 | #[inline ] |
68 | #[target_feature (enable = "fma" )] |
69 | #[cfg_attr (test, assert_instr(vfmadd))] |
70 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
71 | pub unsafe fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
72 | simd_fma(a, b, c) |
73 | } |
74 | |
75 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
76 | /// `a` and `b`, and add the intermediate result to the lower element in `c`. |
77 | /// Stores the result in the lower element of the returned value, and copy the |
78 | /// upper element from `a` to the upper elements of the result. |
79 | /// |
80 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_sd) |
81 | #[inline ] |
82 | #[target_feature (enable = "fma" )] |
83 | #[cfg_attr (test, assert_instr(vfmadd))] |
84 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
85 | pub unsafe fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
86 | vfmaddsd(a, b, c) |
87 | } |
88 | |
89 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
90 | /// `a` and `b`, and add the intermediate result to the lower element in `c`. |
91 | /// Stores the result in the lower element of the returned value, and copy the |
92 | /// 3 upper elements from `a` to the upper elements of the result. |
93 | /// |
94 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmadd_ss) |
95 | #[inline ] |
96 | #[target_feature (enable = "fma" )] |
97 | #[cfg_attr (test, assert_instr(vfmadd))] |
98 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
99 | pub unsafe fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
100 | vfmaddss(a, b, c) |
101 | } |
102 | |
103 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
104 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
105 | /// the intermediate result. |
106 | /// |
107 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd) |
108 | #[inline ] |
109 | #[target_feature (enable = "fma" )] |
110 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
111 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
112 | pub unsafe fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
113 | vfmaddsubpd(a, b, c) |
114 | } |
115 | |
116 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
117 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
118 | /// the intermediate result. |
119 | /// |
120 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd) |
121 | #[inline ] |
122 | #[target_feature (enable = "fma" )] |
123 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
124 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
125 | pub unsafe fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
126 | vfmaddsubpd256(a, b, c) |
127 | } |
128 | |
129 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
130 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
131 | /// the intermediate result. |
132 | /// |
133 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps) |
134 | #[inline ] |
135 | #[target_feature (enable = "fma" )] |
136 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
137 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
138 | pub unsafe fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
139 | vfmaddsubps(a, b, c) |
140 | } |
141 | |
142 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
143 | /// and `b`, and alternatively add and subtract packed elements in `c` to/from |
144 | /// the intermediate result. |
145 | /// |
146 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps) |
147 | #[inline ] |
148 | #[target_feature (enable = "fma" )] |
149 | #[cfg_attr (test, assert_instr(vfmaddsub))] |
150 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
151 | pub unsafe fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
152 | vfmaddsubps256(a, b, c) |
153 | } |
154 | |
155 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
156 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
157 | /// |
158 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_pd) |
159 | #[inline ] |
160 | #[target_feature (enable = "fma" )] |
161 | #[cfg_attr (test, assert_instr(vfmsub))] |
162 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
163 | pub unsafe fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
164 | vfmsubpd(a, b, c) |
165 | } |
166 | |
167 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
168 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
169 | /// |
170 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_pd) |
171 | #[inline ] |
172 | #[target_feature (enable = "fma" )] |
173 | #[cfg_attr (test, assert_instr(vfmsub))] |
174 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
175 | pub unsafe fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
176 | vfmsubpd256(a, b, c) |
177 | } |
178 | |
179 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
180 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
181 | /// |
182 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ps) |
183 | #[inline ] |
184 | #[target_feature (enable = "fma" )] |
185 | #[cfg_attr (test, assert_instr(vfmsub213ps))] |
186 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
187 | pub unsafe fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
188 | vfmsubps(a, b, c) |
189 | } |
190 | |
191 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
192 | /// and `b`, and subtract packed elements in `c` from the intermediate result. |
193 | /// |
194 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsub_ps) |
195 | #[inline ] |
196 | #[target_feature (enable = "fma" )] |
197 | #[cfg_attr (test, assert_instr(vfmsub213ps))] |
198 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
199 | pub unsafe fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
200 | vfmsubps256(a, b, c) |
201 | } |
202 | |
203 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
204 | /// `a` and `b`, and subtract the lower element in `c` from the intermediate |
205 | /// result. Store the result in the lower element of the returned value, and |
206 | /// copy the upper element from `a` to the upper elements of the result. |
207 | /// |
208 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_sd) |
209 | #[inline ] |
210 | #[target_feature (enable = "fma" )] |
211 | #[cfg_attr (test, assert_instr(vfmsub))] |
212 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
213 | pub unsafe fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
214 | vfmsubsd(a, b, c) |
215 | } |
216 | |
217 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
218 | /// `a` and `b`, and subtract the lower element in `c` from the intermediate |
219 | /// result. Store the result in the lower element of the returned value, and |
220 | /// copy the 3 upper elements from `a` to the upper elements of the result. |
221 | /// |
222 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsub_ss) |
223 | #[inline ] |
224 | #[target_feature (enable = "fma" )] |
225 | #[cfg_attr (test, assert_instr(vfmsub))] |
226 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
227 | pub unsafe fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
228 | vfmsubss(a, b, c) |
229 | } |
230 | |
231 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
232 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
233 | /// the intermediate result. |
234 | /// |
235 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd) |
236 | #[inline ] |
237 | #[target_feature (enable = "fma" )] |
238 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
239 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
240 | pub unsafe fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
241 | vfmsubaddpd(a, b, c) |
242 | } |
243 | |
244 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
245 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
246 | /// the intermediate result. |
247 | /// |
248 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd) |
249 | #[inline ] |
250 | #[target_feature (enable = "fma" )] |
251 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
252 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
253 | pub unsafe fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
254 | vfmsubaddpd256(a, b, c) |
255 | } |
256 | |
257 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
258 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
259 | /// the intermediate result. |
260 | /// |
261 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps) |
262 | #[inline ] |
263 | #[target_feature (enable = "fma" )] |
264 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
265 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
266 | pub unsafe fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
267 | vfmsubaddps(a, b, c) |
268 | } |
269 | |
270 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
271 | /// and `b`, and alternatively subtract and add packed elements in `c` from/to |
272 | /// the intermediate result. |
273 | /// |
274 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps) |
275 | #[inline ] |
276 | #[target_feature (enable = "fma" )] |
277 | #[cfg_attr (test, assert_instr(vfmsubadd))] |
278 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
279 | pub unsafe fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
280 | vfmsubaddps256(a, b, c) |
281 | } |
282 | |
283 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
284 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
285 | /// |
286 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd) |
287 | #[inline ] |
288 | #[target_feature (enable = "fma" )] |
289 | #[cfg_attr (test, assert_instr(vfnmadd))] |
290 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
291 | pub unsafe fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
292 | vfnmaddpd(a, b, c) |
293 | } |
294 | |
295 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
296 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
297 | /// |
298 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd) |
299 | #[inline ] |
300 | #[target_feature (enable = "fma" )] |
301 | #[cfg_attr (test, assert_instr(vfnmadd))] |
302 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
303 | pub unsafe fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
304 | vfnmaddpd256(a, b, c) |
305 | } |
306 | |
307 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
308 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
309 | /// |
310 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps) |
311 | #[inline ] |
312 | #[target_feature (enable = "fma" )] |
313 | #[cfg_attr (test, assert_instr(vfnmadd))] |
314 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
315 | pub unsafe fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
316 | vfnmaddps(a, b, c) |
317 | } |
318 | |
319 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
320 | /// and `b`, and add the negated intermediate result to packed elements in `c`. |
321 | /// |
322 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps) |
323 | #[inline ] |
324 | #[target_feature (enable = "fma" )] |
325 | #[cfg_attr (test, assert_instr(vfnmadd))] |
326 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
327 | pub unsafe fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
328 | vfnmaddps256(a, b, c) |
329 | } |
330 | |
331 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
332 | /// `a` and `b`, and add the negated intermediate result to the lower element |
333 | /// in `c`. Store the result in the lower element of the returned value, and |
334 | /// copy the upper element from `a` to the upper elements of the result. |
335 | /// |
336 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_sd) |
337 | #[inline ] |
338 | #[target_feature (enable = "fma" )] |
339 | #[cfg_attr (test, assert_instr(vfnmadd))] |
340 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
341 | pub unsafe fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
342 | vfnmaddsd(a, b, c) |
343 | } |
344 | |
345 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
346 | /// `a` and `b`, and add the negated intermediate result to the lower element |
347 | /// in `c`. Store the result in the lower element of the returned value, and |
348 | /// copy the 3 upper elements from `a` to the upper elements of the result. |
349 | /// |
350 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ss) |
351 | #[inline ] |
352 | #[target_feature (enable = "fma" )] |
353 | #[cfg_attr (test, assert_instr(vfnmadd))] |
354 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
355 | pub unsafe fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
356 | vfnmaddss(a, b, c) |
357 | } |
358 | |
359 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
360 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
361 | /// result. |
362 | /// |
363 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_pd) |
364 | #[inline ] |
365 | #[target_feature (enable = "fma" )] |
366 | #[cfg_attr (test, assert_instr(vfnmsub))] |
367 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
368 | pub unsafe fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
369 | vfnmsubpd(a, b, c) |
370 | } |
371 | |
372 | /// Multiplies packed double-precision (64-bit) floating-point elements in `a` |
373 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
374 | /// result. |
375 | /// |
376 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_pd) |
377 | #[inline ] |
378 | #[target_feature (enable = "fma" )] |
379 | #[cfg_attr (test, assert_instr(vfnmsub))] |
380 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
381 | pub unsafe fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
382 | vfnmsubpd256(a, b, c) |
383 | } |
384 | |
385 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
386 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
387 | /// result. |
388 | /// |
389 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ps) |
390 | #[inline ] |
391 | #[target_feature (enable = "fma" )] |
392 | #[cfg_attr (test, assert_instr(vfnmsub))] |
393 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
394 | pub unsafe fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128 { |
395 | vfnmsubps(a, b, c) |
396 | } |
397 | |
398 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` |
399 | /// and `b`, and subtract packed elements in `c` from the negated intermediate |
400 | /// result. |
401 | /// |
402 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmsub_ps) |
403 | #[inline ] |
404 | #[target_feature (enable = "fma" )] |
405 | #[cfg_attr (test, assert_instr(vfnmsub))] |
406 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
407 | pub unsafe fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
408 | vfnmsubps256(a, b, c) |
409 | } |
410 | |
411 | /// Multiplies the lower double-precision (64-bit) floating-point elements in |
412 | /// `a` and `b`, and subtract packed elements in `c` from the negated |
413 | /// intermediate result. Store the result in the lower element of the returned |
414 | /// value, and copy the upper element from `a` to the upper elements of the |
415 | /// result. |
416 | /// |
417 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_sd) |
418 | #[inline ] |
419 | #[target_feature (enable = "fma" )] |
420 | #[cfg_attr (test, assert_instr(vfnmsub))] |
421 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
422 | pub unsafe fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d { |
423 | vfnmsubsd(a, b, c) |
424 | } |
425 | |
426 | /// Multiplies the lower single-precision (32-bit) floating-point elements in |
427 | /// `a` and `b`, and subtract packed elements in `c` from the negated |
428 | /// intermediate result. Store the result in the lower element of the |
429 | /// returned value, and copy the 3 upper elements from `a` to the upper |
430 | /// elements of the result. |
431 | /// |
432 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmsub_ss) |
433 | #[inline ] |
434 | #[target_feature (enable = "fma" )] |
435 | #[cfg_attr (test, assert_instr(vfnmsub))] |
436 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
437 | pub unsafe fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128 { |
438 | vfnmsubss(a, b, c) |
439 | } |
440 | |
441 | #[allow (improper_ctypes)] |
442 | extern "C" { |
443 | #[link_name = "llvm.x86.fma.vfmadd.sd" ] |
444 | fn vfmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
445 | #[link_name = "llvm.x86.fma.vfmadd.ss" ] |
446 | fn vfmaddss(a: __m128, b: __m128, c: __m128) -> __m128; |
447 | #[link_name = "llvm.x86.fma.vfmaddsub.pd" ] |
448 | fn vfmaddsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
449 | #[link_name = "llvm.x86.fma.vfmaddsub.pd.256" ] |
450 | fn vfmaddsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d; |
451 | #[link_name = "llvm.x86.fma.vfmaddsub.ps" ] |
452 | fn vfmaddsubps(a: __m128, b: __m128, c: __m128) -> __m128; |
453 | #[link_name = "llvm.x86.fma.vfmaddsub.ps.256" ] |
454 | fn vfmaddsubps256(a: __m256, b: __m256, c: __m256) -> __m256; |
455 | #[link_name = "llvm.x86.fma.vfmsub.pd" ] |
456 | fn vfmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
457 | #[link_name = "llvm.x86.fma.vfmsub.pd.256" ] |
458 | fn vfmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d; |
459 | #[link_name = "llvm.x86.fma.vfmsub.ps" ] |
460 | fn vfmsubps(a: __m128, b: __m128, c: __m128) -> __m128; |
461 | #[link_name = "llvm.x86.fma.vfmsub.ps.256" ] |
462 | fn vfmsubps256(a: __m256, b: __m256, c: __m256) -> __m256; |
463 | #[link_name = "llvm.x86.fma.vfmsub.sd" ] |
464 | fn vfmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
465 | #[link_name = "llvm.x86.fma.vfmsub.ss" ] |
466 | fn vfmsubss(a: __m128, b: __m128, c: __m128) -> __m128; |
467 | #[link_name = "llvm.x86.fma.vfmsubadd.pd" ] |
468 | fn vfmsubaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
469 | #[link_name = "llvm.x86.fma.vfmsubadd.pd.256" ] |
470 | fn vfmsubaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d; |
471 | #[link_name = "llvm.x86.fma.vfmsubadd.ps" ] |
472 | fn vfmsubaddps(a: __m128, b: __m128, c: __m128) -> __m128; |
473 | #[link_name = "llvm.x86.fma.vfmsubadd.ps.256" ] |
474 | fn vfmsubaddps256(a: __m256, b: __m256, c: __m256) -> __m256; |
475 | #[link_name = "llvm.x86.fma.vfnmadd.pd" ] |
476 | fn vfnmaddpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
477 | #[link_name = "llvm.x86.fma.vfnmadd.pd.256" ] |
478 | fn vfnmaddpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d; |
479 | #[link_name = "llvm.x86.fma.vfnmadd.ps" ] |
480 | fn vfnmaddps(a: __m128, b: __m128, c: __m128) -> __m128; |
481 | #[link_name = "llvm.x86.fma.vfnmadd.ps.256" ] |
482 | fn vfnmaddps256(a: __m256, b: __m256, c: __m256) -> __m256; |
483 | #[link_name = "llvm.x86.fma.vfnmadd.sd" ] |
484 | fn vfnmaddsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
485 | #[link_name = "llvm.x86.fma.vfnmadd.ss" ] |
486 | fn vfnmaddss(a: __m128, b: __m128, c: __m128) -> __m128; |
487 | #[link_name = "llvm.x86.fma.vfnmsub.pd" ] |
488 | fn vfnmsubpd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
489 | #[link_name = "llvm.x86.fma.vfnmsub.pd.256" ] |
490 | fn vfnmsubpd256(a: __m256d, b: __m256d, c: __m256d) -> __m256d; |
491 | #[link_name = "llvm.x86.fma.vfnmsub.ps" ] |
492 | fn vfnmsubps(a: __m128, b: __m128, c: __m128) -> __m128; |
493 | #[link_name = "llvm.x86.fma.vfnmsub.ps.256" ] |
494 | fn vfnmsubps256(a: __m256, b: __m256, c: __m256) -> __m256; |
495 | #[link_name = "llvm.x86.fma.vfnmsub.sd" ] |
496 | fn vfnmsubsd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; |
497 | #[link_name = "llvm.x86.fma.vfnmsub.ss" ] |
498 | fn vfnmsubss(a: __m128, b: __m128, c: __m128) -> __m128; |
499 | } |
500 | |
501 | #[cfg (test)] |
502 | mod tests { |
503 | |
504 | use stdarch_test::simd_test; |
505 | |
506 | use crate::core_arch::x86::*; |
507 | |
508 | #[simd_test(enable = "fma" )] |
509 | unsafe fn test_mm_fmadd_pd() { |
510 | let a = _mm_setr_pd(1., 2.); |
511 | let b = _mm_setr_pd(5., 3.); |
512 | let c = _mm_setr_pd(4., 9.); |
513 | let r = _mm_setr_pd(9., 15.); |
514 | assert_eq_m128d(_mm_fmadd_pd(a, b, c), r); |
515 | } |
516 | |
517 | #[simd_test(enable = "fma" )] |
518 | unsafe fn test_mm256_fmadd_pd() { |
519 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
520 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
521 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
522 | let r = _mm256_setr_pd(9., 15., 22., 15.); |
523 | assert_eq_m256d(_mm256_fmadd_pd(a, b, c), r); |
524 | } |
525 | |
526 | #[simd_test(enable = "fma" )] |
527 | unsafe fn test_mm_fmadd_ps() { |
528 | let a = _mm_setr_ps(1., 2., 3., 4.); |
529 | let b = _mm_setr_ps(5., 3., 7., 2.); |
530 | let c = _mm_setr_ps(4., 9., 1., 7.); |
531 | let r = _mm_setr_ps(9., 15., 22., 15.); |
532 | assert_eq_m128(_mm_fmadd_ps(a, b, c), r); |
533 | } |
534 | |
535 | #[simd_test(enable = "fma" )] |
536 | unsafe fn test_mm256_fmadd_ps() { |
537 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
538 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
539 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
540 | let r = _mm256_setr_ps(9., 15., 22., 15., -5., -49., -2., -31.); |
541 | assert_eq_m256(_mm256_fmadd_ps(a, b, c), r); |
542 | } |
543 | |
544 | #[simd_test(enable = "fma" )] |
545 | unsafe fn test_mm_fmadd_sd() { |
546 | let a = _mm_setr_pd(1., 2.); |
547 | let b = _mm_setr_pd(5., 3.); |
548 | let c = _mm_setr_pd(4., 9.); |
549 | let r = _mm_setr_pd(9., 2.); |
550 | assert_eq_m128d(_mm_fmadd_sd(a, b, c), r); |
551 | } |
552 | |
553 | #[simd_test(enable = "fma" )] |
554 | unsafe fn test_mm_fmadd_ss() { |
555 | let a = _mm_setr_ps(1., 2., 3., 4.); |
556 | let b = _mm_setr_ps(5., 3., 7., 2.); |
557 | let c = _mm_setr_ps(4., 9., 1., 7.); |
558 | let r = _mm_setr_ps(9., 2., 3., 4.); |
559 | assert_eq_m128(_mm_fmadd_ss(a, b, c), r); |
560 | } |
561 | |
562 | #[simd_test(enable = "fma" )] |
563 | unsafe fn test_mm_fmaddsub_pd() { |
564 | let a = _mm_setr_pd(1., 2.); |
565 | let b = _mm_setr_pd(5., 3.); |
566 | let c = _mm_setr_pd(4., 9.); |
567 | let r = _mm_setr_pd(1., 15.); |
568 | assert_eq_m128d(_mm_fmaddsub_pd(a, b, c), r); |
569 | } |
570 | |
571 | #[simd_test(enable = "fma" )] |
572 | unsafe fn test_mm256_fmaddsub_pd() { |
573 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
574 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
575 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
576 | let r = _mm256_setr_pd(1., 15., 20., 15.); |
577 | assert_eq_m256d(_mm256_fmaddsub_pd(a, b, c), r); |
578 | } |
579 | |
580 | #[simd_test(enable = "fma" )] |
581 | unsafe fn test_mm_fmaddsub_ps() { |
582 | let a = _mm_setr_ps(1., 2., 3., 4.); |
583 | let b = _mm_setr_ps(5., 3., 7., 2.); |
584 | let c = _mm_setr_ps(4., 9., 1., 7.); |
585 | let r = _mm_setr_ps(1., 15., 20., 15.); |
586 | assert_eq_m128(_mm_fmaddsub_ps(a, b, c), r); |
587 | } |
588 | |
589 | #[simd_test(enable = "fma" )] |
590 | unsafe fn test_mm256_fmaddsub_ps() { |
591 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
592 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
593 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
594 | let r = _mm256_setr_ps(1., 15., 20., 15., 5., -49., 2., -31.); |
595 | assert_eq_m256(_mm256_fmaddsub_ps(a, b, c), r); |
596 | } |
597 | |
598 | #[simd_test(enable = "fma" )] |
599 | unsafe fn test_mm_fmsub_pd() { |
600 | let a = _mm_setr_pd(1., 2.); |
601 | let b = _mm_setr_pd(5., 3.); |
602 | let c = _mm_setr_pd(4., 9.); |
603 | let r = _mm_setr_pd(1., -3.); |
604 | assert_eq_m128d(_mm_fmsub_pd(a, b, c), r); |
605 | } |
606 | |
607 | #[simd_test(enable = "fma" )] |
608 | unsafe fn test_mm256_fmsub_pd() { |
609 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
610 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
611 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
612 | let r = _mm256_setr_pd(1., -3., 20., 1.); |
613 | assert_eq_m256d(_mm256_fmsub_pd(a, b, c), r); |
614 | } |
615 | |
616 | #[simd_test(enable = "fma" )] |
617 | unsafe fn test_mm_fmsub_ps() { |
618 | let a = _mm_setr_ps(1., 2., 3., 4.); |
619 | let b = _mm_setr_ps(5., 3., 7., 2.); |
620 | let c = _mm_setr_ps(4., 9., 1., 7.); |
621 | let r = _mm_setr_ps(1., -3., 20., 1.); |
622 | assert_eq_m128(_mm_fmsub_ps(a, b, c), r); |
623 | } |
624 | |
625 | #[simd_test(enable = "fma" )] |
626 | unsafe fn test_mm256_fmsub_ps() { |
627 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
628 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
629 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
630 | let r = _mm256_setr_ps(1., -3., 20., 1., 5., -71., 2., -25.); |
631 | assert_eq_m256(_mm256_fmsub_ps(a, b, c), r); |
632 | } |
633 | |
634 | #[simd_test(enable = "fma" )] |
635 | unsafe fn test_mm_fmsub_sd() { |
636 | let a = _mm_setr_pd(1., 2.); |
637 | let b = _mm_setr_pd(5., 3.); |
638 | let c = _mm_setr_pd(4., 9.); |
639 | let r = _mm_setr_pd(1., 2.); |
640 | assert_eq_m128d(_mm_fmsub_sd(a, b, c), r); |
641 | } |
642 | |
643 | #[simd_test(enable = "fma" )] |
644 | unsafe fn test_mm_fmsub_ss() { |
645 | let a = _mm_setr_ps(1., 2., 3., 4.); |
646 | let b = _mm_setr_ps(5., 3., 7., 2.); |
647 | let c = _mm_setr_ps(4., 9., 1., 7.); |
648 | let r = _mm_setr_ps(1., 2., 3., 4.); |
649 | assert_eq_m128(_mm_fmsub_ss(a, b, c), r); |
650 | } |
651 | |
652 | #[simd_test(enable = "fma" )] |
653 | unsafe fn test_mm_fmsubadd_pd() { |
654 | let a = _mm_setr_pd(1., 2.); |
655 | let b = _mm_setr_pd(5., 3.); |
656 | let c = _mm_setr_pd(4., 9.); |
657 | let r = _mm_setr_pd(9., -3.); |
658 | assert_eq_m128d(_mm_fmsubadd_pd(a, b, c), r); |
659 | } |
660 | |
661 | #[simd_test(enable = "fma" )] |
662 | unsafe fn test_mm256_fmsubadd_pd() { |
663 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
664 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
665 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
666 | let r = _mm256_setr_pd(9., -3., 22., 1.); |
667 | assert_eq_m256d(_mm256_fmsubadd_pd(a, b, c), r); |
668 | } |
669 | |
670 | #[simd_test(enable = "fma" )] |
671 | unsafe fn test_mm_fmsubadd_ps() { |
672 | let a = _mm_setr_ps(1., 2., 3., 4.); |
673 | let b = _mm_setr_ps(5., 3., 7., 2.); |
674 | let c = _mm_setr_ps(4., 9., 1., 7.); |
675 | let r = _mm_setr_ps(9., -3., 22., 1.); |
676 | assert_eq_m128(_mm_fmsubadd_ps(a, b, c), r); |
677 | } |
678 | |
679 | #[simd_test(enable = "fma" )] |
680 | unsafe fn test_mm256_fmsubadd_ps() { |
681 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
682 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
683 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
684 | let r = _mm256_setr_ps(9., -3., 22., 1., -5., -71., -2., -25.); |
685 | assert_eq_m256(_mm256_fmsubadd_ps(a, b, c), r); |
686 | } |
687 | |
688 | #[simd_test(enable = "fma" )] |
689 | unsafe fn test_mm_fnmadd_pd() { |
690 | let a = _mm_setr_pd(1., 2.); |
691 | let b = _mm_setr_pd(5., 3.); |
692 | let c = _mm_setr_pd(4., 9.); |
693 | let r = _mm_setr_pd(-1., 3.); |
694 | assert_eq_m128d(_mm_fnmadd_pd(a, b, c), r); |
695 | } |
696 | |
697 | #[simd_test(enable = "fma" )] |
698 | unsafe fn test_mm256_fnmadd_pd() { |
699 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
700 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
701 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
702 | let r = _mm256_setr_pd(-1., 3., -20., -1.); |
703 | assert_eq_m256d(_mm256_fnmadd_pd(a, b, c), r); |
704 | } |
705 | |
706 | #[simd_test(enable = "fma" )] |
707 | unsafe fn test_mm_fnmadd_ps() { |
708 | let a = _mm_setr_ps(1., 2., 3., 4.); |
709 | let b = _mm_setr_ps(5., 3., 7., 2.); |
710 | let c = _mm_setr_ps(4., 9., 1., 7.); |
711 | let r = _mm_setr_ps(-1., 3., -20., -1.); |
712 | assert_eq_m128(_mm_fnmadd_ps(a, b, c), r); |
713 | } |
714 | |
715 | #[simd_test(enable = "fma" )] |
716 | unsafe fn test_mm256_fnmadd_ps() { |
717 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
718 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
719 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
720 | let r = _mm256_setr_ps(-1., 3., -20., -1., -5., 71., -2., 25.); |
721 | assert_eq_m256(_mm256_fnmadd_ps(a, b, c), r); |
722 | } |
723 | |
724 | #[simd_test(enable = "fma" )] |
725 | unsafe fn test_mm_fnmadd_sd() { |
726 | let a = _mm_setr_pd(1., 2.); |
727 | let b = _mm_setr_pd(5., 3.); |
728 | let c = _mm_setr_pd(4., 9.); |
729 | let r = _mm_setr_pd(-1., 2.); |
730 | assert_eq_m128d(_mm_fnmadd_sd(a, b, c), r); |
731 | } |
732 | |
733 | #[simd_test(enable = "fma" )] |
734 | unsafe fn test_mm_fnmadd_ss() { |
735 | let a = _mm_setr_ps(1., 2., 3., 4.); |
736 | let b = _mm_setr_ps(5., 3., 7., 2.); |
737 | let c = _mm_setr_ps(4., 9., 1., 7.); |
738 | let r = _mm_setr_ps(-1., 2., 3., 4.); |
739 | assert_eq_m128(_mm_fnmadd_ss(a, b, c), r); |
740 | } |
741 | |
742 | #[simd_test(enable = "fma" )] |
743 | unsafe fn test_mm_fnmsub_pd() { |
744 | let a = _mm_setr_pd(1., 2.); |
745 | let b = _mm_setr_pd(5., 3.); |
746 | let c = _mm_setr_pd(4., 9.); |
747 | let r = _mm_setr_pd(-9., -15.); |
748 | assert_eq_m128d(_mm_fnmsub_pd(a, b, c), r); |
749 | } |
750 | |
751 | #[simd_test(enable = "fma" )] |
752 | unsafe fn test_mm256_fnmsub_pd() { |
753 | let a = _mm256_setr_pd(1., 2., 3., 4.); |
754 | let b = _mm256_setr_pd(5., 3., 7., 2.); |
755 | let c = _mm256_setr_pd(4., 9., 1., 7.); |
756 | let r = _mm256_setr_pd(-9., -15., -22., -15.); |
757 | assert_eq_m256d(_mm256_fnmsub_pd(a, b, c), r); |
758 | } |
759 | |
760 | #[simd_test(enable = "fma" )] |
761 | unsafe fn test_mm_fnmsub_ps() { |
762 | let a = _mm_setr_ps(1., 2., 3., 4.); |
763 | let b = _mm_setr_ps(5., 3., 7., 2.); |
764 | let c = _mm_setr_ps(4., 9., 1., 7.); |
765 | let r = _mm_setr_ps(-9., -15., -22., -15.); |
766 | assert_eq_m128(_mm_fnmsub_ps(a, b, c), r); |
767 | } |
768 | |
769 | #[simd_test(enable = "fma" )] |
770 | unsafe fn test_mm256_fnmsub_ps() { |
771 | let a = _mm256_setr_ps(1., 2., 3., 4., 0., 10., -1., -2.); |
772 | let b = _mm256_setr_ps(5., 3., 7., 2., 4., -6., 0., 14.); |
773 | let c = _mm256_setr_ps(4., 9., 1., 7., -5., 11., -2., -3.); |
774 | let r = _mm256_setr_ps(-9., -15., -22., -15., 5., 49., 2., 31.); |
775 | assert_eq_m256(_mm256_fnmsub_ps(a, b, c), r); |
776 | } |
777 | |
778 | #[simd_test(enable = "fma" )] |
779 | unsafe fn test_mm_fnmsub_sd() { |
780 | let a = _mm_setr_pd(1., 2.); |
781 | let b = _mm_setr_pd(5., 3.); |
782 | let c = _mm_setr_pd(4., 9.); |
783 | let r = _mm_setr_pd(-9., 2.); |
784 | assert_eq_m128d(_mm_fnmsub_sd(a, b, c), r); |
785 | } |
786 | |
787 | #[simd_test(enable = "fma" )] |
788 | unsafe fn test_mm_fnmsub_ss() { |
789 | let a = _mm_setr_ps(1., 2., 3., 4.); |
790 | let b = _mm_setr_ps(5., 3., 7., 2.); |
791 | let c = _mm_setr_ps(4., 9., 1., 7.); |
792 | let r = _mm_setr_ps(-9., 2., 3., 4.); |
793 | assert_eq_m128(_mm_fnmsub_ss(a, b, c), r); |
794 | } |
795 | } |
796 | |