1 | use crate::core_arch::x86::*; |
2 | use crate::intrinsics::simd::simd_select_bitmask; |
3 | |
4 | #[cfg (test)] |
5 | use stdarch_test::assert_instr; |
6 | |
7 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
8 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
9 | /// unsigned integer from the intermediate result with the |
10 | /// corresponding unsigned 64-bit integer in `a`, and store the |
11 | /// results in `dst`. |
12 | /// |
13 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52hi_epu64) |
14 | #[inline ] |
15 | #[target_feature (enable = "avx512ifma" )] |
16 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
17 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
18 | pub fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
19 | unsafe { vpmadd52huq_512(z:a, x:b, y:c) } |
20 | } |
21 | |
22 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
23 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
24 | /// unsigned integer from the intermediate result with the |
25 | /// corresponding unsigned 64-bit integer in `a`, and store the |
26 | /// results in `dst` using writemask `k` (elements are copied |
27 | /// from `k` when the corresponding mask bit is not set). |
28 | /// |
29 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52hi_epu64) |
30 | #[inline ] |
31 | #[target_feature (enable = "avx512ifma" )] |
32 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
33 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
34 | pub fn _mm512_mask_madd52hi_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i { |
35 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_512(a, b, c), no:a) } |
36 | } |
37 | |
38 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
39 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
40 | /// unsigned integer from the intermediate result with the |
41 | /// corresponding unsigned 64-bit integer in `a`, and store the |
42 | /// results in `dst` using writemask `k` (elements are zeroed |
43 | /// out when the corresponding mask bit is not set). |
44 | /// |
45 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52hi_epu64) |
46 | #[inline ] |
47 | #[target_feature (enable = "avx512ifma" )] |
48 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
49 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
50 | pub fn _mm512_maskz_madd52hi_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
51 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_512(a, b, c), no:_mm512_setzero_si512()) } |
52 | } |
53 | |
54 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
55 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
56 | /// unsigned integer from the intermediate result with the |
57 | /// corresponding unsigned 64-bit integer in `a`, and store the |
58 | /// results in `dst`. |
59 | /// |
60 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52lo_epu64) |
61 | #[inline ] |
62 | #[target_feature (enable = "avx512ifma" )] |
63 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
64 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
65 | pub fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
66 | unsafe { vpmadd52luq_512(z:a, x:b, y:c) } |
67 | } |
68 | |
69 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
70 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
71 | /// unsigned integer from the intermediate result with the |
72 | /// corresponding unsigned 64-bit integer in `a`, and store the |
73 | /// results in `dst` using writemask `k` (elements are copied |
74 | /// from `k` when the corresponding mask bit is not set). |
75 | /// |
76 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52lo_epu64) |
77 | #[inline ] |
78 | #[target_feature (enable = "avx512ifma" )] |
79 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
80 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
81 | pub fn _mm512_mask_madd52lo_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i { |
82 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_512(a, b, c), no:a) } |
83 | } |
84 | |
85 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
86 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
87 | /// unsigned integer from the intermediate result with the |
88 | /// corresponding unsigned 64-bit integer in `a`, and store the |
89 | /// results in `dst` using writemask `k` (elements are zeroed |
90 | /// out when the corresponding mask bit is not set). |
91 | /// |
92 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52lo_epu64) |
93 | #[inline ] |
94 | #[target_feature (enable = "avx512ifma" )] |
95 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
96 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
97 | pub fn _mm512_maskz_madd52lo_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
98 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_512(a, b, c), no:_mm512_setzero_si512()) } |
99 | } |
100 | |
101 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
102 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
103 | /// unsigned integer from the intermediate result with the |
104 | /// corresponding unsigned 64-bit integer in `a`, and store the |
105 | /// results in `dst`. |
106 | /// |
107 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64) |
108 | #[inline ] |
109 | #[target_feature (enable = "avxifma" )] |
110 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
111 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
112 | pub fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
113 | unsafe { vpmadd52huq_256(z:a, x:b, y:c) } |
114 | } |
115 | |
116 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
117 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
118 | /// unsigned integer from the intermediate result with the |
119 | /// corresponding unsigned 64-bit integer in `a`, and store the |
120 | /// results in `dst`. |
121 | /// |
122 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52hi_epu64) |
123 | #[inline ] |
124 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
125 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
126 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
127 | pub fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
128 | unsafe { vpmadd52huq_256(z:a, x:b, y:c) } |
129 | } |
130 | |
131 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
132 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
133 | /// unsigned integer from the intermediate result with the |
134 | /// corresponding unsigned 64-bit integer in `a`, and store the |
135 | /// results in `dst` using writemask `k` (elements are copied |
136 | /// from `k` when the corresponding mask bit is not set). |
137 | /// |
138 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52hi_epu64) |
139 | #[inline ] |
140 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
141 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
142 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
143 | pub fn _mm256_mask_madd52hi_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { |
144 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_256(a, b, c), no:a) } |
145 | } |
146 | |
147 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
148 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
149 | /// unsigned integer from the intermediate result with the |
150 | /// corresponding unsigned 64-bit integer in `a`, and store the |
151 | /// results in `dst` using writemask `k` (elements are zeroed |
152 | /// out when the corresponding mask bit is not set). |
153 | /// |
154 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52hi_epu64) |
155 | #[inline ] |
156 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
157 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
158 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
159 | pub fn _mm256_maskz_madd52hi_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
160 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_256(a, b, c), no:_mm256_setzero_si256()) } |
161 | } |
162 | |
163 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
164 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
165 | /// unsigned integer from the intermediate result with the |
166 | /// corresponding unsigned 64-bit integer in `a`, and store the |
167 | /// results in `dst`. |
168 | /// |
169 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64) |
170 | #[inline ] |
171 | #[target_feature (enable = "avxifma" )] |
172 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
173 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
174 | pub fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
175 | unsafe { vpmadd52luq_256(z:a, x:b, y:c) } |
176 | } |
177 | |
178 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
179 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
180 | /// unsigned integer from the intermediate result with the |
181 | /// corresponding unsigned 64-bit integer in `a`, and store the |
182 | /// results in `dst`. |
183 | /// |
184 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52lo_epu64) |
185 | #[inline ] |
186 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
187 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
188 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
189 | pub fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
190 | unsafe { vpmadd52luq_256(z:a, x:b, y:c) } |
191 | } |
192 | |
193 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
194 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
195 | /// unsigned integer from the intermediate result with the |
196 | /// corresponding unsigned 64-bit integer in `a`, and store the |
197 | /// results in `dst` using writemask `k` (elements are copied |
198 | /// from `k` when the corresponding mask bit is not set). |
199 | /// |
200 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52lo_epu64) |
201 | #[inline ] |
202 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
203 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
204 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
205 | pub fn _mm256_mask_madd52lo_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { |
206 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_256(a, b, c), no:a) } |
207 | } |
208 | |
209 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
210 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
211 | /// unsigned integer from the intermediate result with the |
212 | /// corresponding unsigned 64-bit integer in `a`, and store the |
213 | /// results in `dst` using writemask `k` (elements are zeroed |
214 | /// out when the corresponding mask bit is not set). |
215 | /// |
216 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52lo_epu64) |
217 | #[inline ] |
218 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
219 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
220 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
221 | pub fn _mm256_maskz_madd52lo_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
222 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_256(a, b, c), no:_mm256_setzero_si256()) } |
223 | } |
224 | |
225 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
226 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
227 | /// unsigned integer from the intermediate result with the |
228 | /// corresponding unsigned 64-bit integer in `a`, and store the |
229 | /// results in `dst`. |
230 | /// |
231 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64) |
232 | #[inline ] |
233 | #[target_feature (enable = "avxifma" )] |
234 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
235 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
236 | pub fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
237 | unsafe { vpmadd52huq_128(z:a, x:b, y:c) } |
238 | } |
239 | |
240 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
241 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
242 | /// unsigned integer from the intermediate result with the |
243 | /// corresponding unsigned 64-bit integer in `a`, and store the |
244 | /// results in `dst`. |
245 | /// |
246 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52hi_epu64) |
247 | #[inline ] |
248 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
249 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
250 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
251 | pub fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
252 | unsafe { vpmadd52huq_128(z:a, x:b, y:c) } |
253 | } |
254 | |
255 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
256 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
257 | /// unsigned integer from the intermediate result with the |
258 | /// corresponding unsigned 64-bit integer in `a`, and store the |
259 | /// results in `dst` using writemask `k` (elements are copied |
260 | /// from `k` when the corresponding mask bit is not set). |
261 | /// |
262 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52hi_epu64) |
263 | #[inline ] |
264 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
265 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
266 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
267 | pub fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { |
268 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_128(a, b, c), no:a) } |
269 | } |
270 | |
271 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
272 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
273 | /// unsigned integer from the intermediate result with the |
274 | /// corresponding unsigned 64-bit integer in `a`, and store the |
275 | /// results in `dst` using writemask `k` (elements are zeroed |
276 | /// out when the corresponding mask bit is not set). |
277 | /// |
278 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52hi_epu64) |
279 | #[inline ] |
280 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
281 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
282 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
283 | pub fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
284 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_128(a, b, c), no:_mm_setzero_si128()) } |
285 | } |
286 | |
287 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
288 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
289 | /// unsigned integer from the intermediate result with the |
290 | /// corresponding unsigned 64-bit integer in `a`, and store the |
291 | /// results in `dst`. |
292 | /// |
293 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64) |
294 | #[inline ] |
295 | #[target_feature (enable = "avxifma" )] |
296 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
297 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
298 | pub fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
299 | unsafe { vpmadd52luq_128(z:a, x:b, y:c) } |
300 | } |
301 | |
302 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
303 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
304 | /// unsigned integer from the intermediate result with the |
305 | /// corresponding unsigned 64-bit integer in `a`, and store the |
306 | /// results in `dst`. |
307 | /// |
308 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52lo_epu64) |
309 | #[inline ] |
310 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
311 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
312 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
313 | pub fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
314 | unsafe { vpmadd52luq_128(z:a, x:b, y:c) } |
315 | } |
316 | |
317 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
318 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
319 | /// unsigned integer from the intermediate result with the |
320 | /// corresponding unsigned 64-bit integer in `a`, and store the |
321 | /// results in `dst` using writemask `k` (elements are copied |
322 | /// from `k` when the corresponding mask bit is not set). |
323 | /// |
324 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52lo_epu64) |
325 | #[inline ] |
326 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
327 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
328 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
329 | pub fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { |
330 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_128(a, b, c), no:a) } |
331 | } |
332 | |
333 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
334 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
335 | /// unsigned integer from the intermediate result with the |
336 | /// corresponding unsigned 64-bit integer in `a`, and store the |
337 | /// results in `dst` using writemask `k` (elements are zeroed |
338 | /// out when the corresponding mask bit is not set). |
339 | /// |
340 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52lo_epu64) |
341 | #[inline ] |
342 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
343 | #[stable (feature = "stdarch_x86_avx512" , since = "1.89" )] |
344 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
345 | pub fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
346 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_128(a, b, c), no:_mm_setzero_si128()) } |
347 | } |
348 | |
349 | #[allow (improper_ctypes)] |
350 | unsafe extern "C" { |
351 | #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128" ] |
352 | unsafefn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; |
353 | #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128" ] |
354 | unsafefn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; |
355 | #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256" ] |
356 | unsafefn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; |
357 | #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256" ] |
358 | unsafefn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; |
359 | #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512" ] |
360 | unsafefn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; |
361 | #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512" ] |
362 | unsafefn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; |
363 | } |
364 | |
365 | #[cfg (test)] |
366 | mod tests { |
367 | |
368 | use stdarch_test::simd_test; |
369 | |
370 | use crate::core_arch::x86::*; |
371 | |
372 | const K: __mmask8 = 0b01101101; |
373 | |
374 | #[simd_test(enable = "avx512ifma" )] |
375 | unsafe fn test_mm512_madd52hi_epu64() { |
376 | let a = _mm512_set1_epi64(10 << 40); |
377 | let b = _mm512_set1_epi64((11 << 40) + 4); |
378 | let c = _mm512_set1_epi64((12 << 40) + 3); |
379 | |
380 | let actual = _mm512_madd52hi_epu64(a, b, c); |
381 | |
382 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
383 | let expected = _mm512_set1_epi64(11030549757952); |
384 | |
385 | assert_eq_m512i(expected, actual); |
386 | } |
387 | |
388 | #[simd_test(enable = "avx512ifma" )] |
389 | unsafe fn test_mm512_mask_madd52hi_epu64() { |
390 | let a = _mm512_set1_epi64(10 << 40); |
391 | let b = _mm512_set1_epi64((11 << 40) + 4); |
392 | let c = _mm512_set1_epi64((12 << 40) + 3); |
393 | |
394 | let actual = _mm512_mask_madd52hi_epu64(a, K, b, c); |
395 | |
396 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
397 | let mut expected = _mm512_set1_epi64(11030549757952); |
398 | expected = _mm512_mask_blend_epi64(K, a, expected); |
399 | |
400 | assert_eq_m512i(expected, actual); |
401 | } |
402 | |
403 | #[simd_test(enable = "avx512ifma" )] |
404 | unsafe fn test_mm512_maskz_madd52hi_epu64() { |
405 | let a = _mm512_set1_epi64(10 << 40); |
406 | let b = _mm512_set1_epi64((11 << 40) + 4); |
407 | let c = _mm512_set1_epi64((12 << 40) + 3); |
408 | |
409 | let actual = _mm512_maskz_madd52hi_epu64(K, a, b, c); |
410 | |
411 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
412 | let mut expected = _mm512_set1_epi64(11030549757952); |
413 | expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected); |
414 | |
415 | assert_eq_m512i(expected, actual); |
416 | } |
417 | |
418 | #[simd_test(enable = "avx512ifma" )] |
419 | unsafe fn test_mm512_madd52lo_epu64() { |
420 | let a = _mm512_set1_epi64(10 << 40); |
421 | let b = _mm512_set1_epi64((11 << 40) + 4); |
422 | let c = _mm512_set1_epi64((12 << 40) + 3); |
423 | |
424 | let actual = _mm512_madd52lo_epu64(a, b, c); |
425 | |
426 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
427 | let expected = _mm512_set1_epi64(100055558127628); |
428 | |
429 | assert_eq_m512i(expected, actual); |
430 | } |
431 | |
432 | #[simd_test(enable = "avx512ifma" )] |
433 | unsafe fn test_mm512_mask_madd52lo_epu64() { |
434 | let a = _mm512_set1_epi64(10 << 40); |
435 | let b = _mm512_set1_epi64((11 << 40) + 4); |
436 | let c = _mm512_set1_epi64((12 << 40) + 3); |
437 | |
438 | let actual = _mm512_mask_madd52lo_epu64(a, K, b, c); |
439 | |
440 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
441 | let mut expected = _mm512_set1_epi64(100055558127628); |
442 | expected = _mm512_mask_blend_epi64(K, a, expected); |
443 | |
444 | assert_eq_m512i(expected, actual); |
445 | } |
446 | |
447 | #[simd_test(enable = "avx512ifma" )] |
448 | unsafe fn test_mm512_maskz_madd52lo_epu64() { |
449 | let a = _mm512_set1_epi64(10 << 40); |
450 | let b = _mm512_set1_epi64((11 << 40) + 4); |
451 | let c = _mm512_set1_epi64((12 << 40) + 3); |
452 | |
453 | let actual = _mm512_maskz_madd52lo_epu64(K, a, b, c); |
454 | |
455 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
456 | let mut expected = _mm512_set1_epi64(100055558127628); |
457 | expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected); |
458 | |
459 | assert_eq_m512i(expected, actual); |
460 | } |
461 | |
462 | #[simd_test(enable = "avxifma" )] |
463 | unsafe fn test_mm256_madd52hi_avx_epu64() { |
464 | let a = _mm256_set1_epi64x(10 << 40); |
465 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
466 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
467 | |
468 | let actual = _mm256_madd52hi_avx_epu64(a, b, c); |
469 | |
470 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
471 | let expected = _mm256_set1_epi64x(11030549757952); |
472 | |
473 | assert_eq_m256i(expected, actual); |
474 | } |
475 | |
476 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
477 | unsafe fn test_mm256_madd52hi_epu64() { |
478 | let a = _mm256_set1_epi64x(10 << 40); |
479 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
480 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
481 | |
482 | let actual = _mm256_madd52hi_epu64(a, b, c); |
483 | |
484 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
485 | let expected = _mm256_set1_epi64x(11030549757952); |
486 | |
487 | assert_eq_m256i(expected, actual); |
488 | } |
489 | |
490 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
491 | unsafe fn test_mm256_mask_madd52hi_epu64() { |
492 | let a = _mm256_set1_epi64x(10 << 40); |
493 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
494 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
495 | |
496 | let actual = _mm256_mask_madd52hi_epu64(a, K, b, c); |
497 | |
498 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
499 | let mut expected = _mm256_set1_epi64x(11030549757952); |
500 | expected = _mm256_mask_blend_epi64(K, a, expected); |
501 | |
502 | assert_eq_m256i(expected, actual); |
503 | } |
504 | |
505 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
506 | unsafe fn test_mm256_maskz_madd52hi_epu64() { |
507 | let a = _mm256_set1_epi64x(10 << 40); |
508 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
509 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
510 | |
511 | let actual = _mm256_maskz_madd52hi_epu64(K, a, b, c); |
512 | |
513 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
514 | let mut expected = _mm256_set1_epi64x(11030549757952); |
515 | expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected); |
516 | |
517 | assert_eq_m256i(expected, actual); |
518 | } |
519 | |
520 | #[simd_test(enable = "avxifma" )] |
521 | unsafe fn test_mm256_madd52lo_avx_epu64() { |
522 | let a = _mm256_set1_epi64x(10 << 40); |
523 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
524 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
525 | |
526 | let actual = _mm256_madd52lo_avx_epu64(a, b, c); |
527 | |
528 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
529 | let expected = _mm256_set1_epi64x(100055558127628); |
530 | |
531 | assert_eq_m256i(expected, actual); |
532 | } |
533 | |
534 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
535 | unsafe fn test_mm256_madd52lo_epu64() { |
536 | let a = _mm256_set1_epi64x(10 << 40); |
537 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
538 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
539 | |
540 | let actual = _mm256_madd52lo_epu64(a, b, c); |
541 | |
542 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
543 | let expected = _mm256_set1_epi64x(100055558127628); |
544 | |
545 | assert_eq_m256i(expected, actual); |
546 | } |
547 | |
548 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
549 | unsafe fn test_mm256_mask_madd52lo_epu64() { |
550 | let a = _mm256_set1_epi64x(10 << 40); |
551 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
552 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
553 | |
554 | let actual = _mm256_mask_madd52lo_epu64(a, K, b, c); |
555 | |
556 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
557 | let mut expected = _mm256_set1_epi64x(100055558127628); |
558 | expected = _mm256_mask_blend_epi64(K, a, expected); |
559 | |
560 | assert_eq_m256i(expected, actual); |
561 | } |
562 | |
563 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
564 | unsafe fn test_mm256_maskz_madd52lo_epu64() { |
565 | let a = _mm256_set1_epi64x(10 << 40); |
566 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
567 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
568 | |
569 | let actual = _mm256_maskz_madd52lo_epu64(K, a, b, c); |
570 | |
571 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
572 | let mut expected = _mm256_set1_epi64x(100055558127628); |
573 | expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected); |
574 | |
575 | assert_eq_m256i(expected, actual); |
576 | } |
577 | |
578 | #[simd_test(enable = "avxifma" )] |
579 | unsafe fn test_mm_madd52hi_avx_epu64() { |
580 | let a = _mm_set1_epi64x(10 << 40); |
581 | let b = _mm_set1_epi64x((11 << 40) + 4); |
582 | let c = _mm_set1_epi64x((12 << 40) + 3); |
583 | |
584 | let actual = _mm_madd52hi_avx_epu64(a, b, c); |
585 | |
586 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
587 | let expected = _mm_set1_epi64x(11030549757952); |
588 | |
589 | assert_eq_m128i(expected, actual); |
590 | } |
591 | |
592 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
593 | unsafe fn test_mm_madd52hi_epu64() { |
594 | let a = _mm_set1_epi64x(10 << 40); |
595 | let b = _mm_set1_epi64x((11 << 40) + 4); |
596 | let c = _mm_set1_epi64x((12 << 40) + 3); |
597 | |
598 | let actual = _mm_madd52hi_epu64(a, b, c); |
599 | |
600 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
601 | let expected = _mm_set1_epi64x(11030549757952); |
602 | |
603 | assert_eq_m128i(expected, actual); |
604 | } |
605 | |
606 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
607 | unsafe fn test_mm_mask_madd52hi_epu64() { |
608 | let a = _mm_set1_epi64x(10 << 40); |
609 | let b = _mm_set1_epi64x((11 << 40) + 4); |
610 | let c = _mm_set1_epi64x((12 << 40) + 3); |
611 | |
612 | let actual = _mm_mask_madd52hi_epu64(a, K, b, c); |
613 | |
614 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
615 | let mut expected = _mm_set1_epi64x(11030549757952); |
616 | expected = _mm_mask_blend_epi64(K, a, expected); |
617 | |
618 | assert_eq_m128i(expected, actual); |
619 | } |
620 | |
621 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
622 | unsafe fn test_mm_maskz_madd52hi_epu64() { |
623 | let a = _mm_set1_epi64x(10 << 40); |
624 | let b = _mm_set1_epi64x((11 << 40) + 4); |
625 | let c = _mm_set1_epi64x((12 << 40) + 3); |
626 | |
627 | let actual = _mm_maskz_madd52hi_epu64(K, a, b, c); |
628 | |
629 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
630 | let mut expected = _mm_set1_epi64x(11030549757952); |
631 | expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected); |
632 | |
633 | assert_eq_m128i(expected, actual); |
634 | } |
635 | |
636 | #[simd_test(enable = "avxifma" )] |
637 | unsafe fn test_mm_madd52lo_avx_epu64() { |
638 | let a = _mm_set1_epi64x(10 << 40); |
639 | let b = _mm_set1_epi64x((11 << 40) + 4); |
640 | let c = _mm_set1_epi64x((12 << 40) + 3); |
641 | |
642 | let actual = _mm_madd52lo_avx_epu64(a, b, c); |
643 | |
644 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
645 | let expected = _mm_set1_epi64x(100055558127628); |
646 | |
647 | assert_eq_m128i(expected, actual); |
648 | } |
649 | |
650 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
651 | unsafe fn test_mm_madd52lo_epu64() { |
652 | let a = _mm_set1_epi64x(10 << 40); |
653 | let b = _mm_set1_epi64x((11 << 40) + 4); |
654 | let c = _mm_set1_epi64x((12 << 40) + 3); |
655 | |
656 | let actual = _mm_madd52lo_epu64(a, b, c); |
657 | |
658 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
659 | let expected = _mm_set1_epi64x(100055558127628); |
660 | |
661 | assert_eq_m128i(expected, actual); |
662 | } |
663 | |
664 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
665 | unsafe fn test_mm_mask_madd52lo_epu64() { |
666 | let a = _mm_set1_epi64x(10 << 40); |
667 | let b = _mm_set1_epi64x((11 << 40) + 4); |
668 | let c = _mm_set1_epi64x((12 << 40) + 3); |
669 | |
670 | let actual = _mm_mask_madd52lo_epu64(a, K, b, c); |
671 | |
672 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
673 | let mut expected = _mm_set1_epi64x(100055558127628); |
674 | expected = _mm_mask_blend_epi64(K, a, expected); |
675 | |
676 | assert_eq_m128i(expected, actual); |
677 | } |
678 | |
679 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
680 | unsafe fn test_mm_maskz_madd52lo_epu64() { |
681 | let a = _mm_set1_epi64x(10 << 40); |
682 | let b = _mm_set1_epi64x((11 << 40) + 4); |
683 | let c = _mm_set1_epi64x((12 << 40) + 3); |
684 | |
685 | let actual = _mm_maskz_madd52lo_epu64(K, a, b, c); |
686 | |
687 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
688 | let mut expected = _mm_set1_epi64x(100055558127628); |
689 | expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected); |
690 | |
691 | assert_eq_m128i(expected, actual); |
692 | } |
693 | } |
694 | |