1 | use crate::core_arch::x86::*; |
2 | use crate::intrinsics::simd::simd_select_bitmask; |
3 | |
4 | #[cfg (test)] |
5 | use stdarch_test::assert_instr; |
6 | |
7 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
8 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
9 | /// unsigned integer from the intermediate result with the |
10 | /// corresponding unsigned 64-bit integer in `a`, and store the |
11 | /// results in `dst`. |
12 | /// |
13 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52hi_epu64) |
14 | #[inline ] |
15 | #[target_feature (enable = "avx512ifma" )] |
16 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
17 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
18 | pub fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
19 | unsafe { vpmadd52huq_512(z:a, x:b, y:c) } |
20 | } |
21 | |
22 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
23 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
24 | /// unsigned integer from the intermediate result with the |
25 | /// corresponding unsigned 64-bit integer in `a`, and store the |
26 | /// results in `dst` using writemask `k` (elements are copied |
27 | /// from `k` when the corresponding mask bit is not set). |
28 | /// |
29 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52hi_epu64) |
30 | #[inline ] |
31 | #[target_feature (enable = "avx512ifma" )] |
32 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
33 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
34 | pub fn _mm512_mask_madd52hi_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i { |
35 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_512(a, b, c), no:a) } |
36 | } |
37 | |
38 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
39 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
40 | /// unsigned integer from the intermediate result with the |
41 | /// corresponding unsigned 64-bit integer in `a`, and store the |
42 | /// results in `dst` using writemask `k` (elements are zeroed |
43 | /// out when the corresponding mask bit is not set). |
44 | /// |
45 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52hi_epu64) |
46 | #[inline ] |
47 | #[target_feature (enable = "avx512ifma" )] |
48 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
49 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
50 | pub fn _mm512_maskz_madd52hi_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
51 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_512(a, b, c), no:_mm512_setzero_si512()) } |
52 | } |
53 | |
54 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
55 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
56 | /// unsigned integer from the intermediate result with the |
57 | /// corresponding unsigned 64-bit integer in `a`, and store the |
58 | /// results in `dst`. |
59 | /// |
60 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_madd52lo_epu64) |
61 | #[inline ] |
62 | #[target_feature (enable = "avx512ifma" )] |
63 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
64 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
65 | pub fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
66 | unsafe { vpmadd52luq_512(z:a, x:b, y:c) } |
67 | } |
68 | |
69 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
70 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
71 | /// unsigned integer from the intermediate result with the |
72 | /// corresponding unsigned 64-bit integer in `a`, and store the |
73 | /// results in `dst` using writemask `k` (elements are copied |
74 | /// from `k` when the corresponding mask bit is not set). |
75 | /// |
76 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_mask_madd52lo_epu64) |
77 | #[inline ] |
78 | #[target_feature (enable = "avx512ifma" )] |
79 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
80 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
81 | pub fn _mm512_mask_madd52lo_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i { |
82 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_512(a, b, c), no:a) } |
83 | } |
84 | |
85 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
86 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
87 | /// unsigned integer from the intermediate result with the |
88 | /// corresponding unsigned 64-bit integer in `a`, and store the |
89 | /// results in `dst` using writemask `k` (elements are zeroed |
90 | /// out when the corresponding mask bit is not set). |
91 | /// |
92 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm512_maskz_madd52lo_epu64) |
93 | #[inline ] |
94 | #[target_feature (enable = "avx512ifma" )] |
95 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
96 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
97 | pub fn _mm512_maskz_madd52lo_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i { |
98 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_512(a, b, c), no:_mm512_setzero_si512()) } |
99 | } |
100 | |
101 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
102 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
103 | /// unsigned integer from the intermediate result with the |
104 | /// corresponding unsigned 64-bit integer in `a`, and store the |
105 | /// results in `dst`. |
106 | /// |
107 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64) |
108 | #[inline ] |
109 | #[target_feature (enable = "avxifma" )] |
110 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
111 | #[cfg_attr ( |
112 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
113 | assert_instr(vpmadd52huq) |
114 | )] |
115 | pub fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
116 | unsafe { vpmadd52huq_256(z:a, x:b, y:c) } |
117 | } |
118 | |
119 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
120 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
121 | /// unsigned integer from the intermediate result with the |
122 | /// corresponding unsigned 64-bit integer in `a`, and store the |
123 | /// results in `dst`. |
124 | /// |
125 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52hi_epu64) |
126 | #[inline ] |
127 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
128 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
129 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
130 | pub fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
131 | unsafe { vpmadd52huq_256(z:a, x:b, y:c) } |
132 | } |
133 | |
134 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
135 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
136 | /// unsigned integer from the intermediate result with the |
137 | /// corresponding unsigned 64-bit integer in `a`, and store the |
138 | /// results in `dst` using writemask `k` (elements are copied |
139 | /// from `k` when the corresponding mask bit is not set). |
140 | /// |
141 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52hi_epu64) |
142 | #[inline ] |
143 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
144 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
145 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
146 | pub fn _mm256_mask_madd52hi_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { |
147 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_256(a, b, c), no:a) } |
148 | } |
149 | |
150 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
151 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
152 | /// unsigned integer from the intermediate result with the |
153 | /// corresponding unsigned 64-bit integer in `a`, and store the |
154 | /// results in `dst` using writemask `k` (elements are zeroed |
155 | /// out when the corresponding mask bit is not set). |
156 | /// |
157 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52hi_epu64) |
158 | #[inline ] |
159 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
160 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
161 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
162 | pub fn _mm256_maskz_madd52hi_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
163 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_256(a, b, c), no:_mm256_setzero_si256()) } |
164 | } |
165 | |
166 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
167 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
168 | /// unsigned integer from the intermediate result with the |
169 | /// corresponding unsigned 64-bit integer in `a`, and store the |
170 | /// results in `dst`. |
171 | /// |
172 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64) |
173 | #[inline ] |
174 | #[target_feature (enable = "avxifma" )] |
175 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
176 | #[cfg_attr ( |
177 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
178 | assert_instr(vpmadd52luq) |
179 | )] |
180 | pub fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
181 | unsafe { vpmadd52luq_256(z:a, x:b, y:c) } |
182 | } |
183 | |
184 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
185 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
186 | /// unsigned integer from the intermediate result with the |
187 | /// corresponding unsigned 64-bit integer in `a`, and store the |
188 | /// results in `dst`. |
189 | /// |
190 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_madd52lo_epu64) |
191 | #[inline ] |
192 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
193 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
194 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
195 | pub fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
196 | unsafe { vpmadd52luq_256(z:a, x:b, y:c) } |
197 | } |
198 | |
199 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
200 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
201 | /// unsigned integer from the intermediate result with the |
202 | /// corresponding unsigned 64-bit integer in `a`, and store the |
203 | /// results in `dst` using writemask `k` (elements are copied |
204 | /// from `k` when the corresponding mask bit is not set). |
205 | /// |
206 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_mask_madd52lo_epu64) |
207 | #[inline ] |
208 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
209 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
210 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
211 | pub fn _mm256_mask_madd52lo_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i { |
212 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_256(a, b, c), no:a) } |
213 | } |
214 | |
215 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
216 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
217 | /// unsigned integer from the intermediate result with the |
218 | /// corresponding unsigned 64-bit integer in `a`, and store the |
219 | /// results in `dst` using writemask `k` (elements are zeroed |
220 | /// out when the corresponding mask bit is not set). |
221 | /// |
222 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm256_maskz_madd52lo_epu64) |
223 | #[inline ] |
224 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
225 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
226 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
227 | pub fn _mm256_maskz_madd52lo_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i { |
228 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_256(a, b, c), no:_mm256_setzero_si256()) } |
229 | } |
230 | |
231 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
232 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
233 | /// unsigned integer from the intermediate result with the |
234 | /// corresponding unsigned 64-bit integer in `a`, and store the |
235 | /// results in `dst`. |
236 | /// |
237 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64) |
238 | #[inline ] |
239 | #[target_feature (enable = "avxifma" )] |
240 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
241 | #[cfg_attr ( |
242 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
243 | assert_instr(vpmadd52huq) |
244 | )] |
245 | pub fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
246 | unsafe { vpmadd52huq_128(z:a, x:b, y:c) } |
247 | } |
248 | |
249 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
250 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
251 | /// unsigned integer from the intermediate result with the |
252 | /// corresponding unsigned 64-bit integer in `a`, and store the |
253 | /// results in `dst`. |
254 | /// |
255 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52hi_epu64) |
256 | #[inline ] |
257 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
258 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
259 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
260 | pub fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
261 | unsafe { vpmadd52huq_128(z:a, x:b, y:c) } |
262 | } |
263 | |
264 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
265 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
266 | /// unsigned integer from the intermediate result with the |
267 | /// corresponding unsigned 64-bit integer in `a`, and store the |
268 | /// results in `dst` using writemask `k` (elements are copied |
269 | /// from `k` when the corresponding mask bit is not set). |
270 | /// |
271 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52hi_epu64) |
272 | #[inline ] |
273 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
274 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
275 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
276 | pub fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { |
277 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_128(a, b, c), no:a) } |
278 | } |
279 | |
280 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
281 | /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit |
282 | /// unsigned integer from the intermediate result with the |
283 | /// corresponding unsigned 64-bit integer in `a`, and store the |
284 | /// results in `dst` using writemask `k` (elements are zeroed |
285 | /// out when the corresponding mask bit is not set). |
286 | /// |
287 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52hi_epu64) |
288 | #[inline ] |
289 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
290 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
291 | #[cfg_attr (test, assert_instr(vpmadd52huq))] |
292 | pub fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
293 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52huq_128(a, b, c), no:_mm_setzero_si128()) } |
294 | } |
295 | |
296 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
297 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
298 | /// unsigned integer from the intermediate result with the |
299 | /// corresponding unsigned 64-bit integer in `a`, and store the |
300 | /// results in `dst`. |
301 | /// |
302 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64) |
303 | #[inline ] |
304 | #[target_feature (enable = "avxifma" )] |
305 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
306 | #[cfg_attr ( |
307 | all(test, any(target_os = "linux" , target_env = "msvc" )), |
308 | assert_instr(vpmadd52luq) |
309 | )] |
310 | pub fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
311 | unsafe { vpmadd52luq_128(z:a, x:b, y:c) } |
312 | } |
313 | |
314 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
315 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
316 | /// unsigned integer from the intermediate result with the |
317 | /// corresponding unsigned 64-bit integer in `a`, and store the |
318 | /// results in `dst`. |
319 | /// |
320 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_madd52lo_epu64) |
321 | #[inline ] |
322 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
323 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
324 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
325 | pub fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
326 | unsafe { vpmadd52luq_128(z:a, x:b, y:c) } |
327 | } |
328 | |
329 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
330 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
331 | /// unsigned integer from the intermediate result with the |
332 | /// corresponding unsigned 64-bit integer in `a`, and store the |
333 | /// results in `dst` using writemask `k` (elements are copied |
334 | /// from `k` when the corresponding mask bit is not set). |
335 | /// |
336 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_mask_madd52lo_epu64) |
337 | #[inline ] |
338 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
339 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
340 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
341 | pub fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i { |
342 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_128(a, b, c), no:a) } |
343 | } |
344 | |
345 | /// Multiply packed unsigned 52-bit integers in each 64-bit element of |
346 | /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit |
347 | /// unsigned integer from the intermediate result with the |
348 | /// corresponding unsigned 64-bit integer in `a`, and store the |
349 | /// results in `dst` using writemask `k` (elements are zeroed |
350 | /// out when the corresponding mask bit is not set). |
351 | /// |
352 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avx512techs=AVX512IFMA52&text=_mm_maskz_madd52lo_epu64) |
353 | #[inline ] |
354 | #[target_feature (enable = "avx512ifma,avx512vl" )] |
355 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
356 | #[cfg_attr (test, assert_instr(vpmadd52luq))] |
357 | pub fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i { |
358 | unsafe { simd_select_bitmask(m:k, yes:vpmadd52luq_128(a, b, c), no:_mm_setzero_si128()) } |
359 | } |
360 | |
361 | #[allow (improper_ctypes)] |
362 | unsafe extern "C" { |
363 | #[link_name = "llvm.x86.avx512.vpmadd52l.uq.128" ] |
364 | unsafefn vpmadd52luq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; |
365 | #[link_name = "llvm.x86.avx512.vpmadd52h.uq.128" ] |
366 | unsafefn vpmadd52huq_128(z: __m128i, x: __m128i, y: __m128i) -> __m128i; |
367 | #[link_name = "llvm.x86.avx512.vpmadd52l.uq.256" ] |
368 | unsafefn vpmadd52luq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; |
369 | #[link_name = "llvm.x86.avx512.vpmadd52h.uq.256" ] |
370 | unsafefn vpmadd52huq_256(z: __m256i, x: __m256i, y: __m256i) -> __m256i; |
371 | #[link_name = "llvm.x86.avx512.vpmadd52l.uq.512" ] |
372 | unsafefn vpmadd52luq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; |
373 | #[link_name = "llvm.x86.avx512.vpmadd52h.uq.512" ] |
374 | unsafefn vpmadd52huq_512(z: __m512i, x: __m512i, y: __m512i) -> __m512i; |
375 | } |
376 | |
377 | #[cfg (test)] |
378 | mod tests { |
379 | |
380 | use stdarch_test::simd_test; |
381 | |
382 | use crate::core_arch::x86::*; |
383 | |
384 | const K: __mmask8 = 0b01101101; |
385 | |
386 | #[simd_test(enable = "avx512ifma" )] |
387 | unsafe fn test_mm512_madd52hi_epu64() { |
388 | let a = _mm512_set1_epi64(10 << 40); |
389 | let b = _mm512_set1_epi64((11 << 40) + 4); |
390 | let c = _mm512_set1_epi64((12 << 40) + 3); |
391 | |
392 | let actual = _mm512_madd52hi_epu64(a, b, c); |
393 | |
394 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
395 | let expected = _mm512_set1_epi64(11030549757952); |
396 | |
397 | assert_eq_m512i(expected, actual); |
398 | } |
399 | |
400 | #[simd_test(enable = "avx512ifma" )] |
401 | unsafe fn test_mm512_mask_madd52hi_epu64() { |
402 | let a = _mm512_set1_epi64(10 << 40); |
403 | let b = _mm512_set1_epi64((11 << 40) + 4); |
404 | let c = _mm512_set1_epi64((12 << 40) + 3); |
405 | |
406 | let actual = _mm512_mask_madd52hi_epu64(a, K, b, c); |
407 | |
408 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
409 | let mut expected = _mm512_set1_epi64(11030549757952); |
410 | expected = _mm512_mask_blend_epi64(K, a, expected); |
411 | |
412 | assert_eq_m512i(expected, actual); |
413 | } |
414 | |
415 | #[simd_test(enable = "avx512ifma" )] |
416 | unsafe fn test_mm512_maskz_madd52hi_epu64() { |
417 | let a = _mm512_set1_epi64(10 << 40); |
418 | let b = _mm512_set1_epi64((11 << 40) + 4); |
419 | let c = _mm512_set1_epi64((12 << 40) + 3); |
420 | |
421 | let actual = _mm512_maskz_madd52hi_epu64(K, a, b, c); |
422 | |
423 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
424 | let mut expected = _mm512_set1_epi64(11030549757952); |
425 | expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected); |
426 | |
427 | assert_eq_m512i(expected, actual); |
428 | } |
429 | |
430 | #[simd_test(enable = "avx512ifma" )] |
431 | unsafe fn test_mm512_madd52lo_epu64() { |
432 | let a = _mm512_set1_epi64(10 << 40); |
433 | let b = _mm512_set1_epi64((11 << 40) + 4); |
434 | let c = _mm512_set1_epi64((12 << 40) + 3); |
435 | |
436 | let actual = _mm512_madd52lo_epu64(a, b, c); |
437 | |
438 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
439 | let expected = _mm512_set1_epi64(100055558127628); |
440 | |
441 | assert_eq_m512i(expected, actual); |
442 | } |
443 | |
444 | #[simd_test(enable = "avx512ifma" )] |
445 | unsafe fn test_mm512_mask_madd52lo_epu64() { |
446 | let a = _mm512_set1_epi64(10 << 40); |
447 | let b = _mm512_set1_epi64((11 << 40) + 4); |
448 | let c = _mm512_set1_epi64((12 << 40) + 3); |
449 | |
450 | let actual = _mm512_mask_madd52lo_epu64(a, K, b, c); |
451 | |
452 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
453 | let mut expected = _mm512_set1_epi64(100055558127628); |
454 | expected = _mm512_mask_blend_epi64(K, a, expected); |
455 | |
456 | assert_eq_m512i(expected, actual); |
457 | } |
458 | |
459 | #[simd_test(enable = "avx512ifma" )] |
460 | unsafe fn test_mm512_maskz_madd52lo_epu64() { |
461 | let a = _mm512_set1_epi64(10 << 40); |
462 | let b = _mm512_set1_epi64((11 << 40) + 4); |
463 | let c = _mm512_set1_epi64((12 << 40) + 3); |
464 | |
465 | let actual = _mm512_maskz_madd52lo_epu64(K, a, b, c); |
466 | |
467 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
468 | let mut expected = _mm512_set1_epi64(100055558127628); |
469 | expected = _mm512_mask_blend_epi64(K, _mm512_setzero_si512(), expected); |
470 | |
471 | assert_eq_m512i(expected, actual); |
472 | } |
473 | |
474 | #[simd_test(enable = "avxifma" )] |
475 | unsafe fn test_mm256_madd52hi_avx_epu64() { |
476 | let a = _mm256_set1_epi64x(10 << 40); |
477 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
478 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
479 | |
480 | let actual = _mm256_madd52hi_avx_epu64(a, b, c); |
481 | |
482 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
483 | let expected = _mm256_set1_epi64x(11030549757952); |
484 | |
485 | assert_eq_m256i(expected, actual); |
486 | } |
487 | |
488 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
489 | unsafe fn test_mm256_madd52hi_epu64() { |
490 | let a = _mm256_set1_epi64x(10 << 40); |
491 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
492 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
493 | |
494 | let actual = _mm256_madd52hi_epu64(a, b, c); |
495 | |
496 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
497 | let expected = _mm256_set1_epi64x(11030549757952); |
498 | |
499 | assert_eq_m256i(expected, actual); |
500 | } |
501 | |
502 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
503 | unsafe fn test_mm256_mask_madd52hi_epu64() { |
504 | let a = _mm256_set1_epi64x(10 << 40); |
505 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
506 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
507 | |
508 | let actual = _mm256_mask_madd52hi_epu64(a, K, b, c); |
509 | |
510 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
511 | let mut expected = _mm256_set1_epi64x(11030549757952); |
512 | expected = _mm256_mask_blend_epi64(K, a, expected); |
513 | |
514 | assert_eq_m256i(expected, actual); |
515 | } |
516 | |
517 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
518 | unsafe fn test_mm256_maskz_madd52hi_epu64() { |
519 | let a = _mm256_set1_epi64x(10 << 40); |
520 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
521 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
522 | |
523 | let actual = _mm256_maskz_madd52hi_epu64(K, a, b, c); |
524 | |
525 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
526 | let mut expected = _mm256_set1_epi64x(11030549757952); |
527 | expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected); |
528 | |
529 | assert_eq_m256i(expected, actual); |
530 | } |
531 | |
532 | #[simd_test(enable = "avxifma" )] |
533 | unsafe fn test_mm256_madd52lo_avx_epu64() { |
534 | let a = _mm256_set1_epi64x(10 << 40); |
535 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
536 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
537 | |
538 | let actual = _mm256_madd52lo_avx_epu64(a, b, c); |
539 | |
540 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
541 | let expected = _mm256_set1_epi64x(100055558127628); |
542 | |
543 | assert_eq_m256i(expected, actual); |
544 | } |
545 | |
546 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
547 | unsafe fn test_mm256_madd52lo_epu64() { |
548 | let a = _mm256_set1_epi64x(10 << 40); |
549 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
550 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
551 | |
552 | let actual = _mm256_madd52lo_epu64(a, b, c); |
553 | |
554 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
555 | let expected = _mm256_set1_epi64x(100055558127628); |
556 | |
557 | assert_eq_m256i(expected, actual); |
558 | } |
559 | |
560 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
561 | unsafe fn test_mm256_mask_madd52lo_epu64() { |
562 | let a = _mm256_set1_epi64x(10 << 40); |
563 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
564 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
565 | |
566 | let actual = _mm256_mask_madd52lo_epu64(a, K, b, c); |
567 | |
568 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
569 | let mut expected = _mm256_set1_epi64x(100055558127628); |
570 | expected = _mm256_mask_blend_epi64(K, a, expected); |
571 | |
572 | assert_eq_m256i(expected, actual); |
573 | } |
574 | |
575 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
576 | unsafe fn test_mm256_maskz_madd52lo_epu64() { |
577 | let a = _mm256_set1_epi64x(10 << 40); |
578 | let b = _mm256_set1_epi64x((11 << 40) + 4); |
579 | let c = _mm256_set1_epi64x((12 << 40) + 3); |
580 | |
581 | let actual = _mm256_maskz_madd52lo_epu64(K, a, b, c); |
582 | |
583 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
584 | let mut expected = _mm256_set1_epi64x(100055558127628); |
585 | expected = _mm256_mask_blend_epi64(K, _mm256_setzero_si256(), expected); |
586 | |
587 | assert_eq_m256i(expected, actual); |
588 | } |
589 | |
590 | #[simd_test(enable = "avxifma" )] |
591 | unsafe fn test_mm_madd52hi_avx_epu64() { |
592 | let a = _mm_set1_epi64x(10 << 40); |
593 | let b = _mm_set1_epi64x((11 << 40) + 4); |
594 | let c = _mm_set1_epi64x((12 << 40) + 3); |
595 | |
596 | let actual = _mm_madd52hi_avx_epu64(a, b, c); |
597 | |
598 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
599 | let expected = _mm_set1_epi64x(11030549757952); |
600 | |
601 | assert_eq_m128i(expected, actual); |
602 | } |
603 | |
604 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
605 | unsafe fn test_mm_madd52hi_epu64() { |
606 | let a = _mm_set1_epi64x(10 << 40); |
607 | let b = _mm_set1_epi64x((11 << 40) + 4); |
608 | let c = _mm_set1_epi64x((12 << 40) + 3); |
609 | |
610 | let actual = _mm_madd52hi_epu64(a, b, c); |
611 | |
612 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
613 | let expected = _mm_set1_epi64x(11030549757952); |
614 | |
615 | assert_eq_m128i(expected, actual); |
616 | } |
617 | |
618 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
619 | unsafe fn test_mm_mask_madd52hi_epu64() { |
620 | let a = _mm_set1_epi64x(10 << 40); |
621 | let b = _mm_set1_epi64x((11 << 40) + 4); |
622 | let c = _mm_set1_epi64x((12 << 40) + 3); |
623 | |
624 | let actual = _mm_mask_madd52hi_epu64(a, K, b, c); |
625 | |
626 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
627 | let mut expected = _mm_set1_epi64x(11030549757952); |
628 | expected = _mm_mask_blend_epi64(K, a, expected); |
629 | |
630 | assert_eq_m128i(expected, actual); |
631 | } |
632 | |
633 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
634 | unsafe fn test_mm_maskz_madd52hi_epu64() { |
635 | let a = _mm_set1_epi64x(10 << 40); |
636 | let b = _mm_set1_epi64x((11 << 40) + 4); |
637 | let c = _mm_set1_epi64x((12 << 40) + 3); |
638 | |
639 | let actual = _mm_maskz_madd52hi_epu64(K, a, b, c); |
640 | |
641 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) |
642 | let mut expected = _mm_set1_epi64x(11030549757952); |
643 | expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected); |
644 | |
645 | assert_eq_m128i(expected, actual); |
646 | } |
647 | |
648 | #[simd_test(enable = "avxifma" )] |
649 | unsafe fn test_mm_madd52lo_avx_epu64() { |
650 | let a = _mm_set1_epi64x(10 << 40); |
651 | let b = _mm_set1_epi64x((11 << 40) + 4); |
652 | let c = _mm_set1_epi64x((12 << 40) + 3); |
653 | |
654 | let actual = _mm_madd52lo_avx_epu64(a, b, c); |
655 | |
656 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
657 | let expected = _mm_set1_epi64x(100055558127628); |
658 | |
659 | assert_eq_m128i(expected, actual); |
660 | } |
661 | |
662 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
663 | unsafe fn test_mm_madd52lo_epu64() { |
664 | let a = _mm_set1_epi64x(10 << 40); |
665 | let b = _mm_set1_epi64x((11 << 40) + 4); |
666 | let c = _mm_set1_epi64x((12 << 40) + 3); |
667 | |
668 | let actual = _mm_madd52lo_epu64(a, b, c); |
669 | |
670 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
671 | let expected = _mm_set1_epi64x(100055558127628); |
672 | |
673 | assert_eq_m128i(expected, actual); |
674 | } |
675 | |
676 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
677 | unsafe fn test_mm_mask_madd52lo_epu64() { |
678 | let a = _mm_set1_epi64x(10 << 40); |
679 | let b = _mm_set1_epi64x((11 << 40) + 4); |
680 | let c = _mm_set1_epi64x((12 << 40) + 3); |
681 | |
682 | let actual = _mm_mask_madd52lo_epu64(a, K, b, c); |
683 | |
684 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
685 | let mut expected = _mm_set1_epi64x(100055558127628); |
686 | expected = _mm_mask_blend_epi64(K, a, expected); |
687 | |
688 | assert_eq_m128i(expected, actual); |
689 | } |
690 | |
691 | #[simd_test(enable = "avx512ifma,avx512vl" )] |
692 | unsafe fn test_mm_maskz_madd52lo_epu64() { |
693 | let a = _mm_set1_epi64x(10 << 40); |
694 | let b = _mm_set1_epi64x((11 << 40) + 4); |
695 | let c = _mm_set1_epi64x((12 << 40) + 3); |
696 | |
697 | let actual = _mm_maskz_madd52lo_epu64(K, a, b, c); |
698 | |
699 | // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) |
700 | let mut expected = _mm_set1_epi64x(100055558127628); |
701 | expected = _mm_mask_blend_epi64(K, _mm_setzero_si128(), expected); |
702 | |
703 | assert_eq_m128i(expected, actual); |
704 | } |
705 | } |
706 | |