1 | use crate::core_arch::{simd::*, x86::*}; |
2 | use crate::intrinsics::simd::*; |
3 | |
4 | #[cfg (test)] |
5 | use stdarch_test::assert_instr; |
6 | |
7 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
8 | /// |
9 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssd_epi32&expand=2219) |
10 | #[inline ] |
11 | #[target_feature (enable = "avx512vnni" )] |
12 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
13 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
14 | pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
15 | transmute(src:vpdpwssd(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
16 | } |
17 | |
18 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
19 | /// |
20 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssd_epi32&expand=2220) |
21 | #[inline ] |
22 | #[target_feature (enable = "avx512vnni" )] |
23 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
24 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
25 | pub unsafe fn _mm512_mask_dpwssd_epi32( |
26 | src: __m512i, |
27 | k: __mmask16, |
28 | a: __m512i, |
29 | b: __m512i, |
30 | ) -> __m512i { |
31 | let r: i32x16 = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); |
32 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x16())) |
33 | } |
34 | |
35 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
36 | /// |
37 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssd_epi32&expand=2221) |
38 | #[inline ] |
39 | #[target_feature (enable = "avx512vnni" )] |
40 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
41 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
42 | pub unsafe fn _mm512_maskz_dpwssd_epi32( |
43 | k: __mmask16, |
44 | src: __m512i, |
45 | a: __m512i, |
46 | b: __m512i, |
47 | ) -> __m512i { |
48 | let r: i32x16 = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); |
49 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
50 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
51 | } |
52 | |
53 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
54 | /// |
55 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216) |
56 | #[inline ] |
57 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
58 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
59 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
60 | pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
61 | transmute(src:vpdpwssd256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
62 | } |
63 | |
64 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
65 | /// |
66 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssd_epi32&expand=2217) |
67 | #[inline ] |
68 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
69 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
70 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
71 | pub unsafe fn _mm256_mask_dpwssd_epi32( |
72 | src: __m256i, |
73 | k: __mmask8, |
74 | a: __m256i, |
75 | b: __m256i, |
76 | ) -> __m256i { |
77 | let r: i32x8 = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); |
78 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x8())) |
79 | } |
80 | |
81 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
82 | /// |
83 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssd_epi32&expand=2218) |
84 | #[inline ] |
85 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
86 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
87 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
88 | pub unsafe fn _mm256_maskz_dpwssd_epi32( |
89 | k: __mmask8, |
90 | src: __m256i, |
91 | a: __m256i, |
92 | b: __m256i, |
93 | ) -> __m256i { |
94 | let r: i32x8 = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); |
95 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
96 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
97 | } |
98 | |
99 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
100 | /// |
101 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213) |
102 | #[inline ] |
103 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
104 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
105 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
106 | pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
107 | transmute(src:vpdpwssd128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
108 | } |
109 | |
110 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
111 | /// |
112 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssd_epi32&expand=2214) |
113 | #[inline ] |
114 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
115 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
116 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
117 | pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
118 | let r: i32x4 = _mm_dpwssd_epi32(src, a, b).as_i32x4(); |
119 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x4())) |
120 | } |
121 | |
122 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
123 | /// |
124 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssd_epi32&expand=2215) |
125 | #[inline ] |
126 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
127 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
128 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
129 | pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
130 | let r: i32x4 = _mm_dpwssd_epi32(src, a, b).as_i32x4(); |
131 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
132 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
133 | } |
134 | |
135 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
136 | /// |
137 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssds_epi32&expand=2228) |
138 | #[inline ] |
139 | #[target_feature (enable = "avx512vnni" )] |
140 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
141 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
142 | pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
143 | transmute(src:vpdpwssds(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
144 | } |
145 | |
146 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
147 | /// |
148 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssds_epi32&expand=2229) |
149 | #[inline ] |
150 | #[target_feature (enable = "avx512vnni" )] |
151 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
152 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
153 | pub unsafe fn _mm512_mask_dpwssds_epi32( |
154 | src: __m512i, |
155 | k: __mmask16, |
156 | a: __m512i, |
157 | b: __m512i, |
158 | ) -> __m512i { |
159 | let r: i32x16 = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); |
160 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x16())) |
161 | } |
162 | |
163 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
164 | /// |
165 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssds_epi32&expand=2230) |
166 | #[inline ] |
167 | #[target_feature (enable = "avx512vnni" )] |
168 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
169 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
170 | pub unsafe fn _mm512_maskz_dpwssds_epi32( |
171 | k: __mmask16, |
172 | src: __m512i, |
173 | a: __m512i, |
174 | b: __m512i, |
175 | ) -> __m512i { |
176 | let r: i32x16 = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); |
177 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
178 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
179 | } |
180 | |
181 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
182 | /// |
183 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225) |
184 | #[inline ] |
185 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
186 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
187 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
188 | pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
189 | transmute(src:vpdpwssds256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
190 | } |
191 | |
192 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
193 | /// |
194 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssds_epi32&expand=2226) |
195 | #[inline ] |
196 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
197 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
198 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
199 | pub unsafe fn _mm256_mask_dpwssds_epi32( |
200 | src: __m256i, |
201 | k: __mmask8, |
202 | a: __m256i, |
203 | b: __m256i, |
204 | ) -> __m256i { |
205 | let r: i32x8 = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); |
206 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x8())) |
207 | } |
208 | |
209 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
210 | /// |
211 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssds_epi32&expand=2227) |
212 | #[inline ] |
213 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
214 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
215 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
216 | pub unsafe fn _mm256_maskz_dpwssds_epi32( |
217 | k: __mmask8, |
218 | src: __m256i, |
219 | a: __m256i, |
220 | b: __m256i, |
221 | ) -> __m256i { |
222 | let r: i32x8 = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); |
223 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
224 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
225 | } |
226 | |
227 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
228 | /// |
229 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222) |
230 | #[inline ] |
231 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
232 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
233 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
234 | pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
235 | transmute(src:vpdpwssds128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
236 | } |
237 | |
238 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
239 | /// |
240 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssds_epi32&expand=2223) |
241 | #[inline ] |
242 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
243 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
244 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
245 | pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
246 | let r: i32x4 = _mm_dpwssds_epi32(src, a, b).as_i32x4(); |
247 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x4())) |
248 | } |
249 | |
250 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
251 | /// |
252 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssds_epi32&expand=2224) |
253 | #[inline ] |
254 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
255 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
256 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
257 | pub unsafe fn _mm_maskz_dpwssds_epi32( |
258 | k: __mmask8, |
259 | src: __m128i, |
260 | a: __m128i, |
261 | b: __m128i, |
262 | ) -> __m128i { |
263 | let r: i32x4 = _mm_dpwssds_epi32(src, a, b).as_i32x4(); |
264 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
265 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
266 | } |
267 | |
268 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
269 | /// |
270 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusd_epi32&expand=2201) |
271 | #[inline ] |
272 | #[target_feature (enable = "avx512vnni" )] |
273 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
274 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
275 | pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
276 | transmute(src:vpdpbusd(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
277 | } |
278 | |
279 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
280 | /// |
281 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusd_epi32&expand=2202) |
282 | #[inline ] |
283 | #[target_feature (enable = "avx512vnni" )] |
284 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
285 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
286 | pub unsafe fn _mm512_mask_dpbusd_epi32( |
287 | src: __m512i, |
288 | k: __mmask16, |
289 | a: __m512i, |
290 | b: __m512i, |
291 | ) -> __m512i { |
292 | let r: i32x16 = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); |
293 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x16())) |
294 | } |
295 | |
296 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
297 | /// |
298 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusd_epi32&expand=2203) |
299 | #[inline ] |
300 | #[target_feature (enable = "avx512vnni" )] |
301 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
302 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
303 | pub unsafe fn _mm512_maskz_dpbusd_epi32( |
304 | k: __mmask16, |
305 | src: __m512i, |
306 | a: __m512i, |
307 | b: __m512i, |
308 | ) -> __m512i { |
309 | let r: i32x16 = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); |
310 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
311 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
312 | } |
313 | |
314 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
315 | /// |
316 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198) |
317 | #[inline ] |
318 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
319 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
320 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
321 | pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
322 | transmute(src:vpdpbusd256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
323 | } |
324 | |
325 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
326 | /// |
327 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusd_epi32&expand=2199) |
328 | #[inline ] |
329 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
330 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
331 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
332 | pub unsafe fn _mm256_mask_dpbusd_epi32( |
333 | src: __m256i, |
334 | k: __mmask8, |
335 | a: __m256i, |
336 | b: __m256i, |
337 | ) -> __m256i { |
338 | let r: i32x8 = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); |
339 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x8())) |
340 | } |
341 | |
342 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
343 | /// |
344 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusd_epi32&expand=2200) |
345 | #[inline ] |
346 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
347 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
348 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
349 | pub unsafe fn _mm256_maskz_dpbusd_epi32( |
350 | k: __mmask8, |
351 | src: __m256i, |
352 | a: __m256i, |
353 | b: __m256i, |
354 | ) -> __m256i { |
355 | let r: i32x8 = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); |
356 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
357 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
358 | } |
359 | |
360 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
361 | /// |
362 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195) |
363 | #[inline ] |
364 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
365 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
366 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
367 | pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
368 | transmute(src:vpdpbusd128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
369 | } |
370 | |
371 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
372 | /// |
373 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusd_epi32&expand=2196) |
374 | #[inline ] |
375 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
376 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
377 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
378 | pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
379 | let r: i32x4 = _mm_dpbusd_epi32(src, a, b).as_i32x4(); |
380 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x4())) |
381 | } |
382 | |
383 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
384 | /// |
385 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusd_epi32&expand=2197) |
386 | #[inline ] |
387 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
388 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
389 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
390 | pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
391 | let r: i32x4 = _mm_dpbusd_epi32(src, a, b).as_i32x4(); |
392 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
393 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
394 | } |
395 | |
396 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
397 | /// |
398 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusds_epi32&expand=2210) |
399 | #[inline ] |
400 | #[target_feature (enable = "avx512vnni" )] |
401 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
402 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
403 | pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
404 | transmute(src:vpdpbusds(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
405 | } |
406 | |
407 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
408 | /// |
409 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusds_epi32&expand=2211) |
410 | #[inline ] |
411 | #[target_feature (enable = "avx512vnni" )] |
412 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
413 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
414 | pub unsafe fn _mm512_mask_dpbusds_epi32( |
415 | src: __m512i, |
416 | k: __mmask16, |
417 | a: __m512i, |
418 | b: __m512i, |
419 | ) -> __m512i { |
420 | let r: i32x16 = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); |
421 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x16())) |
422 | } |
423 | |
424 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
425 | /// |
426 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusds_epi32&expand=2212) |
427 | #[inline ] |
428 | #[target_feature (enable = "avx512vnni" )] |
429 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
430 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
431 | pub unsafe fn _mm512_maskz_dpbusds_epi32( |
432 | k: __mmask16, |
433 | src: __m512i, |
434 | a: __m512i, |
435 | b: __m512i, |
436 | ) -> __m512i { |
437 | let r: i32x16 = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); |
438 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
439 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
440 | } |
441 | |
442 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
443 | /// |
444 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207) |
445 | #[inline ] |
446 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
447 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
448 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
449 | pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
450 | transmute(src:vpdpbusds256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
451 | } |
452 | |
453 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
454 | /// |
455 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusds_epi32&expand=2208) |
456 | #[inline ] |
457 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
458 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
459 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
460 | pub unsafe fn _mm256_mask_dpbusds_epi32( |
461 | src: __m256i, |
462 | k: __mmask8, |
463 | a: __m256i, |
464 | b: __m256i, |
465 | ) -> __m256i { |
466 | let r: i32x8 = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); |
467 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x8())) |
468 | } |
469 | |
470 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
471 | /// |
472 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusds_epi32&expand=2209) |
473 | #[inline ] |
474 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
475 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
476 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
477 | pub unsafe fn _mm256_maskz_dpbusds_epi32( |
478 | k: __mmask8, |
479 | src: __m256i, |
480 | a: __m256i, |
481 | b: __m256i, |
482 | ) -> __m256i { |
483 | let r: i32x8 = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); |
484 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
485 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
486 | } |
487 | |
488 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
489 | /// |
490 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204) |
491 | #[inline ] |
492 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
493 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
494 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
495 | pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
496 | transmute(src:vpdpbusds128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
497 | } |
498 | |
499 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
500 | /// |
501 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusds_epi32&expand=2205) |
502 | #[inline ] |
503 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
504 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
505 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
506 | pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
507 | let r: i32x4 = _mm_dpbusds_epi32(src, a, b).as_i32x4(); |
508 | transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i32x4())) |
509 | } |
510 | |
511 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
512 | /// |
513 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusds_epi32&expand=2206) |
514 | #[inline ] |
515 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
516 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
517 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
518 | pub unsafe fn _mm_maskz_dpbusds_epi32( |
519 | k: __mmask8, |
520 | src: __m128i, |
521 | a: __m128i, |
522 | b: __m128i, |
523 | ) -> __m128i { |
524 | let r: i32x4 = _mm_dpbusds_epi32(src, a, b).as_i32x4(); |
525 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
526 | transmute(src:simd_select_bitmask(m:k, yes:r, no:zero)) |
527 | } |
528 | |
529 | #[allow (improper_ctypes)] |
530 | extern "C" { |
531 | #[link_name = "llvm.x86.avx512.vpdpwssd.512" ] |
532 | fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
533 | #[link_name = "llvm.x86.avx512.vpdpwssd.256" ] |
534 | fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
535 | #[link_name = "llvm.x86.avx512.vpdpwssd.128" ] |
536 | fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
537 | |
538 | #[link_name = "llvm.x86.avx512.vpdpwssds.512" ] |
539 | fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
540 | #[link_name = "llvm.x86.avx512.vpdpwssds.256" ] |
541 | fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
542 | #[link_name = "llvm.x86.avx512.vpdpwssds.128" ] |
543 | fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
544 | |
545 | #[link_name = "llvm.x86.avx512.vpdpbusd.512" ] |
546 | fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
547 | #[link_name = "llvm.x86.avx512.vpdpbusd.256" ] |
548 | fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
549 | #[link_name = "llvm.x86.avx512.vpdpbusd.128" ] |
550 | fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
551 | |
552 | #[link_name = "llvm.x86.avx512.vpdpbusds.512" ] |
553 | fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
554 | #[link_name = "llvm.x86.avx512.vpdpbusds.256" ] |
555 | fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
556 | #[link_name = "llvm.x86.avx512.vpdpbusds.128" ] |
557 | fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
558 | } |
559 | |
560 | #[cfg (test)] |
561 | mod tests { |
562 | |
563 | use crate::core_arch::x86::*; |
564 | use stdarch_test::simd_test; |
565 | |
566 | #[simd_test(enable = "avx512vnni" )] |
567 | unsafe fn test_mm512_dpwssd_epi32() { |
568 | let src = _mm512_set1_epi32(1); |
569 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
570 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
571 | let r = _mm512_dpwssd_epi32(src, a, b); |
572 | let e = _mm512_set1_epi32(3); |
573 | assert_eq_m512i(r, e); |
574 | } |
575 | |
576 | #[simd_test(enable = "avx512vnni" )] |
577 | unsafe fn test_mm512_mask_dpwssd_epi32() { |
578 | let src = _mm512_set1_epi32(1); |
579 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
580 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
581 | let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b); |
582 | assert_eq_m512i(r, src); |
583 | let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b); |
584 | let e = _mm512_set1_epi32(3); |
585 | assert_eq_m512i(r, e); |
586 | } |
587 | |
588 | #[simd_test(enable = "avx512vnni" )] |
589 | unsafe fn test_mm512_maskz_dpwssd_epi32() { |
590 | let src = _mm512_set1_epi32(1); |
591 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
592 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
593 | let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b); |
594 | assert_eq_m512i(r, _mm512_setzero_si512()); |
595 | let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b); |
596 | let e = _mm512_set1_epi32(3); |
597 | assert_eq_m512i(r, e); |
598 | } |
599 | |
600 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
601 | unsafe fn test_mm256_dpwssd_epi32() { |
602 | let src = _mm256_set1_epi32(1); |
603 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
604 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
605 | let r = _mm256_dpwssd_epi32(src, a, b); |
606 | let e = _mm256_set1_epi32(3); |
607 | assert_eq_m256i(r, e); |
608 | } |
609 | |
610 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
611 | unsafe fn test_mm256_mask_dpwssd_epi32() { |
612 | let src = _mm256_set1_epi32(1); |
613 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
614 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
615 | let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b); |
616 | assert_eq_m256i(r, src); |
617 | let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b); |
618 | let e = _mm256_set1_epi32(3); |
619 | assert_eq_m256i(r, e); |
620 | } |
621 | |
622 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
623 | unsafe fn test_mm256_maskz_dpwssd_epi32() { |
624 | let src = _mm256_set1_epi32(1); |
625 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
626 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
627 | let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b); |
628 | assert_eq_m256i(r, _mm256_setzero_si256()); |
629 | let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b); |
630 | let e = _mm256_set1_epi32(3); |
631 | assert_eq_m256i(r, e); |
632 | } |
633 | |
634 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
635 | unsafe fn test_mm_dpwssd_epi32() { |
636 | let src = _mm_set1_epi32(1); |
637 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
638 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
639 | let r = _mm_dpwssd_epi32(src, a, b); |
640 | let e = _mm_set1_epi32(3); |
641 | assert_eq_m128i(r, e); |
642 | } |
643 | |
644 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
645 | unsafe fn test_mm_mask_dpwssd_epi32() { |
646 | let src = _mm_set1_epi32(1); |
647 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
648 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
649 | let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b); |
650 | assert_eq_m128i(r, src); |
651 | let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b); |
652 | let e = _mm_set1_epi32(3); |
653 | assert_eq_m128i(r, e); |
654 | } |
655 | |
656 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
657 | unsafe fn test_mm_maskz_dpwssd_epi32() { |
658 | let src = _mm_set1_epi32(1); |
659 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
660 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
661 | let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b); |
662 | assert_eq_m128i(r, _mm_setzero_si128()); |
663 | let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b); |
664 | let e = _mm_set1_epi32(3); |
665 | assert_eq_m128i(r, e); |
666 | } |
667 | |
668 | #[simd_test(enable = "avx512vnni" )] |
669 | unsafe fn test_mm512_dpwssds_epi32() { |
670 | let src = _mm512_set1_epi32(1); |
671 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
672 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
673 | let r = _mm512_dpwssds_epi32(src, a, b); |
674 | let e = _mm512_set1_epi32(3); |
675 | assert_eq_m512i(r, e); |
676 | } |
677 | |
678 | #[simd_test(enable = "avx512vnni" )] |
679 | unsafe fn test_mm512_mask_dpwssds_epi32() { |
680 | let src = _mm512_set1_epi32(1); |
681 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
682 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
683 | let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b); |
684 | assert_eq_m512i(r, src); |
685 | let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b); |
686 | let e = _mm512_set1_epi32(3); |
687 | assert_eq_m512i(r, e); |
688 | } |
689 | |
690 | #[simd_test(enable = "avx512vnni" )] |
691 | unsafe fn test_mm512_maskz_dpwssds_epi32() { |
692 | let src = _mm512_set1_epi32(1); |
693 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
694 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
695 | let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b); |
696 | assert_eq_m512i(r, _mm512_setzero_si512()); |
697 | let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b); |
698 | let e = _mm512_set1_epi32(3); |
699 | assert_eq_m512i(r, e); |
700 | } |
701 | |
702 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
703 | unsafe fn test_mm256_dpwssds_epi32() { |
704 | let src = _mm256_set1_epi32(1); |
705 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
706 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
707 | let r = _mm256_dpwssds_epi32(src, a, b); |
708 | let e = _mm256_set1_epi32(3); |
709 | assert_eq_m256i(r, e); |
710 | } |
711 | |
712 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
713 | unsafe fn test_mm256_mask_dpwssds_epi32() { |
714 | let src = _mm256_set1_epi32(1); |
715 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
716 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
717 | let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b); |
718 | assert_eq_m256i(r, src); |
719 | let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b); |
720 | let e = _mm256_set1_epi32(3); |
721 | assert_eq_m256i(r, e); |
722 | } |
723 | |
724 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
725 | unsafe fn test_mm256_maskz_dpwssds_epi32() { |
726 | let src = _mm256_set1_epi32(1); |
727 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
728 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
729 | let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b); |
730 | assert_eq_m256i(r, _mm256_setzero_si256()); |
731 | let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b); |
732 | let e = _mm256_set1_epi32(3); |
733 | assert_eq_m256i(r, e); |
734 | } |
735 | |
736 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
737 | unsafe fn test_mm_dpwssds_epi32() { |
738 | let src = _mm_set1_epi32(1); |
739 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
740 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
741 | let r = _mm_dpwssds_epi32(src, a, b); |
742 | let e = _mm_set1_epi32(3); |
743 | assert_eq_m128i(r, e); |
744 | } |
745 | |
746 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
747 | unsafe fn test_mm_mask_dpwssds_epi32() { |
748 | let src = _mm_set1_epi32(1); |
749 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
750 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
751 | let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b); |
752 | assert_eq_m128i(r, src); |
753 | let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b); |
754 | let e = _mm_set1_epi32(3); |
755 | assert_eq_m128i(r, e); |
756 | } |
757 | |
758 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
759 | unsafe fn test_mm_maskz_dpwssds_epi32() { |
760 | let src = _mm_set1_epi32(1); |
761 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
762 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
763 | let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b); |
764 | assert_eq_m128i(r, _mm_setzero_si128()); |
765 | let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b); |
766 | let e = _mm_set1_epi32(3); |
767 | assert_eq_m128i(r, e); |
768 | } |
769 | |
770 | #[simd_test(enable = "avx512vnni" )] |
771 | unsafe fn test_mm512_dpbusd_epi32() { |
772 | let src = _mm512_set1_epi32(1); |
773 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
774 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
775 | let r = _mm512_dpbusd_epi32(src, a, b); |
776 | let e = _mm512_set1_epi32(5); |
777 | assert_eq_m512i(r, e); |
778 | } |
779 | |
780 | #[simd_test(enable = "avx512vnni" )] |
781 | unsafe fn test_mm512_mask_dpbusd_epi32() { |
782 | let src = _mm512_set1_epi32(1); |
783 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
784 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
785 | let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b); |
786 | assert_eq_m512i(r, src); |
787 | let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b); |
788 | let e = _mm512_set1_epi32(5); |
789 | assert_eq_m512i(r, e); |
790 | } |
791 | |
792 | #[simd_test(enable = "avx512vnni" )] |
793 | unsafe fn test_mm512_maskz_dpbusd_epi32() { |
794 | let src = _mm512_set1_epi32(1); |
795 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
796 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
797 | let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b); |
798 | assert_eq_m512i(r, _mm512_setzero_si512()); |
799 | let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b); |
800 | let e = _mm512_set1_epi32(5); |
801 | assert_eq_m512i(r, e); |
802 | } |
803 | |
804 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
805 | unsafe fn test_mm256_dpbusd_epi32() { |
806 | let src = _mm256_set1_epi32(1); |
807 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
808 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
809 | let r = _mm256_dpbusd_epi32(src, a, b); |
810 | let e = _mm256_set1_epi32(5); |
811 | assert_eq_m256i(r, e); |
812 | } |
813 | |
814 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
815 | unsafe fn test_mm256_mask_dpbusd_epi32() { |
816 | let src = _mm256_set1_epi32(1); |
817 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
818 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
819 | let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b); |
820 | assert_eq_m256i(r, src); |
821 | let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b); |
822 | let e = _mm256_set1_epi32(5); |
823 | assert_eq_m256i(r, e); |
824 | } |
825 | |
826 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
827 | unsafe fn test_mm256_maskz_dpbusd_epi32() { |
828 | let src = _mm256_set1_epi32(1); |
829 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
830 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
831 | let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b); |
832 | assert_eq_m256i(r, _mm256_setzero_si256()); |
833 | let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b); |
834 | let e = _mm256_set1_epi32(5); |
835 | assert_eq_m256i(r, e); |
836 | } |
837 | |
838 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
839 | unsafe fn test_mm_dpbusd_epi32() { |
840 | let src = _mm_set1_epi32(1); |
841 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
842 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
843 | let r = _mm_dpbusd_epi32(src, a, b); |
844 | let e = _mm_set1_epi32(5); |
845 | assert_eq_m128i(r, e); |
846 | } |
847 | |
848 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
849 | unsafe fn test_mm_mask_dpbusd_epi32() { |
850 | let src = _mm_set1_epi32(1); |
851 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
852 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
853 | let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b); |
854 | assert_eq_m128i(r, src); |
855 | let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b); |
856 | let e = _mm_set1_epi32(5); |
857 | assert_eq_m128i(r, e); |
858 | } |
859 | |
860 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
861 | unsafe fn test_mm_maskz_dpbusd_epi32() { |
862 | let src = _mm_set1_epi32(1); |
863 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
864 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
865 | let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b); |
866 | assert_eq_m128i(r, _mm_setzero_si128()); |
867 | let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b); |
868 | let e = _mm_set1_epi32(5); |
869 | assert_eq_m128i(r, e); |
870 | } |
871 | |
872 | #[simd_test(enable = "avx512vnni" )] |
873 | unsafe fn test_mm512_dpbusds_epi32() { |
874 | let src = _mm512_set1_epi32(1); |
875 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
876 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
877 | let r = _mm512_dpbusds_epi32(src, a, b); |
878 | let e = _mm512_set1_epi32(5); |
879 | assert_eq_m512i(r, e); |
880 | } |
881 | |
882 | #[simd_test(enable = "avx512vnni" )] |
883 | unsafe fn test_mm512_mask_dpbusds_epi32() { |
884 | let src = _mm512_set1_epi32(1); |
885 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
886 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
887 | let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b); |
888 | assert_eq_m512i(r, src); |
889 | let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b); |
890 | let e = _mm512_set1_epi32(5); |
891 | assert_eq_m512i(r, e); |
892 | } |
893 | |
894 | #[simd_test(enable = "avx512vnni" )] |
895 | unsafe fn test_mm512_maskz_dpbusds_epi32() { |
896 | let src = _mm512_set1_epi32(1); |
897 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
898 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
899 | let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b); |
900 | assert_eq_m512i(r, _mm512_setzero_si512()); |
901 | let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b); |
902 | let e = _mm512_set1_epi32(5); |
903 | assert_eq_m512i(r, e); |
904 | } |
905 | |
906 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
907 | unsafe fn test_mm256_dpbusds_epi32() { |
908 | let src = _mm256_set1_epi32(1); |
909 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
910 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
911 | let r = _mm256_dpbusds_epi32(src, a, b); |
912 | let e = _mm256_set1_epi32(5); |
913 | assert_eq_m256i(r, e); |
914 | } |
915 | |
916 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
917 | unsafe fn test_mm256_mask_dpbusds_epi32() { |
918 | let src = _mm256_set1_epi32(1); |
919 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
920 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
921 | let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b); |
922 | assert_eq_m256i(r, src); |
923 | let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b); |
924 | let e = _mm256_set1_epi32(5); |
925 | assert_eq_m256i(r, e); |
926 | } |
927 | |
928 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
929 | unsafe fn test_mm256_maskz_dpbusds_epi32() { |
930 | let src = _mm256_set1_epi32(1); |
931 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
932 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
933 | let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b); |
934 | assert_eq_m256i(r, _mm256_setzero_si256()); |
935 | let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b); |
936 | let e = _mm256_set1_epi32(5); |
937 | assert_eq_m256i(r, e); |
938 | } |
939 | |
940 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
941 | unsafe fn test_mm_dpbusds_epi32() { |
942 | let src = _mm_set1_epi32(1); |
943 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
944 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
945 | let r = _mm_dpbusds_epi32(src, a, b); |
946 | let e = _mm_set1_epi32(5); |
947 | assert_eq_m128i(r, e); |
948 | } |
949 | |
950 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
951 | unsafe fn test_mm_mask_dpbusds_epi32() { |
952 | let src = _mm_set1_epi32(1); |
953 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
954 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
955 | let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b); |
956 | assert_eq_m128i(r, src); |
957 | let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b); |
958 | let e = _mm_set1_epi32(5); |
959 | assert_eq_m128i(r, e); |
960 | } |
961 | |
962 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
963 | unsafe fn test_mm_maskz_dpbusds_epi32() { |
964 | let src = _mm_set1_epi32(1); |
965 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
966 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
967 | let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b); |
968 | assert_eq_m128i(r, _mm_setzero_si128()); |
969 | let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b); |
970 | let e = _mm_set1_epi32(5); |
971 | assert_eq_m128i(r, e); |
972 | } |
973 | } |
974 | |