1 | use crate::{ |
2 | core_arch::{simd::*, simd_llvm::*, x86::*}, |
3 | mem::transmute, |
4 | }; |
5 | |
6 | #[cfg (test)] |
7 | use stdarch_test::assert_instr; |
8 | |
9 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
10 | /// |
11 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssd_epi32&expand=2219) |
12 | #[inline ] |
13 | #[target_feature (enable = "avx512vnni" )] |
14 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
15 | pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
16 | transmute(src:vpdpwssd(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
17 | } |
18 | |
19 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
20 | /// |
21 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssd_epi32&expand=2220) |
22 | #[inline ] |
23 | #[target_feature (enable = "avx512vnni" )] |
24 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
25 | pub unsafe fn _mm512_mask_dpwssd_epi32( |
26 | src: __m512i, |
27 | k: __mmask16, |
28 | a: __m512i, |
29 | b: __m512i, |
30 | ) -> __m512i { |
31 | let r: i32x16 = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); |
32 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x16())) |
33 | } |
34 | |
35 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
36 | /// |
37 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssd_epi32&expand=2221) |
38 | #[inline ] |
39 | #[target_feature (enable = "avx512vnni" )] |
40 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
41 | pub unsafe fn _mm512_maskz_dpwssd_epi32( |
42 | k: __mmask16, |
43 | src: __m512i, |
44 | a: __m512i, |
45 | b: __m512i, |
46 | ) -> __m512i { |
47 | let r: i32x16 = _mm512_dpwssd_epi32(src, a, b).as_i32x16(); |
48 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
49 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
50 | } |
51 | |
52 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
53 | /// |
54 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216) |
55 | #[inline ] |
56 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
57 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
58 | pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
59 | transmute(src:vpdpwssd256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
60 | } |
61 | |
62 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
63 | /// |
64 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssd_epi32&expand=2217) |
65 | #[inline ] |
66 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
67 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
68 | pub unsafe fn _mm256_mask_dpwssd_epi32( |
69 | src: __m256i, |
70 | k: __mmask8, |
71 | a: __m256i, |
72 | b: __m256i, |
73 | ) -> __m256i { |
74 | let r: i32x8 = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); |
75 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x8())) |
76 | } |
77 | |
78 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
79 | /// |
80 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssd_epi32&expand=2218) |
81 | #[inline ] |
82 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
83 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
84 | pub unsafe fn _mm256_maskz_dpwssd_epi32( |
85 | k: __mmask8, |
86 | src: __m256i, |
87 | a: __m256i, |
88 | b: __m256i, |
89 | ) -> __m256i { |
90 | let r: i32x8 = _mm256_dpwssd_epi32(src, a, b).as_i32x8(); |
91 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
92 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
93 | } |
94 | |
95 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
96 | /// |
97 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213) |
98 | #[inline ] |
99 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
100 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
101 | pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
102 | transmute(src:vpdpwssd128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
103 | } |
104 | |
105 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
106 | /// |
107 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssd_epi32&expand=2214) |
108 | #[inline ] |
109 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
110 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
111 | pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
112 | let r: i32x4 = _mm_dpwssd_epi32(src, a, b).as_i32x4(); |
113 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x4())) |
114 | } |
115 | |
116 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
117 | /// |
118 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssd_epi32&expand=2215) |
119 | #[inline ] |
120 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
121 | #[cfg_attr (test, assert_instr(vpdpwssd))] |
122 | pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
123 | let r: i32x4 = _mm_dpwssd_epi32(src, a, b).as_i32x4(); |
124 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
125 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
126 | } |
127 | |
128 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
129 | /// |
130 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpwssds_epi32&expand=2228) |
131 | #[inline ] |
132 | #[target_feature (enable = "avx512vnni" )] |
133 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
134 | pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
135 | transmute(src:vpdpwssds(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
136 | } |
137 | |
138 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
139 | /// |
140 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpwssds_epi32&expand=2229) |
141 | #[inline ] |
142 | #[target_feature (enable = "avx512vnni" )] |
143 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
144 | pub unsafe fn _mm512_mask_dpwssds_epi32( |
145 | src: __m512i, |
146 | k: __mmask16, |
147 | a: __m512i, |
148 | b: __m512i, |
149 | ) -> __m512i { |
150 | let r: i32x16 = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); |
151 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x16())) |
152 | } |
153 | |
154 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
155 | /// |
156 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpwssds_epi32&expand=2230) |
157 | #[inline ] |
158 | #[target_feature (enable = "avx512vnni" )] |
159 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
160 | pub unsafe fn _mm512_maskz_dpwssds_epi32( |
161 | k: __mmask16, |
162 | src: __m512i, |
163 | a: __m512i, |
164 | b: __m512i, |
165 | ) -> __m512i { |
166 | let r: i32x16 = _mm512_dpwssds_epi32(src, a, b).as_i32x16(); |
167 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
168 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
169 | } |
170 | |
171 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
172 | /// |
173 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225) |
174 | #[inline ] |
175 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
176 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
177 | pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
178 | transmute(src:vpdpwssds256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
179 | } |
180 | |
181 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
182 | /// |
183 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpwssds_epi32&expand=2226) |
184 | #[inline ] |
185 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
186 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
187 | pub unsafe fn _mm256_mask_dpwssds_epi32( |
188 | src: __m256i, |
189 | k: __mmask8, |
190 | a: __m256i, |
191 | b: __m256i, |
192 | ) -> __m256i { |
193 | let r: i32x8 = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); |
194 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x8())) |
195 | } |
196 | |
197 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
198 | /// |
199 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpwssds_epi32&expand=2227) |
200 | #[inline ] |
201 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
202 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
203 | pub unsafe fn _mm256_maskz_dpwssds_epi32( |
204 | k: __mmask8, |
205 | src: __m256i, |
206 | a: __m256i, |
207 | b: __m256i, |
208 | ) -> __m256i { |
209 | let r: i32x8 = _mm256_dpwssds_epi32(src, a, b).as_i32x8(); |
210 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
211 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
212 | } |
213 | |
214 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
215 | /// |
216 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222) |
217 | #[inline ] |
218 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
219 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
220 | pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
221 | transmute(src:vpdpwssds128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
222 | } |
223 | |
224 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
225 | /// |
226 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpwssds_epi32&expand=2223) |
227 | #[inline ] |
228 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
229 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
230 | pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
231 | let r: i32x4 = _mm_dpwssds_epi32(src, a, b).as_i32x4(); |
232 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x4())) |
233 | } |
234 | |
235 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
236 | /// |
237 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpwssds_epi32&expand=2224) |
238 | #[inline ] |
239 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
240 | #[cfg_attr (test, assert_instr(vpdpwssds))] |
241 | pub unsafe fn _mm_maskz_dpwssds_epi32( |
242 | k: __mmask8, |
243 | src: __m128i, |
244 | a: __m128i, |
245 | b: __m128i, |
246 | ) -> __m128i { |
247 | let r: i32x4 = _mm_dpwssds_epi32(src, a, b).as_i32x4(); |
248 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
249 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
250 | } |
251 | |
252 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
253 | /// |
254 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusd_epi32&expand=2201) |
255 | #[inline ] |
256 | #[target_feature (enable = "avx512vnni" )] |
257 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
258 | pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
259 | transmute(src:vpdpbusd(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
260 | } |
261 | |
262 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
263 | /// |
264 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusd_epi32&expand=2202) |
265 | #[inline ] |
266 | #[target_feature (enable = "avx512vnni" )] |
267 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
268 | pub unsafe fn _mm512_mask_dpbusd_epi32( |
269 | src: __m512i, |
270 | k: __mmask16, |
271 | a: __m512i, |
272 | b: __m512i, |
273 | ) -> __m512i { |
274 | let r: i32x16 = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); |
275 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x16())) |
276 | } |
277 | |
278 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
279 | /// |
280 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusd_epi32&expand=2203) |
281 | #[inline ] |
282 | #[target_feature (enable = "avx512vnni" )] |
283 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
284 | pub unsafe fn _mm512_maskz_dpbusd_epi32( |
285 | k: __mmask16, |
286 | src: __m512i, |
287 | a: __m512i, |
288 | b: __m512i, |
289 | ) -> __m512i { |
290 | let r: i32x16 = _mm512_dpbusd_epi32(src, a, b).as_i32x16(); |
291 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
292 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
293 | } |
294 | |
295 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
296 | /// |
297 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198) |
298 | #[inline ] |
299 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
300 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
301 | pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
302 | transmute(src:vpdpbusd256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
303 | } |
304 | |
305 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
306 | /// |
307 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusd_epi32&expand=2199) |
308 | #[inline ] |
309 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
310 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
311 | pub unsafe fn _mm256_mask_dpbusd_epi32( |
312 | src: __m256i, |
313 | k: __mmask8, |
314 | a: __m256i, |
315 | b: __m256i, |
316 | ) -> __m256i { |
317 | let r: i32x8 = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); |
318 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x8())) |
319 | } |
320 | |
321 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
322 | /// |
323 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusd_epi32&expand=2200) |
324 | #[inline ] |
325 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
326 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
327 | pub unsafe fn _mm256_maskz_dpbusd_epi32( |
328 | k: __mmask8, |
329 | src: __m256i, |
330 | a: __m256i, |
331 | b: __m256i, |
332 | ) -> __m256i { |
333 | let r: i32x8 = _mm256_dpbusd_epi32(src, a, b).as_i32x8(); |
334 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
335 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
336 | } |
337 | |
338 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. |
339 | /// |
340 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195) |
341 | #[inline ] |
342 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
343 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
344 | pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
345 | transmute(src:vpdpbusd128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
346 | } |
347 | |
348 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
349 | /// |
350 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusd_epi32&expand=2196) |
351 | #[inline ] |
352 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
353 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
354 | pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
355 | let r: i32x4 = _mm_dpbusd_epi32(src, a, b).as_i32x4(); |
356 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x4())) |
357 | } |
358 | |
359 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
360 | /// |
361 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusd_epi32&expand=2197) |
362 | #[inline ] |
363 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
364 | #[cfg_attr (test, assert_instr(vpdpbusd))] |
365 | pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
366 | let r: i32x4 = _mm_dpbusd_epi32(src, a, b).as_i32x4(); |
367 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
368 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
369 | } |
370 | |
371 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
372 | /// |
373 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_dpbusds_epi32&expand=2210) |
374 | #[inline ] |
375 | #[target_feature (enable = "avx512vnni" )] |
376 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
377 | pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i { |
378 | transmute(src:vpdpbusds(src:src.as_i32x16(), a:a.as_i32x16(), b:b.as_i32x16())) |
379 | } |
380 | |
381 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
382 | /// |
383 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_dpbusds_epi32&expand=2211) |
384 | #[inline ] |
385 | #[target_feature (enable = "avx512vnni" )] |
386 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
387 | pub unsafe fn _mm512_mask_dpbusds_epi32( |
388 | src: __m512i, |
389 | k: __mmask16, |
390 | a: __m512i, |
391 | b: __m512i, |
392 | ) -> __m512i { |
393 | let r: i32x16 = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); |
394 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x16())) |
395 | } |
396 | |
397 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
398 | /// |
399 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_dpbusds_epi32&expand=2212) |
400 | #[inline ] |
401 | #[target_feature (enable = "avx512vnni" )] |
402 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
403 | pub unsafe fn _mm512_maskz_dpbusds_epi32( |
404 | k: __mmask16, |
405 | src: __m512i, |
406 | a: __m512i, |
407 | b: __m512i, |
408 | ) -> __m512i { |
409 | let r: i32x16 = _mm512_dpbusds_epi32(src, a, b).as_i32x16(); |
410 | let zero: i32x16 = _mm512_setzero_si512().as_i32x16(); |
411 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
412 | } |
413 | |
414 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
415 | /// |
416 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207) |
417 | #[inline ] |
418 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
419 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
420 | pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { |
421 | transmute(src:vpdpbusds256(src:src.as_i32x8(), a:a.as_i32x8(), b:b.as_i32x8())) |
422 | } |
423 | |
424 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
425 | /// |
426 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_dpbusds_epi32&expand=2208) |
427 | #[inline ] |
428 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
429 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
430 | pub unsafe fn _mm256_mask_dpbusds_epi32( |
431 | src: __m256i, |
432 | k: __mmask8, |
433 | a: __m256i, |
434 | b: __m256i, |
435 | ) -> __m256i { |
436 | let r: i32x8 = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); |
437 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x8())) |
438 | } |
439 | |
440 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
441 | /// |
442 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_dpbusds_epi32&expand=2209) |
443 | #[inline ] |
444 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
445 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
446 | pub unsafe fn _mm256_maskz_dpbusds_epi32( |
447 | k: __mmask8, |
448 | src: __m256i, |
449 | a: __m256i, |
450 | b: __m256i, |
451 | ) -> __m256i { |
452 | let r: i32x8 = _mm256_dpbusds_epi32(src, a, b).as_i32x8(); |
453 | let zero: i32x8 = _mm256_setzero_si256().as_i32x8(); |
454 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
455 | } |
456 | |
457 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. |
458 | /// |
459 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204) |
460 | #[inline ] |
461 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
462 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
463 | pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { |
464 | transmute(src:vpdpbusds128(src:src.as_i32x4(), a:a.as_i32x4(), b:b.as_i32x4())) |
465 | } |
466 | |
467 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). |
468 | /// |
469 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_dpbusds_epi32&expand=2205) |
470 | #[inline ] |
471 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
472 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
473 | pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { |
474 | let r: i32x4 = _mm_dpbusds_epi32(src, a, b).as_i32x4(); |
475 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i32x4())) |
476 | } |
477 | |
478 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). |
479 | /// |
480 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_dpbusds_epi32&expand=2206) |
481 | #[inline ] |
482 | #[target_feature (enable = "avx512vnni,avx512vl" )] |
483 | #[cfg_attr (test, assert_instr(vpdpbusds))] |
484 | pub unsafe fn _mm_maskz_dpbusds_epi32( |
485 | k: __mmask8, |
486 | src: __m128i, |
487 | a: __m128i, |
488 | b: __m128i, |
489 | ) -> __m128i { |
490 | let r: i32x4 = _mm_dpbusds_epi32(src, a, b).as_i32x4(); |
491 | let zero: i32x4 = _mm_setzero_si128().as_i32x4(); |
492 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
493 | } |
494 | |
495 | #[allow (improper_ctypes)] |
496 | extern "C" { |
497 | #[link_name = "llvm.x86.avx512.vpdpwssd.512" ] |
498 | fn vpdpwssd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
499 | #[link_name = "llvm.x86.avx512.vpdpwssd.256" ] |
500 | fn vpdpwssd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
501 | #[link_name = "llvm.x86.avx512.vpdpwssd.128" ] |
502 | fn vpdpwssd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
503 | |
504 | #[link_name = "llvm.x86.avx512.vpdpwssds.512" ] |
505 | fn vpdpwssds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
506 | #[link_name = "llvm.x86.avx512.vpdpwssds.256" ] |
507 | fn vpdpwssds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
508 | #[link_name = "llvm.x86.avx512.vpdpwssds.128" ] |
509 | fn vpdpwssds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
510 | |
511 | #[link_name = "llvm.x86.avx512.vpdpbusd.512" ] |
512 | fn vpdpbusd(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
513 | #[link_name = "llvm.x86.avx512.vpdpbusd.256" ] |
514 | fn vpdpbusd256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
515 | #[link_name = "llvm.x86.avx512.vpdpbusd.128" ] |
516 | fn vpdpbusd128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
517 | |
518 | #[link_name = "llvm.x86.avx512.vpdpbusds.512" ] |
519 | fn vpdpbusds(src: i32x16, a: i32x16, b: i32x16) -> i32x16; |
520 | #[link_name = "llvm.x86.avx512.vpdpbusds.256" ] |
521 | fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; |
522 | #[link_name = "llvm.x86.avx512.vpdpbusds.128" ] |
523 | fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; |
524 | } |
525 | |
526 | #[cfg (test)] |
527 | mod tests { |
528 | |
529 | use crate::core_arch::x86::*; |
530 | use stdarch_test::simd_test; |
531 | |
532 | #[simd_test(enable = "avx512vnni" )] |
533 | unsafe fn test_mm512_dpwssd_epi32() { |
534 | let src = _mm512_set1_epi32(1); |
535 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
536 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
537 | let r = _mm512_dpwssd_epi32(src, a, b); |
538 | let e = _mm512_set1_epi32(3); |
539 | assert_eq_m512i(r, e); |
540 | } |
541 | |
542 | #[simd_test(enable = "avx512vnni" )] |
543 | unsafe fn test_mm512_mask_dpwssd_epi32() { |
544 | let src = _mm512_set1_epi32(1); |
545 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
546 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
547 | let r = _mm512_mask_dpwssd_epi32(src, 0b00000000_00000000, a, b); |
548 | assert_eq_m512i(r, src); |
549 | let r = _mm512_mask_dpwssd_epi32(src, 0b11111111_11111111, a, b); |
550 | let e = _mm512_set1_epi32(3); |
551 | assert_eq_m512i(r, e); |
552 | } |
553 | |
554 | #[simd_test(enable = "avx512vnni" )] |
555 | unsafe fn test_mm512_maskz_dpwssd_epi32() { |
556 | let src = _mm512_set1_epi32(1); |
557 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
558 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
559 | let r = _mm512_maskz_dpwssd_epi32(0b00000000_00000000, src, a, b); |
560 | assert_eq_m512i(r, _mm512_setzero_si512()); |
561 | let r = _mm512_maskz_dpwssd_epi32(0b11111111_11111111, src, a, b); |
562 | let e = _mm512_set1_epi32(3); |
563 | assert_eq_m512i(r, e); |
564 | } |
565 | |
566 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
567 | unsafe fn test_mm256_dpwssd_epi32() { |
568 | let src = _mm256_set1_epi32(1); |
569 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
570 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
571 | let r = _mm256_dpwssd_epi32(src, a, b); |
572 | let e = _mm256_set1_epi32(3); |
573 | assert_eq_m256i(r, e); |
574 | } |
575 | |
576 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
577 | unsafe fn test_mm256_mask_dpwssd_epi32() { |
578 | let src = _mm256_set1_epi32(1); |
579 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
580 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
581 | let r = _mm256_mask_dpwssd_epi32(src, 0b00000000, a, b); |
582 | assert_eq_m256i(r, src); |
583 | let r = _mm256_mask_dpwssd_epi32(src, 0b11111111, a, b); |
584 | let e = _mm256_set1_epi32(3); |
585 | assert_eq_m256i(r, e); |
586 | } |
587 | |
588 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
589 | unsafe fn test_mm256_maskz_dpwssd_epi32() { |
590 | let src = _mm256_set1_epi32(1); |
591 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
592 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
593 | let r = _mm256_maskz_dpwssd_epi32(0b00000000, src, a, b); |
594 | assert_eq_m256i(r, _mm256_setzero_si256()); |
595 | let r = _mm256_maskz_dpwssd_epi32(0b11111111, src, a, b); |
596 | let e = _mm256_set1_epi32(3); |
597 | assert_eq_m256i(r, e); |
598 | } |
599 | |
600 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
601 | unsafe fn test_mm_dpwssd_epi32() { |
602 | let src = _mm_set1_epi32(1); |
603 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
604 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
605 | let r = _mm_dpwssd_epi32(src, a, b); |
606 | let e = _mm_set1_epi32(3); |
607 | assert_eq_m128i(r, e); |
608 | } |
609 | |
610 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
611 | unsafe fn test_mm_mask_dpwssd_epi32() { |
612 | let src = _mm_set1_epi32(1); |
613 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
614 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
615 | let r = _mm_mask_dpwssd_epi32(src, 0b00000000, a, b); |
616 | assert_eq_m128i(r, src); |
617 | let r = _mm_mask_dpwssd_epi32(src, 0b00001111, a, b); |
618 | let e = _mm_set1_epi32(3); |
619 | assert_eq_m128i(r, e); |
620 | } |
621 | |
622 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
623 | unsafe fn test_mm_maskz_dpwssd_epi32() { |
624 | let src = _mm_set1_epi32(1); |
625 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
626 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
627 | let r = _mm_maskz_dpwssd_epi32(0b00000000, src, a, b); |
628 | assert_eq_m128i(r, _mm_setzero_si128()); |
629 | let r = _mm_maskz_dpwssd_epi32(0b00001111, src, a, b); |
630 | let e = _mm_set1_epi32(3); |
631 | assert_eq_m128i(r, e); |
632 | } |
633 | |
634 | #[simd_test(enable = "avx512vnni" )] |
635 | unsafe fn test_mm512_dpwssds_epi32() { |
636 | let src = _mm512_set1_epi32(1); |
637 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
638 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
639 | let r = _mm512_dpwssds_epi32(src, a, b); |
640 | let e = _mm512_set1_epi32(3); |
641 | assert_eq_m512i(r, e); |
642 | } |
643 | |
644 | #[simd_test(enable = "avx512vnni" )] |
645 | unsafe fn test_mm512_mask_dpwssds_epi32() { |
646 | let src = _mm512_set1_epi32(1); |
647 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
648 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
649 | let r = _mm512_mask_dpwssds_epi32(src, 0b00000000_00000000, a, b); |
650 | assert_eq_m512i(r, src); |
651 | let r = _mm512_mask_dpwssds_epi32(src, 0b11111111_11111111, a, b); |
652 | let e = _mm512_set1_epi32(3); |
653 | assert_eq_m512i(r, e); |
654 | } |
655 | |
656 | #[simd_test(enable = "avx512vnni" )] |
657 | unsafe fn test_mm512_maskz_dpwssds_epi32() { |
658 | let src = _mm512_set1_epi32(1); |
659 | let a = _mm512_set1_epi32(1 << 16 | 1 << 0); |
660 | let b = _mm512_set1_epi32(1 << 16 | 1 << 0); |
661 | let r = _mm512_maskz_dpwssds_epi32(0b00000000_00000000, src, a, b); |
662 | assert_eq_m512i(r, _mm512_setzero_si512()); |
663 | let r = _mm512_maskz_dpwssds_epi32(0b11111111_11111111, src, a, b); |
664 | let e = _mm512_set1_epi32(3); |
665 | assert_eq_m512i(r, e); |
666 | } |
667 | |
668 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
669 | unsafe fn test_mm256_dpwssds_epi32() { |
670 | let src = _mm256_set1_epi32(1); |
671 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
672 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
673 | let r = _mm256_dpwssds_epi32(src, a, b); |
674 | let e = _mm256_set1_epi32(3); |
675 | assert_eq_m256i(r, e); |
676 | } |
677 | |
678 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
679 | unsafe fn test_mm256_mask_dpwssds_epi32() { |
680 | let src = _mm256_set1_epi32(1); |
681 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
682 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
683 | let r = _mm256_mask_dpwssds_epi32(src, 0b00000000, a, b); |
684 | assert_eq_m256i(r, src); |
685 | let r = _mm256_mask_dpwssds_epi32(src, 0b11111111, a, b); |
686 | let e = _mm256_set1_epi32(3); |
687 | assert_eq_m256i(r, e); |
688 | } |
689 | |
690 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
691 | unsafe fn test_mm256_maskz_dpwssds_epi32() { |
692 | let src = _mm256_set1_epi32(1); |
693 | let a = _mm256_set1_epi32(1 << 16 | 1 << 0); |
694 | let b = _mm256_set1_epi32(1 << 16 | 1 << 0); |
695 | let r = _mm256_maskz_dpwssds_epi32(0b00000000, src, a, b); |
696 | assert_eq_m256i(r, _mm256_setzero_si256()); |
697 | let r = _mm256_maskz_dpwssds_epi32(0b11111111, src, a, b); |
698 | let e = _mm256_set1_epi32(3); |
699 | assert_eq_m256i(r, e); |
700 | } |
701 | |
702 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
703 | unsafe fn test_mm_dpwssds_epi32() { |
704 | let src = _mm_set1_epi32(1); |
705 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
706 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
707 | let r = _mm_dpwssds_epi32(src, a, b); |
708 | let e = _mm_set1_epi32(3); |
709 | assert_eq_m128i(r, e); |
710 | } |
711 | |
712 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
713 | unsafe fn test_mm_mask_dpwssds_epi32() { |
714 | let src = _mm_set1_epi32(1); |
715 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
716 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
717 | let r = _mm_mask_dpwssds_epi32(src, 0b00000000, a, b); |
718 | assert_eq_m128i(r, src); |
719 | let r = _mm_mask_dpwssds_epi32(src, 0b00001111, a, b); |
720 | let e = _mm_set1_epi32(3); |
721 | assert_eq_m128i(r, e); |
722 | } |
723 | |
724 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
725 | unsafe fn test_mm_maskz_dpwssds_epi32() { |
726 | let src = _mm_set1_epi32(1); |
727 | let a = _mm_set1_epi32(1 << 16 | 1 << 0); |
728 | let b = _mm_set1_epi32(1 << 16 | 1 << 0); |
729 | let r = _mm_maskz_dpwssds_epi32(0b00000000, src, a, b); |
730 | assert_eq_m128i(r, _mm_setzero_si128()); |
731 | let r = _mm_maskz_dpwssds_epi32(0b00001111, src, a, b); |
732 | let e = _mm_set1_epi32(3); |
733 | assert_eq_m128i(r, e); |
734 | } |
735 | |
736 | #[simd_test(enable = "avx512vnni" )] |
737 | unsafe fn test_mm512_dpbusd_epi32() { |
738 | let src = _mm512_set1_epi32(1); |
739 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
740 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
741 | let r = _mm512_dpbusd_epi32(src, a, b); |
742 | let e = _mm512_set1_epi32(5); |
743 | assert_eq_m512i(r, e); |
744 | } |
745 | |
746 | #[simd_test(enable = "avx512vnni" )] |
747 | unsafe fn test_mm512_mask_dpbusd_epi32() { |
748 | let src = _mm512_set1_epi32(1); |
749 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
750 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
751 | let r = _mm512_mask_dpbusd_epi32(src, 0b00000000_00000000, a, b); |
752 | assert_eq_m512i(r, src); |
753 | let r = _mm512_mask_dpbusd_epi32(src, 0b11111111_11111111, a, b); |
754 | let e = _mm512_set1_epi32(5); |
755 | assert_eq_m512i(r, e); |
756 | } |
757 | |
758 | #[simd_test(enable = "avx512vnni" )] |
759 | unsafe fn test_mm512_maskz_dpbusd_epi32() { |
760 | let src = _mm512_set1_epi32(1); |
761 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
762 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
763 | let r = _mm512_maskz_dpbusd_epi32(0b00000000_00000000, src, a, b); |
764 | assert_eq_m512i(r, _mm512_setzero_si512()); |
765 | let r = _mm512_maskz_dpbusd_epi32(0b11111111_11111111, src, a, b); |
766 | let e = _mm512_set1_epi32(5); |
767 | assert_eq_m512i(r, e); |
768 | } |
769 | |
770 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
771 | unsafe fn test_mm256_dpbusd_epi32() { |
772 | let src = _mm256_set1_epi32(1); |
773 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
774 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
775 | let r = _mm256_dpbusd_epi32(src, a, b); |
776 | let e = _mm256_set1_epi32(5); |
777 | assert_eq_m256i(r, e); |
778 | } |
779 | |
780 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
781 | unsafe fn test_mm256_mask_dpbusd_epi32() { |
782 | let src = _mm256_set1_epi32(1); |
783 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
784 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
785 | let r = _mm256_mask_dpbusd_epi32(src, 0b00000000, a, b); |
786 | assert_eq_m256i(r, src); |
787 | let r = _mm256_mask_dpbusd_epi32(src, 0b11111111, a, b); |
788 | let e = _mm256_set1_epi32(5); |
789 | assert_eq_m256i(r, e); |
790 | } |
791 | |
792 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
793 | unsafe fn test_mm256_maskz_dpbusd_epi32() { |
794 | let src = _mm256_set1_epi32(1); |
795 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
796 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
797 | let r = _mm256_maskz_dpbusd_epi32(0b00000000, src, a, b); |
798 | assert_eq_m256i(r, _mm256_setzero_si256()); |
799 | let r = _mm256_maskz_dpbusd_epi32(0b11111111, src, a, b); |
800 | let e = _mm256_set1_epi32(5); |
801 | assert_eq_m256i(r, e); |
802 | } |
803 | |
804 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
805 | unsafe fn test_mm_dpbusd_epi32() { |
806 | let src = _mm_set1_epi32(1); |
807 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
808 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
809 | let r = _mm_dpbusd_epi32(src, a, b); |
810 | let e = _mm_set1_epi32(5); |
811 | assert_eq_m128i(r, e); |
812 | } |
813 | |
814 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
815 | unsafe fn test_mm_mask_dpbusd_epi32() { |
816 | let src = _mm_set1_epi32(1); |
817 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
818 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
819 | let r = _mm_mask_dpbusd_epi32(src, 0b00000000, a, b); |
820 | assert_eq_m128i(r, src); |
821 | let r = _mm_mask_dpbusd_epi32(src, 0b00001111, a, b); |
822 | let e = _mm_set1_epi32(5); |
823 | assert_eq_m128i(r, e); |
824 | } |
825 | |
826 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
827 | unsafe fn test_mm_maskz_dpbusd_epi32() { |
828 | let src = _mm_set1_epi32(1); |
829 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
830 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
831 | let r = _mm_maskz_dpbusd_epi32(0b00000000, src, a, b); |
832 | assert_eq_m128i(r, _mm_setzero_si128()); |
833 | let r = _mm_maskz_dpbusd_epi32(0b00001111, src, a, b); |
834 | let e = _mm_set1_epi32(5); |
835 | assert_eq_m128i(r, e); |
836 | } |
837 | |
838 | #[simd_test(enable = "avx512vnni" )] |
839 | unsafe fn test_mm512_dpbusds_epi32() { |
840 | let src = _mm512_set1_epi32(1); |
841 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
842 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
843 | let r = _mm512_dpbusds_epi32(src, a, b); |
844 | let e = _mm512_set1_epi32(5); |
845 | assert_eq_m512i(r, e); |
846 | } |
847 | |
848 | #[simd_test(enable = "avx512vnni" )] |
849 | unsafe fn test_mm512_mask_dpbusds_epi32() { |
850 | let src = _mm512_set1_epi32(1); |
851 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
852 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
853 | let r = _mm512_mask_dpbusds_epi32(src, 0b00000000_00000000, a, b); |
854 | assert_eq_m512i(r, src); |
855 | let r = _mm512_mask_dpbusds_epi32(src, 0b11111111_11111111, a, b); |
856 | let e = _mm512_set1_epi32(5); |
857 | assert_eq_m512i(r, e); |
858 | } |
859 | |
860 | #[simd_test(enable = "avx512vnni" )] |
861 | unsafe fn test_mm512_maskz_dpbusds_epi32() { |
862 | let src = _mm512_set1_epi32(1); |
863 | let a = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
864 | let b = _mm512_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
865 | let r = _mm512_maskz_dpbusds_epi32(0b00000000_00000000, src, a, b); |
866 | assert_eq_m512i(r, _mm512_setzero_si512()); |
867 | let r = _mm512_maskz_dpbusds_epi32(0b11111111_11111111, src, a, b); |
868 | let e = _mm512_set1_epi32(5); |
869 | assert_eq_m512i(r, e); |
870 | } |
871 | |
872 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
873 | unsafe fn test_mm256_dpbusds_epi32() { |
874 | let src = _mm256_set1_epi32(1); |
875 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
876 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
877 | let r = _mm256_dpbusds_epi32(src, a, b); |
878 | let e = _mm256_set1_epi32(5); |
879 | assert_eq_m256i(r, e); |
880 | } |
881 | |
882 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
883 | unsafe fn test_mm256_mask_dpbusds_epi32() { |
884 | let src = _mm256_set1_epi32(1); |
885 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
886 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
887 | let r = _mm256_mask_dpbusds_epi32(src, 0b00000000, a, b); |
888 | assert_eq_m256i(r, src); |
889 | let r = _mm256_mask_dpbusds_epi32(src, 0b11111111, a, b); |
890 | let e = _mm256_set1_epi32(5); |
891 | assert_eq_m256i(r, e); |
892 | } |
893 | |
894 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
895 | unsafe fn test_mm256_maskz_dpbusds_epi32() { |
896 | let src = _mm256_set1_epi32(1); |
897 | let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
898 | let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
899 | let r = _mm256_maskz_dpbusds_epi32(0b00000000, src, a, b); |
900 | assert_eq_m256i(r, _mm256_setzero_si256()); |
901 | let r = _mm256_maskz_dpbusds_epi32(0b11111111, src, a, b); |
902 | let e = _mm256_set1_epi32(5); |
903 | assert_eq_m256i(r, e); |
904 | } |
905 | |
906 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
907 | unsafe fn test_mm_dpbusds_epi32() { |
908 | let src = _mm_set1_epi32(1); |
909 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
910 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
911 | let r = _mm_dpbusds_epi32(src, a, b); |
912 | let e = _mm_set1_epi32(5); |
913 | assert_eq_m128i(r, e); |
914 | } |
915 | |
916 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
917 | unsafe fn test_mm_mask_dpbusds_epi32() { |
918 | let src = _mm_set1_epi32(1); |
919 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
920 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
921 | let r = _mm_mask_dpbusds_epi32(src, 0b00000000, a, b); |
922 | assert_eq_m128i(r, src); |
923 | let r = _mm_mask_dpbusds_epi32(src, 0b00001111, a, b); |
924 | let e = _mm_set1_epi32(5); |
925 | assert_eq_m128i(r, e); |
926 | } |
927 | |
928 | #[simd_test(enable = "avx512vnni,avx512vl" )] |
929 | unsafe fn test_mm_maskz_dpbusds_epi32() { |
930 | let src = _mm_set1_epi32(1); |
931 | let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
932 | let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); |
933 | let r = _mm_maskz_dpbusds_epi32(0b00000000, src, a, b); |
934 | assert_eq_m128i(r, _mm_setzero_si128()); |
935 | let r = _mm_maskz_dpbusds_epi32(0b00001111, src, a, b); |
936 | let e = _mm_set1_epi32(5); |
937 | assert_eq_m128i(r, e); |
938 | } |
939 | } |
940 | |