1 | //! Galois Field New Instructions (GFNI) |
2 | //! |
3 | //! The intrinsics here correspond to those in the `immintrin.h` C header. |
4 | //! |
5 | //! The reference is [Intel 64 and IA-32 Architectures Software Developer's |
6 | //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref]. |
7 | //! |
8 | //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf |
9 | |
10 | use crate::core_arch::simd::i8x16; |
11 | use crate::core_arch::simd::i8x32; |
12 | use crate::core_arch::simd::i8x64; |
13 | use crate::core_arch::simd_llvm::simd_select_bitmask; |
14 | use crate::core_arch::x86::__m128i; |
15 | use crate::core_arch::x86::__m256i; |
16 | use crate::core_arch::x86::__m512i; |
17 | use crate::core_arch::x86::__mmask16; |
18 | use crate::core_arch::x86::__mmask32; |
19 | use crate::core_arch::x86::__mmask64; |
20 | use crate::core_arch::x86::_mm256_setzero_si256; |
21 | use crate::core_arch::x86::_mm512_setzero_si512; |
22 | use crate::core_arch::x86::_mm_setzero_si128; |
23 | use crate::core_arch::x86::m128iExt; |
24 | use crate::core_arch::x86::m256iExt; |
25 | use crate::core_arch::x86::m512iExt; |
26 | use crate::mem::transmute; |
27 | |
28 | #[cfg (test)] |
29 | use stdarch_test::assert_instr; |
30 | |
31 | #[allow (improper_ctypes)] |
32 | extern "C" { |
33 | #[link_name = "llvm.x86.vgf2p8affineinvqb.512" ] |
34 | fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64; |
35 | #[link_name = "llvm.x86.vgf2p8affineinvqb.256" ] |
36 | fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32; |
37 | #[link_name = "llvm.x86.vgf2p8affineinvqb.128" ] |
38 | fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16; |
39 | #[link_name = "llvm.x86.vgf2p8affineqb.512" ] |
40 | fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64; |
41 | #[link_name = "llvm.x86.vgf2p8affineqb.256" ] |
42 | fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32; |
43 | #[link_name = "llvm.x86.vgf2p8affineqb.128" ] |
44 | fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16; |
45 | #[link_name = "llvm.x86.vgf2p8mulb.512" ] |
46 | fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64; |
47 | #[link_name = "llvm.x86.vgf2p8mulb.256" ] |
48 | fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32; |
49 | #[link_name = "llvm.x86.vgf2p8mulb.128" ] |
50 | fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16; |
51 | } |
52 | |
53 | // LLVM requires AVX512BW for a lot of these instructions, see |
54 | // https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457 |
55 | // however our tests also require the target feature list to match Intel's |
56 | // which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F |
57 | // requirement (for now) |
58 | // also see |
59 | // https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h |
60 | // for forcing GFNI, BW and optionally VL extension |
61 | |
62 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
63 | /// The field is in polynomial representation with the reduction polynomial |
64 | /// x^8 + x^4 + x^3 + x + 1. |
65 | /// |
66 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8) |
67 | #[inline ] |
68 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
69 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
70 | pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i { |
71 | transmute(src:vgf2p8mulb_512(a:a.as_i8x64(), b:b.as_i8x64())) |
72 | } |
73 | |
74 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
75 | /// The field is in polynomial representation with the reduction polynomial |
76 | /// x^8 + x^4 + x^3 + x + 1. |
77 | /// |
78 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
79 | /// Otherwise the computation result is written into the result. |
80 | /// |
81 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8) |
82 | #[inline ] |
83 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
84 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
85 | pub unsafe fn _mm512_mask_gf2p8mul_epi8( |
86 | src: __m512i, |
87 | k: __mmask64, |
88 | a: __m512i, |
89 | b: __m512i, |
90 | ) -> __m512i { |
91 | transmute(src:simd_select_bitmask( |
92 | m:k, |
93 | a:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()), |
94 | b:src.as_i8x64(), |
95 | )) |
96 | } |
97 | |
98 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
99 | /// The field is in polynomial representation with the reduction polynomial |
100 | /// x^8 + x^4 + x^3 + x + 1. |
101 | /// |
102 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
103 | /// Otherwise the computation result is written into the result. |
104 | /// |
105 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8) |
106 | #[inline ] |
107 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
108 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
109 | pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i { |
110 | let zero: i8x64 = _mm512_setzero_si512().as_i8x64(); |
111 | transmute(src:simd_select_bitmask( |
112 | m:k, |
113 | a:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()), |
114 | b:zero, |
115 | )) |
116 | } |
117 | |
118 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
119 | /// The field is in polynomial representation with the reduction polynomial |
120 | /// x^8 + x^4 + x^3 + x + 1. |
121 | /// |
122 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8) |
123 | #[inline ] |
124 | #[target_feature (enable = "gfni,avx" )] |
125 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
126 | pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i { |
127 | transmute(src:vgf2p8mulb_256(a:a.as_i8x32(), b:b.as_i8x32())) |
128 | } |
129 | |
130 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
131 | /// The field is in polynomial representation with the reduction polynomial |
132 | /// x^8 + x^4 + x^3 + x + 1. |
133 | /// |
134 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
135 | /// Otherwise the computation result is written into the result. |
136 | /// |
137 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8) |
138 | #[inline ] |
139 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
140 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
141 | pub unsafe fn _mm256_mask_gf2p8mul_epi8( |
142 | src: __m256i, |
143 | k: __mmask32, |
144 | a: __m256i, |
145 | b: __m256i, |
146 | ) -> __m256i { |
147 | transmute(src:simd_select_bitmask( |
148 | m:k, |
149 | a:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()), |
150 | b:src.as_i8x32(), |
151 | )) |
152 | } |
153 | |
154 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
155 | /// The field is in polynomial representation with the reduction polynomial |
156 | /// x^8 + x^4 + x^3 + x + 1. |
157 | /// |
158 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
159 | /// Otherwise the computation result is written into the result. |
160 | /// |
161 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8) |
162 | #[inline ] |
163 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
164 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
165 | pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i { |
166 | let zero: i8x32 = _mm256_setzero_si256().as_i8x32(); |
167 | transmute(src:simd_select_bitmask( |
168 | m:k, |
169 | a:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()), |
170 | b:zero, |
171 | )) |
172 | } |
173 | |
174 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
175 | /// The field is in polynomial representation with the reduction polynomial |
176 | /// x^8 + x^4 + x^3 + x + 1. |
177 | /// |
178 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8) |
179 | #[inline ] |
180 | #[target_feature (enable = "gfni" )] |
181 | #[cfg_attr (test, assert_instr(gf2p8mulb))] |
182 | pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i { |
183 | transmute(src:vgf2p8mulb_128(a:a.as_i8x16(), b:b.as_i8x16())) |
184 | } |
185 | |
186 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
187 | /// The field is in polynomial representation with the reduction polynomial |
188 | /// x^8 + x^4 + x^3 + x + 1. |
189 | /// |
190 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
191 | /// Otherwise the computation result is written into the result. |
192 | /// |
193 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8) |
194 | #[inline ] |
195 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
196 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
197 | pub unsafe fn _mm_mask_gf2p8mul_epi8( |
198 | src: __m128i, |
199 | k: __mmask16, |
200 | a: __m128i, |
201 | b: __m128i, |
202 | ) -> __m128i { |
203 | transmute(src:simd_select_bitmask( |
204 | m:k, |
205 | a:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()), |
206 | b:src.as_i8x16(), |
207 | )) |
208 | } |
209 | |
210 | /// Performs a multiplication in GF(2^8) on the packed bytes. |
211 | /// The field is in polynomial representation with the reduction polynomial |
212 | /// x^8 + x^4 + x^3 + x + 1. |
213 | /// |
214 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
215 | /// Otherwise the computation result is written into the result. |
216 | /// |
217 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8) |
218 | #[inline ] |
219 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
220 | #[cfg_attr (test, assert_instr(vgf2p8mulb))] |
221 | pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i { |
222 | let zero: i8x16 = _mm_setzero_si128().as_i8x16(); |
223 | transmute(src:simd_select_bitmask( |
224 | m:k, |
225 | a:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()), |
226 | b:zero, |
227 | )) |
228 | } |
229 | |
230 | /// Performs an affine transformation on the packed bytes in x. |
231 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
232 | /// and b being a constant 8-bit immediate value. |
233 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
234 | /// |
235 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi8) |
236 | #[inline ] |
237 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
238 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
239 | #[rustc_legacy_const_generics (2)] |
240 | pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i { |
241 | static_assert_uimm_bits!(B, 8); |
242 | let b: u8 = B as u8; |
243 | let x: i8x64 = x.as_i8x64(); |
244 | let a: i8x64 = a.as_i8x64(); |
245 | let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b); |
246 | transmute(src:r) |
247 | } |
248 | |
249 | /// Performs an affine transformation on the packed bytes in x. |
250 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
251 | /// and b being a constant 8-bit immediate value. |
252 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
253 | /// |
254 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
255 | /// Otherwise the computation result is written into the result. |
256 | /// |
257 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi8) |
258 | #[inline ] |
259 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
260 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
261 | #[rustc_legacy_const_generics (3)] |
262 | pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>( |
263 | k: __mmask64, |
264 | x: __m512i, |
265 | a: __m512i, |
266 | ) -> __m512i { |
267 | static_assert_uimm_bits!(B, 8); |
268 | let b: u8 = B as u8; |
269 | let zero: i8x64 = _mm512_setzero_si512().as_i8x64(); |
270 | let x: i8x64 = x.as_i8x64(); |
271 | let a: i8x64 = a.as_i8x64(); |
272 | let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b); |
273 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
274 | } |
275 | |
276 | /// Performs an affine transformation on the packed bytes in x. |
277 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
278 | /// and b being a constant 8-bit immediate value. |
279 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
280 | /// |
281 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
282 | /// Otherwise the computation result is written into the result. |
283 | /// |
284 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi8) |
285 | #[inline ] |
286 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
287 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
288 | #[rustc_legacy_const_generics (4)] |
289 | pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>( |
290 | src: __m512i, |
291 | k: __mmask64, |
292 | x: __m512i, |
293 | a: __m512i, |
294 | ) -> __m512i { |
295 | static_assert_uimm_bits!(B, 8); |
296 | let b: u8 = B as u8; |
297 | let x: i8x64 = x.as_i8x64(); |
298 | let a: i8x64 = a.as_i8x64(); |
299 | let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b); |
300 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x64())) |
301 | } |
302 | |
303 | /// Performs an affine transformation on the packed bytes in x. |
304 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
305 | /// and b being a constant 8-bit immediate value. |
306 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
307 | /// |
308 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi8) |
309 | #[inline ] |
310 | #[target_feature (enable = "gfni,avx" )] |
311 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
312 | #[rustc_legacy_const_generics (2)] |
313 | pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i { |
314 | static_assert_uimm_bits!(B, 8); |
315 | let b: u8 = B as u8; |
316 | let x: i8x32 = x.as_i8x32(); |
317 | let a: i8x32 = a.as_i8x32(); |
318 | let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b); |
319 | transmute(src:r) |
320 | } |
321 | |
322 | /// Performs an affine transformation on the packed bytes in x. |
323 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
324 | /// and b being a constant 8-bit immediate value. |
325 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
326 | /// |
327 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
328 | /// Otherwise the computation result is written into the result. |
329 | /// |
330 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi8) |
331 | #[inline ] |
332 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
333 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
334 | #[rustc_legacy_const_generics (3)] |
335 | pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>( |
336 | k: __mmask32, |
337 | x: __m256i, |
338 | a: __m256i, |
339 | ) -> __m256i { |
340 | static_assert_uimm_bits!(B, 8); |
341 | let b: u8 = B as u8; |
342 | let zero: i8x32 = _mm256_setzero_si256().as_i8x32(); |
343 | let x: i8x32 = x.as_i8x32(); |
344 | let a: i8x32 = a.as_i8x32(); |
345 | let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b); |
346 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
347 | } |
348 | |
349 | /// Performs an affine transformation on the packed bytes in x. |
350 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
351 | /// and b being a constant 8-bit immediate value. |
352 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
353 | /// |
354 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
355 | /// Otherwise the computation result is written into the result. |
356 | /// |
357 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi8) |
358 | #[inline ] |
359 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
360 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
361 | #[rustc_legacy_const_generics (4)] |
362 | pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>( |
363 | src: __m256i, |
364 | k: __mmask32, |
365 | x: __m256i, |
366 | a: __m256i, |
367 | ) -> __m256i { |
368 | static_assert_uimm_bits!(B, 8); |
369 | let b: u8 = B as u8; |
370 | let x: i8x32 = x.as_i8x32(); |
371 | let a: i8x32 = a.as_i8x32(); |
372 | let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b); |
373 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x32())) |
374 | } |
375 | |
376 | /// Performs an affine transformation on the packed bytes in x. |
377 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
378 | /// and b being a constant 8-bit immediate value. |
379 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
380 | /// |
381 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi8) |
382 | #[inline ] |
383 | #[target_feature (enable = "gfni" )] |
384 | #[cfg_attr (test, assert_instr(gf2p8affineqb, B = 0))] |
385 | #[rustc_legacy_const_generics (2)] |
386 | pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i { |
387 | static_assert_uimm_bits!(B, 8); |
388 | let b: u8 = B as u8; |
389 | let x: i8x16 = x.as_i8x16(); |
390 | let a: i8x16 = a.as_i8x16(); |
391 | let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b); |
392 | transmute(src:r) |
393 | } |
394 | |
395 | /// Performs an affine transformation on the packed bytes in x. |
396 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
397 | /// and b being a constant 8-bit immediate value. |
398 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
399 | /// |
400 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
401 | /// Otherwise the computation result is written into the result. |
402 | /// |
403 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi8) |
404 | #[inline ] |
405 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
406 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
407 | #[rustc_legacy_const_generics (3)] |
408 | pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>( |
409 | k: __mmask16, |
410 | x: __m128i, |
411 | a: __m128i, |
412 | ) -> __m128i { |
413 | static_assert_uimm_bits!(B, 8); |
414 | let b: u8 = B as u8; |
415 | let zero: i8x16 = _mm_setzero_si128().as_i8x16(); |
416 | let x: i8x16 = x.as_i8x16(); |
417 | let a: i8x16 = a.as_i8x16(); |
418 | let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b); |
419 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
420 | } |
421 | |
422 | /// Performs an affine transformation on the packed bytes in x. |
423 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
424 | /// and b being a constant 8-bit immediate value. |
425 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
426 | /// |
427 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
428 | /// Otherwise the computation result is written into the result. |
429 | /// |
430 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi8) |
431 | #[inline ] |
432 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
433 | #[cfg_attr (test, assert_instr(vgf2p8affineqb, B = 0))] |
434 | #[rustc_legacy_const_generics (4)] |
435 | pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>( |
436 | src: __m128i, |
437 | k: __mmask16, |
438 | x: __m128i, |
439 | a: __m128i, |
440 | ) -> __m128i { |
441 | static_assert_uimm_bits!(B, 8); |
442 | let b: u8 = B as u8; |
443 | let x: i8x16 = x.as_i8x16(); |
444 | let a: i8x16 = a.as_i8x16(); |
445 | let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b); |
446 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x16())) |
447 | } |
448 | |
449 | /// Performs an affine transformation on the inverted packed bytes in x. |
450 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
451 | /// and b being a constant 8-bit immediate value. |
452 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
453 | /// The inverse of 0 is 0. |
454 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
455 | /// |
456 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8) |
457 | #[inline ] |
458 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
459 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
460 | #[rustc_legacy_const_generics (2)] |
461 | pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i { |
462 | static_assert_uimm_bits!(B, 8); |
463 | let b: u8 = B as u8; |
464 | let x: i8x64 = x.as_i8x64(); |
465 | let a: i8x64 = a.as_i8x64(); |
466 | let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b); |
467 | transmute(src:r) |
468 | } |
469 | |
470 | /// Performs an affine transformation on the inverted packed bytes in x. |
471 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
472 | /// and b being a constant 8-bit immediate value. |
473 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
474 | /// The inverse of 0 is 0. |
475 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
476 | /// |
477 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
478 | /// Otherwise the computation result is written into the result. |
479 | /// |
480 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8) |
481 | #[inline ] |
482 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
483 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
484 | #[rustc_legacy_const_generics (3)] |
485 | pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( |
486 | k: __mmask64, |
487 | x: __m512i, |
488 | a: __m512i, |
489 | ) -> __m512i { |
490 | static_assert_uimm_bits!(B, 8); |
491 | let b: u8 = B as u8; |
492 | let zero: i8x64 = _mm512_setzero_si512().as_i8x64(); |
493 | let x: i8x64 = x.as_i8x64(); |
494 | let a: i8x64 = a.as_i8x64(); |
495 | let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b); |
496 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
497 | } |
498 | |
499 | /// Performs an affine transformation on the inverted packed bytes in x. |
500 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
501 | /// and b being a constant 8-bit immediate value. |
502 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
503 | /// The inverse of 0 is 0. |
504 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
505 | /// |
506 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
507 | /// Otherwise the computation result is written into the result. |
508 | /// |
509 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8) |
510 | #[inline ] |
511 | #[target_feature (enable = "gfni,avx512bw,avx512f" )] |
512 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
513 | #[rustc_legacy_const_generics (4)] |
514 | pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>( |
515 | src: __m512i, |
516 | k: __mmask64, |
517 | x: __m512i, |
518 | a: __m512i, |
519 | ) -> __m512i { |
520 | static_assert_uimm_bits!(B, 8); |
521 | let b: u8 = B as u8; |
522 | let x: i8x64 = x.as_i8x64(); |
523 | let a: i8x64 = a.as_i8x64(); |
524 | let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b); |
525 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x64())) |
526 | } |
527 | |
528 | /// Performs an affine transformation on the inverted packed bytes in x. |
529 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
530 | /// and b being a constant 8-bit immediate value. |
531 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
532 | /// The inverse of 0 is 0. |
533 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
534 | /// |
535 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8) |
536 | #[inline ] |
537 | #[target_feature (enable = "gfni,avx" )] |
538 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
539 | #[rustc_legacy_const_generics (2)] |
540 | pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i { |
541 | static_assert_uimm_bits!(B, 8); |
542 | let b: u8 = B as u8; |
543 | let x: i8x32 = x.as_i8x32(); |
544 | let a: i8x32 = a.as_i8x32(); |
545 | let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b); |
546 | transmute(src:r) |
547 | } |
548 | |
549 | /// Performs an affine transformation on the inverted packed bytes in x. |
550 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
551 | /// and b being a constant 8-bit immediate value. |
552 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
553 | /// The inverse of 0 is 0. |
554 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
555 | /// |
556 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
557 | /// Otherwise the computation result is written into the result. |
558 | /// |
559 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8) |
560 | #[inline ] |
561 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
562 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
563 | #[rustc_legacy_const_generics (3)] |
564 | pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( |
565 | k: __mmask32, |
566 | x: __m256i, |
567 | a: __m256i, |
568 | ) -> __m256i { |
569 | static_assert_uimm_bits!(B, 8); |
570 | let b: u8 = B as u8; |
571 | let zero: i8x32 = _mm256_setzero_si256().as_i8x32(); |
572 | let x: i8x32 = x.as_i8x32(); |
573 | let a: i8x32 = a.as_i8x32(); |
574 | let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b); |
575 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
576 | } |
577 | |
578 | /// Performs an affine transformation on the inverted packed bytes in x. |
579 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
580 | /// and b being a constant 8-bit immediate value. |
581 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
582 | /// The inverse of 0 is 0. |
583 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
584 | /// |
585 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
586 | /// Otherwise the computation result is written into the result. |
587 | /// |
588 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8) |
589 | #[inline ] |
590 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
591 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
592 | #[rustc_legacy_const_generics (4)] |
593 | pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>( |
594 | src: __m256i, |
595 | k: __mmask32, |
596 | x: __m256i, |
597 | a: __m256i, |
598 | ) -> __m256i { |
599 | static_assert_uimm_bits!(B, 8); |
600 | let b: u8 = B as u8; |
601 | let x: i8x32 = x.as_i8x32(); |
602 | let a: i8x32 = a.as_i8x32(); |
603 | let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b); |
604 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x32())) |
605 | } |
606 | |
607 | /// Performs an affine transformation on the inverted packed bytes in x. |
608 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
609 | /// and b being a constant 8-bit immediate value. |
610 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
611 | /// The inverse of 0 is 0. |
612 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
613 | /// |
614 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8) |
615 | #[inline ] |
616 | #[target_feature (enable = "gfni" )] |
617 | #[cfg_attr (test, assert_instr(gf2p8affineinvqb, B = 0))] |
618 | #[rustc_legacy_const_generics (2)] |
619 | pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i { |
620 | static_assert_uimm_bits!(B, 8); |
621 | let b: u8 = B as u8; |
622 | let x: i8x16 = x.as_i8x16(); |
623 | let a: i8x16 = a.as_i8x16(); |
624 | let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b); |
625 | transmute(src:r) |
626 | } |
627 | |
628 | /// Performs an affine transformation on the inverted packed bytes in x. |
629 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
630 | /// and b being a constant 8-bit immediate value. |
631 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
632 | /// The inverse of 0 is 0. |
633 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
634 | /// |
635 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. |
636 | /// Otherwise the computation result is written into the result. |
637 | /// |
638 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8) |
639 | #[inline ] |
640 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
641 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
642 | #[rustc_legacy_const_generics (3)] |
643 | pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( |
644 | k: __mmask16, |
645 | x: __m128i, |
646 | a: __m128i, |
647 | ) -> __m128i { |
648 | static_assert_uimm_bits!(B, 8); |
649 | let b: u8 = B as u8; |
650 | let zero: i8x16 = _mm_setzero_si128().as_i8x16(); |
651 | let x: i8x16 = x.as_i8x16(); |
652 | let a: i8x16 = a.as_i8x16(); |
653 | let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b); |
654 | transmute(src:simd_select_bitmask(m:k, a:r, b:zero)) |
655 | } |
656 | |
657 | /// Performs an affine transformation on the inverted packed bytes in x. |
658 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix |
659 | /// and b being a constant 8-bit immediate value. |
660 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. |
661 | /// The inverse of 0 is 0. |
662 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. |
663 | /// |
664 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. |
665 | /// Otherwise the computation result is written into the result. |
666 | /// |
667 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8) |
668 | #[inline ] |
669 | #[target_feature (enable = "gfni,avx512bw,avx512vl" )] |
670 | #[cfg_attr (test, assert_instr(vgf2p8affineinvqb, B = 0))] |
671 | #[rustc_legacy_const_generics (4)] |
672 | pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>( |
673 | src: __m128i, |
674 | k: __mmask16, |
675 | x: __m128i, |
676 | a: __m128i, |
677 | ) -> __m128i { |
678 | static_assert_uimm_bits!(B, 8); |
679 | let b: u8 = B as u8; |
680 | let x: i8x16 = x.as_i8x16(); |
681 | let a: i8x16 = a.as_i8x16(); |
682 | let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b); |
683 | transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x16())) |
684 | } |
685 | |
686 | #[cfg (test)] |
687 | mod tests { |
688 | // The constants in the tests below are just bit patterns. They should not |
689 | // be interpreted as integers; signedness does not make sense for them, but |
690 | // __mXXXi happens to be defined in terms of signed integers. |
691 | #![allow (overflowing_literals)] |
692 | |
693 | use core::hint::black_box; |
694 | use core::intrinsics::size_of; |
695 | use stdarch_test::simd_test; |
696 | |
697 | use crate::core_arch::x86::*; |
698 | |
699 | fn mulbyte(left: u8, right: u8) -> u8 { |
700 | // this implementation follows the description in |
701 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8 |
702 | const REDUCTION_POLYNOMIAL: u16 = 0x11b; |
703 | let left: u16 = left.into(); |
704 | let right: u16 = right.into(); |
705 | let mut carryless_product: u16 = 0; |
706 | |
707 | // Carryless multiplication |
708 | for i in 0..8 { |
709 | if ((left >> i) & 0x01) != 0 { |
710 | carryless_product ^= right << i; |
711 | } |
712 | } |
713 | |
714 | // reduction, adding in "0" where appropriate to clear out high bits |
715 | // note that REDUCTION_POLYNOMIAL is zero in this context |
716 | for i in (8..=14).rev() { |
717 | if ((carryless_product >> i) & 0x01) != 0 { |
718 | carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8); |
719 | } |
720 | } |
721 | |
722 | carryless_product as u8 |
723 | } |
724 | |
725 | const NUM_TEST_WORDS_512: usize = 4; |
726 | const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2; |
727 | const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2; |
728 | const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64; |
729 | const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2; |
730 | const NUM_BYTES: usize = 256; |
731 | const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16; |
732 | const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2; |
733 | const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2; |
734 | |
735 | fn parity(input: u8) -> u8 { |
736 | let mut accumulator = 0; |
737 | for i in 0..8 { |
738 | accumulator ^= (input >> i) & 0x01; |
739 | } |
740 | accumulator |
741 | } |
742 | |
743 | fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 { |
744 | // this implementation follows the description in |
745 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8 |
746 | let mut accumulator = 0; |
747 | |
748 | for bit in 0..8 { |
749 | accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit); |
750 | } |
751 | |
752 | accumulator ^ b |
753 | } |
754 | |
755 | fn generate_affine_mul_test_data( |
756 | immediate: u8, |
757 | ) -> ( |
758 | [u64; NUM_TEST_WORDS_64], |
759 | [u8; NUM_TEST_ENTRIES], |
760 | [u8; NUM_TEST_ENTRIES], |
761 | ) { |
762 | let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64]; |
763 | let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; |
764 | let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; |
765 | |
766 | for i in 0..NUM_TEST_WORDS_64 { |
767 | left[i] = (i as u64) * 103 * 101; |
768 | for j in 0..8 { |
769 | let j64 = j as u64; |
770 | right[i * 8 + j] = ((left[i] + j64) % 256) as u8; |
771 | result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate); |
772 | } |
773 | } |
774 | |
775 | (left, right, result) |
776 | } |
777 | |
778 | fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) { |
779 | let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES]; |
780 | let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES]; |
781 | |
782 | for i in 0..NUM_BYTES { |
783 | input[i] = (i % 256) as u8; |
784 | result[i] = if i == 0 { 0 } else { 1 }; |
785 | } |
786 | |
787 | (input, result) |
788 | } |
789 | |
790 | const AES_S_BOX: [u8; NUM_BYTES] = [ |
791 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, |
792 | 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, |
793 | 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, |
794 | 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, |
795 | 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, |
796 | 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, |
797 | 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, |
798 | 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, |
799 | 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, |
800 | 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, |
801 | 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, |
802 | 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, |
803 | 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, |
804 | 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, |
805 | 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, |
806 | 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, |
807 | 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, |
808 | 0x16, |
809 | ]; |
810 | |
811 | fn generate_byte_mul_test_data() -> ( |
812 | [u8; NUM_TEST_ENTRIES], |
813 | [u8; NUM_TEST_ENTRIES], |
814 | [u8; NUM_TEST_ENTRIES], |
815 | ) { |
816 | let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; |
817 | let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; |
818 | let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; |
819 | |
820 | for i in 0..NUM_TEST_ENTRIES { |
821 | left[i] = (i % 256) as u8; |
822 | right[i] = left[i].wrapping_mul(101); |
823 | result[i] = mulbyte(left[i], right[i]); |
824 | } |
825 | |
826 | (left, right, result) |
827 | } |
828 | |
829 | #[target_feature (enable = "sse2" )] |
830 | unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i { |
831 | let byte_offset = word_index * 16 / size_of::<T>(); |
832 | let pointer = data.as_ptr().add(byte_offset) as *const __m128i; |
833 | _mm_loadu_si128(black_box(pointer)) |
834 | } |
835 | |
836 | #[target_feature (enable = "avx" )] |
837 | unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i { |
838 | let byte_offset = word_index * 32 / size_of::<T>(); |
839 | let pointer = data.as_ptr().add(byte_offset) as *const __m256i; |
840 | _mm256_loadu_si256(black_box(pointer)) |
841 | } |
842 | |
843 | #[target_feature (enable = "avx512f" )] |
844 | unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i { |
845 | let byte_offset = word_index * 64 / size_of::<T>(); |
846 | let pointer = data.as_ptr().add(byte_offset) as *const i32; |
847 | _mm512_loadu_si512(black_box(pointer)) |
848 | } |
849 | |
850 | #[simd_test(enable = "gfni,avx512bw" )] |
851 | unsafe fn test_mm512_gf2p8mul_epi8() { |
852 | let (left, right, expected) = generate_byte_mul_test_data(); |
853 | |
854 | for i in 0..NUM_TEST_WORDS_512 { |
855 | let left = load_m512i_word(&left, i); |
856 | let right = load_m512i_word(&right, i); |
857 | let expected = load_m512i_word(&expected, i); |
858 | let result = _mm512_gf2p8mul_epi8(left, right); |
859 | assert_eq_m512i(result, expected); |
860 | } |
861 | } |
862 | |
863 | #[simd_test(enable = "gfni,avx512bw" )] |
864 | unsafe fn test_mm512_maskz_gf2p8mul_epi8() { |
865 | let (left, right, _expected) = generate_byte_mul_test_data(); |
866 | |
867 | for i in 0..NUM_TEST_WORDS_512 { |
868 | let left = load_m512i_word(&left, i); |
869 | let right = load_m512i_word(&right, i); |
870 | let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right); |
871 | assert_eq_m512i(result_zero, _mm512_setzero_si512()); |
872 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; |
873 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; |
874 | let expected_result = _mm512_gf2p8mul_epi8(left, right); |
875 | let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right); |
876 | let expected_masked = |
877 | _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result); |
878 | assert_eq_m512i(result_masked, expected_masked); |
879 | } |
880 | } |
881 | |
882 | #[simd_test(enable = "gfni,avx512bw" )] |
883 | unsafe fn test_mm512_mask_gf2p8mul_epi8() { |
884 | let (left, right, _expected) = generate_byte_mul_test_data(); |
885 | |
886 | for i in 0..NUM_TEST_WORDS_512 { |
887 | let left = load_m512i_word(&left, i); |
888 | let right = load_m512i_word(&right, i); |
889 | let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right); |
890 | assert_eq_m512i(result_left, left); |
891 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; |
892 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; |
893 | let expected_result = _mm512_gf2p8mul_epi8(left, right); |
894 | let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right); |
895 | let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result); |
896 | assert_eq_m512i(result_masked, expected_masked); |
897 | } |
898 | } |
899 | |
900 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
901 | unsafe fn test_mm256_gf2p8mul_epi8() { |
902 | let (left, right, expected) = generate_byte_mul_test_data(); |
903 | |
904 | for i in 0..NUM_TEST_WORDS_256 { |
905 | let left = load_m256i_word(&left, i); |
906 | let right = load_m256i_word(&right, i); |
907 | let expected = load_m256i_word(&expected, i); |
908 | let result = _mm256_gf2p8mul_epi8(left, right); |
909 | assert_eq_m256i(result, expected); |
910 | } |
911 | } |
912 | |
913 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
914 | unsafe fn test_mm256_maskz_gf2p8mul_epi8() { |
915 | let (left, right, _expected) = generate_byte_mul_test_data(); |
916 | |
917 | for i in 0..NUM_TEST_WORDS_256 { |
918 | let left = load_m256i_word(&left, i); |
919 | let right = load_m256i_word(&right, i); |
920 | let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right); |
921 | assert_eq_m256i(result_zero, _mm256_setzero_si256()); |
922 | let mask_bytes: __mmask32 = 0x0F_F0_FF_00; |
923 | const MASK_WORDS: i32 = 0b01_10_11_00; |
924 | let expected_result = _mm256_gf2p8mul_epi8(left, right); |
925 | let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right); |
926 | let expected_masked = |
927 | _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result); |
928 | assert_eq_m256i(result_masked, expected_masked); |
929 | } |
930 | } |
931 | |
932 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
933 | unsafe fn test_mm256_mask_gf2p8mul_epi8() { |
934 | let (left, right, _expected) = generate_byte_mul_test_data(); |
935 | |
936 | for i in 0..NUM_TEST_WORDS_256 { |
937 | let left = load_m256i_word(&left, i); |
938 | let right = load_m256i_word(&right, i); |
939 | let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right); |
940 | assert_eq_m256i(result_left, left); |
941 | let mask_bytes: __mmask32 = 0x0F_F0_FF_00; |
942 | const MASK_WORDS: i32 = 0b01_10_11_00; |
943 | let expected_result = _mm256_gf2p8mul_epi8(left, right); |
944 | let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right); |
945 | let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result); |
946 | assert_eq_m256i(result_masked, expected_masked); |
947 | } |
948 | } |
949 | |
950 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
951 | unsafe fn test_mm_gf2p8mul_epi8() { |
952 | let (left, right, expected) = generate_byte_mul_test_data(); |
953 | |
954 | for i in 0..NUM_TEST_WORDS_128 { |
955 | let left = load_m128i_word(&left, i); |
956 | let right = load_m128i_word(&right, i); |
957 | let expected = load_m128i_word(&expected, i); |
958 | let result = _mm_gf2p8mul_epi8(left, right); |
959 | assert_eq_m128i(result, expected); |
960 | } |
961 | } |
962 | |
963 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
964 | unsafe fn test_mm_maskz_gf2p8mul_epi8() { |
965 | let (left, right, _expected) = generate_byte_mul_test_data(); |
966 | |
967 | for i in 0..NUM_TEST_WORDS_128 { |
968 | let left = load_m128i_word(&left, i); |
969 | let right = load_m128i_word(&right, i); |
970 | let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right); |
971 | assert_eq_m128i(result_zero, _mm_setzero_si128()); |
972 | let mask_bytes: __mmask16 = 0x0F_F0; |
973 | const MASK_WORDS: i32 = 0b01_10; |
974 | let expected_result = _mm_gf2p8mul_epi8(left, right); |
975 | let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right); |
976 | let expected_masked = |
977 | _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result); |
978 | assert_eq_m128i(result_masked, expected_masked); |
979 | } |
980 | } |
981 | |
982 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
983 | unsafe fn test_mm_mask_gf2p8mul_epi8() { |
984 | let (left, right, _expected) = generate_byte_mul_test_data(); |
985 | |
986 | for i in 0..NUM_TEST_WORDS_128 { |
987 | let left = load_m128i_word(&left, i); |
988 | let right = load_m128i_word(&right, i); |
989 | let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right); |
990 | assert_eq_m128i(result_left, left); |
991 | let mask_bytes: __mmask16 = 0x0F_F0; |
992 | const MASK_WORDS: i32 = 0b01_10; |
993 | let expected_result = _mm_gf2p8mul_epi8(left, right); |
994 | let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right); |
995 | let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result); |
996 | assert_eq_m128i(result_masked, expected_masked); |
997 | } |
998 | } |
999 | |
1000 | #[simd_test(enable = "gfni,avx512bw" )] |
1001 | unsafe fn test_mm512_gf2p8affine_epi64_epi8() { |
1002 | let identity: i64 = 0x01_02_04_08_10_20_40_80; |
1003 | const IDENTITY_BYTE: i32 = 0; |
1004 | let constant: i64 = 0; |
1005 | const CONSTANT_BYTE: i32 = 0x63; |
1006 | let identity = _mm512_set1_epi64(identity); |
1007 | let constant = _mm512_set1_epi64(constant); |
1008 | let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8); |
1009 | |
1010 | let (bytes, more_bytes, _) = generate_byte_mul_test_data(); |
1011 | let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8); |
1012 | |
1013 | for i in 0..NUM_TEST_WORDS_512 { |
1014 | let data = load_m512i_word(&bytes, i); |
1015 | let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
1016 | assert_eq_m512i(result, data); |
1017 | let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
1018 | assert_eq_m512i(result, constant_reference); |
1019 | let data = load_m512i_word(&more_bytes, i); |
1020 | let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
1021 | assert_eq_m512i(result, data); |
1022 | let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
1023 | assert_eq_m512i(result, constant_reference); |
1024 | |
1025 | let matrix = load_m512i_word(&matrices, i); |
1026 | let vector = load_m512i_word(&vectors, i); |
1027 | let reference = load_m512i_word(&references, i); |
1028 | |
1029 | let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); |
1030 | assert_eq_m512i(result, reference); |
1031 | } |
1032 | } |
1033 | |
1034 | #[simd_test(enable = "gfni,avx512bw" )] |
1035 | unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() { |
1036 | const CONSTANT_BYTE: i32 = 0x63; |
1037 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1038 | |
1039 | for i in 0..NUM_TEST_WORDS_512 { |
1040 | let matrix = load_m512i_word(&matrices, i); |
1041 | let vector = load_m512i_word(&vectors, i); |
1042 | let result_zero = |
1043 | _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
1044 | assert_eq_m512i(result_zero, _mm512_setzero_si512()); |
1045 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; |
1046 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; |
1047 | let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1048 | let result_masked = |
1049 | _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1050 | let expected_masked = |
1051 | _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result); |
1052 | assert_eq_m512i(result_masked, expected_masked); |
1053 | } |
1054 | } |
1055 | |
1056 | #[simd_test(enable = "gfni,avx512bw" )] |
1057 | unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() { |
1058 | const CONSTANT_BYTE: i32 = 0x63; |
1059 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1060 | |
1061 | for i in 0..NUM_TEST_WORDS_512 { |
1062 | let left = load_m512i_word(&vectors, i); |
1063 | let right = load_m512i_word(&matrices, i); |
1064 | let result_left = |
1065 | _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
1066 | assert_eq_m512i(result_left, left); |
1067 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; |
1068 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; |
1069 | let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1070 | let result_masked = |
1071 | _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
1072 | let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result); |
1073 | assert_eq_m512i(result_masked, expected_masked); |
1074 | } |
1075 | } |
1076 | |
1077 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1078 | unsafe fn test_mm256_gf2p8affine_epi64_epi8() { |
1079 | let identity: i64 = 0x01_02_04_08_10_20_40_80; |
1080 | const IDENTITY_BYTE: i32 = 0; |
1081 | let constant: i64 = 0; |
1082 | const CONSTANT_BYTE: i32 = 0x63; |
1083 | let identity = _mm256_set1_epi64x(identity); |
1084 | let constant = _mm256_set1_epi64x(constant); |
1085 | let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8); |
1086 | |
1087 | let (bytes, more_bytes, _) = generate_byte_mul_test_data(); |
1088 | let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8); |
1089 | |
1090 | for i in 0..NUM_TEST_WORDS_256 { |
1091 | let data = load_m256i_word(&bytes, i); |
1092 | let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
1093 | assert_eq_m256i(result, data); |
1094 | let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
1095 | assert_eq_m256i(result, constant_reference); |
1096 | let data = load_m256i_word(&more_bytes, i); |
1097 | let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
1098 | assert_eq_m256i(result, data); |
1099 | let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
1100 | assert_eq_m256i(result, constant_reference); |
1101 | |
1102 | let matrix = load_m256i_word(&matrices, i); |
1103 | let vector = load_m256i_word(&vectors, i); |
1104 | let reference = load_m256i_word(&references, i); |
1105 | |
1106 | let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); |
1107 | assert_eq_m256i(result, reference); |
1108 | } |
1109 | } |
1110 | |
1111 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1112 | unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() { |
1113 | const CONSTANT_BYTE: i32 = 0x63; |
1114 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1115 | |
1116 | for i in 0..NUM_TEST_WORDS_256 { |
1117 | let matrix = load_m256i_word(&matrices, i); |
1118 | let vector = load_m256i_word(&vectors, i); |
1119 | let result_zero = |
1120 | _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
1121 | assert_eq_m256i(result_zero, _mm256_setzero_si256()); |
1122 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; |
1123 | const MASK_WORDS: i32 = 0b11_01_10_00; |
1124 | let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1125 | let result_masked = |
1126 | _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1127 | let expected_masked = |
1128 | _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result); |
1129 | assert_eq_m256i(result_masked, expected_masked); |
1130 | } |
1131 | } |
1132 | |
1133 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1134 | unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() { |
1135 | const CONSTANT_BYTE: i32 = 0x63; |
1136 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1137 | |
1138 | for i in 0..NUM_TEST_WORDS_256 { |
1139 | let left = load_m256i_word(&vectors, i); |
1140 | let right = load_m256i_word(&matrices, i); |
1141 | let result_left = |
1142 | _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
1143 | assert_eq_m256i(result_left, left); |
1144 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; |
1145 | const MASK_WORDS: i32 = 0b11_01_10_00; |
1146 | let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1147 | let result_masked = |
1148 | _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
1149 | let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result); |
1150 | assert_eq_m256i(result_masked, expected_masked); |
1151 | } |
1152 | } |
1153 | |
1154 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1155 | unsafe fn test_mm_gf2p8affine_epi64_epi8() { |
1156 | let identity: i64 = 0x01_02_04_08_10_20_40_80; |
1157 | const IDENTITY_BYTE: i32 = 0; |
1158 | let constant: i64 = 0; |
1159 | const CONSTANT_BYTE: i32 = 0x63; |
1160 | let identity = _mm_set1_epi64x(identity); |
1161 | let constant = _mm_set1_epi64x(constant); |
1162 | let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8); |
1163 | |
1164 | let (bytes, more_bytes, _) = generate_byte_mul_test_data(); |
1165 | let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8); |
1166 | |
1167 | for i in 0..NUM_TEST_WORDS_128 { |
1168 | let data = load_m128i_word(&bytes, i); |
1169 | let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
1170 | assert_eq_m128i(result, data); |
1171 | let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
1172 | assert_eq_m128i(result, constant_reference); |
1173 | let data = load_m128i_word(&more_bytes, i); |
1174 | let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
1175 | assert_eq_m128i(result, data); |
1176 | let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
1177 | assert_eq_m128i(result, constant_reference); |
1178 | |
1179 | let matrix = load_m128i_word(&matrices, i); |
1180 | let vector = load_m128i_word(&vectors, i); |
1181 | let reference = load_m128i_word(&references, i); |
1182 | |
1183 | let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); |
1184 | assert_eq_m128i(result, reference); |
1185 | } |
1186 | } |
1187 | |
1188 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1189 | unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() { |
1190 | const CONSTANT_BYTE: i32 = 0x63; |
1191 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1192 | |
1193 | for i in 0..NUM_TEST_WORDS_128 { |
1194 | let matrix = load_m128i_word(&matrices, i); |
1195 | let vector = load_m128i_word(&vectors, i); |
1196 | let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
1197 | assert_eq_m128i(result_zero, _mm_setzero_si128()); |
1198 | let mask_bytes: __mmask16 = 0x0F_F0; |
1199 | const MASK_WORDS: i32 = 0b01_10; |
1200 | let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1201 | let result_masked = |
1202 | _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1203 | let expected_masked = |
1204 | _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result); |
1205 | assert_eq_m128i(result_masked, expected_masked); |
1206 | } |
1207 | } |
1208 | |
1209 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1210 | unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() { |
1211 | const CONSTANT_BYTE: i32 = 0x63; |
1212 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1213 | |
1214 | for i in 0..NUM_TEST_WORDS_128 { |
1215 | let left = load_m128i_word(&vectors, i); |
1216 | let right = load_m128i_word(&matrices, i); |
1217 | let result_left = |
1218 | _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
1219 | assert_eq_m128i(result_left, left); |
1220 | let mask_bytes: __mmask16 = 0x0F_F0; |
1221 | const MASK_WORDS: i32 = 0b01_10; |
1222 | let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1223 | let result_masked = |
1224 | _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
1225 | let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result); |
1226 | assert_eq_m128i(result_masked, expected_masked); |
1227 | } |
1228 | } |
1229 | |
1230 | #[simd_test(enable = "gfni,avx512bw" )] |
1231 | unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() { |
1232 | let identity: i64 = 0x01_02_04_08_10_20_40_80; |
1233 | const IDENTITY_BYTE: i32 = 0; |
1234 | const CONSTANT_BYTE: i32 = 0x63; |
1235 | let identity = _mm512_set1_epi64(identity); |
1236 | |
1237 | // validate inversion |
1238 | let (inputs, results) = generate_inv_tests_data(); |
1239 | |
1240 | for i in 0..NUM_BYTES_WORDS_512 { |
1241 | let input = load_m512i_word(&inputs, i); |
1242 | let reference = load_m512i_word(&results, i); |
1243 | let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); |
1244 | let remultiplied = _mm512_gf2p8mul_epi8(result, input); |
1245 | assert_eq_m512i(remultiplied, reference); |
1246 | } |
1247 | |
1248 | // validate subsequent affine operation |
1249 | let (matrices, vectors, _affine_expected) = |
1250 | generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1251 | |
1252 | for i in 0..NUM_TEST_WORDS_512 { |
1253 | let vector = load_m512i_word(&vectors, i); |
1254 | let matrix = load_m512i_word(&matrices, i); |
1255 | |
1256 | let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); |
1257 | let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); |
1258 | let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1259 | assert_eq_m512i(result, reference); |
1260 | } |
1261 | |
1262 | // validate everything by virtue of checking against the AES SBox |
1263 | const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; |
1264 | let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX); |
1265 | |
1266 | for i in 0..NUM_BYTES_WORDS_512 { |
1267 | let reference = load_m512i_word(&AES_S_BOX, i); |
1268 | let input = load_m512i_word(&inputs, i); |
1269 | let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); |
1270 | assert_eq_m512i(result, reference); |
1271 | } |
1272 | } |
1273 | |
1274 | #[simd_test(enable = "gfni,avx512bw" )] |
1275 | unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() { |
1276 | const CONSTANT_BYTE: i32 = 0x63; |
1277 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1278 | |
1279 | for i in 0..NUM_TEST_WORDS_512 { |
1280 | let matrix = load_m512i_word(&matrices, i); |
1281 | let vector = load_m512i_word(&vectors, i); |
1282 | let result_zero = |
1283 | _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
1284 | assert_eq_m512i(result_zero, _mm512_setzero_si512()); |
1285 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; |
1286 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; |
1287 | let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1288 | let result_masked = |
1289 | _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1290 | let expected_masked = |
1291 | _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result); |
1292 | assert_eq_m512i(result_masked, expected_masked); |
1293 | } |
1294 | } |
1295 | |
1296 | #[simd_test(enable = "gfni,avx512bw" )] |
1297 | unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() { |
1298 | const CONSTANT_BYTE: i32 = 0x63; |
1299 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1300 | |
1301 | for i in 0..NUM_TEST_WORDS_512 { |
1302 | let left = load_m512i_word(&vectors, i); |
1303 | let right = load_m512i_word(&matrices, i); |
1304 | let result_left = |
1305 | _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
1306 | assert_eq_m512i(result_left, left); |
1307 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; |
1308 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; |
1309 | let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1310 | let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>( |
1311 | left, mask_bytes, left, right, |
1312 | ); |
1313 | let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result); |
1314 | assert_eq_m512i(result_masked, expected_masked); |
1315 | } |
1316 | } |
1317 | |
1318 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1319 | unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() { |
1320 | let identity: i64 = 0x01_02_04_08_10_20_40_80; |
1321 | const IDENTITY_BYTE: i32 = 0; |
1322 | const CONSTANT_BYTE: i32 = 0x63; |
1323 | let identity = _mm256_set1_epi64x(identity); |
1324 | |
1325 | // validate inversion |
1326 | let (inputs, results) = generate_inv_tests_data(); |
1327 | |
1328 | for i in 0..NUM_BYTES_WORDS_256 { |
1329 | let input = load_m256i_word(&inputs, i); |
1330 | let reference = load_m256i_word(&results, i); |
1331 | let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); |
1332 | let remultiplied = _mm256_gf2p8mul_epi8(result, input); |
1333 | assert_eq_m256i(remultiplied, reference); |
1334 | } |
1335 | |
1336 | // validate subsequent affine operation |
1337 | let (matrices, vectors, _affine_expected) = |
1338 | generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1339 | |
1340 | for i in 0..NUM_TEST_WORDS_256 { |
1341 | let vector = load_m256i_word(&vectors, i); |
1342 | let matrix = load_m256i_word(&matrices, i); |
1343 | |
1344 | let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); |
1345 | let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); |
1346 | let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1347 | assert_eq_m256i(result, reference); |
1348 | } |
1349 | |
1350 | // validate everything by virtue of checking against the AES SBox |
1351 | const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; |
1352 | let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX); |
1353 | |
1354 | for i in 0..NUM_BYTES_WORDS_256 { |
1355 | let reference = load_m256i_word(&AES_S_BOX, i); |
1356 | let input = load_m256i_word(&inputs, i); |
1357 | let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); |
1358 | assert_eq_m256i(result, reference); |
1359 | } |
1360 | } |
1361 | |
1362 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1363 | unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() { |
1364 | const CONSTANT_BYTE: i32 = 0x63; |
1365 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1366 | |
1367 | for i in 0..NUM_TEST_WORDS_256 { |
1368 | let matrix = load_m256i_word(&matrices, i); |
1369 | let vector = load_m256i_word(&vectors, i); |
1370 | let result_zero = |
1371 | _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
1372 | assert_eq_m256i(result_zero, _mm256_setzero_si256()); |
1373 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; |
1374 | const MASK_WORDS: i32 = 0b11_01_10_00; |
1375 | let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1376 | let result_masked = |
1377 | _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1378 | let expected_masked = |
1379 | _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result); |
1380 | assert_eq_m256i(result_masked, expected_masked); |
1381 | } |
1382 | } |
1383 | |
1384 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1385 | unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() { |
1386 | const CONSTANT_BYTE: i32 = 0x63; |
1387 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1388 | |
1389 | for i in 0..NUM_TEST_WORDS_256 { |
1390 | let left = load_m256i_word(&vectors, i); |
1391 | let right = load_m256i_word(&matrices, i); |
1392 | let result_left = |
1393 | _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
1394 | assert_eq_m256i(result_left, left); |
1395 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; |
1396 | const MASK_WORDS: i32 = 0b11_01_10_00; |
1397 | let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1398 | let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>( |
1399 | left, mask_bytes, left, right, |
1400 | ); |
1401 | let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result); |
1402 | assert_eq_m256i(result_masked, expected_masked); |
1403 | } |
1404 | } |
1405 | |
1406 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1407 | unsafe fn test_mm_gf2p8affineinv_epi64_epi8() { |
1408 | let identity: i64 = 0x01_02_04_08_10_20_40_80; |
1409 | const IDENTITY_BYTE: i32 = 0; |
1410 | const CONSTANT_BYTE: i32 = 0x63; |
1411 | let identity = _mm_set1_epi64x(identity); |
1412 | |
1413 | // validate inversion |
1414 | let (inputs, results) = generate_inv_tests_data(); |
1415 | |
1416 | for i in 0..NUM_BYTES_WORDS_128 { |
1417 | let input = load_m128i_word(&inputs, i); |
1418 | let reference = load_m128i_word(&results, i); |
1419 | let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); |
1420 | let remultiplied = _mm_gf2p8mul_epi8(result, input); |
1421 | assert_eq_m128i(remultiplied, reference); |
1422 | } |
1423 | |
1424 | // validate subsequent affine operation |
1425 | let (matrices, vectors, _affine_expected) = |
1426 | generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1427 | |
1428 | for i in 0..NUM_TEST_WORDS_128 { |
1429 | let vector = load_m128i_word(&vectors, i); |
1430 | let matrix = load_m128i_word(&matrices, i); |
1431 | |
1432 | let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); |
1433 | let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); |
1434 | let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1435 | assert_eq_m128i(result, reference); |
1436 | } |
1437 | |
1438 | // validate everything by virtue of checking against the AES SBox |
1439 | const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; |
1440 | let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX); |
1441 | |
1442 | for i in 0..NUM_BYTES_WORDS_128 { |
1443 | let reference = load_m128i_word(&AES_S_BOX, i); |
1444 | let input = load_m128i_word(&inputs, i); |
1445 | let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); |
1446 | assert_eq_m128i(result, reference); |
1447 | } |
1448 | } |
1449 | |
1450 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1451 | unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() { |
1452 | const CONSTANT_BYTE: i32 = 0x63; |
1453 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1454 | |
1455 | for i in 0..NUM_TEST_WORDS_128 { |
1456 | let matrix = load_m128i_word(&matrices, i); |
1457 | let vector = load_m128i_word(&vectors, i); |
1458 | let result_zero = |
1459 | _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
1460 | assert_eq_m128i(result_zero, _mm_setzero_si128()); |
1461 | let mask_bytes: __mmask16 = 0x0F_F0; |
1462 | const MASK_WORDS: i32 = 0b01_10; |
1463 | let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
1464 | let result_masked = |
1465 | _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1466 | let expected_masked = |
1467 | _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result); |
1468 | assert_eq_m128i(result_masked, expected_masked); |
1469 | } |
1470 | } |
1471 | |
1472 | #[simd_test(enable = "gfni,avx512bw,avx512vl" )] |
1473 | unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() { |
1474 | const CONSTANT_BYTE: i32 = 0x63; |
1475 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); |
1476 | |
1477 | for i in 0..NUM_TEST_WORDS_128 { |
1478 | let left = load_m128i_word(&vectors, i); |
1479 | let right = load_m128i_word(&matrices, i); |
1480 | let result_left = |
1481 | _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
1482 | assert_eq_m128i(result_left, left); |
1483 | let mask_bytes: __mmask16 = 0x0F_F0; |
1484 | const MASK_WORDS: i32 = 0b01_10; |
1485 | let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1486 | let result_masked = |
1487 | _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
1488 | let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result); |
1489 | assert_eq_m128i(result_masked, expected_masked); |
1490 | } |
1491 | } |
1492 | } |
1493 | |