1//! Galois Field New Instructions (GFNI)
2//!
3//! The intrinsics here correspond to those in the `immintrin.h` C header.
4//!
5//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7//!
8//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10use crate::core_arch::simd::i8x16;
11use crate::core_arch::simd::i8x32;
12use crate::core_arch::simd::i8x64;
13use crate::core_arch::simd_llvm::simd_select_bitmask;
14use crate::core_arch::x86::__m128i;
15use crate::core_arch::x86::__m256i;
16use crate::core_arch::x86::__m512i;
17use crate::core_arch::x86::__mmask16;
18use crate::core_arch::x86::__mmask32;
19use crate::core_arch::x86::__mmask64;
20use crate::core_arch::x86::_mm256_setzero_si256;
21use crate::core_arch::x86::_mm512_setzero_si512;
22use crate::core_arch::x86::_mm_setzero_si128;
23use crate::core_arch::x86::m128iExt;
24use crate::core_arch::x86::m256iExt;
25use crate::core_arch::x86::m512iExt;
26use crate::mem::transmute;
27
28#[cfg(test)]
29use stdarch_test::assert_instr;
30
31#[allow(improper_ctypes)]
32extern "C" {
33 #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
34 fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35 #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
36 fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37 #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
38 fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39 #[link_name = "llvm.x86.vgf2p8affineqb.512"]
40 fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
41 #[link_name = "llvm.x86.vgf2p8affineqb.256"]
42 fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
43 #[link_name = "llvm.x86.vgf2p8affineqb.128"]
44 fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
45 #[link_name = "llvm.x86.vgf2p8mulb.512"]
46 fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
47 #[link_name = "llvm.x86.vgf2p8mulb.256"]
48 fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
49 #[link_name = "llvm.x86.vgf2p8mulb.128"]
50 fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
51}
52
53// LLVM requires AVX512BW for a lot of these instructions, see
54// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
55// however our tests also require the target feature list to match Intel's
56// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
57// requirement (for now)
58// also see
59// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
60// for forcing GFNI, BW and optionally VL extension
61
62/// Performs a multiplication in GF(2^8) on the packed bytes.
63/// The field is in polynomial representation with the reduction polynomial
64/// x^8 + x^4 + x^3 + x + 1.
65///
66/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
67#[inline]
68#[target_feature(enable = "gfni,avx512bw,avx512f")]
69#[cfg_attr(test, assert_instr(vgf2p8mulb))]
70pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
71 transmute(src:vgf2p8mulb_512(a:a.as_i8x64(), b:b.as_i8x64()))
72}
73
74/// Performs a multiplication in GF(2^8) on the packed bytes.
75/// The field is in polynomial representation with the reduction polynomial
76/// x^8 + x^4 + x^3 + x + 1.
77///
78/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
79/// Otherwise the computation result is written into the result.
80///
81/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
82#[inline]
83#[target_feature(enable = "gfni,avx512bw,avx512f")]
84#[cfg_attr(test, assert_instr(vgf2p8mulb))]
85pub unsafe fn _mm512_mask_gf2p8mul_epi8(
86 src: __m512i,
87 k: __mmask64,
88 a: __m512i,
89 b: __m512i,
90) -> __m512i {
91 transmute(src:simd_select_bitmask(
92 m:k,
93 a:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
94 b:src.as_i8x64(),
95 ))
96}
97
98/// Performs a multiplication in GF(2^8) on the packed bytes.
99/// The field is in polynomial representation with the reduction polynomial
100/// x^8 + x^4 + x^3 + x + 1.
101///
102/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
103/// Otherwise the computation result is written into the result.
104///
105/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
106#[inline]
107#[target_feature(enable = "gfni,avx512bw,avx512f")]
108#[cfg_attr(test, assert_instr(vgf2p8mulb))]
109pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
110 let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
111 transmute(src:simd_select_bitmask(
112 m:k,
113 a:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
114 b:zero,
115 ))
116}
117
118/// Performs a multiplication in GF(2^8) on the packed bytes.
119/// The field is in polynomial representation with the reduction polynomial
120/// x^8 + x^4 + x^3 + x + 1.
121///
122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
123#[inline]
124#[target_feature(enable = "gfni,avx")]
125#[cfg_attr(test, assert_instr(vgf2p8mulb))]
126pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
127 transmute(src:vgf2p8mulb_256(a:a.as_i8x32(), b:b.as_i8x32()))
128}
129
130/// Performs a multiplication in GF(2^8) on the packed bytes.
131/// The field is in polynomial representation with the reduction polynomial
132/// x^8 + x^4 + x^3 + x + 1.
133///
134/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
135/// Otherwise the computation result is written into the result.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
138#[inline]
139#[target_feature(enable = "gfni,avx512bw,avx512vl")]
140#[cfg_attr(test, assert_instr(vgf2p8mulb))]
141pub unsafe fn _mm256_mask_gf2p8mul_epi8(
142 src: __m256i,
143 k: __mmask32,
144 a: __m256i,
145 b: __m256i,
146) -> __m256i {
147 transmute(src:simd_select_bitmask(
148 m:k,
149 a:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
150 b:src.as_i8x32(),
151 ))
152}
153
154/// Performs a multiplication in GF(2^8) on the packed bytes.
155/// The field is in polynomial representation with the reduction polynomial
156/// x^8 + x^4 + x^3 + x + 1.
157///
158/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
159/// Otherwise the computation result is written into the result.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
162#[inline]
163#[target_feature(enable = "gfni,avx512bw,avx512vl")]
164#[cfg_attr(test, assert_instr(vgf2p8mulb))]
165pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
166 let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
167 transmute(src:simd_select_bitmask(
168 m:k,
169 a:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
170 b:zero,
171 ))
172}
173
174/// Performs a multiplication in GF(2^8) on the packed bytes.
175/// The field is in polynomial representation with the reduction polynomial
176/// x^8 + x^4 + x^3 + x + 1.
177///
178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
179#[inline]
180#[target_feature(enable = "gfni")]
181#[cfg_attr(test, assert_instr(gf2p8mulb))]
182pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
183 transmute(src:vgf2p8mulb_128(a:a.as_i8x16(), b:b.as_i8x16()))
184}
185
186/// Performs a multiplication in GF(2^8) on the packed bytes.
187/// The field is in polynomial representation with the reduction polynomial
188/// x^8 + x^4 + x^3 + x + 1.
189///
190/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
191/// Otherwise the computation result is written into the result.
192///
193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
194#[inline]
195#[target_feature(enable = "gfni,avx512bw,avx512vl")]
196#[cfg_attr(test, assert_instr(vgf2p8mulb))]
197pub unsafe fn _mm_mask_gf2p8mul_epi8(
198 src: __m128i,
199 k: __mmask16,
200 a: __m128i,
201 b: __m128i,
202) -> __m128i {
203 transmute(src:simd_select_bitmask(
204 m:k,
205 a:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
206 b:src.as_i8x16(),
207 ))
208}
209
210/// Performs a multiplication in GF(2^8) on the packed bytes.
211/// The field is in polynomial representation with the reduction polynomial
212/// x^8 + x^4 + x^3 + x + 1.
213///
214/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
215/// Otherwise the computation result is written into the result.
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
218#[inline]
219#[target_feature(enable = "gfni,avx512bw,avx512vl")]
220#[cfg_attr(test, assert_instr(vgf2p8mulb))]
221pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
222 let zero: i8x16 = _mm_setzero_si128().as_i8x16();
223 transmute(src:simd_select_bitmask(
224 m:k,
225 a:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
226 b:zero,
227 ))
228}
229
230/// Performs an affine transformation on the packed bytes in x.
231/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
232/// and b being a constant 8-bit immediate value.
233/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
234///
235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi8)
236#[inline]
237#[target_feature(enable = "gfni,avx512bw,avx512f")]
238#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
239#[rustc_legacy_const_generics(2)]
240pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
241 static_assert_uimm_bits!(B, 8);
242 let b: u8 = B as u8;
243 let x: i8x64 = x.as_i8x64();
244 let a: i8x64 = a.as_i8x64();
245 let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
246 transmute(src:r)
247}
248
249/// Performs an affine transformation on the packed bytes in x.
250/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
251/// and b being a constant 8-bit immediate value.
252/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
253///
254/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
255/// Otherwise the computation result is written into the result.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi8)
258#[inline]
259#[target_feature(enable = "gfni,avx512bw,avx512f")]
260#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
261#[rustc_legacy_const_generics(3)]
262pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
263 k: __mmask64,
264 x: __m512i,
265 a: __m512i,
266) -> __m512i {
267 static_assert_uimm_bits!(B, 8);
268 let b: u8 = B as u8;
269 let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
270 let x: i8x64 = x.as_i8x64();
271 let a: i8x64 = a.as_i8x64();
272 let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
273 transmute(src:simd_select_bitmask(m:k, a:r, b:zero))
274}
275
276/// Performs an affine transformation on the packed bytes in x.
277/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
278/// and b being a constant 8-bit immediate value.
279/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
280///
281/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
282/// Otherwise the computation result is written into the result.
283///
284/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi8)
285#[inline]
286#[target_feature(enable = "gfni,avx512bw,avx512f")]
287#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
288#[rustc_legacy_const_generics(4)]
289pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
290 src: __m512i,
291 k: __mmask64,
292 x: __m512i,
293 a: __m512i,
294) -> __m512i {
295 static_assert_uimm_bits!(B, 8);
296 let b: u8 = B as u8;
297 let x: i8x64 = x.as_i8x64();
298 let a: i8x64 = a.as_i8x64();
299 let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
300 transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x64()))
301}
302
303/// Performs an affine transformation on the packed bytes in x.
304/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
305/// and b being a constant 8-bit immediate value.
306/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
307///
308/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi8)
309#[inline]
310#[target_feature(enable = "gfni,avx")]
311#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
312#[rustc_legacy_const_generics(2)]
313pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
314 static_assert_uimm_bits!(B, 8);
315 let b: u8 = B as u8;
316 let x: i8x32 = x.as_i8x32();
317 let a: i8x32 = a.as_i8x32();
318 let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
319 transmute(src:r)
320}
321
322/// Performs an affine transformation on the packed bytes in x.
323/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
324/// and b being a constant 8-bit immediate value.
325/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
326///
327/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
328/// Otherwise the computation result is written into the result.
329///
330/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi8)
331#[inline]
332#[target_feature(enable = "gfni,avx512bw,avx512vl")]
333#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
334#[rustc_legacy_const_generics(3)]
335pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
336 k: __mmask32,
337 x: __m256i,
338 a: __m256i,
339) -> __m256i {
340 static_assert_uimm_bits!(B, 8);
341 let b: u8 = B as u8;
342 let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
343 let x: i8x32 = x.as_i8x32();
344 let a: i8x32 = a.as_i8x32();
345 let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
346 transmute(src:simd_select_bitmask(m:k, a:r, b:zero))
347}
348
349/// Performs an affine transformation on the packed bytes in x.
350/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
351/// and b being a constant 8-bit immediate value.
352/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
353///
354/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
355/// Otherwise the computation result is written into the result.
356///
357/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi8)
358#[inline]
359#[target_feature(enable = "gfni,avx512bw,avx512vl")]
360#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
361#[rustc_legacy_const_generics(4)]
362pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
363 src: __m256i,
364 k: __mmask32,
365 x: __m256i,
366 a: __m256i,
367) -> __m256i {
368 static_assert_uimm_bits!(B, 8);
369 let b: u8 = B as u8;
370 let x: i8x32 = x.as_i8x32();
371 let a: i8x32 = a.as_i8x32();
372 let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
373 transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x32()))
374}
375
376/// Performs an affine transformation on the packed bytes in x.
377/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
378/// and b being a constant 8-bit immediate value.
379/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
380///
381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi8)
382#[inline]
383#[target_feature(enable = "gfni")]
384#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
385#[rustc_legacy_const_generics(2)]
386pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
387 static_assert_uimm_bits!(B, 8);
388 let b: u8 = B as u8;
389 let x: i8x16 = x.as_i8x16();
390 let a: i8x16 = a.as_i8x16();
391 let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
392 transmute(src:r)
393}
394
395/// Performs an affine transformation on the packed bytes in x.
396/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
397/// and b being a constant 8-bit immediate value.
398/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
399///
400/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
401/// Otherwise the computation result is written into the result.
402///
403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi8)
404#[inline]
405#[target_feature(enable = "gfni,avx512bw,avx512vl")]
406#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
407#[rustc_legacy_const_generics(3)]
408pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
409 k: __mmask16,
410 x: __m128i,
411 a: __m128i,
412) -> __m128i {
413 static_assert_uimm_bits!(B, 8);
414 let b: u8 = B as u8;
415 let zero: i8x16 = _mm_setzero_si128().as_i8x16();
416 let x: i8x16 = x.as_i8x16();
417 let a: i8x16 = a.as_i8x16();
418 let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
419 transmute(src:simd_select_bitmask(m:k, a:r, b:zero))
420}
421
422/// Performs an affine transformation on the packed bytes in x.
423/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
424/// and b being a constant 8-bit immediate value.
425/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
426///
427/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
428/// Otherwise the computation result is written into the result.
429///
430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi8)
431#[inline]
432#[target_feature(enable = "gfni,avx512bw,avx512vl")]
433#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
434#[rustc_legacy_const_generics(4)]
435pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
436 src: __m128i,
437 k: __mmask16,
438 x: __m128i,
439 a: __m128i,
440) -> __m128i {
441 static_assert_uimm_bits!(B, 8);
442 let b: u8 = B as u8;
443 let x: i8x16 = x.as_i8x16();
444 let a: i8x16 = a.as_i8x16();
445 let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
446 transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x16()))
447}
448
449/// Performs an affine transformation on the inverted packed bytes in x.
450/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
451/// and b being a constant 8-bit immediate value.
452/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
453/// The inverse of 0 is 0.
454/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
455///
456/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
457#[inline]
458#[target_feature(enable = "gfni,avx512bw,avx512f")]
459#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
460#[rustc_legacy_const_generics(2)]
461pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
462 static_assert_uimm_bits!(B, 8);
463 let b: u8 = B as u8;
464 let x: i8x64 = x.as_i8x64();
465 let a: i8x64 = a.as_i8x64();
466 let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
467 transmute(src:r)
468}
469
470/// Performs an affine transformation on the inverted packed bytes in x.
471/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
472/// and b being a constant 8-bit immediate value.
473/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
474/// The inverse of 0 is 0.
475/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
476///
477/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
478/// Otherwise the computation result is written into the result.
479///
480/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
481#[inline]
482#[target_feature(enable = "gfni,avx512bw,avx512f")]
483#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
484#[rustc_legacy_const_generics(3)]
485pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
486 k: __mmask64,
487 x: __m512i,
488 a: __m512i,
489) -> __m512i {
490 static_assert_uimm_bits!(B, 8);
491 let b: u8 = B as u8;
492 let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
493 let x: i8x64 = x.as_i8x64();
494 let a: i8x64 = a.as_i8x64();
495 let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
496 transmute(src:simd_select_bitmask(m:k, a:r, b:zero))
497}
498
499/// Performs an affine transformation on the inverted packed bytes in x.
500/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
501/// and b being a constant 8-bit immediate value.
502/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
503/// The inverse of 0 is 0.
504/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
505///
506/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
507/// Otherwise the computation result is written into the result.
508///
509/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
510#[inline]
511#[target_feature(enable = "gfni,avx512bw,avx512f")]
512#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
513#[rustc_legacy_const_generics(4)]
514pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
515 src: __m512i,
516 k: __mmask64,
517 x: __m512i,
518 a: __m512i,
519) -> __m512i {
520 static_assert_uimm_bits!(B, 8);
521 let b: u8 = B as u8;
522 let x: i8x64 = x.as_i8x64();
523 let a: i8x64 = a.as_i8x64();
524 let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
525 transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x64()))
526}
527
528/// Performs an affine transformation on the inverted packed bytes in x.
529/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
530/// and b being a constant 8-bit immediate value.
531/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
532/// The inverse of 0 is 0.
533/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
534///
535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
536#[inline]
537#[target_feature(enable = "gfni,avx")]
538#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
539#[rustc_legacy_const_generics(2)]
540pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
541 static_assert_uimm_bits!(B, 8);
542 let b: u8 = B as u8;
543 let x: i8x32 = x.as_i8x32();
544 let a: i8x32 = a.as_i8x32();
545 let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
546 transmute(src:r)
547}
548
549/// Performs an affine transformation on the inverted packed bytes in x.
550/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
551/// and b being a constant 8-bit immediate value.
552/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
553/// The inverse of 0 is 0.
554/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
555///
556/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
557/// Otherwise the computation result is written into the result.
558///
559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
560#[inline]
561#[target_feature(enable = "gfni,avx512bw,avx512vl")]
562#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
563#[rustc_legacy_const_generics(3)]
564pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
565 k: __mmask32,
566 x: __m256i,
567 a: __m256i,
568) -> __m256i {
569 static_assert_uimm_bits!(B, 8);
570 let b: u8 = B as u8;
571 let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
572 let x: i8x32 = x.as_i8x32();
573 let a: i8x32 = a.as_i8x32();
574 let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
575 transmute(src:simd_select_bitmask(m:k, a:r, b:zero))
576}
577
578/// Performs an affine transformation on the inverted packed bytes in x.
579/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
580/// and b being a constant 8-bit immediate value.
581/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
582/// The inverse of 0 is 0.
583/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
584///
585/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
586/// Otherwise the computation result is written into the result.
587///
588/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
589#[inline]
590#[target_feature(enable = "gfni,avx512bw,avx512vl")]
591#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
592#[rustc_legacy_const_generics(4)]
593pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
594 src: __m256i,
595 k: __mmask32,
596 x: __m256i,
597 a: __m256i,
598) -> __m256i {
599 static_assert_uimm_bits!(B, 8);
600 let b: u8 = B as u8;
601 let x: i8x32 = x.as_i8x32();
602 let a: i8x32 = a.as_i8x32();
603 let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
604 transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x32()))
605}
606
607/// Performs an affine transformation on the inverted packed bytes in x.
608/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
609/// and b being a constant 8-bit immediate value.
610/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
611/// The inverse of 0 is 0.
612/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
613///
614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
615#[inline]
616#[target_feature(enable = "gfni")]
617#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
618#[rustc_legacy_const_generics(2)]
619pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
620 static_assert_uimm_bits!(B, 8);
621 let b: u8 = B as u8;
622 let x: i8x16 = x.as_i8x16();
623 let a: i8x16 = a.as_i8x16();
624 let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
625 transmute(src:r)
626}
627
628/// Performs an affine transformation on the inverted packed bytes in x.
629/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
630/// and b being a constant 8-bit immediate value.
631/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
632/// The inverse of 0 is 0.
633/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
634///
635/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
636/// Otherwise the computation result is written into the result.
637///
638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
639#[inline]
640#[target_feature(enable = "gfni,avx512bw,avx512vl")]
641#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
642#[rustc_legacy_const_generics(3)]
643pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
644 k: __mmask16,
645 x: __m128i,
646 a: __m128i,
647) -> __m128i {
648 static_assert_uimm_bits!(B, 8);
649 let b: u8 = B as u8;
650 let zero: i8x16 = _mm_setzero_si128().as_i8x16();
651 let x: i8x16 = x.as_i8x16();
652 let a: i8x16 = a.as_i8x16();
653 let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
654 transmute(src:simd_select_bitmask(m:k, a:r, b:zero))
655}
656
657/// Performs an affine transformation on the inverted packed bytes in x.
658/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
659/// and b being a constant 8-bit immediate value.
660/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
661/// The inverse of 0 is 0.
662/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
663///
664/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
665/// Otherwise the computation result is written into the result.
666///
667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
668#[inline]
669#[target_feature(enable = "gfni,avx512bw,avx512vl")]
670#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
671#[rustc_legacy_const_generics(4)]
672pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
673 src: __m128i,
674 k: __mmask16,
675 x: __m128i,
676 a: __m128i,
677) -> __m128i {
678 static_assert_uimm_bits!(B, 8);
679 let b: u8 = B as u8;
680 let x: i8x16 = x.as_i8x16();
681 let a: i8x16 = a.as_i8x16();
682 let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
683 transmute(src:simd_select_bitmask(m:k, a:r, b:src.as_i8x16()))
684}
685
686#[cfg(test)]
687mod tests {
688 // The constants in the tests below are just bit patterns. They should not
689 // be interpreted as integers; signedness does not make sense for them, but
690 // __mXXXi happens to be defined in terms of signed integers.
691 #![allow(overflowing_literals)]
692
693 use core::hint::black_box;
694 use core::intrinsics::size_of;
695 use stdarch_test::simd_test;
696
697 use crate::core_arch::x86::*;
698
699 fn mulbyte(left: u8, right: u8) -> u8 {
700 // this implementation follows the description in
701 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
702 const REDUCTION_POLYNOMIAL: u16 = 0x11b;
703 let left: u16 = left.into();
704 let right: u16 = right.into();
705 let mut carryless_product: u16 = 0;
706
707 // Carryless multiplication
708 for i in 0..8 {
709 if ((left >> i) & 0x01) != 0 {
710 carryless_product ^= right << i;
711 }
712 }
713
714 // reduction, adding in "0" where appropriate to clear out high bits
715 // note that REDUCTION_POLYNOMIAL is zero in this context
716 for i in (8..=14).rev() {
717 if ((carryless_product >> i) & 0x01) != 0 {
718 carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
719 }
720 }
721
722 carryless_product as u8
723 }
724
725 const NUM_TEST_WORDS_512: usize = 4;
726 const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
727 const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
728 const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
729 const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
730 const NUM_BYTES: usize = 256;
731 const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
732 const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
733 const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
734
735 fn parity(input: u8) -> u8 {
736 let mut accumulator = 0;
737 for i in 0..8 {
738 accumulator ^= (input >> i) & 0x01;
739 }
740 accumulator
741 }
742
743 fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
744 // this implementation follows the description in
745 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
746 let mut accumulator = 0;
747
748 for bit in 0..8 {
749 accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
750 }
751
752 accumulator ^ b
753 }
754
755 fn generate_affine_mul_test_data(
756 immediate: u8,
757 ) -> (
758 [u64; NUM_TEST_WORDS_64],
759 [u8; NUM_TEST_ENTRIES],
760 [u8; NUM_TEST_ENTRIES],
761 ) {
762 let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
763 let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
764 let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
765
766 for i in 0..NUM_TEST_WORDS_64 {
767 left[i] = (i as u64) * 103 * 101;
768 for j in 0..8 {
769 let j64 = j as u64;
770 right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
771 result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
772 }
773 }
774
775 (left, right, result)
776 }
777
778 fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
779 let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
780 let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
781
782 for i in 0..NUM_BYTES {
783 input[i] = (i % 256) as u8;
784 result[i] = if i == 0 { 0 } else { 1 };
785 }
786
787 (input, result)
788 }
789
790 const AES_S_BOX: [u8; NUM_BYTES] = [
791 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
792 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
793 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
794 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
795 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
796 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
797 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
798 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
799 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
800 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
801 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
802 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
803 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
804 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
805 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
806 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
807 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
808 0x16,
809 ];
810
811 fn generate_byte_mul_test_data() -> (
812 [u8; NUM_TEST_ENTRIES],
813 [u8; NUM_TEST_ENTRIES],
814 [u8; NUM_TEST_ENTRIES],
815 ) {
816 let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
817 let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
818 let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
819
820 for i in 0..NUM_TEST_ENTRIES {
821 left[i] = (i % 256) as u8;
822 right[i] = left[i].wrapping_mul(101);
823 result[i] = mulbyte(left[i], right[i]);
824 }
825
826 (left, right, result)
827 }
828
829 #[target_feature(enable = "sse2")]
830 unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
831 let byte_offset = word_index * 16 / size_of::<T>();
832 let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
833 _mm_loadu_si128(black_box(pointer))
834 }
835
836 #[target_feature(enable = "avx")]
837 unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
838 let byte_offset = word_index * 32 / size_of::<T>();
839 let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
840 _mm256_loadu_si256(black_box(pointer))
841 }
842
843 #[target_feature(enable = "avx512f")]
844 unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
845 let byte_offset = word_index * 64 / size_of::<T>();
846 let pointer = data.as_ptr().add(byte_offset) as *const i32;
847 _mm512_loadu_si512(black_box(pointer))
848 }
849
850 #[simd_test(enable = "gfni,avx512bw")]
851 unsafe fn test_mm512_gf2p8mul_epi8() {
852 let (left, right, expected) = generate_byte_mul_test_data();
853
854 for i in 0..NUM_TEST_WORDS_512 {
855 let left = load_m512i_word(&left, i);
856 let right = load_m512i_word(&right, i);
857 let expected = load_m512i_word(&expected, i);
858 let result = _mm512_gf2p8mul_epi8(left, right);
859 assert_eq_m512i(result, expected);
860 }
861 }
862
863 #[simd_test(enable = "gfni,avx512bw")]
864 unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
865 let (left, right, _expected) = generate_byte_mul_test_data();
866
867 for i in 0..NUM_TEST_WORDS_512 {
868 let left = load_m512i_word(&left, i);
869 let right = load_m512i_word(&right, i);
870 let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
871 assert_eq_m512i(result_zero, _mm512_setzero_si512());
872 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
873 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
874 let expected_result = _mm512_gf2p8mul_epi8(left, right);
875 let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
876 let expected_masked =
877 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
878 assert_eq_m512i(result_masked, expected_masked);
879 }
880 }
881
882 #[simd_test(enable = "gfni,avx512bw")]
883 unsafe fn test_mm512_mask_gf2p8mul_epi8() {
884 let (left, right, _expected) = generate_byte_mul_test_data();
885
886 for i in 0..NUM_TEST_WORDS_512 {
887 let left = load_m512i_word(&left, i);
888 let right = load_m512i_word(&right, i);
889 let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
890 assert_eq_m512i(result_left, left);
891 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
892 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
893 let expected_result = _mm512_gf2p8mul_epi8(left, right);
894 let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
895 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
896 assert_eq_m512i(result_masked, expected_masked);
897 }
898 }
899
900 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
901 unsafe fn test_mm256_gf2p8mul_epi8() {
902 let (left, right, expected) = generate_byte_mul_test_data();
903
904 for i in 0..NUM_TEST_WORDS_256 {
905 let left = load_m256i_word(&left, i);
906 let right = load_m256i_word(&right, i);
907 let expected = load_m256i_word(&expected, i);
908 let result = _mm256_gf2p8mul_epi8(left, right);
909 assert_eq_m256i(result, expected);
910 }
911 }
912
913 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
914 unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
915 let (left, right, _expected) = generate_byte_mul_test_data();
916
917 for i in 0..NUM_TEST_WORDS_256 {
918 let left = load_m256i_word(&left, i);
919 let right = load_m256i_word(&right, i);
920 let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
921 assert_eq_m256i(result_zero, _mm256_setzero_si256());
922 let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
923 const MASK_WORDS: i32 = 0b01_10_11_00;
924 let expected_result = _mm256_gf2p8mul_epi8(left, right);
925 let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
926 let expected_masked =
927 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
928 assert_eq_m256i(result_masked, expected_masked);
929 }
930 }
931
932 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
933 unsafe fn test_mm256_mask_gf2p8mul_epi8() {
934 let (left, right, _expected) = generate_byte_mul_test_data();
935
936 for i in 0..NUM_TEST_WORDS_256 {
937 let left = load_m256i_word(&left, i);
938 let right = load_m256i_word(&right, i);
939 let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
940 assert_eq_m256i(result_left, left);
941 let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
942 const MASK_WORDS: i32 = 0b01_10_11_00;
943 let expected_result = _mm256_gf2p8mul_epi8(left, right);
944 let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
945 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
946 assert_eq_m256i(result_masked, expected_masked);
947 }
948 }
949
950 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
951 unsafe fn test_mm_gf2p8mul_epi8() {
952 let (left, right, expected) = generate_byte_mul_test_data();
953
954 for i in 0..NUM_TEST_WORDS_128 {
955 let left = load_m128i_word(&left, i);
956 let right = load_m128i_word(&right, i);
957 let expected = load_m128i_word(&expected, i);
958 let result = _mm_gf2p8mul_epi8(left, right);
959 assert_eq_m128i(result, expected);
960 }
961 }
962
963 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
964 unsafe fn test_mm_maskz_gf2p8mul_epi8() {
965 let (left, right, _expected) = generate_byte_mul_test_data();
966
967 for i in 0..NUM_TEST_WORDS_128 {
968 let left = load_m128i_word(&left, i);
969 let right = load_m128i_word(&right, i);
970 let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
971 assert_eq_m128i(result_zero, _mm_setzero_si128());
972 let mask_bytes: __mmask16 = 0x0F_F0;
973 const MASK_WORDS: i32 = 0b01_10;
974 let expected_result = _mm_gf2p8mul_epi8(left, right);
975 let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
976 let expected_masked =
977 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
978 assert_eq_m128i(result_masked, expected_masked);
979 }
980 }
981
982 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
983 unsafe fn test_mm_mask_gf2p8mul_epi8() {
984 let (left, right, _expected) = generate_byte_mul_test_data();
985
986 for i in 0..NUM_TEST_WORDS_128 {
987 let left = load_m128i_word(&left, i);
988 let right = load_m128i_word(&right, i);
989 let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
990 assert_eq_m128i(result_left, left);
991 let mask_bytes: __mmask16 = 0x0F_F0;
992 const MASK_WORDS: i32 = 0b01_10;
993 let expected_result = _mm_gf2p8mul_epi8(left, right);
994 let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
995 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
996 assert_eq_m128i(result_masked, expected_masked);
997 }
998 }
999
1000 #[simd_test(enable = "gfni,avx512bw")]
1001 unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1002 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1003 const IDENTITY_BYTE: i32 = 0;
1004 let constant: i64 = 0;
1005 const CONSTANT_BYTE: i32 = 0x63;
1006 let identity = _mm512_set1_epi64(identity);
1007 let constant = _mm512_set1_epi64(constant);
1008 let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1009
1010 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1011 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1012
1013 for i in 0..NUM_TEST_WORDS_512 {
1014 let data = load_m512i_word(&bytes, i);
1015 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1016 assert_eq_m512i(result, data);
1017 let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1018 assert_eq_m512i(result, constant_reference);
1019 let data = load_m512i_word(&more_bytes, i);
1020 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1021 assert_eq_m512i(result, data);
1022 let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1023 assert_eq_m512i(result, constant_reference);
1024
1025 let matrix = load_m512i_word(&matrices, i);
1026 let vector = load_m512i_word(&vectors, i);
1027 let reference = load_m512i_word(&references, i);
1028
1029 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1030 assert_eq_m512i(result, reference);
1031 }
1032 }
1033
1034 #[simd_test(enable = "gfni,avx512bw")]
1035 unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1036 const CONSTANT_BYTE: i32 = 0x63;
1037 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1038
1039 for i in 0..NUM_TEST_WORDS_512 {
1040 let matrix = load_m512i_word(&matrices, i);
1041 let vector = load_m512i_word(&vectors, i);
1042 let result_zero =
1043 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1044 assert_eq_m512i(result_zero, _mm512_setzero_si512());
1045 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1046 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1047 let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1048 let result_masked =
1049 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1050 let expected_masked =
1051 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1052 assert_eq_m512i(result_masked, expected_masked);
1053 }
1054 }
1055
1056 #[simd_test(enable = "gfni,avx512bw")]
1057 unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1058 const CONSTANT_BYTE: i32 = 0x63;
1059 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1060
1061 for i in 0..NUM_TEST_WORDS_512 {
1062 let left = load_m512i_word(&vectors, i);
1063 let right = load_m512i_word(&matrices, i);
1064 let result_left =
1065 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1066 assert_eq_m512i(result_left, left);
1067 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1068 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1069 let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1070 let result_masked =
1071 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1072 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1073 assert_eq_m512i(result_masked, expected_masked);
1074 }
1075 }
1076
1077 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1078 unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1079 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1080 const IDENTITY_BYTE: i32 = 0;
1081 let constant: i64 = 0;
1082 const CONSTANT_BYTE: i32 = 0x63;
1083 let identity = _mm256_set1_epi64x(identity);
1084 let constant = _mm256_set1_epi64x(constant);
1085 let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1086
1087 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1088 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1089
1090 for i in 0..NUM_TEST_WORDS_256 {
1091 let data = load_m256i_word(&bytes, i);
1092 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1093 assert_eq_m256i(result, data);
1094 let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1095 assert_eq_m256i(result, constant_reference);
1096 let data = load_m256i_word(&more_bytes, i);
1097 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1098 assert_eq_m256i(result, data);
1099 let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1100 assert_eq_m256i(result, constant_reference);
1101
1102 let matrix = load_m256i_word(&matrices, i);
1103 let vector = load_m256i_word(&vectors, i);
1104 let reference = load_m256i_word(&references, i);
1105
1106 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1107 assert_eq_m256i(result, reference);
1108 }
1109 }
1110
1111 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1112 unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1113 const CONSTANT_BYTE: i32 = 0x63;
1114 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1115
1116 for i in 0..NUM_TEST_WORDS_256 {
1117 let matrix = load_m256i_word(&matrices, i);
1118 let vector = load_m256i_word(&vectors, i);
1119 let result_zero =
1120 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1121 assert_eq_m256i(result_zero, _mm256_setzero_si256());
1122 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1123 const MASK_WORDS: i32 = 0b11_01_10_00;
1124 let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1125 let result_masked =
1126 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1127 let expected_masked =
1128 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1129 assert_eq_m256i(result_masked, expected_masked);
1130 }
1131 }
1132
1133 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1134 unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1135 const CONSTANT_BYTE: i32 = 0x63;
1136 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1137
1138 for i in 0..NUM_TEST_WORDS_256 {
1139 let left = load_m256i_word(&vectors, i);
1140 let right = load_m256i_word(&matrices, i);
1141 let result_left =
1142 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1143 assert_eq_m256i(result_left, left);
1144 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1145 const MASK_WORDS: i32 = 0b11_01_10_00;
1146 let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1147 let result_masked =
1148 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1149 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1150 assert_eq_m256i(result_masked, expected_masked);
1151 }
1152 }
1153
1154 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1155 unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1156 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1157 const IDENTITY_BYTE: i32 = 0;
1158 let constant: i64 = 0;
1159 const CONSTANT_BYTE: i32 = 0x63;
1160 let identity = _mm_set1_epi64x(identity);
1161 let constant = _mm_set1_epi64x(constant);
1162 let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1163
1164 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1165 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1166
1167 for i in 0..NUM_TEST_WORDS_128 {
1168 let data = load_m128i_word(&bytes, i);
1169 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1170 assert_eq_m128i(result, data);
1171 let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1172 assert_eq_m128i(result, constant_reference);
1173 let data = load_m128i_word(&more_bytes, i);
1174 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1175 assert_eq_m128i(result, data);
1176 let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1177 assert_eq_m128i(result, constant_reference);
1178
1179 let matrix = load_m128i_word(&matrices, i);
1180 let vector = load_m128i_word(&vectors, i);
1181 let reference = load_m128i_word(&references, i);
1182
1183 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1184 assert_eq_m128i(result, reference);
1185 }
1186 }
1187
1188 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1189 unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1190 const CONSTANT_BYTE: i32 = 0x63;
1191 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1192
1193 for i in 0..NUM_TEST_WORDS_128 {
1194 let matrix = load_m128i_word(&matrices, i);
1195 let vector = load_m128i_word(&vectors, i);
1196 let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1197 assert_eq_m128i(result_zero, _mm_setzero_si128());
1198 let mask_bytes: __mmask16 = 0x0F_F0;
1199 const MASK_WORDS: i32 = 0b01_10;
1200 let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1201 let result_masked =
1202 _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1203 let expected_masked =
1204 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1205 assert_eq_m128i(result_masked, expected_masked);
1206 }
1207 }
1208
1209 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1210 unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1211 const CONSTANT_BYTE: i32 = 0x63;
1212 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1213
1214 for i in 0..NUM_TEST_WORDS_128 {
1215 let left = load_m128i_word(&vectors, i);
1216 let right = load_m128i_word(&matrices, i);
1217 let result_left =
1218 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1219 assert_eq_m128i(result_left, left);
1220 let mask_bytes: __mmask16 = 0x0F_F0;
1221 const MASK_WORDS: i32 = 0b01_10;
1222 let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1223 let result_masked =
1224 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1225 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1226 assert_eq_m128i(result_masked, expected_masked);
1227 }
1228 }
1229
1230 #[simd_test(enable = "gfni,avx512bw")]
1231 unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1232 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1233 const IDENTITY_BYTE: i32 = 0;
1234 const CONSTANT_BYTE: i32 = 0x63;
1235 let identity = _mm512_set1_epi64(identity);
1236
1237 // validate inversion
1238 let (inputs, results) = generate_inv_tests_data();
1239
1240 for i in 0..NUM_BYTES_WORDS_512 {
1241 let input = load_m512i_word(&inputs, i);
1242 let reference = load_m512i_word(&results, i);
1243 let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1244 let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1245 assert_eq_m512i(remultiplied, reference);
1246 }
1247
1248 // validate subsequent affine operation
1249 let (matrices, vectors, _affine_expected) =
1250 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1251
1252 for i in 0..NUM_TEST_WORDS_512 {
1253 let vector = load_m512i_word(&vectors, i);
1254 let matrix = load_m512i_word(&matrices, i);
1255
1256 let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1257 let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1258 let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1259 assert_eq_m512i(result, reference);
1260 }
1261
1262 // validate everything by virtue of checking against the AES SBox
1263 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1264 let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1265
1266 for i in 0..NUM_BYTES_WORDS_512 {
1267 let reference = load_m512i_word(&AES_S_BOX, i);
1268 let input = load_m512i_word(&inputs, i);
1269 let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1270 assert_eq_m512i(result, reference);
1271 }
1272 }
1273
1274 #[simd_test(enable = "gfni,avx512bw")]
1275 unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1276 const CONSTANT_BYTE: i32 = 0x63;
1277 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1278
1279 for i in 0..NUM_TEST_WORDS_512 {
1280 let matrix = load_m512i_word(&matrices, i);
1281 let vector = load_m512i_word(&vectors, i);
1282 let result_zero =
1283 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1284 assert_eq_m512i(result_zero, _mm512_setzero_si512());
1285 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1286 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1287 let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1288 let result_masked =
1289 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1290 let expected_masked =
1291 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1292 assert_eq_m512i(result_masked, expected_masked);
1293 }
1294 }
1295
1296 #[simd_test(enable = "gfni,avx512bw")]
1297 unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1298 const CONSTANT_BYTE: i32 = 0x63;
1299 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1300
1301 for i in 0..NUM_TEST_WORDS_512 {
1302 let left = load_m512i_word(&vectors, i);
1303 let right = load_m512i_word(&matrices, i);
1304 let result_left =
1305 _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1306 assert_eq_m512i(result_left, left);
1307 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1308 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1309 let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1310 let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1311 left, mask_bytes, left, right,
1312 );
1313 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1314 assert_eq_m512i(result_masked, expected_masked);
1315 }
1316 }
1317
1318 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1319 unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1320 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1321 const IDENTITY_BYTE: i32 = 0;
1322 const CONSTANT_BYTE: i32 = 0x63;
1323 let identity = _mm256_set1_epi64x(identity);
1324
1325 // validate inversion
1326 let (inputs, results) = generate_inv_tests_data();
1327
1328 for i in 0..NUM_BYTES_WORDS_256 {
1329 let input = load_m256i_word(&inputs, i);
1330 let reference = load_m256i_word(&results, i);
1331 let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1332 let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1333 assert_eq_m256i(remultiplied, reference);
1334 }
1335
1336 // validate subsequent affine operation
1337 let (matrices, vectors, _affine_expected) =
1338 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1339
1340 for i in 0..NUM_TEST_WORDS_256 {
1341 let vector = load_m256i_word(&vectors, i);
1342 let matrix = load_m256i_word(&matrices, i);
1343
1344 let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1345 let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1346 let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1347 assert_eq_m256i(result, reference);
1348 }
1349
1350 // validate everything by virtue of checking against the AES SBox
1351 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1352 let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1353
1354 for i in 0..NUM_BYTES_WORDS_256 {
1355 let reference = load_m256i_word(&AES_S_BOX, i);
1356 let input = load_m256i_word(&inputs, i);
1357 let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1358 assert_eq_m256i(result, reference);
1359 }
1360 }
1361
1362 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1363 unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1364 const CONSTANT_BYTE: i32 = 0x63;
1365 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1366
1367 for i in 0..NUM_TEST_WORDS_256 {
1368 let matrix = load_m256i_word(&matrices, i);
1369 let vector = load_m256i_word(&vectors, i);
1370 let result_zero =
1371 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1372 assert_eq_m256i(result_zero, _mm256_setzero_si256());
1373 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1374 const MASK_WORDS: i32 = 0b11_01_10_00;
1375 let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1376 let result_masked =
1377 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1378 let expected_masked =
1379 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1380 assert_eq_m256i(result_masked, expected_masked);
1381 }
1382 }
1383
1384 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1385 unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1386 const CONSTANT_BYTE: i32 = 0x63;
1387 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1388
1389 for i in 0..NUM_TEST_WORDS_256 {
1390 let left = load_m256i_word(&vectors, i);
1391 let right = load_m256i_word(&matrices, i);
1392 let result_left =
1393 _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1394 assert_eq_m256i(result_left, left);
1395 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1396 const MASK_WORDS: i32 = 0b11_01_10_00;
1397 let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1398 let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1399 left, mask_bytes, left, right,
1400 );
1401 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1402 assert_eq_m256i(result_masked, expected_masked);
1403 }
1404 }
1405
1406 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1407 unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1408 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1409 const IDENTITY_BYTE: i32 = 0;
1410 const CONSTANT_BYTE: i32 = 0x63;
1411 let identity = _mm_set1_epi64x(identity);
1412
1413 // validate inversion
1414 let (inputs, results) = generate_inv_tests_data();
1415
1416 for i in 0..NUM_BYTES_WORDS_128 {
1417 let input = load_m128i_word(&inputs, i);
1418 let reference = load_m128i_word(&results, i);
1419 let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1420 let remultiplied = _mm_gf2p8mul_epi8(result, input);
1421 assert_eq_m128i(remultiplied, reference);
1422 }
1423
1424 // validate subsequent affine operation
1425 let (matrices, vectors, _affine_expected) =
1426 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1427
1428 for i in 0..NUM_TEST_WORDS_128 {
1429 let vector = load_m128i_word(&vectors, i);
1430 let matrix = load_m128i_word(&matrices, i);
1431
1432 let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1433 let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1434 let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1435 assert_eq_m128i(result, reference);
1436 }
1437
1438 // validate everything by virtue of checking against the AES SBox
1439 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1440 let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1441
1442 for i in 0..NUM_BYTES_WORDS_128 {
1443 let reference = load_m128i_word(&AES_S_BOX, i);
1444 let input = load_m128i_word(&inputs, i);
1445 let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1446 assert_eq_m128i(result, reference);
1447 }
1448 }
1449
1450 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1451 unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1452 const CONSTANT_BYTE: i32 = 0x63;
1453 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1454
1455 for i in 0..NUM_TEST_WORDS_128 {
1456 let matrix = load_m128i_word(&matrices, i);
1457 let vector = load_m128i_word(&vectors, i);
1458 let result_zero =
1459 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1460 assert_eq_m128i(result_zero, _mm_setzero_si128());
1461 let mask_bytes: __mmask16 = 0x0F_F0;
1462 const MASK_WORDS: i32 = 0b01_10;
1463 let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1464 let result_masked =
1465 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1466 let expected_masked =
1467 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1468 assert_eq_m128i(result_masked, expected_masked);
1469 }
1470 }
1471
1472 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1473 unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1474 const CONSTANT_BYTE: i32 = 0x63;
1475 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1476
1477 for i in 0..NUM_TEST_WORDS_128 {
1478 let left = load_m128i_word(&vectors, i);
1479 let right = load_m128i_word(&matrices, i);
1480 let result_left =
1481 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1482 assert_eq_m128i(result_left, left);
1483 let mask_bytes: __mmask16 = 0x0F_F0;
1484 const MASK_WORDS: i32 = 0b01_10;
1485 let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1486 let result_masked =
1487 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1488 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1489 assert_eq_m128i(result_masked, expected_masked);
1490 }
1491 }
1492}
1493