1//! Galois Field New Instructions (GFNI)
2//!
3//! The intrinsics here correspond to those in the `immintrin.h` C header.
4//!
5//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7//!
8//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10use crate::core_arch::simd::i8x16;
11use crate::core_arch::simd::i8x32;
12use crate::core_arch::simd::i8x64;
13use crate::core_arch::x86::__m128i;
14use crate::core_arch::x86::__m256i;
15use crate::core_arch::x86::__m512i;
16use crate::core_arch::x86::__mmask16;
17use crate::core_arch::x86::__mmask32;
18use crate::core_arch::x86::__mmask64;
19use crate::core_arch::x86::_mm256_setzero_si256;
20use crate::core_arch::x86::_mm512_setzero_si512;
21use crate::core_arch::x86::_mm_setzero_si128;
22use crate::core_arch::x86::m128iExt;
23use crate::core_arch::x86::m256iExt;
24use crate::core_arch::x86::m512iExt;
25use crate::intrinsics::simd::simd_select_bitmask;
26use crate::mem::transmute;
27
28#[cfg(test)]
29use stdarch_test::assert_instr;
30
31#[allow(improper_ctypes)]
32extern "C" {
33 #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
34 fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35 #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
36 fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37 #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
38 fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39 #[link_name = "llvm.x86.vgf2p8affineqb.512"]
40 fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
41 #[link_name = "llvm.x86.vgf2p8affineqb.256"]
42 fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
43 #[link_name = "llvm.x86.vgf2p8affineqb.128"]
44 fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
45 #[link_name = "llvm.x86.vgf2p8mulb.512"]
46 fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
47 #[link_name = "llvm.x86.vgf2p8mulb.256"]
48 fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
49 #[link_name = "llvm.x86.vgf2p8mulb.128"]
50 fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
51}
52
53// LLVM requires AVX512BW for a lot of these instructions, see
54// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
55// however our tests also require the target feature list to match Intel's
56// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
57// requirement (for now)
58// also see
59// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
60// for forcing GFNI, BW and optionally VL extension
61
62/// Performs a multiplication in GF(2^8) on the packed bytes.
63/// The field is in polynomial representation with the reduction polynomial
64/// x^8 + x^4 + x^3 + x + 1.
65///
66/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
67#[inline]
68#[target_feature(enable = "gfni,avx512bw,avx512f")]
69#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
70#[cfg_attr(test, assert_instr(vgf2p8mulb))]
71pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
72 transmute(src:vgf2p8mulb_512(a:a.as_i8x64(), b:b.as_i8x64()))
73}
74
75/// Performs a multiplication in GF(2^8) on the packed bytes.
76/// The field is in polynomial representation with the reduction polynomial
77/// x^8 + x^4 + x^3 + x + 1.
78///
79/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
80/// Otherwise the computation result is written into the result.
81///
82/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
83#[inline]
84#[target_feature(enable = "gfni,avx512bw,avx512f")]
85#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
86#[cfg_attr(test, assert_instr(vgf2p8mulb))]
87pub unsafe fn _mm512_mask_gf2p8mul_epi8(
88 src: __m512i,
89 k: __mmask64,
90 a: __m512i,
91 b: __m512i,
92) -> __m512i {
93 transmute(src:simd_select_bitmask(
94 m:k,
95 yes:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
96 no:src.as_i8x64(),
97 ))
98}
99
100/// Performs a multiplication in GF(2^8) on the packed bytes.
101/// The field is in polynomial representation with the reduction polynomial
102/// x^8 + x^4 + x^3 + x + 1.
103///
104/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
105/// Otherwise the computation result is written into the result.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
108#[inline]
109#[target_feature(enable = "gfni,avx512bw,avx512f")]
110#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
111#[cfg_attr(test, assert_instr(vgf2p8mulb))]
112pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
113 let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
114 transmute(src:simd_select_bitmask(
115 m:k,
116 yes:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
117 no:zero,
118 ))
119}
120
121/// Performs a multiplication in GF(2^8) on the packed bytes.
122/// The field is in polynomial representation with the reduction polynomial
123/// x^8 + x^4 + x^3 + x + 1.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
126#[inline]
127#[target_feature(enable = "gfni,avx")]
128#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
129#[cfg_attr(test, assert_instr(vgf2p8mulb))]
130pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
131 transmute(src:vgf2p8mulb_256(a:a.as_i8x32(), b:b.as_i8x32()))
132}
133
134/// Performs a multiplication in GF(2^8) on the packed bytes.
135/// The field is in polynomial representation with the reduction polynomial
136/// x^8 + x^4 + x^3 + x + 1.
137///
138/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
139/// Otherwise the computation result is written into the result.
140///
141/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
142#[inline]
143#[target_feature(enable = "gfni,avx512bw,avx512vl")]
144#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
145#[cfg_attr(test, assert_instr(vgf2p8mulb))]
146pub unsafe fn _mm256_mask_gf2p8mul_epi8(
147 src: __m256i,
148 k: __mmask32,
149 a: __m256i,
150 b: __m256i,
151) -> __m256i {
152 transmute(src:simd_select_bitmask(
153 m:k,
154 yes:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
155 no:src.as_i8x32(),
156 ))
157}
158
159/// Performs a multiplication in GF(2^8) on the packed bytes.
160/// The field is in polynomial representation with the reduction polynomial
161/// x^8 + x^4 + x^3 + x + 1.
162///
163/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
164/// Otherwise the computation result is written into the result.
165///
166/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
167#[inline]
168#[target_feature(enable = "gfni,avx512bw,avx512vl")]
169#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
170#[cfg_attr(test, assert_instr(vgf2p8mulb))]
171pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
172 let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
173 transmute(src:simd_select_bitmask(
174 m:k,
175 yes:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
176 no:zero,
177 ))
178}
179
180/// Performs a multiplication in GF(2^8) on the packed bytes.
181/// The field is in polynomial representation with the reduction polynomial
182/// x^8 + x^4 + x^3 + x + 1.
183///
184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
185#[inline]
186#[target_feature(enable = "gfni")]
187#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
188#[cfg_attr(test, assert_instr(gf2p8mulb))]
189pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
190 transmute(src:vgf2p8mulb_128(a:a.as_i8x16(), b:b.as_i8x16()))
191}
192
193/// Performs a multiplication in GF(2^8) on the packed bytes.
194/// The field is in polynomial representation with the reduction polynomial
195/// x^8 + x^4 + x^3 + x + 1.
196///
197/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
198/// Otherwise the computation result is written into the result.
199///
200/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
201#[inline]
202#[target_feature(enable = "gfni,avx512bw,avx512vl")]
203#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
204#[cfg_attr(test, assert_instr(vgf2p8mulb))]
205pub unsafe fn _mm_mask_gf2p8mul_epi8(
206 src: __m128i,
207 k: __mmask16,
208 a: __m128i,
209 b: __m128i,
210) -> __m128i {
211 transmute(src:simd_select_bitmask(
212 m:k,
213 yes:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
214 no:src.as_i8x16(),
215 ))
216}
217
218/// Performs a multiplication in GF(2^8) on the packed bytes.
219/// The field is in polynomial representation with the reduction polynomial
220/// x^8 + x^4 + x^3 + x + 1.
221///
222/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
223/// Otherwise the computation result is written into the result.
224///
225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
226#[inline]
227#[target_feature(enable = "gfni,avx512bw,avx512vl")]
228#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
229#[cfg_attr(test, assert_instr(vgf2p8mulb))]
230pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
231 let zero: i8x16 = _mm_setzero_si128().as_i8x16();
232 transmute(src:simd_select_bitmask(
233 m:k,
234 yes:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
235 no:zero,
236 ))
237}
238
239/// Performs an affine transformation on the packed bytes in x.
240/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
241/// and b being a constant 8-bit immediate value.
242/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
243///
244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi8)
245#[inline]
246#[target_feature(enable = "gfni,avx512bw,avx512f")]
247#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
248#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
249#[rustc_legacy_const_generics(2)]
250pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
251 static_assert_uimm_bits!(B, 8);
252 let b: u8 = B as u8;
253 let x: i8x64 = x.as_i8x64();
254 let a: i8x64 = a.as_i8x64();
255 let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
256 transmute(src:r)
257}
258
259/// Performs an affine transformation on the packed bytes in x.
260/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
261/// and b being a constant 8-bit immediate value.
262/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
263///
264/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
265/// Otherwise the computation result is written into the result.
266///
267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi8)
268#[inline]
269#[target_feature(enable = "gfni,avx512bw,avx512f")]
270#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
271#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
272#[rustc_legacy_const_generics(3)]
273pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
274 k: __mmask64,
275 x: __m512i,
276 a: __m512i,
277) -> __m512i {
278 static_assert_uimm_bits!(B, 8);
279 let b: u8 = B as u8;
280 let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
281 let x: i8x64 = x.as_i8x64();
282 let a: i8x64 = a.as_i8x64();
283 let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
284 transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
285}
286
287/// Performs an affine transformation on the packed bytes in x.
288/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
289/// and b being a constant 8-bit immediate value.
290/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
291///
292/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
293/// Otherwise the computation result is written into the result.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi8)
296#[inline]
297#[target_feature(enable = "gfni,avx512bw,avx512f")]
298#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
299#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
300#[rustc_legacy_const_generics(4)]
301pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
302 src: __m512i,
303 k: __mmask64,
304 x: __m512i,
305 a: __m512i,
306) -> __m512i {
307 static_assert_uimm_bits!(B, 8);
308 let b: u8 = B as u8;
309 let x: i8x64 = x.as_i8x64();
310 let a: i8x64 = a.as_i8x64();
311 let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
312 transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x64()))
313}
314
315/// Performs an affine transformation on the packed bytes in x.
316/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
317/// and b being a constant 8-bit immediate value.
318/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
319///
320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi8)
321#[inline]
322#[target_feature(enable = "gfni,avx")]
323#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
324#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
325#[rustc_legacy_const_generics(2)]
326pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
327 static_assert_uimm_bits!(B, 8);
328 let b: u8 = B as u8;
329 let x: i8x32 = x.as_i8x32();
330 let a: i8x32 = a.as_i8x32();
331 let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
332 transmute(src:r)
333}
334
335/// Performs an affine transformation on the packed bytes in x.
336/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
337/// and b being a constant 8-bit immediate value.
338/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
339///
340/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
341/// Otherwise the computation result is written into the result.
342///
343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi8)
344#[inline]
345#[target_feature(enable = "gfni,avx512bw,avx512vl")]
346#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
347#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
348#[rustc_legacy_const_generics(3)]
349pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
350 k: __mmask32,
351 x: __m256i,
352 a: __m256i,
353) -> __m256i {
354 static_assert_uimm_bits!(B, 8);
355 let b: u8 = B as u8;
356 let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
357 let x: i8x32 = x.as_i8x32();
358 let a: i8x32 = a.as_i8x32();
359 let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
360 transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
361}
362
363/// Performs an affine transformation on the packed bytes in x.
364/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
365/// and b being a constant 8-bit immediate value.
366/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
367///
368/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
369/// Otherwise the computation result is written into the result.
370///
371/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi8)
372#[inline]
373#[target_feature(enable = "gfni,avx512bw,avx512vl")]
374#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
375#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
376#[rustc_legacy_const_generics(4)]
377pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
378 src: __m256i,
379 k: __mmask32,
380 x: __m256i,
381 a: __m256i,
382) -> __m256i {
383 static_assert_uimm_bits!(B, 8);
384 let b: u8 = B as u8;
385 let x: i8x32 = x.as_i8x32();
386 let a: i8x32 = a.as_i8x32();
387 let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
388 transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x32()))
389}
390
391/// Performs an affine transformation on the packed bytes in x.
392/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
393/// and b being a constant 8-bit immediate value.
394/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
395///
396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi8)
397#[inline]
398#[target_feature(enable = "gfni")]
399#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
400#[cfg_attr(test, assert_instr(gf2p8affineqb, B = 0))]
401#[rustc_legacy_const_generics(2)]
402pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
403 static_assert_uimm_bits!(B, 8);
404 let b: u8 = B as u8;
405 let x: i8x16 = x.as_i8x16();
406 let a: i8x16 = a.as_i8x16();
407 let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
408 transmute(src:r)
409}
410
411/// Performs an affine transformation on the packed bytes in x.
412/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
413/// and b being a constant 8-bit immediate value.
414/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
415///
416/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
417/// Otherwise the computation result is written into the result.
418///
419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi8)
420#[inline]
421#[target_feature(enable = "gfni,avx512bw,avx512vl")]
422#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
423#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
424#[rustc_legacy_const_generics(3)]
425pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
426 k: __mmask16,
427 x: __m128i,
428 a: __m128i,
429) -> __m128i {
430 static_assert_uimm_bits!(B, 8);
431 let b: u8 = B as u8;
432 let zero: i8x16 = _mm_setzero_si128().as_i8x16();
433 let x: i8x16 = x.as_i8x16();
434 let a: i8x16 = a.as_i8x16();
435 let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
436 transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
437}
438
439/// Performs an affine transformation on the packed bytes in x.
440/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
441/// and b being a constant 8-bit immediate value.
442/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
443///
444/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
445/// Otherwise the computation result is written into the result.
446///
447/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi8)
448#[inline]
449#[target_feature(enable = "gfni,avx512bw,avx512vl")]
450#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
451#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
452#[rustc_legacy_const_generics(4)]
453pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
454 src: __m128i,
455 k: __mmask16,
456 x: __m128i,
457 a: __m128i,
458) -> __m128i {
459 static_assert_uimm_bits!(B, 8);
460 let b: u8 = B as u8;
461 let x: i8x16 = x.as_i8x16();
462 let a: i8x16 = a.as_i8x16();
463 let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
464 transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x16()))
465}
466
467/// Performs an affine transformation on the inverted packed bytes in x.
468/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
469/// and b being a constant 8-bit immediate value.
470/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
471/// The inverse of 0 is 0.
472/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
473///
474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
475#[inline]
476#[target_feature(enable = "gfni,avx512bw,avx512f")]
477#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
478#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
479#[rustc_legacy_const_generics(2)]
480pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
481 static_assert_uimm_bits!(B, 8);
482 let b: u8 = B as u8;
483 let x: i8x64 = x.as_i8x64();
484 let a: i8x64 = a.as_i8x64();
485 let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
486 transmute(src:r)
487}
488
489/// Performs an affine transformation on the inverted packed bytes in x.
490/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
491/// and b being a constant 8-bit immediate value.
492/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
493/// The inverse of 0 is 0.
494/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
495///
496/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
497/// Otherwise the computation result is written into the result.
498///
499/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
500#[inline]
501#[target_feature(enable = "gfni,avx512bw,avx512f")]
502#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
503#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
504#[rustc_legacy_const_generics(3)]
505pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
506 k: __mmask64,
507 x: __m512i,
508 a: __m512i,
509) -> __m512i {
510 static_assert_uimm_bits!(B, 8);
511 let b: u8 = B as u8;
512 let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
513 let x: i8x64 = x.as_i8x64();
514 let a: i8x64 = a.as_i8x64();
515 let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
516 transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
517}
518
519/// Performs an affine transformation on the inverted packed bytes in x.
520/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
521/// and b being a constant 8-bit immediate value.
522/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
523/// The inverse of 0 is 0.
524/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
525///
526/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
527/// Otherwise the computation result is written into the result.
528///
529/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
530#[inline]
531#[target_feature(enable = "gfni,avx512bw,avx512f")]
532#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
533#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
534#[rustc_legacy_const_generics(4)]
535pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
536 src: __m512i,
537 k: __mmask64,
538 x: __m512i,
539 a: __m512i,
540) -> __m512i {
541 static_assert_uimm_bits!(B, 8);
542 let b: u8 = B as u8;
543 let x: i8x64 = x.as_i8x64();
544 let a: i8x64 = a.as_i8x64();
545 let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
546 transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x64()))
547}
548
549/// Performs an affine transformation on the inverted packed bytes in x.
550/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
551/// and b being a constant 8-bit immediate value.
552/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
553/// The inverse of 0 is 0.
554/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
555///
556/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
557#[inline]
558#[target_feature(enable = "gfni,avx")]
559#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
560#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
561#[rustc_legacy_const_generics(2)]
562pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
563 static_assert_uimm_bits!(B, 8);
564 let b: u8 = B as u8;
565 let x: i8x32 = x.as_i8x32();
566 let a: i8x32 = a.as_i8x32();
567 let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
568 transmute(src:r)
569}
570
571/// Performs an affine transformation on the inverted packed bytes in x.
572/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
573/// and b being a constant 8-bit immediate value.
574/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
575/// The inverse of 0 is 0.
576/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
577///
578/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
579/// Otherwise the computation result is written into the result.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
582#[inline]
583#[target_feature(enable = "gfni,avx512bw,avx512vl")]
584#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
585#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
586#[rustc_legacy_const_generics(3)]
587pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
588 k: __mmask32,
589 x: __m256i,
590 a: __m256i,
591) -> __m256i {
592 static_assert_uimm_bits!(B, 8);
593 let b: u8 = B as u8;
594 let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
595 let x: i8x32 = x.as_i8x32();
596 let a: i8x32 = a.as_i8x32();
597 let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
598 transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
599}
600
601/// Performs an affine transformation on the inverted packed bytes in x.
602/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
603/// and b being a constant 8-bit immediate value.
604/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
605/// The inverse of 0 is 0.
606/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
607///
608/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
609/// Otherwise the computation result is written into the result.
610///
611/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
612#[inline]
613#[target_feature(enable = "gfni,avx512bw,avx512vl")]
614#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
615#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
616#[rustc_legacy_const_generics(4)]
617pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
618 src: __m256i,
619 k: __mmask32,
620 x: __m256i,
621 a: __m256i,
622) -> __m256i {
623 static_assert_uimm_bits!(B, 8);
624 let b: u8 = B as u8;
625 let x: i8x32 = x.as_i8x32();
626 let a: i8x32 = a.as_i8x32();
627 let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
628 transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x32()))
629}
630
631/// Performs an affine transformation on the inverted packed bytes in x.
632/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
633/// and b being a constant 8-bit immediate value.
634/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
635/// The inverse of 0 is 0.
636/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
637///
638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
639#[inline]
640#[target_feature(enable = "gfni")]
641#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
642#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = 0))]
643#[rustc_legacy_const_generics(2)]
644pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
645 static_assert_uimm_bits!(B, 8);
646 let b: u8 = B as u8;
647 let x: i8x16 = x.as_i8x16();
648 let a: i8x16 = a.as_i8x16();
649 let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
650 transmute(src:r)
651}
652
653/// Performs an affine transformation on the inverted packed bytes in x.
654/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
655/// and b being a constant 8-bit immediate value.
656/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
657/// The inverse of 0 is 0.
658/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
659///
660/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
661/// Otherwise the computation result is written into the result.
662///
663/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
664#[inline]
665#[target_feature(enable = "gfni,avx512bw,avx512vl")]
666#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
667#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
668#[rustc_legacy_const_generics(3)]
669pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
670 k: __mmask16,
671 x: __m128i,
672 a: __m128i,
673) -> __m128i {
674 static_assert_uimm_bits!(B, 8);
675 let b: u8 = B as u8;
676 let zero: i8x16 = _mm_setzero_si128().as_i8x16();
677 let x: i8x16 = x.as_i8x16();
678 let a: i8x16 = a.as_i8x16();
679 let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
680 transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
681}
682
683/// Performs an affine transformation on the inverted packed bytes in x.
684/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
685/// and b being a constant 8-bit immediate value.
686/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
687/// The inverse of 0 is 0.
688/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
689///
690/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
691/// Otherwise the computation result is written into the result.
692///
693/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
694#[inline]
695#[target_feature(enable = "gfni,avx512bw,avx512vl")]
696#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
697#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
698#[rustc_legacy_const_generics(4)]
699pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
700 src: __m128i,
701 k: __mmask16,
702 x: __m128i,
703 a: __m128i,
704) -> __m128i {
705 static_assert_uimm_bits!(B, 8);
706 let b: u8 = B as u8;
707 let x: i8x16 = x.as_i8x16();
708 let a: i8x16 = a.as_i8x16();
709 let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
710 transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x16()))
711}
712
713#[cfg(test)]
714mod tests {
715 // The constants in the tests below are just bit patterns. They should not
716 // be interpreted as integers; signedness does not make sense for them, but
717 // __mXXXi happens to be defined in terms of signed integers.
718 #![allow(overflowing_literals)]
719
720 use core::hint::black_box;
721 use core::intrinsics::size_of;
722 use stdarch_test::simd_test;
723
724 use crate::core_arch::x86::*;
725
726 fn mulbyte(left: u8, right: u8) -> u8 {
727 // this implementation follows the description in
728 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
729 const REDUCTION_POLYNOMIAL: u16 = 0x11b;
730 let left: u16 = left.into();
731 let right: u16 = right.into();
732 let mut carryless_product: u16 = 0;
733
734 // Carryless multiplication
735 for i in 0..8 {
736 if ((left >> i) & 0x01) != 0 {
737 carryless_product ^= right << i;
738 }
739 }
740
741 // reduction, adding in "0" where appropriate to clear out high bits
742 // note that REDUCTION_POLYNOMIAL is zero in this context
743 for i in (8..=14).rev() {
744 if ((carryless_product >> i) & 0x01) != 0 {
745 carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
746 }
747 }
748
749 carryless_product as u8
750 }
751
752 const NUM_TEST_WORDS_512: usize = 4;
753 const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
754 const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
755 const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
756 const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
757 const NUM_BYTES: usize = 256;
758 const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
759 const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
760 const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
761
762 fn parity(input: u8) -> u8 {
763 let mut accumulator = 0;
764 for i in 0..8 {
765 accumulator ^= (input >> i) & 0x01;
766 }
767 accumulator
768 }
769
770 fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
771 // this implementation follows the description in
772 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
773 let mut accumulator = 0;
774
775 for bit in 0..8 {
776 accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
777 }
778
779 accumulator ^ b
780 }
781
782 fn generate_affine_mul_test_data(
783 immediate: u8,
784 ) -> (
785 [u64; NUM_TEST_WORDS_64],
786 [u8; NUM_TEST_ENTRIES],
787 [u8; NUM_TEST_ENTRIES],
788 ) {
789 let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
790 let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
791 let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
792
793 for i in 0..NUM_TEST_WORDS_64 {
794 left[i] = (i as u64) * 103 * 101;
795 for j in 0..8 {
796 let j64 = j as u64;
797 right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
798 result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
799 }
800 }
801
802 (left, right, result)
803 }
804
805 fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
806 let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
807 let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
808
809 for i in 0..NUM_BYTES {
810 input[i] = (i % 256) as u8;
811 result[i] = if i == 0 { 0 } else { 1 };
812 }
813
814 (input, result)
815 }
816
817 const AES_S_BOX: [u8; NUM_BYTES] = [
818 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
819 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
820 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
821 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
822 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
823 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
824 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
825 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
826 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
827 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
828 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
829 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
830 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
831 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
832 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
833 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
834 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
835 0x16,
836 ];
837
838 fn generate_byte_mul_test_data() -> (
839 [u8; NUM_TEST_ENTRIES],
840 [u8; NUM_TEST_ENTRIES],
841 [u8; NUM_TEST_ENTRIES],
842 ) {
843 let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
844 let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
845 let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
846
847 for i in 0..NUM_TEST_ENTRIES {
848 left[i] = (i % 256) as u8;
849 right[i] = left[i].wrapping_mul(101);
850 result[i] = mulbyte(left[i], right[i]);
851 }
852
853 (left, right, result)
854 }
855
856 #[target_feature(enable = "sse2")]
857 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
858 unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
859 let byte_offset = word_index * 16 / size_of::<T>();
860 let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
861 _mm_loadu_si128(black_box(pointer))
862 }
863
864 #[target_feature(enable = "avx")]
865 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
866 unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
867 let byte_offset = word_index * 32 / size_of::<T>();
868 let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
869 _mm256_loadu_si256(black_box(pointer))
870 }
871
872 #[target_feature(enable = "avx512f")]
873 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
874 unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
875 let byte_offset = word_index * 64 / size_of::<T>();
876 let pointer = data.as_ptr().add(byte_offset) as *const i32;
877 _mm512_loadu_si512(black_box(pointer))
878 }
879
880 #[simd_test(enable = "gfni,avx512bw")]
881 unsafe fn test_mm512_gf2p8mul_epi8() {
882 let (left, right, expected) = generate_byte_mul_test_data();
883
884 for i in 0..NUM_TEST_WORDS_512 {
885 let left = load_m512i_word(&left, i);
886 let right = load_m512i_word(&right, i);
887 let expected = load_m512i_word(&expected, i);
888 let result = _mm512_gf2p8mul_epi8(left, right);
889 assert_eq_m512i(result, expected);
890 }
891 }
892
893 #[simd_test(enable = "gfni,avx512bw")]
894 unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
895 let (left, right, _expected) = generate_byte_mul_test_data();
896
897 for i in 0..NUM_TEST_WORDS_512 {
898 let left = load_m512i_word(&left, i);
899 let right = load_m512i_word(&right, i);
900 let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
901 assert_eq_m512i(result_zero, _mm512_setzero_si512());
902 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
903 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
904 let expected_result = _mm512_gf2p8mul_epi8(left, right);
905 let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
906 let expected_masked =
907 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
908 assert_eq_m512i(result_masked, expected_masked);
909 }
910 }
911
912 #[simd_test(enable = "gfni,avx512bw")]
913 unsafe fn test_mm512_mask_gf2p8mul_epi8() {
914 let (left, right, _expected) = generate_byte_mul_test_data();
915
916 for i in 0..NUM_TEST_WORDS_512 {
917 let left = load_m512i_word(&left, i);
918 let right = load_m512i_word(&right, i);
919 let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
920 assert_eq_m512i(result_left, left);
921 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
922 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
923 let expected_result = _mm512_gf2p8mul_epi8(left, right);
924 let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
925 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
926 assert_eq_m512i(result_masked, expected_masked);
927 }
928 }
929
930 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
931 unsafe fn test_mm256_gf2p8mul_epi8() {
932 let (left, right, expected) = generate_byte_mul_test_data();
933
934 for i in 0..NUM_TEST_WORDS_256 {
935 let left = load_m256i_word(&left, i);
936 let right = load_m256i_word(&right, i);
937 let expected = load_m256i_word(&expected, i);
938 let result = _mm256_gf2p8mul_epi8(left, right);
939 assert_eq_m256i(result, expected);
940 }
941 }
942
943 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
944 unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
945 let (left, right, _expected) = generate_byte_mul_test_data();
946
947 for i in 0..NUM_TEST_WORDS_256 {
948 let left = load_m256i_word(&left, i);
949 let right = load_m256i_word(&right, i);
950 let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
951 assert_eq_m256i(result_zero, _mm256_setzero_si256());
952 let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
953 const MASK_WORDS: i32 = 0b01_10_11_00;
954 let expected_result = _mm256_gf2p8mul_epi8(left, right);
955 let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
956 let expected_masked =
957 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
958 assert_eq_m256i(result_masked, expected_masked);
959 }
960 }
961
962 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
963 unsafe fn test_mm256_mask_gf2p8mul_epi8() {
964 let (left, right, _expected) = generate_byte_mul_test_data();
965
966 for i in 0..NUM_TEST_WORDS_256 {
967 let left = load_m256i_word(&left, i);
968 let right = load_m256i_word(&right, i);
969 let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
970 assert_eq_m256i(result_left, left);
971 let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
972 const MASK_WORDS: i32 = 0b01_10_11_00;
973 let expected_result = _mm256_gf2p8mul_epi8(left, right);
974 let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
975 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
976 assert_eq_m256i(result_masked, expected_masked);
977 }
978 }
979
980 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
981 unsafe fn test_mm_gf2p8mul_epi8() {
982 let (left, right, expected) = generate_byte_mul_test_data();
983
984 for i in 0..NUM_TEST_WORDS_128 {
985 let left = load_m128i_word(&left, i);
986 let right = load_m128i_word(&right, i);
987 let expected = load_m128i_word(&expected, i);
988 let result = _mm_gf2p8mul_epi8(left, right);
989 assert_eq_m128i(result, expected);
990 }
991 }
992
993 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
994 unsafe fn test_mm_maskz_gf2p8mul_epi8() {
995 let (left, right, _expected) = generate_byte_mul_test_data();
996
997 for i in 0..NUM_TEST_WORDS_128 {
998 let left = load_m128i_word(&left, i);
999 let right = load_m128i_word(&right, i);
1000 let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
1001 assert_eq_m128i(result_zero, _mm_setzero_si128());
1002 let mask_bytes: __mmask16 = 0x0F_F0;
1003 const MASK_WORDS: i32 = 0b01_10;
1004 let expected_result = _mm_gf2p8mul_epi8(left, right);
1005 let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
1006 let expected_masked =
1007 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1008 assert_eq_m128i(result_masked, expected_masked);
1009 }
1010 }
1011
1012 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1013 unsafe fn test_mm_mask_gf2p8mul_epi8() {
1014 let (left, right, _expected) = generate_byte_mul_test_data();
1015
1016 for i in 0..NUM_TEST_WORDS_128 {
1017 let left = load_m128i_word(&left, i);
1018 let right = load_m128i_word(&right, i);
1019 let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
1020 assert_eq_m128i(result_left, left);
1021 let mask_bytes: __mmask16 = 0x0F_F0;
1022 const MASK_WORDS: i32 = 0b01_10;
1023 let expected_result = _mm_gf2p8mul_epi8(left, right);
1024 let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1025 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1026 assert_eq_m128i(result_masked, expected_masked);
1027 }
1028 }
1029
1030 #[simd_test(enable = "gfni,avx512bw")]
1031 unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1032 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1033 const IDENTITY_BYTE: i32 = 0;
1034 let constant: i64 = 0;
1035 const CONSTANT_BYTE: i32 = 0x63;
1036 let identity = _mm512_set1_epi64(identity);
1037 let constant = _mm512_set1_epi64(constant);
1038 let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1039
1040 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1041 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1042
1043 for i in 0..NUM_TEST_WORDS_512 {
1044 let data = load_m512i_word(&bytes, i);
1045 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1046 assert_eq_m512i(result, data);
1047 let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1048 assert_eq_m512i(result, constant_reference);
1049 let data = load_m512i_word(&more_bytes, i);
1050 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1051 assert_eq_m512i(result, data);
1052 let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1053 assert_eq_m512i(result, constant_reference);
1054
1055 let matrix = load_m512i_word(&matrices, i);
1056 let vector = load_m512i_word(&vectors, i);
1057 let reference = load_m512i_word(&references, i);
1058
1059 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1060 assert_eq_m512i(result, reference);
1061 }
1062 }
1063
1064 #[simd_test(enable = "gfni,avx512bw")]
1065 unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1066 const CONSTANT_BYTE: i32 = 0x63;
1067 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1068
1069 for i in 0..NUM_TEST_WORDS_512 {
1070 let matrix = load_m512i_word(&matrices, i);
1071 let vector = load_m512i_word(&vectors, i);
1072 let result_zero =
1073 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1074 assert_eq_m512i(result_zero, _mm512_setzero_si512());
1075 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1076 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1077 let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1078 let result_masked =
1079 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1080 let expected_masked =
1081 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1082 assert_eq_m512i(result_masked, expected_masked);
1083 }
1084 }
1085
1086 #[simd_test(enable = "gfni,avx512bw")]
1087 unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1088 const CONSTANT_BYTE: i32 = 0x63;
1089 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1090
1091 for i in 0..NUM_TEST_WORDS_512 {
1092 let left = load_m512i_word(&vectors, i);
1093 let right = load_m512i_word(&matrices, i);
1094 let result_left =
1095 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1096 assert_eq_m512i(result_left, left);
1097 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1098 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1099 let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1100 let result_masked =
1101 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1102 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1103 assert_eq_m512i(result_masked, expected_masked);
1104 }
1105 }
1106
1107 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1108 unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1109 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1110 const IDENTITY_BYTE: i32 = 0;
1111 let constant: i64 = 0;
1112 const CONSTANT_BYTE: i32 = 0x63;
1113 let identity = _mm256_set1_epi64x(identity);
1114 let constant = _mm256_set1_epi64x(constant);
1115 let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1116
1117 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1118 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1119
1120 for i in 0..NUM_TEST_WORDS_256 {
1121 let data = load_m256i_word(&bytes, i);
1122 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1123 assert_eq_m256i(result, data);
1124 let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1125 assert_eq_m256i(result, constant_reference);
1126 let data = load_m256i_word(&more_bytes, i);
1127 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1128 assert_eq_m256i(result, data);
1129 let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1130 assert_eq_m256i(result, constant_reference);
1131
1132 let matrix = load_m256i_word(&matrices, i);
1133 let vector = load_m256i_word(&vectors, i);
1134 let reference = load_m256i_word(&references, i);
1135
1136 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1137 assert_eq_m256i(result, reference);
1138 }
1139 }
1140
1141 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1142 unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1143 const CONSTANT_BYTE: i32 = 0x63;
1144 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1145
1146 for i in 0..NUM_TEST_WORDS_256 {
1147 let matrix = load_m256i_word(&matrices, i);
1148 let vector = load_m256i_word(&vectors, i);
1149 let result_zero =
1150 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1151 assert_eq_m256i(result_zero, _mm256_setzero_si256());
1152 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1153 const MASK_WORDS: i32 = 0b11_01_10_00;
1154 let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1155 let result_masked =
1156 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1157 let expected_masked =
1158 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1159 assert_eq_m256i(result_masked, expected_masked);
1160 }
1161 }
1162
1163 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1164 unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1165 const CONSTANT_BYTE: i32 = 0x63;
1166 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1167
1168 for i in 0..NUM_TEST_WORDS_256 {
1169 let left = load_m256i_word(&vectors, i);
1170 let right = load_m256i_word(&matrices, i);
1171 let result_left =
1172 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1173 assert_eq_m256i(result_left, left);
1174 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1175 const MASK_WORDS: i32 = 0b11_01_10_00;
1176 let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1177 let result_masked =
1178 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1179 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1180 assert_eq_m256i(result_masked, expected_masked);
1181 }
1182 }
1183
1184 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1185 unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1186 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1187 const IDENTITY_BYTE: i32 = 0;
1188 let constant: i64 = 0;
1189 const CONSTANT_BYTE: i32 = 0x63;
1190 let identity = _mm_set1_epi64x(identity);
1191 let constant = _mm_set1_epi64x(constant);
1192 let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1193
1194 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1195 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1196
1197 for i in 0..NUM_TEST_WORDS_128 {
1198 let data = load_m128i_word(&bytes, i);
1199 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1200 assert_eq_m128i(result, data);
1201 let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1202 assert_eq_m128i(result, constant_reference);
1203 let data = load_m128i_word(&more_bytes, i);
1204 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1205 assert_eq_m128i(result, data);
1206 let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1207 assert_eq_m128i(result, constant_reference);
1208
1209 let matrix = load_m128i_word(&matrices, i);
1210 let vector = load_m128i_word(&vectors, i);
1211 let reference = load_m128i_word(&references, i);
1212
1213 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1214 assert_eq_m128i(result, reference);
1215 }
1216 }
1217
1218 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1219 unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1220 const CONSTANT_BYTE: i32 = 0x63;
1221 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1222
1223 for i in 0..NUM_TEST_WORDS_128 {
1224 let matrix = load_m128i_word(&matrices, i);
1225 let vector = load_m128i_word(&vectors, i);
1226 let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1227 assert_eq_m128i(result_zero, _mm_setzero_si128());
1228 let mask_bytes: __mmask16 = 0x0F_F0;
1229 const MASK_WORDS: i32 = 0b01_10;
1230 let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1231 let result_masked =
1232 _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1233 let expected_masked =
1234 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1235 assert_eq_m128i(result_masked, expected_masked);
1236 }
1237 }
1238
1239 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1240 unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1241 const CONSTANT_BYTE: i32 = 0x63;
1242 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1243
1244 for i in 0..NUM_TEST_WORDS_128 {
1245 let left = load_m128i_word(&vectors, i);
1246 let right = load_m128i_word(&matrices, i);
1247 let result_left =
1248 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1249 assert_eq_m128i(result_left, left);
1250 let mask_bytes: __mmask16 = 0x0F_F0;
1251 const MASK_WORDS: i32 = 0b01_10;
1252 let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1253 let result_masked =
1254 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1255 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1256 assert_eq_m128i(result_masked, expected_masked);
1257 }
1258 }
1259
1260 #[simd_test(enable = "gfni,avx512bw")]
1261 unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1262 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1263 const IDENTITY_BYTE: i32 = 0;
1264 const CONSTANT_BYTE: i32 = 0x63;
1265 let identity = _mm512_set1_epi64(identity);
1266
1267 // validate inversion
1268 let (inputs, results) = generate_inv_tests_data();
1269
1270 for i in 0..NUM_BYTES_WORDS_512 {
1271 let input = load_m512i_word(&inputs, i);
1272 let reference = load_m512i_word(&results, i);
1273 let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1274 let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1275 assert_eq_m512i(remultiplied, reference);
1276 }
1277
1278 // validate subsequent affine operation
1279 let (matrices, vectors, _affine_expected) =
1280 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1281
1282 for i in 0..NUM_TEST_WORDS_512 {
1283 let vector = load_m512i_word(&vectors, i);
1284 let matrix = load_m512i_word(&matrices, i);
1285
1286 let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1287 let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1288 let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1289 assert_eq_m512i(result, reference);
1290 }
1291
1292 // validate everything by virtue of checking against the AES SBox
1293 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1294 let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1295
1296 for i in 0..NUM_BYTES_WORDS_512 {
1297 let reference = load_m512i_word(&AES_S_BOX, i);
1298 let input = load_m512i_word(&inputs, i);
1299 let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1300 assert_eq_m512i(result, reference);
1301 }
1302 }
1303
1304 #[simd_test(enable = "gfni,avx512bw")]
1305 unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1306 const CONSTANT_BYTE: i32 = 0x63;
1307 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1308
1309 for i in 0..NUM_TEST_WORDS_512 {
1310 let matrix = load_m512i_word(&matrices, i);
1311 let vector = load_m512i_word(&vectors, i);
1312 let result_zero =
1313 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1314 assert_eq_m512i(result_zero, _mm512_setzero_si512());
1315 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1316 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1317 let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1318 let result_masked =
1319 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1320 let expected_masked =
1321 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1322 assert_eq_m512i(result_masked, expected_masked);
1323 }
1324 }
1325
1326 #[simd_test(enable = "gfni,avx512bw")]
1327 unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1328 const CONSTANT_BYTE: i32 = 0x63;
1329 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1330
1331 for i in 0..NUM_TEST_WORDS_512 {
1332 let left = load_m512i_word(&vectors, i);
1333 let right = load_m512i_word(&matrices, i);
1334 let result_left =
1335 _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1336 assert_eq_m512i(result_left, left);
1337 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1338 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1339 let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1340 let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1341 left, mask_bytes, left, right,
1342 );
1343 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1344 assert_eq_m512i(result_masked, expected_masked);
1345 }
1346 }
1347
1348 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1349 unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1350 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1351 const IDENTITY_BYTE: i32 = 0;
1352 const CONSTANT_BYTE: i32 = 0x63;
1353 let identity = _mm256_set1_epi64x(identity);
1354
1355 // validate inversion
1356 let (inputs, results) = generate_inv_tests_data();
1357
1358 for i in 0..NUM_BYTES_WORDS_256 {
1359 let input = load_m256i_word(&inputs, i);
1360 let reference = load_m256i_word(&results, i);
1361 let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1362 let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1363 assert_eq_m256i(remultiplied, reference);
1364 }
1365
1366 // validate subsequent affine operation
1367 let (matrices, vectors, _affine_expected) =
1368 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1369
1370 for i in 0..NUM_TEST_WORDS_256 {
1371 let vector = load_m256i_word(&vectors, i);
1372 let matrix = load_m256i_word(&matrices, i);
1373
1374 let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1375 let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1376 let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1377 assert_eq_m256i(result, reference);
1378 }
1379
1380 // validate everything by virtue of checking against the AES SBox
1381 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1382 let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1383
1384 for i in 0..NUM_BYTES_WORDS_256 {
1385 let reference = load_m256i_word(&AES_S_BOX, i);
1386 let input = load_m256i_word(&inputs, i);
1387 let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1388 assert_eq_m256i(result, reference);
1389 }
1390 }
1391
1392 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1393 unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1394 const CONSTANT_BYTE: i32 = 0x63;
1395 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1396
1397 for i in 0..NUM_TEST_WORDS_256 {
1398 let matrix = load_m256i_word(&matrices, i);
1399 let vector = load_m256i_word(&vectors, i);
1400 let result_zero =
1401 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1402 assert_eq_m256i(result_zero, _mm256_setzero_si256());
1403 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1404 const MASK_WORDS: i32 = 0b11_01_10_00;
1405 let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1406 let result_masked =
1407 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1408 let expected_masked =
1409 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1410 assert_eq_m256i(result_masked, expected_masked);
1411 }
1412 }
1413
1414 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1415 unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1416 const CONSTANT_BYTE: i32 = 0x63;
1417 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1418
1419 for i in 0..NUM_TEST_WORDS_256 {
1420 let left = load_m256i_word(&vectors, i);
1421 let right = load_m256i_word(&matrices, i);
1422 let result_left =
1423 _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1424 assert_eq_m256i(result_left, left);
1425 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1426 const MASK_WORDS: i32 = 0b11_01_10_00;
1427 let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1428 let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1429 left, mask_bytes, left, right,
1430 );
1431 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1432 assert_eq_m256i(result_masked, expected_masked);
1433 }
1434 }
1435
1436 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1437 unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1438 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1439 const IDENTITY_BYTE: i32 = 0;
1440 const CONSTANT_BYTE: i32 = 0x63;
1441 let identity = _mm_set1_epi64x(identity);
1442
1443 // validate inversion
1444 let (inputs, results) = generate_inv_tests_data();
1445
1446 for i in 0..NUM_BYTES_WORDS_128 {
1447 let input = load_m128i_word(&inputs, i);
1448 let reference = load_m128i_word(&results, i);
1449 let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1450 let remultiplied = _mm_gf2p8mul_epi8(result, input);
1451 assert_eq_m128i(remultiplied, reference);
1452 }
1453
1454 // validate subsequent affine operation
1455 let (matrices, vectors, _affine_expected) =
1456 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1457
1458 for i in 0..NUM_TEST_WORDS_128 {
1459 let vector = load_m128i_word(&vectors, i);
1460 let matrix = load_m128i_word(&matrices, i);
1461
1462 let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1463 let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1464 let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1465 assert_eq_m128i(result, reference);
1466 }
1467
1468 // validate everything by virtue of checking against the AES SBox
1469 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1470 let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1471
1472 for i in 0..NUM_BYTES_WORDS_128 {
1473 let reference = load_m128i_word(&AES_S_BOX, i);
1474 let input = load_m128i_word(&inputs, i);
1475 let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1476 assert_eq_m128i(result, reference);
1477 }
1478 }
1479
1480 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1481 unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1482 const CONSTANT_BYTE: i32 = 0x63;
1483 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1484
1485 for i in 0..NUM_TEST_WORDS_128 {
1486 let matrix = load_m128i_word(&matrices, i);
1487 let vector = load_m128i_word(&vectors, i);
1488 let result_zero =
1489 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1490 assert_eq_m128i(result_zero, _mm_setzero_si128());
1491 let mask_bytes: __mmask16 = 0x0F_F0;
1492 const MASK_WORDS: i32 = 0b01_10;
1493 let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1494 let result_masked =
1495 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1496 let expected_masked =
1497 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1498 assert_eq_m128i(result_masked, expected_masked);
1499 }
1500 }
1501
1502 #[simd_test(enable = "gfni,avx512bw,avx512vl")]
1503 unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1504 const CONSTANT_BYTE: i32 = 0x63;
1505 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1506
1507 for i in 0..NUM_TEST_WORDS_128 {
1508 let left = load_m128i_word(&vectors, i);
1509 let right = load_m128i_word(&matrices, i);
1510 let result_left =
1511 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1512 assert_eq_m128i(result_left, left);
1513 let mask_bytes: __mmask16 = 0x0F_F0;
1514 const MASK_WORDS: i32 = 0b01_10;
1515 let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1516 let result_masked =
1517 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1518 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1519 assert_eq_m128i(result_masked, expected_masked);
1520 }
1521 }
1522}
1523