gfni.rs source code [crates/core_arch/src/x86/gfni.rs]

1	//! Galois Field New Instructions (GFNI)
2	//!
3	//! The intrinsics here correspond to those in the `immintrin.h` C header.
4	//!
5	//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6	//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7	//!
8	//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10	use crate::core_arch::simd::i8x16;
11	use crate::core_arch::simd::i8x32;
12	use crate::core_arch::simd::i8x64;
13	use crate::core_arch::x86::__m128i;
14	use crate::core_arch::x86::__m256i;
15	use crate::core_arch::x86::__m512i;
16	use crate::core_arch::x86::__mmask16;
17	use crate::core_arch::x86::__mmask32;
18	use crate::core_arch::x86::__mmask64;
19	use crate::intrinsics::simd::simd_select_bitmask;
20	use crate::mem::transmute;
21
22	#[cfg(test)]
23	use stdarch_test::assert_instr;
24
25	#[allow(improper_ctypes)]
26	unsafe extern "C" {
27	#[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
28	unsafefn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
29	#[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
30	unsafefn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
31	#[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
32	unsafefn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
33	#[link_name = "llvm.x86.vgf2p8affineqb.512"]
34	unsafefn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35	#[link_name = "llvm.x86.vgf2p8affineqb.256"]
36	unsafefn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37	#[link_name = "llvm.x86.vgf2p8affineqb.128"]
38	unsafefn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39	#[link_name = "llvm.x86.vgf2p8mulb.512"]
40	unsafefn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
41	#[link_name = "llvm.x86.vgf2p8mulb.256"]
42	unsafefn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
43	#[link_name = "llvm.x86.vgf2p8mulb.128"]
44	unsafefn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
45	}
46
47	// LLVM requires AVX512BW for a lot of these instructions, see
48	// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
49	// however our tests also require the target feature list to match Intel's
50	// which doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F*
51	// requirement (for now)
52	// also see
53	// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
54	// for forcing GFNI, BW and optionally VL extension
55
56	/// Performs a multiplication in GF(2^8) on the packed bytes.
57	/// The field is in polynomial representation with the reduction polynomial
58	/// x^8 + x^4 + x^3 + x + 1.
59	///
60	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
61	#[inline]
62	#[target_feature(enable = "gfni,avx512f")]
63	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
64	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
65	pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
66	transmute(src:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()))
67	}
68
69	/// Performs a multiplication in GF(2^8) on the packed bytes.
70	/// The field is in polynomial representation with the reduction polynomial
71	/// x^8 + x^4 + x^3 + x + 1.
72	///
73	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
74	/// Otherwise the computation result is written into the result.
75	///
76	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
77	#[inline]
78	#[target_feature(enable = "gfni,avx512bw,avx512f")]
79	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
80	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
81	pub unsafe fn _mm512_mask_gf2p8mul_epi8(
82	src: __m512i,
83	k: __mmask64,
84	a: __m512i,
85	b: __m512i,
86	) -> __m512i {
87	transmute(src:simd_select_bitmask(
88	m:k,
89	yes:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
90	no:src.as_i8x64(),
91	))
92	}
93
94	/// Performs a multiplication in GF(2^8) on the packed bytes.
95	/// The field is in polynomial representation with the reduction polynomial
96	/// x^8 + x^4 + x^3 + x + 1.
97	///
98	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
99	/// Otherwise the computation result is written into the result.
100	///
101	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
102	#[inline]
103	#[target_feature(enable = "gfni,avx512bw,avx512f")]
104	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
105	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
106	pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
107	let zero: i8x64 = i8x64::ZERO;
108	transmute(src:simd_select_bitmask(
109	m:k,
110	yes:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
111	no:zero,
112	))
113	}
114
115	/// Performs a multiplication in GF(2^8) on the packed bytes.
116	/// The field is in polynomial representation with the reduction polynomial
117	/// x^8 + x^4 + x^3 + x + 1.
118	///
119	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
120	#[inline]
121	#[target_feature(enable = "gfni,avx")]
122	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
123	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
124	pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
125	transmute(src:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()))
126	}
127
128	/// Performs a multiplication in GF(2^8) on the packed bytes.
129	/// The field is in polynomial representation with the reduction polynomial
130	/// x^8 + x^4 + x^3 + x + 1.
131	///
132	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
133	/// Otherwise the computation result is written into the result.
134	///
135	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
136	#[inline]
137	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
138	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
139	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
140	pub unsafe fn _mm256_mask_gf2p8mul_epi8(
141	src: __m256i,
142	k: __mmask32,
143	a: __m256i,
144	b: __m256i,
145	) -> __m256i {
146	transmute(src:simd_select_bitmask(
147	m:k,
148	yes:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
149	no:src.as_i8x32(),
150	))
151	}
152
153	/// Performs a multiplication in GF(2^8) on the packed bytes.
154	/// The field is in polynomial representation with the reduction polynomial
155	/// x^8 + x^4 + x^3 + x + 1.
156	///
157	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
158	/// Otherwise the computation result is written into the result.
159	///
160	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
161	#[inline]
162	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
163	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
164	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
165	pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
166	let zero: i8x32 = i8x32::ZERO;
167	transmute(src:simd_select_bitmask(
168	m:k,
169	yes:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
170	no:zero,
171	))
172	}
173
174	/// Performs a multiplication in GF(2^8) on the packed bytes.
175	/// The field is in polynomial representation with the reduction polynomial
176	/// x^8 + x^4 + x^3 + x + 1.
177	///
178	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
179	#[inline]
180	#[target_feature(enable = "gfni")]
181	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
182	#[cfg_attr(test, assert_instr(gf2p8mulb))]
183	pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
184	transmute(src:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()))
185	}
186
187	/// Performs a multiplication in GF(2^8) on the packed bytes.
188	/// The field is in polynomial representation with the reduction polynomial
189	/// x^8 + x^4 + x^3 + x + 1.
190	///
191	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
192	/// Otherwise the computation result is written into the result.
193	///
194	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
195	#[inline]
196	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
197	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
198	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
199	pub unsafe fn _mm_mask_gf2p8mul_epi8(
200	src: __m128i,
201	k: __mmask16,
202	a: __m128i,
203	b: __m128i,
204	) -> __m128i {
205	transmute(src:simd_select_bitmask(
206	m:k,
207	yes:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
208	no:src.as_i8x16(),
209	))
210	}
211
212	/// Performs a multiplication in GF(2^8) on the packed bytes.
213	/// The field is in polynomial representation with the reduction polynomial
214	/// x^8 + x^4 + x^3 + x + 1.
215	///
216	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
217	/// Otherwise the computation result is written into the result.
218	///
219	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
220	#[inline]
221	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
222	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
223	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
224	pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
225	let zero: i8x16 = i8x16::ZERO;
226	transmute(src:simd_select_bitmask(
227	m:k,
228	yes:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
229	no:zero,
230	))
231	}
232
233	/// Performs an affine transformation on the packed bytes in x.
234	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
235	/// and b being a constant 8-bit immediate value.
236	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
237	///
238	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi64_epi8)
239	#[inline]
240	#[target_feature(enable = "gfni,avx512f")]
241	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
242	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
243	#[rustc_legacy_const_generics(`2`)]
244	pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
245	static_assert_uimm_bits!(B, `8`);
246	let b: u8 = B as u8;
247	let x: i8x64 = x.as_i8x64();
248	let a: i8x64 = a.as_i8x64();
249	let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
250	transmute(src:r)
251	}
252
253	/// Performs an affine transformation on the packed bytes in x.
254	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
255	/// and b being a constant 8-bit immediate value.
256	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
257	///
258	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
259	/// Otherwise the computation result is written into the result.
260	///
261	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi64_epi8)
262	#[inline]
263	#[target_feature(enable = "gfni,avx512bw,avx512f")]
264	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
265	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
266	#[rustc_legacy_const_generics(`3`)]
267	pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
268	k: __mmask64,
269	x: __m512i,
270	a: __m512i,
271	) -> __m512i {
272	static_assert_uimm_bits!(B, `8`);
273	let b: u8 = B as u8;
274	let zero: i8x64 = i8x64::ZERO;
275	let x: i8x64 = x.as_i8x64();
276	let a: i8x64 = a.as_i8x64();
277	let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
278	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
279	}
280
281	/// Performs an affine transformation on the packed bytes in x.
282	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
283	/// and b being a constant 8-bit immediate value.
284	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
285	///
286	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
287	/// Otherwise the computation result is written into the result.
288	///
289	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi64_epi8)
290	#[inline]
291	#[target_feature(enable = "gfni,avx512bw,avx512f")]
292	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
293	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
294	#[rustc_legacy_const_generics(`4`)]
295	pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
296	src: __m512i,
297	k: __mmask64,
298	x: __m512i,
299	a: __m512i,
300	) -> __m512i {
301	static_assert_uimm_bits!(B, `8`);
302	let b: u8 = B as u8;
303	let x: i8x64 = x.as_i8x64();
304	let a: i8x64 = a.as_i8x64();
305	let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
306	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x64()))
307	}
308
309	/// Performs an affine transformation on the packed bytes in x.
310	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
311	/// and b being a constant 8-bit immediate value.
312	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
313	///
314	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi64_epi8)
315	#[inline]
316	#[target_feature(enable = "gfni,avx")]
317	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
318	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
319	#[rustc_legacy_const_generics(`2`)]
320	pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
321	static_assert_uimm_bits!(B, `8`);
322	let b: u8 = B as u8;
323	let x: i8x32 = x.as_i8x32();
324	let a: i8x32 = a.as_i8x32();
325	let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
326	transmute(src:r)
327	}
328
329	/// Performs an affine transformation on the packed bytes in x.
330	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
331	/// and b being a constant 8-bit immediate value.
332	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
333	///
334	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
335	/// Otherwise the computation result is written into the result.
336	///
337	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi64_epi8)
338	#[inline]
339	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
340	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
341	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
342	#[rustc_legacy_const_generics(`3`)]
343	pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
344	k: __mmask32,
345	x: __m256i,
346	a: __m256i,
347	) -> __m256i {
348	static_assert_uimm_bits!(B, `8`);
349	let b: u8 = B as u8;
350	let zero: i8x32 = i8x32::ZERO;
351	let x: i8x32 = x.as_i8x32();
352	let a: i8x32 = a.as_i8x32();
353	let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
354	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
355	}
356
357	/// Performs an affine transformation on the packed bytes in x.
358	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
359	/// and b being a constant 8-bit immediate value.
360	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
361	///
362	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
363	/// Otherwise the computation result is written into the result.
364	///
365	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi64_epi8)
366	#[inline]
367	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
368	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
369	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
370	#[rustc_legacy_const_generics(`4`)]
371	pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
372	src: __m256i,
373	k: __mmask32,
374	x: __m256i,
375	a: __m256i,
376	) -> __m256i {
377	static_assert_uimm_bits!(B, `8`);
378	let b: u8 = B as u8;
379	let x: i8x32 = x.as_i8x32();
380	let a: i8x32 = a.as_i8x32();
381	let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
382	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x32()))
383	}
384
385	/// Performs an affine transformation on the packed bytes in x.
386	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
387	/// and b being a constant 8-bit immediate value.
388	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
389	///
390	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8)
391	#[inline]
392	#[target_feature(enable = "gfni")]
393	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
394	#[cfg_attr(test, assert_instr(gf2p8affineqb, B = `0`))]
395	#[rustc_legacy_const_generics(`2`)]
396	pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
397	static_assert_uimm_bits!(B, `8`);
398	let b: u8 = B as u8;
399	let x: i8x16 = x.as_i8x16();
400	let a: i8x16 = a.as_i8x16();
401	let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
402	transmute(src:r)
403	}
404
405	/// Performs an affine transformation on the packed bytes in x.
406	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
407	/// and b being a constant 8-bit immediate value.
408	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
409	///
410	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
411	/// Otherwise the computation result is written into the result.
412	///
413	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi64_epi8)
414	#[inline]
415	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
416	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
417	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
418	#[rustc_legacy_const_generics(`3`)]
419	pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
420	k: __mmask16,
421	x: __m128i,
422	a: __m128i,
423	) -> __m128i {
424	static_assert_uimm_bits!(B, `8`);
425	let b: u8 = B as u8;
426	let zero: i8x16 = i8x16::ZERO;
427	let x: i8x16 = x.as_i8x16();
428	let a: i8x16 = a.as_i8x16();
429	let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
430	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
431	}
432
433	/// Performs an affine transformation on the packed bytes in x.
434	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
435	/// and b being a constant 8-bit immediate value.
436	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
437	///
438	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
439	/// Otherwise the computation result is written into the result.
440	///
441	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi64_epi8)
442	#[inline]
443	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
444	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
445	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
446	#[rustc_legacy_const_generics(`4`)]
447	pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
448	src: __m128i,
449	k: __mmask16,
450	x: __m128i,
451	a: __m128i,
452	) -> __m128i {
453	static_assert_uimm_bits!(B, `8`);
454	let b: u8 = B as u8;
455	let x: i8x16 = x.as_i8x16();
456	let a: i8x16 = a.as_i8x16();
457	let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
458	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x16()))
459	}
460
461	/// Performs an affine transformation on the inverted packed bytes in x.
462	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
463	/// and b being a constant 8-bit immediate value.
464	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
465	/// The inverse of 0 is 0.
466	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
467	///
468	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
469	#[inline]
470	#[target_feature(enable = "gfni,avx512f")]
471	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
472	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
473	#[rustc_legacy_const_generics(`2`)]
474	pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
475	static_assert_uimm_bits!(B, `8`);
476	let b: u8 = B as u8;
477	let x: i8x64 = x.as_i8x64();
478	let a: i8x64 = a.as_i8x64();
479	let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
480	transmute(src:r)
481	}
482
483	/// Performs an affine transformation on the inverted packed bytes in x.
484	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
485	/// and b being a constant 8-bit immediate value.
486	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
487	/// The inverse of 0 is 0.
488	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
489	///
490	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
491	/// Otherwise the computation result is written into the result.
492	///
493	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
494	#[inline]
495	#[target_feature(enable = "gfni,avx512bw,avx512f")]
496	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
497	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
498	#[rustc_legacy_const_generics(`3`)]
499	pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
500	k: __mmask64,
501	x: __m512i,
502	a: __m512i,
503	) -> __m512i {
504	static_assert_uimm_bits!(B, `8`);
505	let b: u8 = B as u8;
506	let zero: i8x64 = i8x64::ZERO;
507	let x: i8x64 = x.as_i8x64();
508	let a: i8x64 = a.as_i8x64();
509	let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
510	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
511	}
512
513	/// Performs an affine transformation on the inverted packed bytes in x.
514	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
515	/// and b being a constant 8-bit immediate value.
516	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
517	/// The inverse of 0 is 0.
518	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
519	///
520	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
521	/// Otherwise the computation result is written into the result.
522	///
523	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
524	#[inline]
525	#[target_feature(enable = "gfni,avx512bw,avx512f")]
526	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
527	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
528	#[rustc_legacy_const_generics(`4`)]
529	pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
530	src: __m512i,
531	k: __mmask64,
532	x: __m512i,
533	a: __m512i,
534	) -> __m512i {
535	static_assert_uimm_bits!(B, `8`);
536	let b: u8 = B as u8;
537	let x: i8x64 = x.as_i8x64();
538	let a: i8x64 = a.as_i8x64();
539	let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
540	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x64()))
541	}
542
543	/// Performs an affine transformation on the inverted packed bytes in x.
544	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
545	/// and b being a constant 8-bit immediate value.
546	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
547	/// The inverse of 0 is 0.
548	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
549	///
550	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
551	#[inline]
552	#[target_feature(enable = "gfni,avx")]
553	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
554	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
555	#[rustc_legacy_const_generics(`2`)]
556	pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
557	static_assert_uimm_bits!(B, `8`);
558	let b: u8 = B as u8;
559	let x: i8x32 = x.as_i8x32();
560	let a: i8x32 = a.as_i8x32();
561	let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
562	transmute(src:r)
563	}
564
565	/// Performs an affine transformation on the inverted packed bytes in x.
566	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
567	/// and b being a constant 8-bit immediate value.
568	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
569	/// The inverse of 0 is 0.
570	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
571	///
572	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
573	/// Otherwise the computation result is written into the result.
574	///
575	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
576	#[inline]
577	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
578	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
579	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
580	#[rustc_legacy_const_generics(`3`)]
581	pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
582	k: __mmask32,
583	x: __m256i,
584	a: __m256i,
585	) -> __m256i {
586	static_assert_uimm_bits!(B, `8`);
587	let b: u8 = B as u8;
588	let zero: i8x32 = i8x32::ZERO;
589	let x: i8x32 = x.as_i8x32();
590	let a: i8x32 = a.as_i8x32();
591	let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
592	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
593	}
594
595	/// Performs an affine transformation on the inverted packed bytes in x.
596	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
597	/// and b being a constant 8-bit immediate value.
598	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
599	/// The inverse of 0 is 0.
600	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
601	///
602	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
603	/// Otherwise the computation result is written into the result.
604	///
605	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
606	#[inline]
607	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
608	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
609	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
610	#[rustc_legacy_const_generics(`4`)]
611	pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
612	src: __m256i,
613	k: __mmask32,
614	x: __m256i,
615	a: __m256i,
616	) -> __m256i {
617	static_assert_uimm_bits!(B, `8`);
618	let b: u8 = B as u8;
619	let x: i8x32 = x.as_i8x32();
620	let a: i8x32 = a.as_i8x32();
621	let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
622	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x32()))
623	}
624
625	/// Performs an affine transformation on the inverted packed bytes in x.
626	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
627	/// and b being a constant 8-bit immediate value.
628	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
629	/// The inverse of 0 is 0.
630	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
631	///
632	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
633	#[inline]
634	#[target_feature(enable = "gfni")]
635	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
636	#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = `0`))]
637	#[rustc_legacy_const_generics(`2`)]
638	pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
639	static_assert_uimm_bits!(B, `8`);
640	let b: u8 = B as u8;
641	let x: i8x16 = x.as_i8x16();
642	let a: i8x16 = a.as_i8x16();
643	let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
644	transmute(src:r)
645	}
646
647	/// Performs an affine transformation on the inverted packed bytes in x.
648	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
649	/// and b being a constant 8-bit immediate value.
650	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
651	/// The inverse of 0 is 0.
652	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
653	///
654	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
655	/// Otherwise the computation result is written into the result.
656	///
657	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
658	#[inline]
659	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
660	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
661	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
662	#[rustc_legacy_const_generics(`3`)]
663	pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
664	k: __mmask16,
665	x: __m128i,
666	a: __m128i,
667	) -> __m128i {
668	static_assert_uimm_bits!(B, `8`);
669	let b: u8 = B as u8;
670	let zero: i8x16 = i8x16::ZERO;
671	let x: i8x16 = x.as_i8x16();
672	let a: i8x16 = a.as_i8x16();
673	let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
674	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
675	}
676
677	/// Performs an affine transformation on the inverted packed bytes in x.
678	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
679	/// and b being a constant 8-bit immediate value.
680	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
681	/// The inverse of 0 is 0.
682	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
683	///
684	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
685	/// Otherwise the computation result is written into the result.
686	///
687	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
688	#[inline]
689	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
690	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
691	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
692	#[rustc_legacy_const_generics(`4`)]
693	pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
694	src: __m128i,
695	k: __mmask16,
696	x: __m128i,
697	a: __m128i,
698	) -> __m128i {
699	static_assert_uimm_bits!(B, `8`);
700	let b: u8 = B as u8;
701	let x: i8x16 = x.as_i8x16();
702	let a: i8x16 = a.as_i8x16();
703	let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
704	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x16()))
705	}
706
707	#[cfg(test)]
708	mod tests {
709	// The constants in the tests below are just bit patterns. They should not
710	// be interpreted as integers; signedness does not make sense for them, but
711	// __mXXXi happens to be defined in terms of signed integers.
712	#![allow(overflowing_literals)]
713
714	use core::hint::black_box;
715	use core::intrinsics::size_of;
716	use stdarch_test::simd_test;
717
718	use crate::core_arch::x86::*;
719
720	fn mulbyte(left: u8, right: u8) -> u8 {
721	// this implementation follows the description in
722	// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
723	const REDUCTION_POLYNOMIAL: u16 = `0x11b`;
724	let left: u16 = left.into();
725	let right: u16 = right.into();
726	let mut carryless_product: u16 = `0`;
727
728	// Carryless multiplication
729	for i in `0`..`8` {
730	if ((left >> i) & `0x01`) != `0` {
731	carryless_product ^= right << i;
732	}
733	}
734
735	// reduction, adding in "0" where appropriate to clear out high bits
736	// note that REDUCTION_POLYNOMIAL is zero in this context
737	for i in (`8`..=`14`).rev() {
738	if ((carryless_product >> i) & `0x01`) != `0` {
739	carryless_product ^= REDUCTION_POLYNOMIAL << (i - `8`);
740	}
741	}
742
743	carryless_product as u8
744	}
745
746	const NUM_TEST_WORDS_512: usize = `4`;
747	const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * `2`;
748	const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * `2`;
749	const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * `64`;
750	const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * `2`;
751	const NUM_BYTES: usize = `256`;
752	const NUM_BYTES_WORDS_128: usize = NUM_BYTES / `16`;
753	const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / `2`;
754	const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / `2`;
755
756	fn parity(input: u8) -> u8 {
757	let mut accumulator = `0`;
758	for i in `0`..`8` {
759	accumulator ^= (input >> i) & `0x01`;
760	}
761	accumulator
762	}
763
764	fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
765	// this implementation follows the description in
766	// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
767	let mut accumulator = `0`;
768
769	for bit in `0`..`8` {
770	accumulator \|= parity(x & matrix.to_le_bytes()[bit]) << (`7` - bit);
771	}
772
773	accumulator ^ b
774	}
775
776	fn generate_affine_mul_test_data(
777	immediate: u8,
778	) -> (
779	[u64; NUM_TEST_WORDS_64],
780	[u8; NUM_TEST_ENTRIES],
781	[u8; NUM_TEST_ENTRIES],
782	) {
783	let mut left: [u64; NUM_TEST_WORDS_64] = [`0`; NUM_TEST_WORDS_64];
784	let mut right: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
785	let mut result: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
786
787	for i in `0`..NUM_TEST_WORDS_64 {
788	left[i] = (i as u64) * `103` * `101`;
789	for j in `0`..`8` {
790	let j64 = j as u64;
791	right[i * `8` + j] = ((left[i] + j64) % `256`) as u8;
792	result[i * `8` + j] = mat_vec_multiply_affine(left[i], right[i * `8` + j], immediate);
793	}
794	}
795
796	(left, right, result)
797	}
798
799	fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
800	let mut input: [u8; NUM_BYTES] = [`0`; NUM_BYTES];
801	let mut result: [u8; NUM_BYTES] = [`0`; NUM_BYTES];
802
803	for i in `0`..NUM_BYTES {
804	input[i] = (i % `256`) as u8;
805	result[i] = if i == `0` { `0` } else { `1` };
806	}
807
808	(input, result)
809	}
810
811	const AES_S_BOX: [u8; NUM_BYTES] = [
812	`0x63`, `0x7c`, `0x77`, `0x7b`, `0xf2`, `0x6b`, `0x6f`, `0xc5`, `0x30`, `0x01`, `0x67`, `0x2b`, `0xfe`, `0xd7`, `0xab`,
813	`0x76`, `0xca`, `0x82`, `0xc9`, `0x7d`, `0xfa`, `0x59`, `0x47`, `0xf0`, `0xad`, `0xd4`, `0xa2`, `0xaf`, `0x9c`, `0xa4`,
814	`0x72`, `0xc0`, `0xb7`, `0xfd`, `0x93`, `0x26`, `0x36`, `0x3f`, `0xf7`, `0xcc`, `0x34`, `0xa5`, `0xe5`, `0xf1`, `0x71`,
815	`0xd8`, `0x31`, `0x15`, `0x04`, `0xc7`, `0x23`, `0xc3`, `0x18`, `0x96`, `0x05`, `0x9a`, `0x07`, `0x12`, `0x80`, `0xe2`,
816	`0xeb`, `0x27`, `0xb2`, `0x75`, `0x09`, `0x83`, `0x2c`, `0x1a`, `0x1b`, `0x6e`, `0x5a`, `0xa0`, `0x52`, `0x3b`, `0xd6`,
817	`0xb3`, `0x29`, `0xe3`, `0x2f`, `0x84`, `0x53`, `0xd1`, `0x00`, `0xed`, `0x20`, `0xfc`, `0xb1`, `0x5b`, `0x6a`, `0xcb`,
818	`0xbe`, `0x39`, `0x4a`, `0x4c`, `0x58`, `0xcf`, `0xd0`, `0xef`, `0xaa`, `0xfb`, `0x43`, `0x4d`, `0x33`, `0x85`, `0x45`,
819	`0xf9`, `0x02`, `0x7f`, `0x50`, `0x3c`, `0x9f`, `0xa8`, `0x51`, `0xa3`, `0x40`, `0x8f`, `0x92`, `0x9d`, `0x38`, `0xf5`,
820	`0xbc`, `0xb6`, `0xda`, `0x21`, `0x10`, `0xff`, `0xf3`, `0xd2`, `0xcd`, `0x0c`, `0x13`, `0xec`, `0x5f`, `0x97`, `0x44`,
821	`0x17`, `0xc4`, `0xa7`, `0x7e`, `0x3d`, `0x64`, `0x5d`, `0x19`, `0x73`, `0x60`, `0x81`, `0x4f`, `0xdc`, `0x22`, `0x2a`,
822	`0x90`, `0x88`, `0x46`, `0xee`, `0xb8`, `0x14`, `0xde`, `0x5e`, `0x0b`, `0xdb`, `0xe0`, `0x32`, `0x3a`, `0x0a`, `0x49`,
823	`0x06`, `0x24`, `0x5c`, `0xc2`, `0xd3`, `0xac`, `0x62`, `0x91`, `0x95`, `0xe4`, `0x79`, `0xe7`, `0xc8`, `0x37`, `0x6d`,
824	`0x8d`, `0xd5`, `0x4e`, `0xa9`, `0x6c`, `0x56`, `0xf4`, `0xea`, `0x65`, `0x7a`, `0xae`, `0x08`, `0xba`, `0x78`, `0x25`,
825	`0x2e`, `0x1c`, `0xa6`, `0xb4`, `0xc6`, `0xe8`, `0xdd`, `0x74`, `0x1f`, `0x4b`, `0xbd`, `0x8b`, `0x8a`, `0x70`, `0x3e`,
826	`0xb5`, `0x66`, `0x48`, `0x03`, `0xf6`, `0x0e`, `0x61`, `0x35`, `0x57`, `0xb9`, `0x86`, `0xc1`, `0x1d`, `0x9e`, `0xe1`,
827	`0xf8`, `0x98`, `0x11`, `0x69`, `0xd9`, `0x8e`, `0x94`, `0x9b`, `0x1e`, `0x87`, `0xe9`, `0xce`, `0x55`, `0x28`, `0xdf`,
828	`0x8c`, `0xa1`, `0x89`, `0x0d`, `0xbf`, `0xe6`, `0x42`, `0x68`, `0x41`, `0x99`, `0x2d`, `0x0f`, `0xb0`, `0x54`, `0xbb`,
829	`0x16`,
830	];
831
832	fn generate_byte_mul_test_data() -> (
833	[u8; NUM_TEST_ENTRIES],
834	[u8; NUM_TEST_ENTRIES],
835	[u8; NUM_TEST_ENTRIES],
836	) {
837	let mut left: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
838	let mut right: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
839	let mut result: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
840
841	for i in `0`..NUM_TEST_ENTRIES {
842	left[i] = (i % `256`) as u8;
843	right[i] = left[i].wrapping_mul(`101`);
844	result[i] = mulbyte(left[i], right[i]);
845	}
846
847	(left, right, result)
848	}
849
850	#[target_feature(enable = "sse2")]
851	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
852	unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
853	let byte_offset = word_index * `16` / size_of::<T>();
854	let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
855	_mm_loadu_si128(black_box(pointer))
856	}
857
858	#[target_feature(enable = "avx")]
859	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
860	unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
861	let byte_offset = word_index * `32` / size_of::<T>();
862	let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
863	_mm256_loadu_si256(black_box(pointer))
864	}
865
866	#[target_feature(enable = "avx512f")]
867	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
868	unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
869	let byte_offset = word_index * `64` / size_of::<T>();
870	let pointer = data.as_ptr().add(byte_offset) as *const i32;
871	_mm512_loadu_si512(black_box(pointer))
872	}
873
874	#[simd_test(enable = "gfni,avx512f")]
875	unsafe fn test_mm512_gf2p8mul_epi8() {
876	let (left, right, expected) = generate_byte_mul_test_data();
877
878	for i in `0`..NUM_TEST_WORDS_512 {
879	let left = load_m512i_word(&left, i);
880	let right = load_m512i_word(&right, i);
881	let expected = load_m512i_word(&expected, i);
882	let result = _mm512_gf2p8mul_epi8(left, right);
883	assert_eq_m512i(result, expected);
884	}
885	}
886
887	#[simd_test(enable = "gfni,avx512bw")]
888	unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
889	let (left, right, _expected) = generate_byte_mul_test_data();
890
891	for i in `0`..NUM_TEST_WORDS_512 {
892	let left = load_m512i_word(&left, i);
893	let right = load_m512i_word(&right, i);
894	let result_zero = _mm512_maskz_gf2p8mul_epi8(`0`, left, right);
895	assert_eq_m512i(result_zero, _mm512_setzero_si512());
896	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
897	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
898	let expected_result = _mm512_gf2p8mul_epi8(left, right);
899	let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
900	let expected_masked =
901	_mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
902	assert_eq_m512i(result_masked, expected_masked);
903	}
904	}
905
906	#[simd_test(enable = "gfni,avx512bw")]
907	unsafe fn test_mm512_mask_gf2p8mul_epi8() {
908	let (left, right, _expected) = generate_byte_mul_test_data();
909
910	for i in `0`..NUM_TEST_WORDS_512 {
911	let left = load_m512i_word(&left, i);
912	let right = load_m512i_word(&right, i);
913	let result_left = _mm512_mask_gf2p8mul_epi8(left, `0`, left, right);
914	assert_eq_m512i(result_left, left);
915	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
916	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
917	let expected_result = _mm512_gf2p8mul_epi8(left, right);
918	let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
919	let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
920	assert_eq_m512i(result_masked, expected_masked);
921	}
922	}
923
924	#[simd_test(enable = "gfni,avx")]
925	unsafe fn test_mm256_gf2p8mul_epi8() {
926	let (left, right, expected) = generate_byte_mul_test_data();
927
928	for i in `0`..NUM_TEST_WORDS_256 {
929	let left = load_m256i_word(&left, i);
930	let right = load_m256i_word(&right, i);
931	let expected = load_m256i_word(&expected, i);
932	let result = _mm256_gf2p8mul_epi8(left, right);
933	assert_eq_m256i(result, expected);
934	}
935	}
936
937	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
938	unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
939	let (left, right, _expected) = generate_byte_mul_test_data();
940
941	for i in `0`..NUM_TEST_WORDS_256 {
942	let left = load_m256i_word(&left, i);
943	let right = load_m256i_word(&right, i);
944	let result_zero = _mm256_maskz_gf2p8mul_epi8(`0`, left, right);
945	assert_eq_m256i(result_zero, _mm256_setzero_si256());
946	let mask_bytes: __mmask32 = `0x0F_F0_FF_00`;
947	const MASK_WORDS: i32 = `0b01_10_11_00`;
948	let expected_result = _mm256_gf2p8mul_epi8(left, right);
949	let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
950	let expected_masked =
951	_mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
952	assert_eq_m256i(result_masked, expected_masked);
953	}
954	}
955
956	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
957	unsafe fn test_mm256_mask_gf2p8mul_epi8() {
958	let (left, right, _expected) = generate_byte_mul_test_data();
959
960	for i in `0`..NUM_TEST_WORDS_256 {
961	let left = load_m256i_word(&left, i);
962	let right = load_m256i_word(&right, i);
963	let result_left = _mm256_mask_gf2p8mul_epi8(left, `0`, left, right);
964	assert_eq_m256i(result_left, left);
965	let mask_bytes: __mmask32 = `0x0F_F0_FF_00`;
966	const MASK_WORDS: i32 = `0b01_10_11_00`;
967	let expected_result = _mm256_gf2p8mul_epi8(left, right);
968	let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
969	let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
970	assert_eq_m256i(result_masked, expected_masked);
971	}
972	}
973
974	#[simd_test(enable = "gfni")]
975	unsafe fn test_mm_gf2p8mul_epi8() {
976	let (left, right, expected) = generate_byte_mul_test_data();
977
978	for i in `0`..NUM_TEST_WORDS_128 {
979	let left = load_m128i_word(&left, i);
980	let right = load_m128i_word(&right, i);
981	let expected = load_m128i_word(&expected, i);
982	let result = _mm_gf2p8mul_epi8(left, right);
983	assert_eq_m128i(result, expected);
984	}
985	}
986
987	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
988	unsafe fn test_mm_maskz_gf2p8mul_epi8() {
989	let (left, right, _expected) = generate_byte_mul_test_data();
990
991	for i in `0`..NUM_TEST_WORDS_128 {
992	let left = load_m128i_word(&left, i);
993	let right = load_m128i_word(&right, i);
994	let result_zero = _mm_maskz_gf2p8mul_epi8(`0`, left, right);
995	assert_eq_m128i(result_zero, _mm_setzero_si128());
996	let mask_bytes: __mmask16 = `0x0F_F0`;
997	const MASK_WORDS: i32 = `0b01_10`;
998	let expected_result = _mm_gf2p8mul_epi8(left, right);
999	let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
1000	let expected_masked =
1001	_mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1002	assert_eq_m128i(result_masked, expected_masked);
1003	}
1004	}
1005
1006	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1007	unsafe fn test_mm_mask_gf2p8mul_epi8() {
1008	let (left, right, _expected) = generate_byte_mul_test_data();
1009
1010	for i in `0`..NUM_TEST_WORDS_128 {
1011	let left = load_m128i_word(&left, i);
1012	let right = load_m128i_word(&right, i);
1013	let result_left = _mm_mask_gf2p8mul_epi8(left, `0`, left, right);
1014	assert_eq_m128i(result_left, left);
1015	let mask_bytes: __mmask16 = `0x0F_F0`;
1016	const MASK_WORDS: i32 = `0b01_10`;
1017	let expected_result = _mm_gf2p8mul_epi8(left, right);
1018	let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1019	let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1020	assert_eq_m128i(result_masked, expected_masked);
1021	}
1022	}
1023
1024	#[simd_test(enable = "gfni,avx512f")]
1025	unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1026	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1027	const IDENTITY_BYTE: i32 = `0`;
1028	let constant: i64 = `0`;
1029	const CONSTANT_BYTE: i32 = `0x63`;
1030	let identity = _mm512_set1_epi64(identity);
1031	let constant = _mm512_set1_epi64(constant);
1032	let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1033
1034	let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1035	let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1036
1037	for i in `0`..NUM_TEST_WORDS_512 {
1038	let data = load_m512i_word(&bytes, i);
1039	let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1040	assert_eq_m512i(result, data);
1041	let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1042	assert_eq_m512i(result, constant_reference);
1043	let data = load_m512i_word(&more_bytes, i);
1044	let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1045	assert_eq_m512i(result, data);
1046	let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1047	assert_eq_m512i(result, constant_reference);
1048
1049	let matrix = load_m512i_word(&matrices, i);
1050	let vector = load_m512i_word(&vectors, i);
1051	let reference = load_m512i_word(&references, i);
1052
1053	let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1054	assert_eq_m512i(result, reference);
1055	}
1056	}
1057
1058	#[simd_test(enable = "gfni,avx512bw")]
1059	unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1060	const CONSTANT_BYTE: i32 = `0x63`;
1061	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1062
1063	for i in `0`..NUM_TEST_WORDS_512 {
1064	let matrix = load_m512i_word(&matrices, i);
1065	let vector = load_m512i_word(&vectors, i);
1066	let result_zero =
1067	_mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1068	assert_eq_m512i(result_zero, _mm512_setzero_si512());
1069	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1070	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1071	let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1072	let result_masked =
1073	_mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1074	let expected_masked =
1075	_mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1076	assert_eq_m512i(result_masked, expected_masked);
1077	}
1078	}
1079
1080	#[simd_test(enable = "gfni,avx512bw")]
1081	unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1082	const CONSTANT_BYTE: i32 = `0x63`;
1083	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1084
1085	for i in `0`..NUM_TEST_WORDS_512 {
1086	let left = load_m512i_word(&vectors, i);
1087	let right = load_m512i_word(&matrices, i);
1088	let result_left =
1089	_mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1090	assert_eq_m512i(result_left, left);
1091	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1092	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1093	let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1094	let result_masked =
1095	_mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1096	let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1097	assert_eq_m512i(result_masked, expected_masked);
1098	}
1099	}
1100
1101	#[simd_test(enable = "gfni,avx")]
1102	unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1103	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1104	const IDENTITY_BYTE: i32 = `0`;
1105	let constant: i64 = `0`;
1106	const CONSTANT_BYTE: i32 = `0x63`;
1107	let identity = _mm256_set1_epi64x(identity);
1108	let constant = _mm256_set1_epi64x(constant);
1109	let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1110
1111	let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1112	let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1113
1114	for i in `0`..NUM_TEST_WORDS_256 {
1115	let data = load_m256i_word(&bytes, i);
1116	let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1117	assert_eq_m256i(result, data);
1118	let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1119	assert_eq_m256i(result, constant_reference);
1120	let data = load_m256i_word(&more_bytes, i);
1121	let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1122	assert_eq_m256i(result, data);
1123	let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1124	assert_eq_m256i(result, constant_reference);
1125
1126	let matrix = load_m256i_word(&matrices, i);
1127	let vector = load_m256i_word(&vectors, i);
1128	let reference = load_m256i_word(&references, i);
1129
1130	let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1131	assert_eq_m256i(result, reference);
1132	}
1133	}
1134
1135	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1136	unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1137	const CONSTANT_BYTE: i32 = `0x63`;
1138	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1139
1140	for i in `0`..NUM_TEST_WORDS_256 {
1141	let matrix = load_m256i_word(&matrices, i);
1142	let vector = load_m256i_word(&vectors, i);
1143	let result_zero =
1144	_mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1145	assert_eq_m256i(result_zero, _mm256_setzero_si256());
1146	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1147	const MASK_WORDS: i32 = `0b11_01_10_00`;
1148	let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1149	let result_masked =
1150	_mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1151	let expected_masked =
1152	_mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1153	assert_eq_m256i(result_masked, expected_masked);
1154	}
1155	}
1156
1157	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1158	unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1159	const CONSTANT_BYTE: i32 = `0x63`;
1160	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1161
1162	for i in `0`..NUM_TEST_WORDS_256 {
1163	let left = load_m256i_word(&vectors, i);
1164	let right = load_m256i_word(&matrices, i);
1165	let result_left =
1166	_mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1167	assert_eq_m256i(result_left, left);
1168	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1169	const MASK_WORDS: i32 = `0b11_01_10_00`;
1170	let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1171	let result_masked =
1172	_mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1173	let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1174	assert_eq_m256i(result_masked, expected_masked);
1175	}
1176	}
1177
1178	#[simd_test(enable = "gfni")]
1179	unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1180	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1181	const IDENTITY_BYTE: i32 = `0`;
1182	let constant: i64 = `0`;
1183	const CONSTANT_BYTE: i32 = `0x63`;
1184	let identity = _mm_set1_epi64x(identity);
1185	let constant = _mm_set1_epi64x(constant);
1186	let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1187
1188	let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1189	let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1190
1191	for i in `0`..NUM_TEST_WORDS_128 {
1192	let data = load_m128i_word(&bytes, i);
1193	let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1194	assert_eq_m128i(result, data);
1195	let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1196	assert_eq_m128i(result, constant_reference);
1197	let data = load_m128i_word(&more_bytes, i);
1198	let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1199	assert_eq_m128i(result, data);
1200	let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1201	assert_eq_m128i(result, constant_reference);
1202
1203	let matrix = load_m128i_word(&matrices, i);
1204	let vector = load_m128i_word(&vectors, i);
1205	let reference = load_m128i_word(&references, i);
1206
1207	let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1208	assert_eq_m128i(result, reference);
1209	}
1210	}
1211
1212	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1213	unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1214	const CONSTANT_BYTE: i32 = `0x63`;
1215	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1216
1217	for i in `0`..NUM_TEST_WORDS_128 {
1218	let matrix = load_m128i_word(&matrices, i);
1219	let vector = load_m128i_word(&vectors, i);
1220	let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1221	assert_eq_m128i(result_zero, _mm_setzero_si128());
1222	let mask_bytes: __mmask16 = `0x0F_F0`;
1223	const MASK_WORDS: i32 = `0b01_10`;
1224	let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1225	let result_masked =
1226	_mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1227	let expected_masked =
1228	_mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1229	assert_eq_m128i(result_masked, expected_masked);
1230	}
1231	}
1232
1233	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1234	unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1235	const CONSTANT_BYTE: i32 = `0x63`;
1236	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1237
1238	for i in `0`..NUM_TEST_WORDS_128 {
1239	let left = load_m128i_word(&vectors, i);
1240	let right = load_m128i_word(&matrices, i);
1241	let result_left =
1242	_mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1243	assert_eq_m128i(result_left, left);
1244	let mask_bytes: __mmask16 = `0x0F_F0`;
1245	const MASK_WORDS: i32 = `0b01_10`;
1246	let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1247	let result_masked =
1248	_mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1249	let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1250	assert_eq_m128i(result_masked, expected_masked);
1251	}
1252	}
1253
1254	#[simd_test(enable = "gfni,avx512f")]
1255	unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1256	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1257	const IDENTITY_BYTE: i32 = `0`;
1258	const CONSTANT_BYTE: i32 = `0x63`;
1259	let identity = _mm512_set1_epi64(identity);
1260
1261	// validate inversion
1262	let (inputs, results) = generate_inv_tests_data();
1263
1264	for i in `0`..NUM_BYTES_WORDS_512 {
1265	let input = load_m512i_word(&inputs, i);
1266	let reference = load_m512i_word(&results, i);
1267	let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1268	let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1269	assert_eq_m512i(remultiplied, reference);
1270	}
1271
1272	// validate subsequent affine operation
1273	let (matrices, vectors, _affine_expected) =
1274	generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1275
1276	for i in `0`..NUM_TEST_WORDS_512 {
1277	let vector = load_m512i_word(&vectors, i);
1278	let matrix = load_m512i_word(&matrices, i);
1279
1280	let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1281	let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1282	let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1283	assert_eq_m512i(result, reference);
1284	}
1285
1286	// validate everything by virtue of checking against the AES SBox
1287	const AES_S_BOX_MATRIX: i64 = `0xF1_E3_C7_8F_1F_3E_7C_F8`;
1288	let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1289
1290	for i in `0`..NUM_BYTES_WORDS_512 {
1291	let reference = load_m512i_word(&AES_S_BOX, i);
1292	let input = load_m512i_word(&inputs, i);
1293	let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1294	assert_eq_m512i(result, reference);
1295	}
1296	}
1297
1298	#[simd_test(enable = "gfni,avx512bw")]
1299	unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1300	const CONSTANT_BYTE: i32 = `0x63`;
1301	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1302
1303	for i in `0`..NUM_TEST_WORDS_512 {
1304	let matrix = load_m512i_word(&matrices, i);
1305	let vector = load_m512i_word(&vectors, i);
1306	let result_zero =
1307	_mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1308	assert_eq_m512i(result_zero, _mm512_setzero_si512());
1309	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1310	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1311	let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1312	let result_masked =
1313	_mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1314	let expected_masked =
1315	_mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1316	assert_eq_m512i(result_masked, expected_masked);
1317	}
1318	}
1319
1320	#[simd_test(enable = "gfni,avx512bw")]
1321	unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1322	const CONSTANT_BYTE: i32 = `0x63`;
1323	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1324
1325	for i in `0`..NUM_TEST_WORDS_512 {
1326	let left = load_m512i_word(&vectors, i);
1327	let right = load_m512i_word(&matrices, i);
1328	let result_left =
1329	_mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1330	assert_eq_m512i(result_left, left);
1331	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1332	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1333	let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1334	let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1335	left, mask_bytes, left, right,
1336	);
1337	let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1338	assert_eq_m512i(result_masked, expected_masked);
1339	}
1340	}
1341
1342	#[simd_test(enable = "gfni,avx")]
1343	unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1344	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1345	const IDENTITY_BYTE: i32 = `0`;
1346	const CONSTANT_BYTE: i32 = `0x63`;
1347	let identity = _mm256_set1_epi64x(identity);
1348
1349	// validate inversion
1350	let (inputs, results) = generate_inv_tests_data();
1351
1352	for i in `0`..NUM_BYTES_WORDS_256 {
1353	let input = load_m256i_word(&inputs, i);
1354	let reference = load_m256i_word(&results, i);
1355	let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1356	let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1357	assert_eq_m256i(remultiplied, reference);
1358	}
1359
1360	// validate subsequent affine operation
1361	let (matrices, vectors, _affine_expected) =
1362	generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1363
1364	for i in `0`..NUM_TEST_WORDS_256 {
1365	let vector = load_m256i_word(&vectors, i);
1366	let matrix = load_m256i_word(&matrices, i);
1367
1368	let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1369	let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1370	let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1371	assert_eq_m256i(result, reference);
1372	}
1373
1374	// validate everything by virtue of checking against the AES SBox
1375	const AES_S_BOX_MATRIX: i64 = `0xF1_E3_C7_8F_1F_3E_7C_F8`;
1376	let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1377
1378	for i in `0`..NUM_BYTES_WORDS_256 {
1379	let reference = load_m256i_word(&AES_S_BOX, i);
1380	let input = load_m256i_word(&inputs, i);
1381	let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1382	assert_eq_m256i(result, reference);
1383	}
1384	}
1385
1386	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1387	unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1388	const CONSTANT_BYTE: i32 = `0x63`;
1389	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1390
1391	for i in `0`..NUM_TEST_WORDS_256 {
1392	let matrix = load_m256i_word(&matrices, i);
1393	let vector = load_m256i_word(&vectors, i);
1394	let result_zero =
1395	_mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1396	assert_eq_m256i(result_zero, _mm256_setzero_si256());
1397	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1398	const MASK_WORDS: i32 = `0b11_01_10_00`;
1399	let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1400	let result_masked =
1401	_mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1402	let expected_masked =
1403	_mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1404	assert_eq_m256i(result_masked, expected_masked);
1405	}
1406	}
1407
1408	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1409	unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1410	const CONSTANT_BYTE: i32 = `0x63`;
1411	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1412
1413	for i in `0`..NUM_TEST_WORDS_256 {
1414	let left = load_m256i_word(&vectors, i);
1415	let right = load_m256i_word(&matrices, i);
1416	let result_left =
1417	_mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1418	assert_eq_m256i(result_left, left);
1419	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1420	const MASK_WORDS: i32 = `0b11_01_10_00`;
1421	let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1422	let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1423	left, mask_bytes, left, right,
1424	);
1425	let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1426	assert_eq_m256i(result_masked, expected_masked);
1427	}
1428	}
1429
1430	#[simd_test(enable = "gfni")]
1431	unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1432	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1433	const IDENTITY_BYTE: i32 = `0`;
1434	const CONSTANT_BYTE: i32 = `0x63`;
1435	let identity = _mm_set1_epi64x(identity);
1436
1437	// validate inversion
1438	let (inputs, results) = generate_inv_tests_data();
1439
1440	for i in `0`..NUM_BYTES_WORDS_128 {
1441	let input = load_m128i_word(&inputs, i);
1442	let reference = load_m128i_word(&results, i);
1443	let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1444	let remultiplied = _mm_gf2p8mul_epi8(result, input);
1445	assert_eq_m128i(remultiplied, reference);
1446	}
1447
1448	// validate subsequent affine operation
1449	let (matrices, vectors, _affine_expected) =
1450	generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1451
1452	for i in `0`..NUM_TEST_WORDS_128 {
1453	let vector = load_m128i_word(&vectors, i);
1454	let matrix = load_m128i_word(&matrices, i);
1455
1456	let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1457	let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1458	let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1459	assert_eq_m128i(result, reference);
1460	}
1461
1462	// validate everything by virtue of checking against the AES SBox
1463	const AES_S_BOX_MATRIX: i64 = `0xF1_E3_C7_8F_1F_3E_7C_F8`;
1464	let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1465
1466	for i in `0`..NUM_BYTES_WORDS_128 {
1467	let reference = load_m128i_word(&AES_S_BOX, i);
1468	let input = load_m128i_word(&inputs, i);
1469	let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1470	assert_eq_m128i(result, reference);
1471	}
1472	}
1473
1474	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1475	unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1476	const CONSTANT_BYTE: i32 = `0x63`;
1477	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1478
1479	for i in `0`..NUM_TEST_WORDS_128 {
1480	let matrix = load_m128i_word(&matrices, i);
1481	let vector = load_m128i_word(&vectors, i);
1482	let result_zero =
1483	_mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1484	assert_eq_m128i(result_zero, _mm_setzero_si128());
1485	let mask_bytes: __mmask16 = `0x0F_F0`;
1486	const MASK_WORDS: i32 = `0b01_10`;
1487	let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1488	let result_masked =
1489	_mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1490	let expected_masked =
1491	_mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1492	assert_eq_m128i(result_masked, expected_masked);
1493	}
1494	}
1495
1496	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1497	unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1498	const CONSTANT_BYTE: i32 = `0x63`;
1499	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1500
1501	for i in `0`..NUM_TEST_WORDS_128 {
1502	let left = load_m128i_word(&vectors, i);
1503	let right = load_m128i_word(&matrices, i);
1504	let result_left =
1505	_mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1506	assert_eq_m128i(result_left, left);
1507	let mask_bytes: __mmask16 = `0x0F_F0`;
1508	const MASK_WORDS: i32 = `0b01_10`;
1509	let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1510	let result_masked =
1511	_mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1512	let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1513	assert_eq_m128i(result_masked, expected_masked);
1514	}
1515	}
1516	}
1517