gfni.rs source code [crates/core_arch/src/x86/gfni.rs]

1	//! Galois Field New Instructions (GFNI)
2	//!
3	//! The intrinsics here correspond to those in the `immintrin.h` C header.
4	//!
5	//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6	//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7	//!
8	//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10	use crate::core_arch::simd::i8x16;
11	use crate::core_arch::simd::i8x32;
12	use crate::core_arch::simd::i8x64;
13	use crate::core_arch::x86::__m128i;
14	use crate::core_arch::x86::__m256i;
15	use crate::core_arch::x86::__m512i;
16	use crate::core_arch::x86::__mmask16;
17	use crate::core_arch::x86::__mmask32;
18	use crate::core_arch::x86::__mmask64;
19	use crate::core_arch::x86::_mm256_setzero_si256;
20	use crate::core_arch::x86::_mm512_setzero_si512;
21	use crate::core_arch::x86::_mm_setzero_si128;
22	use crate::core_arch::x86::m128iExt;
23	use crate::core_arch::x86::m256iExt;
24	use crate::core_arch::x86::m512iExt;
25	use crate::intrinsics::simd::simd_select_bitmask;
26	use crate::mem::transmute;
27
28	#[cfg(test)]
29	use stdarch_test::assert_instr;
30
31	#[allow(improper_ctypes)]
32	extern "C" {
33	#[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
34	fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35	#[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
36	fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37	#[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
38	fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39	#[link_name = "llvm.x86.vgf2p8affineqb.512"]
40	fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
41	#[link_name = "llvm.x86.vgf2p8affineqb.256"]
42	fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
43	#[link_name = "llvm.x86.vgf2p8affineqb.128"]
44	fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
45	#[link_name = "llvm.x86.vgf2p8mulb.512"]
46	fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
47	#[link_name = "llvm.x86.vgf2p8mulb.256"]
48	fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
49	#[link_name = "llvm.x86.vgf2p8mulb.128"]
50	fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
51	}
52
53	// LLVM requires AVX512BW for a lot of these instructions, see
54	// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
55	// however our tests also require the target feature list to match Intel's
56	// which doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F*
57	// requirement (for now)
58	// also see
59	// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
60	// for forcing GFNI, BW and optionally VL extension
61
62	/// Performs a multiplication in GF(2^8) on the packed bytes.
63	/// The field is in polynomial representation with the reduction polynomial
64	/// x^8 + x^4 + x^3 + x + 1.
65	///
66	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8)
67	#[inline]
68	#[target_feature(enable = "gfni,avx512bw,avx512f")]
69	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
70	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
71	pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
72	transmute(src:vgf2p8mulb_512(a:a.as_i8x64(), b:b.as_i8x64()))
73	}
74
75	/// Performs a multiplication in GF(2^8) on the packed bytes.
76	/// The field is in polynomial representation with the reduction polynomial
77	/// x^8 + x^4 + x^3 + x + 1.
78	///
79	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
80	/// Otherwise the computation result is written into the result.
81	///
82	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8mul_epi8)
83	#[inline]
84	#[target_feature(enable = "gfni,avx512bw,avx512f")]
85	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
86	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
87	pub unsafe fn _mm512_mask_gf2p8mul_epi8(
88	src: __m512i,
89	k: __mmask64,
90	a: __m512i,
91	b: __m512i,
92	) -> __m512i {
93	transmute(src:simd_select_bitmask(
94	m:k,
95	yes:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
96	no:src.as_i8x64(),
97	))
98	}
99
100	/// Performs a multiplication in GF(2^8) on the packed bytes.
101	/// The field is in polynomial representation with the reduction polynomial
102	/// x^8 + x^4 + x^3 + x + 1.
103	///
104	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
105	/// Otherwise the computation result is written into the result.
106	///
107	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8mul_epi8)
108	#[inline]
109	#[target_feature(enable = "gfni,avx512bw,avx512f")]
110	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
111	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
112	pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
113	let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
114	transmute(src:simd_select_bitmask(
115	m:k,
116	yes:vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
117	no:zero,
118	))
119	}
120
121	/// Performs a multiplication in GF(2^8) on the packed bytes.
122	/// The field is in polynomial representation with the reduction polynomial
123	/// x^8 + x^4 + x^3 + x + 1.
124	///
125	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8mul_epi8)
126	#[inline]
127	#[target_feature(enable = "gfni,avx")]
128	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
129	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
130	pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
131	transmute(src:vgf2p8mulb_256(a:a.as_i8x32(), b:b.as_i8x32()))
132	}
133
134	/// Performs a multiplication in GF(2^8) on the packed bytes.
135	/// The field is in polynomial representation with the reduction polynomial
136	/// x^8 + x^4 + x^3 + x + 1.
137	///
138	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
139	/// Otherwise the computation result is written into the result.
140	///
141	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8mul_epi8)
142	#[inline]
143	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
144	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
145	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
146	pub unsafe fn _mm256_mask_gf2p8mul_epi8(
147	src: __m256i,
148	k: __mmask32,
149	a: __m256i,
150	b: __m256i,
151	) -> __m256i {
152	transmute(src:simd_select_bitmask(
153	m:k,
154	yes:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
155	no:src.as_i8x32(),
156	))
157	}
158
159	/// Performs a multiplication in GF(2^8) on the packed bytes.
160	/// The field is in polynomial representation with the reduction polynomial
161	/// x^8 + x^4 + x^3 + x + 1.
162	///
163	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
164	/// Otherwise the computation result is written into the result.
165	///
166	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8mul_epi8)
167	#[inline]
168	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
169	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
170	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
171	pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
172	let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
173	transmute(src:simd_select_bitmask(
174	m:k,
175	yes:vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
176	no:zero,
177	))
178	}
179
180	/// Performs a multiplication in GF(2^8) on the packed bytes.
181	/// The field is in polynomial representation with the reduction polynomial
182	/// x^8 + x^4 + x^3 + x + 1.
183	///
184	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8mul_epi8)
185	#[inline]
186	#[target_feature(enable = "gfni")]
187	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
188	#[cfg_attr(test, assert_instr(gf2p8mulb))]
189	pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
190	transmute(src:vgf2p8mulb_128(a:a.as_i8x16(), b:b.as_i8x16()))
191	}
192
193	/// Performs a multiplication in GF(2^8) on the packed bytes.
194	/// The field is in polynomial representation with the reduction polynomial
195	/// x^8 + x^4 + x^3 + x + 1.
196	///
197	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
198	/// Otherwise the computation result is written into the result.
199	///
200	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8mul_epi8)
201	#[inline]
202	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
203	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
204	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
205	pub unsafe fn _mm_mask_gf2p8mul_epi8(
206	src: __m128i,
207	k: __mmask16,
208	a: __m128i,
209	b: __m128i,
210	) -> __m128i {
211	transmute(src:simd_select_bitmask(
212	m:k,
213	yes:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
214	no:src.as_i8x16(),
215	))
216	}
217
218	/// Performs a multiplication in GF(2^8) on the packed bytes.
219	/// The field is in polynomial representation with the reduction polynomial
220	/// x^8 + x^4 + x^3 + x + 1.
221	///
222	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
223	/// Otherwise the computation result is written into the result.
224	///
225	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8mul_epi8)
226	#[inline]
227	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
228	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
229	#[cfg_attr(test, assert_instr(vgf2p8mulb))]
230	pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
231	let zero: i8x16 = _mm_setzero_si128().as_i8x16();
232	transmute(src:simd_select_bitmask(
233	m:k,
234	yes:vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
235	no:zero,
236	))
237	}
238
239	/// Performs an affine transformation on the packed bytes in x.
240	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
241	/// and b being a constant 8-bit immediate value.
242	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
243	///
244	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affine_epi8)
245	#[inline]
246	#[target_feature(enable = "gfni,avx512bw,avx512f")]
247	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
248	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
249	#[rustc_legacy_const_generics(`2`)]
250	pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
251	static_assert_uimm_bits!(B, `8`);
252	let b: u8 = B as u8;
253	let x: i8x64 = x.as_i8x64();
254	let a: i8x64 = a.as_i8x64();
255	let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
256	transmute(src:r)
257	}
258
259	/// Performs an affine transformation on the packed bytes in x.
260	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
261	/// and b being a constant 8-bit immediate value.
262	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
263	///
264	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
265	/// Otherwise the computation result is written into the result.
266	///
267	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affine_epi8)
268	#[inline]
269	#[target_feature(enable = "gfni,avx512bw,avx512f")]
270	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
271	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
272	#[rustc_legacy_const_generics(`3`)]
273	pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
274	k: __mmask64,
275	x: __m512i,
276	a: __m512i,
277	) -> __m512i {
278	static_assert_uimm_bits!(B, `8`);
279	let b: u8 = B as u8;
280	let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
281	let x: i8x64 = x.as_i8x64();
282	let a: i8x64 = a.as_i8x64();
283	let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
284	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
285	}
286
287	/// Performs an affine transformation on the packed bytes in x.
288	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
289	/// and b being a constant 8-bit immediate value.
290	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
291	///
292	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
293	/// Otherwise the computation result is written into the result.
294	///
295	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affine_epi8)
296	#[inline]
297	#[target_feature(enable = "gfni,avx512bw,avx512f")]
298	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
299	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
300	#[rustc_legacy_const_generics(`4`)]
301	pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
302	src: __m512i,
303	k: __mmask64,
304	x: __m512i,
305	a: __m512i,
306	) -> __m512i {
307	static_assert_uimm_bits!(B, `8`);
308	let b: u8 = B as u8;
309	let x: i8x64 = x.as_i8x64();
310	let a: i8x64 = a.as_i8x64();
311	let r: i8x64 = vgf2p8affineqb_512(x, a, imm8:b);
312	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x64()))
313	}
314
315	/// Performs an affine transformation on the packed bytes in x.
316	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
317	/// and b being a constant 8-bit immediate value.
318	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
319	///
320	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affine_epi8)
321	#[inline]
322	#[target_feature(enable = "gfni,avx")]
323	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
324	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
325	#[rustc_legacy_const_generics(`2`)]
326	pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
327	static_assert_uimm_bits!(B, `8`);
328	let b: u8 = B as u8;
329	let x: i8x32 = x.as_i8x32();
330	let a: i8x32 = a.as_i8x32();
331	let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
332	transmute(src:r)
333	}
334
335	/// Performs an affine transformation on the packed bytes in x.
336	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
337	/// and b being a constant 8-bit immediate value.
338	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
339	///
340	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
341	/// Otherwise the computation result is written into the result.
342	///
343	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affine_epi8)
344	#[inline]
345	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
346	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
347	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
348	#[rustc_legacy_const_generics(`3`)]
349	pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
350	k: __mmask32,
351	x: __m256i,
352	a: __m256i,
353	) -> __m256i {
354	static_assert_uimm_bits!(B, `8`);
355	let b: u8 = B as u8;
356	let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
357	let x: i8x32 = x.as_i8x32();
358	let a: i8x32 = a.as_i8x32();
359	let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
360	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
361	}
362
363	/// Performs an affine transformation on the packed bytes in x.
364	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
365	/// and b being a constant 8-bit immediate value.
366	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
367	///
368	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
369	/// Otherwise the computation result is written into the result.
370	///
371	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affine_epi8)
372	#[inline]
373	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
374	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
375	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
376	#[rustc_legacy_const_generics(`4`)]
377	pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
378	src: __m256i,
379	k: __mmask32,
380	x: __m256i,
381	a: __m256i,
382	) -> __m256i {
383	static_assert_uimm_bits!(B, `8`);
384	let b: u8 = B as u8;
385	let x: i8x32 = x.as_i8x32();
386	let a: i8x32 = a.as_i8x32();
387	let r: i8x32 = vgf2p8affineqb_256(x, a, imm8:b);
388	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x32()))
389	}
390
391	/// Performs an affine transformation on the packed bytes in x.
392	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
393	/// and b being a constant 8-bit immediate value.
394	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
395	///
396	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi8)
397	#[inline]
398	#[target_feature(enable = "gfni")]
399	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
400	#[cfg_attr(test, assert_instr(gf2p8affineqb, B = `0`))]
401	#[rustc_legacy_const_generics(`2`)]
402	pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
403	static_assert_uimm_bits!(B, `8`);
404	let b: u8 = B as u8;
405	let x: i8x16 = x.as_i8x16();
406	let a: i8x16 = a.as_i8x16();
407	let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
408	transmute(src:r)
409	}
410
411	/// Performs an affine transformation on the packed bytes in x.
412	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
413	/// and b being a constant 8-bit immediate value.
414	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
415	///
416	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
417	/// Otherwise the computation result is written into the result.
418	///
419	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affine_epi8)
420	#[inline]
421	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
422	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
423	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
424	#[rustc_legacy_const_generics(`3`)]
425	pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
426	k: __mmask16,
427	x: __m128i,
428	a: __m128i,
429	) -> __m128i {
430	static_assert_uimm_bits!(B, `8`);
431	let b: u8 = B as u8;
432	let zero: i8x16 = _mm_setzero_si128().as_i8x16();
433	let x: i8x16 = x.as_i8x16();
434	let a: i8x16 = a.as_i8x16();
435	let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
436	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
437	}
438
439	/// Performs an affine transformation on the packed bytes in x.
440	/// That is computes ax+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
441	/// and b being a constant 8-bit immediate value.
442	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
443	///
444	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
445	/// Otherwise the computation result is written into the result.
446	///
447	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affine_epi8)
448	#[inline]
449	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
450	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
451	#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = `0`))]
452	#[rustc_legacy_const_generics(`4`)]
453	pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
454	src: __m128i,
455	k: __mmask16,
456	x: __m128i,
457	a: __m128i,
458	) -> __m128i {
459	static_assert_uimm_bits!(B, `8`);
460	let b: u8 = B as u8;
461	let x: i8x16 = x.as_i8x16();
462	let a: i8x16 = a.as_i8x16();
463	let r: i8x16 = vgf2p8affineqb_128(x, a, imm8:b);
464	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x16()))
465	}
466
467	/// Performs an affine transformation on the inverted packed bytes in x.
468	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
469	/// and b being a constant 8-bit immediate value.
470	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
471	/// The inverse of 0 is 0.
472	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
473	///
474	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8affineinv_epi64_epi8)
475	#[inline]
476	#[target_feature(enable = "gfni,avx512bw,avx512f")]
477	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
478	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
479	#[rustc_legacy_const_generics(`2`)]
480	pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
481	static_assert_uimm_bits!(B, `8`);
482	let b: u8 = B as u8;
483	let x: i8x64 = x.as_i8x64();
484	let a: i8x64 = a.as_i8x64();
485	let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
486	transmute(src:r)
487	}
488
489	/// Performs an affine transformation on the inverted packed bytes in x.
490	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
491	/// and b being a constant 8-bit immediate value.
492	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
493	/// The inverse of 0 is 0.
494	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
495	///
496	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
497	/// Otherwise the computation result is written into the result.
498	///
499	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
500	#[inline]
501	#[target_feature(enable = "gfni,avx512bw,avx512f")]
502	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
503	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
504	#[rustc_legacy_const_generics(`3`)]
505	pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
506	k: __mmask64,
507	x: __m512i,
508	a: __m512i,
509	) -> __m512i {
510	static_assert_uimm_bits!(B, `8`);
511	let b: u8 = B as u8;
512	let zero: i8x64 = _mm512_setzero_si512().as_i8x64();
513	let x: i8x64 = x.as_i8x64();
514	let a: i8x64 = a.as_i8x64();
515	let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
516	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
517	}
518
519	/// Performs an affine transformation on the inverted packed bytes in x.
520	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
521	/// and b being a constant 8-bit immediate value.
522	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
523	/// The inverse of 0 is 0.
524	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
525	///
526	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
527	/// Otherwise the computation result is written into the result.
528	///
529	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
530	#[inline]
531	#[target_feature(enable = "gfni,avx512bw,avx512f")]
532	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
533	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
534	#[rustc_legacy_const_generics(`4`)]
535	pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
536	src: __m512i,
537	k: __mmask64,
538	x: __m512i,
539	a: __m512i,
540	) -> __m512i {
541	static_assert_uimm_bits!(B, `8`);
542	let b: u8 = B as u8;
543	let x: i8x64 = x.as_i8x64();
544	let a: i8x64 = a.as_i8x64();
545	let r: i8x64 = vgf2p8affineinvqb_512(x, a, imm8:b);
546	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x64()))
547	}
548
549	/// Performs an affine transformation on the inverted packed bytes in x.
550	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
551	/// and b being a constant 8-bit immediate value.
552	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
553	/// The inverse of 0 is 0.
554	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
555	///
556	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_gf2p8affineinv_epi64_epi8)
557	#[inline]
558	#[target_feature(enable = "gfni,avx")]
559	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
560	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
561	#[rustc_legacy_const_generics(`2`)]
562	pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
563	static_assert_uimm_bits!(B, `8`);
564	let b: u8 = B as u8;
565	let x: i8x32 = x.as_i8x32();
566	let a: i8x32 = a.as_i8x32();
567	let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
568	transmute(src:r)
569	}
570
571	/// Performs an affine transformation on the inverted packed bytes in x.
572	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
573	/// and b being a constant 8-bit immediate value.
574	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
575	/// The inverse of 0 is 0.
576	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
577	///
578	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
579	/// Otherwise the computation result is written into the result.
580	///
581	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
582	#[inline]
583	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
584	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
585	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
586	#[rustc_legacy_const_generics(`3`)]
587	pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
588	k: __mmask32,
589	x: __m256i,
590	a: __m256i,
591	) -> __m256i {
592	static_assert_uimm_bits!(B, `8`);
593	let b: u8 = B as u8;
594	let zero: i8x32 = _mm256_setzero_si256().as_i8x32();
595	let x: i8x32 = x.as_i8x32();
596	let a: i8x32 = a.as_i8x32();
597	let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
598	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
599	}
600
601	/// Performs an affine transformation on the inverted packed bytes in x.
602	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
603	/// and b being a constant 8-bit immediate value.
604	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
605	/// The inverse of 0 is 0.
606	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
607	///
608	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
609	/// Otherwise the computation result is written into the result.
610	///
611	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
612	#[inline]
613	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
614	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
615	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
616	#[rustc_legacy_const_generics(`4`)]
617	pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
618	src: __m256i,
619	k: __mmask32,
620	x: __m256i,
621	a: __m256i,
622	) -> __m256i {
623	static_assert_uimm_bits!(B, `8`);
624	let b: u8 = B as u8;
625	let x: i8x32 = x.as_i8x32();
626	let a: i8x32 = a.as_i8x32();
627	let r: i8x32 = vgf2p8affineinvqb_256(x, a, imm8:b);
628	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x32()))
629	}
630
631	/// Performs an affine transformation on the inverted packed bytes in x.
632	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
633	/// and b being a constant 8-bit immediate value.
634	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
635	/// The inverse of 0 is 0.
636	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
637	///
638	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affineinv_epi64_epi8)
639	#[inline]
640	#[target_feature(enable = "gfni")]
641	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
642	#[cfg_attr(test, assert_instr(gf2p8affineinvqb, B = `0`))]
643	#[rustc_legacy_const_generics(`2`)]
644	pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
645	static_assert_uimm_bits!(B, `8`);
646	let b: u8 = B as u8;
647	let x: i8x16 = x.as_i8x16();
648	let a: i8x16 = a.as_i8x16();
649	let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
650	transmute(src:r)
651	}
652
653	/// Performs an affine transformation on the inverted packed bytes in x.
654	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
655	/// and b being a constant 8-bit immediate value.
656	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
657	/// The inverse of 0 is 0.
658	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
659	///
660	/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
661	/// Otherwise the computation result is written into the result.
662	///
663	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
664	#[inline]
665	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
666	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
667	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
668	#[rustc_legacy_const_generics(`3`)]
669	pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
670	k: __mmask16,
671	x: __m128i,
672	a: __m128i,
673	) -> __m128i {
674	static_assert_uimm_bits!(B, `8`);
675	let b: u8 = B as u8;
676	let zero: i8x16 = _mm_setzero_si128().as_i8x16();
677	let x: i8x16 = x.as_i8x16();
678	let a: i8x16 = a.as_i8x16();
679	let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
680	transmute(src:simd_select_bitmask(m:k, yes:r, no:zero))
681	}
682
683	/// Performs an affine transformation on the inverted packed bytes in x.
684	/// That is computes ainv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix*
685	/// and b being a constant 8-bit immediate value.
686	/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
687	/// The inverse of 0 is 0.
688	/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
689	///
690	/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
691	/// Otherwise the computation result is written into the result.
692	///
693	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_gf2p8affineinv_epi64_epi8)
694	#[inline]
695	#[target_feature(enable = "gfni,avx512bw,avx512vl")]
696	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
697	#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = `0`))]
698	#[rustc_legacy_const_generics(`4`)]
699	pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
700	src: __m128i,
701	k: __mmask16,
702	x: __m128i,
703	a: __m128i,
704	) -> __m128i {
705	static_assert_uimm_bits!(B, `8`);
706	let b: u8 = B as u8;
707	let x: i8x16 = x.as_i8x16();
708	let a: i8x16 = a.as_i8x16();
709	let r: i8x16 = vgf2p8affineinvqb_128(x, a, imm8:b);
710	transmute(src:simd_select_bitmask(m:k, yes:r, no:src.as_i8x16()))
711	}
712
713	#[cfg(test)]
714	mod tests {
715	// The constants in the tests below are just bit patterns. They should not
716	// be interpreted as integers; signedness does not make sense for them, but
717	// __mXXXi happens to be defined in terms of signed integers.
718	#![allow(overflowing_literals)]
719
720	use core::hint::black_box;
721	use core::intrinsics::size_of;
722	use stdarch_test::simd_test;
723
724	use crate::core_arch::x86::*;
725
726	fn mulbyte(left: u8, right: u8) -> u8 {
727	// this implementation follows the description in
728	// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_gf2p8mul_epi8
729	const REDUCTION_POLYNOMIAL: u16 = `0x11b`;
730	let left: u16 = left.into();
731	let right: u16 = right.into();
732	let mut carryless_product: u16 = `0`;
733
734	// Carryless multiplication
735	for i in `0`..`8` {
736	if ((left >> i) & `0x01`) != `0` {
737	carryless_product ^= right << i;
738	}
739	}
740
741	// reduction, adding in "0" where appropriate to clear out high bits
742	// note that REDUCTION_POLYNOMIAL is zero in this context
743	for i in (`8`..=`14`).rev() {
744	if ((carryless_product >> i) & `0x01`) != `0` {
745	carryless_product ^= REDUCTION_POLYNOMIAL << (i - `8`);
746	}
747	}
748
749	carryless_product as u8
750	}
751
752	const NUM_TEST_WORDS_512: usize = `4`;
753	const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * `2`;
754	const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * `2`;
755	const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * `64`;
756	const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * `2`;
757	const NUM_BYTES: usize = `256`;
758	const NUM_BYTES_WORDS_128: usize = NUM_BYTES / `16`;
759	const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / `2`;
760	const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / `2`;
761
762	fn parity(input: u8) -> u8 {
763	let mut accumulator = `0`;
764	for i in `0`..`8` {
765	accumulator ^= (input >> i) & `0x01`;
766	}
767	accumulator
768	}
769
770	fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
771	// this implementation follows the description in
772	// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_gf2p8affine_epi64_epi8
773	let mut accumulator = `0`;
774
775	for bit in `0`..`8` {
776	accumulator \|= parity(x & matrix.to_le_bytes()[bit]) << (`7` - bit);
777	}
778
779	accumulator ^ b
780	}
781
782	fn generate_affine_mul_test_data(
783	immediate: u8,
784	) -> (
785	[u64; NUM_TEST_WORDS_64],
786	[u8; NUM_TEST_ENTRIES],
787	[u8; NUM_TEST_ENTRIES],
788	) {
789	let mut left: [u64; NUM_TEST_WORDS_64] = [`0`; NUM_TEST_WORDS_64];
790	let mut right: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
791	let mut result: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
792
793	for i in `0`..NUM_TEST_WORDS_64 {
794	left[i] = (i as u64) * `103` * `101`;
795	for j in `0`..`8` {
796	let j64 = j as u64;
797	right[i * `8` + j] = ((left[i] + j64) % `256`) as u8;
798	result[i * `8` + j] = mat_vec_multiply_affine(left[i], right[i * `8` + j], immediate);
799	}
800	}
801
802	(left, right, result)
803	}
804
805	fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
806	let mut input: [u8; NUM_BYTES] = [`0`; NUM_BYTES];
807	let mut result: [u8; NUM_BYTES] = [`0`; NUM_BYTES];
808
809	for i in `0`..NUM_BYTES {
810	input[i] = (i % `256`) as u8;
811	result[i] = if i == `0` { `0` } else { `1` };
812	}
813
814	(input, result)
815	}
816
817	const AES_S_BOX: [u8; NUM_BYTES] = [
818	`0x63`, `0x7c`, `0x77`, `0x7b`, `0xf2`, `0x6b`, `0x6f`, `0xc5`, `0x30`, `0x01`, `0x67`, `0x2b`, `0xfe`, `0xd7`, `0xab`,
819	`0x76`, `0xca`, `0x82`, `0xc9`, `0x7d`, `0xfa`, `0x59`, `0x47`, `0xf0`, `0xad`, `0xd4`, `0xa2`, `0xaf`, `0x9c`, `0xa4`,
820	`0x72`, `0xc0`, `0xb7`, `0xfd`, `0x93`, `0x26`, `0x36`, `0x3f`, `0xf7`, `0xcc`, `0x34`, `0xa5`, `0xe5`, `0xf1`, `0x71`,
821	`0xd8`, `0x31`, `0x15`, `0x04`, `0xc7`, `0x23`, `0xc3`, `0x18`, `0x96`, `0x05`, `0x9a`, `0x07`, `0x12`, `0x80`, `0xe2`,
822	`0xeb`, `0x27`, `0xb2`, `0x75`, `0x09`, `0x83`, `0x2c`, `0x1a`, `0x1b`, `0x6e`, `0x5a`, `0xa0`, `0x52`, `0x3b`, `0xd6`,
823	`0xb3`, `0x29`, `0xe3`, `0x2f`, `0x84`, `0x53`, `0xd1`, `0x00`, `0xed`, `0x20`, `0xfc`, `0xb1`, `0x5b`, `0x6a`, `0xcb`,
824	`0xbe`, `0x39`, `0x4a`, `0x4c`, `0x58`, `0xcf`, `0xd0`, `0xef`, `0xaa`, `0xfb`, `0x43`, `0x4d`, `0x33`, `0x85`, `0x45`,
825	`0xf9`, `0x02`, `0x7f`, `0x50`, `0x3c`, `0x9f`, `0xa8`, `0x51`, `0xa3`, `0x40`, `0x8f`, `0x92`, `0x9d`, `0x38`, `0xf5`,
826	`0xbc`, `0xb6`, `0xda`, `0x21`, `0x10`, `0xff`, `0xf3`, `0xd2`, `0xcd`, `0x0c`, `0x13`, `0xec`, `0x5f`, `0x97`, `0x44`,
827	`0x17`, `0xc4`, `0xa7`, `0x7e`, `0x3d`, `0x64`, `0x5d`, `0x19`, `0x73`, `0x60`, `0x81`, `0x4f`, `0xdc`, `0x22`, `0x2a`,
828	`0x90`, `0x88`, `0x46`, `0xee`, `0xb8`, `0x14`, `0xde`, `0x5e`, `0x0b`, `0xdb`, `0xe0`, `0x32`, `0x3a`, `0x0a`, `0x49`,
829	`0x06`, `0x24`, `0x5c`, `0xc2`, `0xd3`, `0xac`, `0x62`, `0x91`, `0x95`, `0xe4`, `0x79`, `0xe7`, `0xc8`, `0x37`, `0x6d`,
830	`0x8d`, `0xd5`, `0x4e`, `0xa9`, `0x6c`, `0x56`, `0xf4`, `0xea`, `0x65`, `0x7a`, `0xae`, `0x08`, `0xba`, `0x78`, `0x25`,
831	`0x2e`, `0x1c`, `0xa6`, `0xb4`, `0xc6`, `0xe8`, `0xdd`, `0x74`, `0x1f`, `0x4b`, `0xbd`, `0x8b`, `0x8a`, `0x70`, `0x3e`,
832	`0xb5`, `0x66`, `0x48`, `0x03`, `0xf6`, `0x0e`, `0x61`, `0x35`, `0x57`, `0xb9`, `0x86`, `0xc1`, `0x1d`, `0x9e`, `0xe1`,
833	`0xf8`, `0x98`, `0x11`, `0x69`, `0xd9`, `0x8e`, `0x94`, `0x9b`, `0x1e`, `0x87`, `0xe9`, `0xce`, `0x55`, `0x28`, `0xdf`,
834	`0x8c`, `0xa1`, `0x89`, `0x0d`, `0xbf`, `0xe6`, `0x42`, `0x68`, `0x41`, `0x99`, `0x2d`, `0x0f`, `0xb0`, `0x54`, `0xbb`,
835	`0x16`,
836	];
837
838	fn generate_byte_mul_test_data() -> (
839	[u8; NUM_TEST_ENTRIES],
840	[u8; NUM_TEST_ENTRIES],
841	[u8; NUM_TEST_ENTRIES],
842	) {
843	let mut left: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
844	let mut right: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
845	let mut result: [u8; NUM_TEST_ENTRIES] = [`0`; NUM_TEST_ENTRIES];
846
847	for i in `0`..NUM_TEST_ENTRIES {
848	left[i] = (i % `256`) as u8;
849	right[i] = left[i].wrapping_mul(`101`);
850	result[i] = mulbyte(left[i], right[i]);
851	}
852
853	(left, right, result)
854	}
855
856	#[target_feature(enable = "sse2")]
857	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
858	unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
859	let byte_offset = word_index * `16` / size_of::<T>();
860	let pointer = data.as_ptr().add(byte_offset) as *const __m128i;
861	_mm_loadu_si128(black_box(pointer))
862	}
863
864	#[target_feature(enable = "avx")]
865	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
866	unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
867	let byte_offset = word_index * `32` / size_of::<T>();
868	let pointer = data.as_ptr().add(byte_offset) as *const __m256i;
869	_mm256_loadu_si256(black_box(pointer))
870	}
871
872	#[target_feature(enable = "avx512f")]
873	#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
874	unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
875	let byte_offset = word_index * `64` / size_of::<T>();
876	let pointer = data.as_ptr().add(byte_offset) as *const i32;
877	_mm512_loadu_si512(black_box(pointer))
878	}
879
880	#[simd_test(enable = "gfni,avx512bw")]
881	unsafe fn test_mm512_gf2p8mul_epi8() {
882	let (left, right, expected) = generate_byte_mul_test_data();
883
884	for i in `0`..NUM_TEST_WORDS_512 {
885	let left = load_m512i_word(&left, i);
886	let right = load_m512i_word(&right, i);
887	let expected = load_m512i_word(&expected, i);
888	let result = _mm512_gf2p8mul_epi8(left, right);
889	assert_eq_m512i(result, expected);
890	}
891	}
892
893	#[simd_test(enable = "gfni,avx512bw")]
894	unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
895	let (left, right, _expected) = generate_byte_mul_test_data();
896
897	for i in `0`..NUM_TEST_WORDS_512 {
898	let left = load_m512i_word(&left, i);
899	let right = load_m512i_word(&right, i);
900	let result_zero = _mm512_maskz_gf2p8mul_epi8(`0`, left, right);
901	assert_eq_m512i(result_zero, _mm512_setzero_si512());
902	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
903	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
904	let expected_result = _mm512_gf2p8mul_epi8(left, right);
905	let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
906	let expected_masked =
907	_mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
908	assert_eq_m512i(result_masked, expected_masked);
909	}
910	}
911
912	#[simd_test(enable = "gfni,avx512bw")]
913	unsafe fn test_mm512_mask_gf2p8mul_epi8() {
914	let (left, right, _expected) = generate_byte_mul_test_data();
915
916	for i in `0`..NUM_TEST_WORDS_512 {
917	let left = load_m512i_word(&left, i);
918	let right = load_m512i_word(&right, i);
919	let result_left = _mm512_mask_gf2p8mul_epi8(left, `0`, left, right);
920	assert_eq_m512i(result_left, left);
921	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
922	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
923	let expected_result = _mm512_gf2p8mul_epi8(left, right);
924	let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
925	let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
926	assert_eq_m512i(result_masked, expected_masked);
927	}
928	}
929
930	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
931	unsafe fn test_mm256_gf2p8mul_epi8() {
932	let (left, right, expected) = generate_byte_mul_test_data();
933
934	for i in `0`..NUM_TEST_WORDS_256 {
935	let left = load_m256i_word(&left, i);
936	let right = load_m256i_word(&right, i);
937	let expected = load_m256i_word(&expected, i);
938	let result = _mm256_gf2p8mul_epi8(left, right);
939	assert_eq_m256i(result, expected);
940	}
941	}
942
943	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
944	unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
945	let (left, right, _expected) = generate_byte_mul_test_data();
946
947	for i in `0`..NUM_TEST_WORDS_256 {
948	let left = load_m256i_word(&left, i);
949	let right = load_m256i_word(&right, i);
950	let result_zero = _mm256_maskz_gf2p8mul_epi8(`0`, left, right);
951	assert_eq_m256i(result_zero, _mm256_setzero_si256());
952	let mask_bytes: __mmask32 = `0x0F_F0_FF_00`;
953	const MASK_WORDS: i32 = `0b01_10_11_00`;
954	let expected_result = _mm256_gf2p8mul_epi8(left, right);
955	let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
956	let expected_masked =
957	_mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
958	assert_eq_m256i(result_masked, expected_masked);
959	}
960	}
961
962	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
963	unsafe fn test_mm256_mask_gf2p8mul_epi8() {
964	let (left, right, _expected) = generate_byte_mul_test_data();
965
966	for i in `0`..NUM_TEST_WORDS_256 {
967	let left = load_m256i_word(&left, i);
968	let right = load_m256i_word(&right, i);
969	let result_left = _mm256_mask_gf2p8mul_epi8(left, `0`, left, right);
970	assert_eq_m256i(result_left, left);
971	let mask_bytes: __mmask32 = `0x0F_F0_FF_00`;
972	const MASK_WORDS: i32 = `0b01_10_11_00`;
973	let expected_result = _mm256_gf2p8mul_epi8(left, right);
974	let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
975	let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
976	assert_eq_m256i(result_masked, expected_masked);
977	}
978	}
979
980	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
981	unsafe fn test_mm_gf2p8mul_epi8() {
982	let (left, right, expected) = generate_byte_mul_test_data();
983
984	for i in `0`..NUM_TEST_WORDS_128 {
985	let left = load_m128i_word(&left, i);
986	let right = load_m128i_word(&right, i);
987	let expected = load_m128i_word(&expected, i);
988	let result = _mm_gf2p8mul_epi8(left, right);
989	assert_eq_m128i(result, expected);
990	}
991	}
992
993	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
994	unsafe fn test_mm_maskz_gf2p8mul_epi8() {
995	let (left, right, _expected) = generate_byte_mul_test_data();
996
997	for i in `0`..NUM_TEST_WORDS_128 {
998	let left = load_m128i_word(&left, i);
999	let right = load_m128i_word(&right, i);
1000	let result_zero = _mm_maskz_gf2p8mul_epi8(`0`, left, right);
1001	assert_eq_m128i(result_zero, _mm_setzero_si128());
1002	let mask_bytes: __mmask16 = `0x0F_F0`;
1003	const MASK_WORDS: i32 = `0b01_10`;
1004	let expected_result = _mm_gf2p8mul_epi8(left, right);
1005	let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
1006	let expected_masked =
1007	_mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1008	assert_eq_m128i(result_masked, expected_masked);
1009	}
1010	}
1011
1012	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1013	unsafe fn test_mm_mask_gf2p8mul_epi8() {
1014	let (left, right, _expected) = generate_byte_mul_test_data();
1015
1016	for i in `0`..NUM_TEST_WORDS_128 {
1017	let left = load_m128i_word(&left, i);
1018	let right = load_m128i_word(&right, i);
1019	let result_left = _mm_mask_gf2p8mul_epi8(left, `0`, left, right);
1020	assert_eq_m128i(result_left, left);
1021	let mask_bytes: __mmask16 = `0x0F_F0`;
1022	const MASK_WORDS: i32 = `0b01_10`;
1023	let expected_result = _mm_gf2p8mul_epi8(left, right);
1024	let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
1025	let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1026	assert_eq_m128i(result_masked, expected_masked);
1027	}
1028	}
1029
1030	#[simd_test(enable = "gfni,avx512bw")]
1031	unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1032	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1033	const IDENTITY_BYTE: i32 = `0`;
1034	let constant: i64 = `0`;
1035	const CONSTANT_BYTE: i32 = `0x63`;
1036	let identity = _mm512_set1_epi64(identity);
1037	let constant = _mm512_set1_epi64(constant);
1038	let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1039
1040	let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1041	let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1042
1043	for i in `0`..NUM_TEST_WORDS_512 {
1044	let data = load_m512i_word(&bytes, i);
1045	let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1046	assert_eq_m512i(result, data);
1047	let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1048	assert_eq_m512i(result, constant_reference);
1049	let data = load_m512i_word(&more_bytes, i);
1050	let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1051	assert_eq_m512i(result, data);
1052	let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1053	assert_eq_m512i(result, constant_reference);
1054
1055	let matrix = load_m512i_word(&matrices, i);
1056	let vector = load_m512i_word(&vectors, i);
1057	let reference = load_m512i_word(&references, i);
1058
1059	let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1060	assert_eq_m512i(result, reference);
1061	}
1062	}
1063
1064	#[simd_test(enable = "gfni,avx512bw")]
1065	unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1066	const CONSTANT_BYTE: i32 = `0x63`;
1067	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1068
1069	for i in `0`..NUM_TEST_WORDS_512 {
1070	let matrix = load_m512i_word(&matrices, i);
1071	let vector = load_m512i_word(&vectors, i);
1072	let result_zero =
1073	_mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1074	assert_eq_m512i(result_zero, _mm512_setzero_si512());
1075	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1076	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1077	let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1078	let result_masked =
1079	_mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1080	let expected_masked =
1081	_mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1082	assert_eq_m512i(result_masked, expected_masked);
1083	}
1084	}
1085
1086	#[simd_test(enable = "gfni,avx512bw")]
1087	unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1088	const CONSTANT_BYTE: i32 = `0x63`;
1089	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1090
1091	for i in `0`..NUM_TEST_WORDS_512 {
1092	let left = load_m512i_word(&vectors, i);
1093	let right = load_m512i_word(&matrices, i);
1094	let result_left =
1095	_mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1096	assert_eq_m512i(result_left, left);
1097	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1098	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1099	let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1100	let result_masked =
1101	_mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1102	let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1103	assert_eq_m512i(result_masked, expected_masked);
1104	}
1105	}
1106
1107	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1108	unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1109	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1110	const IDENTITY_BYTE: i32 = `0`;
1111	let constant: i64 = `0`;
1112	const CONSTANT_BYTE: i32 = `0x63`;
1113	let identity = _mm256_set1_epi64x(identity);
1114	let constant = _mm256_set1_epi64x(constant);
1115	let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1116
1117	let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1118	let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1119
1120	for i in `0`..NUM_TEST_WORDS_256 {
1121	let data = load_m256i_word(&bytes, i);
1122	let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1123	assert_eq_m256i(result, data);
1124	let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1125	assert_eq_m256i(result, constant_reference);
1126	let data = load_m256i_word(&more_bytes, i);
1127	let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1128	assert_eq_m256i(result, data);
1129	let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1130	assert_eq_m256i(result, constant_reference);
1131
1132	let matrix = load_m256i_word(&matrices, i);
1133	let vector = load_m256i_word(&vectors, i);
1134	let reference = load_m256i_word(&references, i);
1135
1136	let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1137	assert_eq_m256i(result, reference);
1138	}
1139	}
1140
1141	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1142	unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1143	const CONSTANT_BYTE: i32 = `0x63`;
1144	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1145
1146	for i in `0`..NUM_TEST_WORDS_256 {
1147	let matrix = load_m256i_word(&matrices, i);
1148	let vector = load_m256i_word(&vectors, i);
1149	let result_zero =
1150	_mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1151	assert_eq_m256i(result_zero, _mm256_setzero_si256());
1152	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1153	const MASK_WORDS: i32 = `0b11_01_10_00`;
1154	let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1155	let result_masked =
1156	_mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1157	let expected_masked =
1158	_mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1159	assert_eq_m256i(result_masked, expected_masked);
1160	}
1161	}
1162
1163	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1164	unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1165	const CONSTANT_BYTE: i32 = `0x63`;
1166	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1167
1168	for i in `0`..NUM_TEST_WORDS_256 {
1169	let left = load_m256i_word(&vectors, i);
1170	let right = load_m256i_word(&matrices, i);
1171	let result_left =
1172	_mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1173	assert_eq_m256i(result_left, left);
1174	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1175	const MASK_WORDS: i32 = `0b11_01_10_00`;
1176	let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1177	let result_masked =
1178	_mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1179	let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1180	assert_eq_m256i(result_masked, expected_masked);
1181	}
1182	}
1183
1184	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1185	unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1186	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1187	const IDENTITY_BYTE: i32 = `0`;
1188	let constant: i64 = `0`;
1189	const CONSTANT_BYTE: i32 = `0x63`;
1190	let identity = _mm_set1_epi64x(identity);
1191	let constant = _mm_set1_epi64x(constant);
1192	let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1193
1194	let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1195	let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1196
1197	for i in `0`..NUM_TEST_WORDS_128 {
1198	let data = load_m128i_word(&bytes, i);
1199	let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1200	assert_eq_m128i(result, data);
1201	let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1202	assert_eq_m128i(result, constant_reference);
1203	let data = load_m128i_word(&more_bytes, i);
1204	let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1205	assert_eq_m128i(result, data);
1206	let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1207	assert_eq_m128i(result, constant_reference);
1208
1209	let matrix = load_m128i_word(&matrices, i);
1210	let vector = load_m128i_word(&vectors, i);
1211	let reference = load_m128i_word(&references, i);
1212
1213	let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1214	assert_eq_m128i(result, reference);
1215	}
1216	}
1217
1218	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1219	unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1220	const CONSTANT_BYTE: i32 = `0x63`;
1221	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1222
1223	for i in `0`..NUM_TEST_WORDS_128 {
1224	let matrix = load_m128i_word(&matrices, i);
1225	let vector = load_m128i_word(&vectors, i);
1226	let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1227	assert_eq_m128i(result_zero, _mm_setzero_si128());
1228	let mask_bytes: __mmask16 = `0x0F_F0`;
1229	const MASK_WORDS: i32 = `0b01_10`;
1230	let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1231	let result_masked =
1232	_mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1233	let expected_masked =
1234	_mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1235	assert_eq_m128i(result_masked, expected_masked);
1236	}
1237	}
1238
1239	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1240	unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1241	const CONSTANT_BYTE: i32 = `0x63`;
1242	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1243
1244	for i in `0`..NUM_TEST_WORDS_128 {
1245	let left = load_m128i_word(&vectors, i);
1246	let right = load_m128i_word(&matrices, i);
1247	let result_left =
1248	_mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1249	assert_eq_m128i(result_left, left);
1250	let mask_bytes: __mmask16 = `0x0F_F0`;
1251	const MASK_WORDS: i32 = `0b01_10`;
1252	let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1253	let result_masked =
1254	_mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1255	let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1256	assert_eq_m128i(result_masked, expected_masked);
1257	}
1258	}
1259
1260	#[simd_test(enable = "gfni,avx512bw")]
1261	unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1262	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1263	const IDENTITY_BYTE: i32 = `0`;
1264	const CONSTANT_BYTE: i32 = `0x63`;
1265	let identity = _mm512_set1_epi64(identity);
1266
1267	// validate inversion
1268	let (inputs, results) = generate_inv_tests_data();
1269
1270	for i in `0`..NUM_BYTES_WORDS_512 {
1271	let input = load_m512i_word(&inputs, i);
1272	let reference = load_m512i_word(&results, i);
1273	let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1274	let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1275	assert_eq_m512i(remultiplied, reference);
1276	}
1277
1278	// validate subsequent affine operation
1279	let (matrices, vectors, _affine_expected) =
1280	generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1281
1282	for i in `0`..NUM_TEST_WORDS_512 {
1283	let vector = load_m512i_word(&vectors, i);
1284	let matrix = load_m512i_word(&matrices, i);
1285
1286	let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1287	let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1288	let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1289	assert_eq_m512i(result, reference);
1290	}
1291
1292	// validate everything by virtue of checking against the AES SBox
1293	const AES_S_BOX_MATRIX: i64 = `0xF1_E3_C7_8F_1F_3E_7C_F8`;
1294	let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1295
1296	for i in `0`..NUM_BYTES_WORDS_512 {
1297	let reference = load_m512i_word(&AES_S_BOX, i);
1298	let input = load_m512i_word(&inputs, i);
1299	let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1300	assert_eq_m512i(result, reference);
1301	}
1302	}
1303
1304	#[simd_test(enable = "gfni,avx512bw")]
1305	unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1306	const CONSTANT_BYTE: i32 = `0x63`;
1307	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1308
1309	for i in `0`..NUM_TEST_WORDS_512 {
1310	let matrix = load_m512i_word(&matrices, i);
1311	let vector = load_m512i_word(&vectors, i);
1312	let result_zero =
1313	_mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1314	assert_eq_m512i(result_zero, _mm512_setzero_si512());
1315	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1316	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1317	let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1318	let result_masked =
1319	_mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1320	let expected_masked =
1321	_mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1322	assert_eq_m512i(result_masked, expected_masked);
1323	}
1324	}
1325
1326	#[simd_test(enable = "gfni,avx512bw")]
1327	unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1328	const CONSTANT_BYTE: i32 = `0x63`;
1329	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1330
1331	for i in `0`..NUM_TEST_WORDS_512 {
1332	let left = load_m512i_word(&vectors, i);
1333	let right = load_m512i_word(&matrices, i);
1334	let result_left =
1335	_mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1336	assert_eq_m512i(result_left, left);
1337	let mask_bytes: __mmask64 = `0x0F_0F_0F_0F_FF_FF_00_00`;
1338	let mask_words: __mmask16 = `0b01_01_01_01_11_11_00_00`;
1339	let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1340	let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1341	left, mask_bytes, left, right,
1342	);
1343	let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1344	assert_eq_m512i(result_masked, expected_masked);
1345	}
1346	}
1347
1348	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1349	unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1350	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1351	const IDENTITY_BYTE: i32 = `0`;
1352	const CONSTANT_BYTE: i32 = `0x63`;
1353	let identity = _mm256_set1_epi64x(identity);
1354
1355	// validate inversion
1356	let (inputs, results) = generate_inv_tests_data();
1357
1358	for i in `0`..NUM_BYTES_WORDS_256 {
1359	let input = load_m256i_word(&inputs, i);
1360	let reference = load_m256i_word(&results, i);
1361	let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1362	let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1363	assert_eq_m256i(remultiplied, reference);
1364	}
1365
1366	// validate subsequent affine operation
1367	let (matrices, vectors, _affine_expected) =
1368	generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1369
1370	for i in `0`..NUM_TEST_WORDS_256 {
1371	let vector = load_m256i_word(&vectors, i);
1372	let matrix = load_m256i_word(&matrices, i);
1373
1374	let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1375	let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1376	let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1377	assert_eq_m256i(result, reference);
1378	}
1379
1380	// validate everything by virtue of checking against the AES SBox
1381	const AES_S_BOX_MATRIX: i64 = `0xF1_E3_C7_8F_1F_3E_7C_F8`;
1382	let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1383
1384	for i in `0`..NUM_BYTES_WORDS_256 {
1385	let reference = load_m256i_word(&AES_S_BOX, i);
1386	let input = load_m256i_word(&inputs, i);
1387	let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1388	assert_eq_m256i(result, reference);
1389	}
1390	}
1391
1392	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1393	unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1394	const CONSTANT_BYTE: i32 = `0x63`;
1395	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1396
1397	for i in `0`..NUM_TEST_WORDS_256 {
1398	let matrix = load_m256i_word(&matrices, i);
1399	let vector = load_m256i_word(&vectors, i);
1400	let result_zero =
1401	_mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1402	assert_eq_m256i(result_zero, _mm256_setzero_si256());
1403	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1404	const MASK_WORDS: i32 = `0b11_01_10_00`;
1405	let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1406	let result_masked =
1407	_mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1408	let expected_masked =
1409	_mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1410	assert_eq_m256i(result_masked, expected_masked);
1411	}
1412	}
1413
1414	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1415	unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1416	const CONSTANT_BYTE: i32 = `0x63`;
1417	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1418
1419	for i in `0`..NUM_TEST_WORDS_256 {
1420	let left = load_m256i_word(&vectors, i);
1421	let right = load_m256i_word(&matrices, i);
1422	let result_left =
1423	_mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1424	assert_eq_m256i(result_left, left);
1425	let mask_bytes: __mmask32 = `0xFF_0F_F0_00`;
1426	const MASK_WORDS: i32 = `0b11_01_10_00`;
1427	let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1428	let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1429	left, mask_bytes, left, right,
1430	);
1431	let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1432	assert_eq_m256i(result_masked, expected_masked);
1433	}
1434	}
1435
1436	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1437	unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1438	let identity: i64 = `0x01_02_04_08_10_20_40_80`;
1439	const IDENTITY_BYTE: i32 = `0`;
1440	const CONSTANT_BYTE: i32 = `0x63`;
1441	let identity = _mm_set1_epi64x(identity);
1442
1443	// validate inversion
1444	let (inputs, results) = generate_inv_tests_data();
1445
1446	for i in `0`..NUM_BYTES_WORDS_128 {
1447	let input = load_m128i_word(&inputs, i);
1448	let reference = load_m128i_word(&results, i);
1449	let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1450	let remultiplied = _mm_gf2p8mul_epi8(result, input);
1451	assert_eq_m128i(remultiplied, reference);
1452	}
1453
1454	// validate subsequent affine operation
1455	let (matrices, vectors, _affine_expected) =
1456	generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1457
1458	for i in `0`..NUM_TEST_WORDS_128 {
1459	let vector = load_m128i_word(&vectors, i);
1460	let matrix = load_m128i_word(&matrices, i);
1461
1462	let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1463	let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1464	let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1465	assert_eq_m128i(result, reference);
1466	}
1467
1468	// validate everything by virtue of checking against the AES SBox
1469	const AES_S_BOX_MATRIX: i64 = `0xF1_E3_C7_8F_1F_3E_7C_F8`;
1470	let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1471
1472	for i in `0`..NUM_BYTES_WORDS_128 {
1473	let reference = load_m128i_word(&AES_S_BOX, i);
1474	let input = load_m128i_word(&inputs, i);
1475	let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1476	assert_eq_m128i(result, reference);
1477	}
1478	}
1479
1480	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1481	unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1482	const CONSTANT_BYTE: i32 = `0x63`;
1483	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1484
1485	for i in `0`..NUM_TEST_WORDS_128 {
1486	let matrix = load_m128i_word(&matrices, i);
1487	let vector = load_m128i_word(&vectors, i);
1488	let result_zero =
1489	_mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(`0`, vector, matrix);
1490	assert_eq_m128i(result_zero, _mm_setzero_si128());
1491	let mask_bytes: __mmask16 = `0x0F_F0`;
1492	const MASK_WORDS: i32 = `0b01_10`;
1493	let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1494	let result_masked =
1495	_mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1496	let expected_masked =
1497	_mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1498	assert_eq_m128i(result_masked, expected_masked);
1499	}
1500	}
1501
1502	#[simd_test(enable = "gfni,avx512bw,avx512vl")]
1503	unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1504	const CONSTANT_BYTE: i32 = `0x63`;
1505	let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1506
1507	for i in `0`..NUM_TEST_WORDS_128 {
1508	let left = load_m128i_word(&vectors, i);
1509	let right = load_m128i_word(&matrices, i);
1510	let result_left =
1511	_mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, `0`, left, right);
1512	assert_eq_m128i(result_left, left);
1513	let mask_bytes: __mmask16 = `0x0F_F0`;
1514	const MASK_WORDS: i32 = `0b01_10`;
1515	let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1516	let result_masked =
1517	_mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1518	let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1519	assert_eq_m128i(result_masked, expected_masked);
1520	}
1521	}
1522	}
1523