vpclmulqdq.rs source code [crates/core_arch/src/x86/vpclmulqdq.rs]

1	//! Vectorized Carry-less Multiplication (VCLMUL)
2	//!
3	//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
4	//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
5	//!
6	//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
7
8	use crate::core_arch::x86::__m256i;
9	use crate::core_arch::x86::__m512i;
10
11	#[cfg(test)]
12	use stdarch_test::assert_instr;
13
14	#[allow(improper_ctypes)]
15	unsafe extern "C" {
16	#[link_name = "llvm.x86.pclmulqdq.256"]
17	unsafefn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
18	#[link_name = "llvm.x86.pclmulqdq.512"]
19	unsafefn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
20	}
21
22	// for some odd reason on x86_64 we generate the correct long name instructions
23	// but on i686 we generate the short name + imm8
24	// so we need to special-case on that...
25
26	/// Performs a carry-less multiplication of two 64-bit polynomials over the
27	/// finite field GF(2) - in each of the 4 128-bit lanes.
28	///
29	/// The immediate byte is used for determining which halves of each lane `a` and `b`
30	/// should be used. Immediate bits other than 0 and 4 are ignored.
31	/// All lanes share immediate byte.
32	///
33	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128)
34	#[inline]
35	#[target_feature(enable = "vpclmulqdq,avx512f")]
36	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
37	// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
38	#[cfg_attr(test, assert_instr(vpclmul, IMM8 = `0`))]
39	#[rustc_legacy_const_generics(`2`)]
40	pub fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
41	static_assert_uimm_bits!(IMM8, `8`);
42	unsafe { pclmulqdq_512(a, round_key:b, IMM8 as u8) }
43	}
44
45	/// Performs a carry-less multiplication of two 64-bit polynomials over the
46	/// finite field GF(2) - in each of the 2 128-bit lanes.
47	///
48	/// The immediate byte is used for determining which halves of each lane `a` and `b`
49	/// should be used. Immediate bits other than 0 and 4 are ignored.
50	/// All lanes share immediate byte.
51	///
52	/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128)
53	#[inline]
54	#[target_feature(enable = "vpclmulqdq")]
55	#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
56	#[cfg_attr(test, assert_instr(vpclmul, IMM8 = `0`))]
57	#[rustc_legacy_const_generics(`2`)]
58	pub fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
59	static_assert_uimm_bits!(IMM8, `8`);
60	unsafe { pclmulqdq_256(a, round_key:b, IMM8 as u8) }
61	}
62
63	#[cfg(test)]
64	mod tests {
65	// The constants in the tests below are just bit patterns. They should not
66	// be interpreted as integers; signedness does not make sense for them, but
67	// __mXXXi happens to be defined in terms of signed integers.
68	#![allow(overflowing_literals)]
69
70	use stdarch_test::simd_test;
71
72	use crate::core_arch::x86::*;
73
74	macro_rules! verify_kat_pclmul {
75	($broadcast:ident, $clmul:ident, $assert:ident) => {
76	// Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
77	let a = _mm_set_epi64x(`0x7b5b546573745665`, `0x63746f725d53475d`);
78	let a = $broadcast(a);
79	let b = _mm_set_epi64x(`0x4869285368617929`, `0x5b477565726f6e5d`);
80	let b = $broadcast(b);
81	let r00 = _mm_set_epi64x(`0x1d4d84c85c3440c0`, `0x929633d5d36f0451`);
82	let r00 = $broadcast(r00);
83	let r01 = _mm_set_epi64x(`0x1bd17c8d556ab5a1`, `0x7fa540ac2a281315`);
84	let r01 = $broadcast(r01);
85	let r10 = _mm_set_epi64x(`0x1a2bf6db3a30862f`, `0xbabf262df4b7d5c9`);
86	let r10 = $broadcast(r10);
87	let r11 = _mm_set_epi64x(`0x1d1e1f2c592e7c45`, `0xd66ee03e410fd4ed`);
88	let r11 = $broadcast(r11);
89
90	$assert($clmul::<`0x00`>(a, b), r00);
91	$assert($clmul::<`0x10`>(a, b), r01);
92	$assert($clmul::<`0x01`>(a, b), r10);
93	$assert($clmul::<`0x11`>(a, b), r11);
94
95	let a0 = _mm_set_epi64x(`0x0000000000000000`, `0x8000000000000000`);
96	let a0 = $broadcast(a0);
97	let r = _mm_set_epi64x(`0x4000000000000000`, `0x0000000000000000`);
98	let r = $broadcast(r);
99	$assert($clmul::<`0x00`>(a0, a0), r);
100	}
101	}
102
103	macro_rules! unroll {
104	($target:ident[`4`] = $op:ident::<`4`>($source:ident);) => {
105	$target[`3`] = $op::<`3`>($source);
106	$target[`2`] = $op::<`2`>($source);
107	unroll! {$target[`2`] = $op::<`2`>($source);}
108	};
109	($target:ident[`2`] = $op:ident::<`2`>($source:ident);) => {
110	$target[`1`] = $op::<`1`>($source);
111	$target[`0`] = $op::<`0`>($source);
112	};
113	(assert_eq_m128i($op:ident::<`4`>($vec_res:ident),$lin_res:ident[`4`]);) => {
114	assert_eq_m128i($op::<`3`>($vec_res), $lin_res[`3`]);
115	assert_eq_m128i($op::<`2`>($vec_res), $lin_res[`2`]);
116	unroll! {assert_eq_m128i($op::<`2`>($vec_res),$lin_res[`2`]);}
117	};
118	(assert_eq_m128i($op:ident::<`2`>($vec_res:ident),$lin_res:ident[`2`]);) => {
119	assert_eq_m128i($op::<`1`>($vec_res), $lin_res[`1`]);
120	assert_eq_m128i($op::<`0`>($vec_res), $lin_res[`0`]);
121	};
122	}
123
124	// this function tests one of the possible 4 instances
125	// with different inputs across lanes
126	#[target_feature(enable = "vpclmulqdq,avx512f")]
127	unsafe fn verify_512_helper(
128	linear: unsafe fn(__m128i, __m128i) -> __m128i,
129	vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
130	) {
131	let a = _mm512_set_epi64(
132	`0xDCB4DB3657BF0B7D`,
133	`0x18DB0601068EDD9F`,
134	`0xB76B908233200DC5`,
135	`0xE478235FA8E22D5E`,
136	`0xAB05CFFA2621154C`,
137	`0x1171B47A186174C9`,
138	`0x8C6B6C0E7595CEC9`,
139	`0xBE3E7D4934E961BD`,
140	);
141	let b = _mm512_set_epi64(
142	`0x672F6F105A94CEA7`,
143	`0x8298B8FFCA5F829C`,
144	`0xA3927047B3FB61D8`,
145	`0x978093862CDE7187`,
146	`0xB1927AB22F31D0EC`,
147	`0xA9A5DA619BE4D7AF`,
148	`0xCA2590F56884FDC6`,
149	`0x19BE9F660038BDB5`,
150	);
151
152	let mut a_decomp = [_mm_setzero_si128(); `4`];
153	unroll! {a_decomp[`4`] = _mm512_extracti32x4_epi32::<`4`>(a);}
154	let mut b_decomp = [_mm_setzero_si128(); `4`];
155	unroll! {b_decomp[`4`] = _mm512_extracti32x4_epi32::<`4`>(b);}
156
157	let r = vectorized(a, b);
158	let mut e_decomp = [_mm_setzero_si128(); `4`];
159	for i in `0`..`4` {
160	e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
161	}
162	unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<`4`>(r),e_decomp[`4`]);}
163	}
164
165	// this function tests one of the possible 4 instances
166	// with different inputs across lanes for the VL version
167	#[target_feature(enable = "vpclmulqdq,avx512vl")]
168	unsafe fn verify_256_helper(
169	linear: unsafe fn(__m128i, __m128i) -> __m128i,
170	vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
171	) {
172	let a = _mm512_set_epi64(
173	`0xDCB4DB3657BF0B7D`,
174	`0x18DB0601068EDD9F`,
175	`0xB76B908233200DC5`,
176	`0xE478235FA8E22D5E`,
177	`0xAB05CFFA2621154C`,
178	`0x1171B47A186174C9`,
179	`0x8C6B6C0E7595CEC9`,
180	`0xBE3E7D4934E961BD`,
181	);
182	let b = _mm512_set_epi64(
183	`0x672F6F105A94CEA7`,
184	`0x8298B8FFCA5F829C`,
185	`0xA3927047B3FB61D8`,
186	`0x978093862CDE7187`,
187	`0xB1927AB22F31D0EC`,
188	`0xA9A5DA619BE4D7AF`,
189	`0xCA2590F56884FDC6`,
190	`0x19BE9F660038BDB5`,
191	);
192
193	let mut a_decomp = [_mm_setzero_si128(); `2`];
194	unroll! {a_decomp[`2`] = _mm512_extracti32x4_epi32::<`2`>(a);}
195	let mut b_decomp = [_mm_setzero_si128(); `2`];
196	unroll! {b_decomp[`2`] = _mm512_extracti32x4_epi32::<`2`>(b);}
197
198	let r = vectorized(
199	_mm512_extracti64x4_epi64::<`0`>(a),
200	_mm512_extracti64x4_epi64::<`0`>(b),
201	);
202	let mut e_decomp = [_mm_setzero_si128(); `2`];
203	for i in `0`..`2` {
204	e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
205	}
206	unroll! {assert_eq_m128i(_mm256_extracti128_si256::<`2`>(r),e_decomp[`2`]);}
207	}
208
209	#[simd_test(enable = "vpclmulqdq,avx512f")]
210	unsafe fn test_mm512_clmulepi64_epi128() {
211	verify_kat_pclmul!(
212	_mm512_broadcast_i32x4,
213	_mm512_clmulepi64_epi128,
214	assert_eq_m512i
215	);
216
217	verify_512_helper(
218	\|a, b\| _mm_clmulepi64_si128::<`0x00`>(a, b),
219	\|a, b\| _mm512_clmulepi64_epi128::<`0x00`>(a, b),
220	);
221	verify_512_helper(
222	\|a, b\| _mm_clmulepi64_si128::<`0x01`>(a, b),
223	\|a, b\| _mm512_clmulepi64_epi128::<`0x01`>(a, b),
224	);
225	verify_512_helper(
226	\|a, b\| _mm_clmulepi64_si128::<`0x10`>(a, b),
227	\|a, b\| _mm512_clmulepi64_epi128::<`0x10`>(a, b),
228	);
229	verify_512_helper(
230	\|a, b\| _mm_clmulepi64_si128::<`0x11`>(a, b),
231	\|a, b\| _mm512_clmulepi64_epi128::<`0x11`>(a, b),
232	);
233	}
234
235	#[simd_test(enable = "vpclmulqdq,avx512vl")]
236	unsafe fn test_mm256_clmulepi64_epi128() {
237	verify_kat_pclmul!(
238	_mm256_broadcastsi128_si256,
239	_mm256_clmulepi64_epi128,
240	assert_eq_m256i
241	);
242
243	verify_256_helper(
244	\|a, b\| _mm_clmulepi64_si128::<`0x00`>(a, b),
245	\|a, b\| _mm256_clmulepi64_epi128::<`0x00`>(a, b),
246	);
247	verify_256_helper(
248	\|a, b\| _mm_clmulepi64_si128::<`0x01`>(a, b),
249	\|a, b\| _mm256_clmulepi64_epi128::<`0x01`>(a, b),
250	);
251	verify_256_helper(
252	\|a, b\| _mm_clmulepi64_si128::<`0x10`>(a, b),
253	\|a, b\| _mm256_clmulepi64_epi128::<`0x10`>(a, b),
254	);
255	verify_256_helper(
256	\|a, b\| _mm_clmulepi64_si128::<`0x11`>(a, b),
257	\|a, b\| _mm256_clmulepi64_epi128::<`0x11`>(a, b),
258	);
259	}
260	}
261