1//! Vectorized Carry-less Multiplication (VCLMUL)
2//!
3//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
4//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
5//!
6//! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
7
8use crate::core_arch::x86::__m256i;
9use crate::core_arch::x86::__m512i;
10
11#[cfg(test)]
12use stdarch_test::assert_instr;
13
14#[allow(improper_ctypes)]
15extern "C" {
16 #[link_name = "llvm.x86.pclmulqdq.256"]
17 fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
18 #[link_name = "llvm.x86.pclmulqdq.512"]
19 fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
20}
21
22// for some odd reason on x86_64 we generate the correct long name instructions
23// but on i686 we generate the short name + imm8
24// so we need to special-case on that...
25
26/// Performs a carry-less multiplication of two 64-bit polynomials over the
27/// finite field GF(2) - in each of the 4 128-bit lanes.
28///
29/// The immediate byte is used for determining which halves of each lane `a` and `b`
30/// should be used. Immediate bits other than 0 and 4 are ignored.
31/// All lanes share immediate byte.
32///
33/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128)
34#[inline]
35#[target_feature(enable = "vpclmulqdq,avx512f")]
36// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
37#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
38#[rustc_legacy_const_generics(2)]
39pub unsafe fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
40 static_assert_uimm_bits!(IMM8, 8);
41 pclmulqdq_512(a, round_key:b, IMM8 as u8)
42}
43
44/// Performs a carry-less multiplication of two 64-bit polynomials over the
45/// finite field GF(2) - in each of the 2 128-bit lanes.
46///
47/// The immediate byte is used for determining which halves of each lane `a` and `b`
48/// should be used. Immediate bits other than 0 and 4 are ignored.
49/// All lanes share immediate byte.
50///
51/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128)
52#[inline]
53#[target_feature(enable = "vpclmulqdq")]
54#[cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))]
55#[rustc_legacy_const_generics(2)]
56pub unsafe fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
57 static_assert_uimm_bits!(IMM8, 8);
58 pclmulqdq_256(a, round_key:b, IMM8 as u8)
59}
60
61#[cfg(test)]
62mod tests {
63 // The constants in the tests below are just bit patterns. They should not
64 // be interpreted as integers; signedness does not make sense for them, but
65 // __mXXXi happens to be defined in terms of signed integers.
66 #![allow(overflowing_literals)]
67
68 use stdarch_test::simd_test;
69
70 use crate::core_arch::x86::*;
71
72 macro_rules! verify_kat_pclmul {
73 ($broadcast:ident, $clmul:ident, $assert:ident) => {
74 // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
75 let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
76 let a = $broadcast(a);
77 let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
78 let b = $broadcast(b);
79 let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
80 let r00 = $broadcast(r00);
81 let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
82 let r01 = $broadcast(r01);
83 let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
84 let r10 = $broadcast(r10);
85 let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
86 let r11 = $broadcast(r11);
87
88 $assert($clmul::<0x00>(a, b), r00);
89 $assert($clmul::<0x10>(a, b), r01);
90 $assert($clmul::<0x01>(a, b), r10);
91 $assert($clmul::<0x11>(a, b), r11);
92
93 let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
94 let a0 = $broadcast(a0);
95 let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
96 let r = $broadcast(r);
97 $assert($clmul::<0x00>(a0, a0), r);
98 }
99 }
100
101 macro_rules! unroll {
102 ($target:ident[4] = $op:ident::<4>($source:ident);) => {
103 $target[3] = $op::<3>($source);
104 $target[2] = $op::<2>($source);
105 unroll! {$target[2] = $op::<2>($source);}
106 };
107 ($target:ident[2] = $op:ident::<2>($source:ident);) => {
108 $target[1] = $op::<1>($source);
109 $target[0] = $op::<0>($source);
110 };
111 (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => {
112 assert_eq_m128i($op::<3>($vec_res), $lin_res[3]);
113 assert_eq_m128i($op::<2>($vec_res), $lin_res[2]);
114 unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);}
115 };
116 (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => {
117 assert_eq_m128i($op::<1>($vec_res), $lin_res[1]);
118 assert_eq_m128i($op::<0>($vec_res), $lin_res[0]);
119 };
120 }
121
122 // this function tests one of the possible 4 instances
123 // with different inputs across lanes
124 #[target_feature(enable = "vpclmulqdq,avx512f")]
125 unsafe fn verify_512_helper(
126 linear: unsafe fn(__m128i, __m128i) -> __m128i,
127 vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
128 ) {
129 let a = _mm512_set_epi64(
130 0xDCB4DB3657BF0B7D,
131 0x18DB0601068EDD9F,
132 0xB76B908233200DC5,
133 0xE478235FA8E22D5E,
134 0xAB05CFFA2621154C,
135 0x1171B47A186174C9,
136 0x8C6B6C0E7595CEC9,
137 0xBE3E7D4934E961BD,
138 );
139 let b = _mm512_set_epi64(
140 0x672F6F105A94CEA7,
141 0x8298B8FFCA5F829C,
142 0xA3927047B3FB61D8,
143 0x978093862CDE7187,
144 0xB1927AB22F31D0EC,
145 0xA9A5DA619BE4D7AF,
146 0xCA2590F56884FDC6,
147 0x19BE9F660038BDB5,
148 );
149
150 let mut a_decomp = [_mm_setzero_si128(); 4];
151 unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);}
152 let mut b_decomp = [_mm_setzero_si128(); 4];
153 unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);}
154
155 let r = vectorized(a, b);
156 let mut e_decomp = [_mm_setzero_si128(); 4];
157 for i in 0..4 {
158 e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
159 }
160 unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);}
161 }
162
163 // this function tests one of the possible 4 instances
164 // with different inputs across lanes for the VL version
165 #[target_feature(enable = "vpclmulqdq,avx512vl")]
166 unsafe fn verify_256_helper(
167 linear: unsafe fn(__m128i, __m128i) -> __m128i,
168 vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
169 ) {
170 let a = _mm512_set_epi64(
171 0xDCB4DB3657BF0B7D,
172 0x18DB0601068EDD9F,
173 0xB76B908233200DC5,
174 0xE478235FA8E22D5E,
175 0xAB05CFFA2621154C,
176 0x1171B47A186174C9,
177 0x8C6B6C0E7595CEC9,
178 0xBE3E7D4934E961BD,
179 );
180 let b = _mm512_set_epi64(
181 0x672F6F105A94CEA7,
182 0x8298B8FFCA5F829C,
183 0xA3927047B3FB61D8,
184 0x978093862CDE7187,
185 0xB1927AB22F31D0EC,
186 0xA9A5DA619BE4D7AF,
187 0xCA2590F56884FDC6,
188 0x19BE9F660038BDB5,
189 );
190
191 let mut a_decomp = [_mm_setzero_si128(); 2];
192 unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);}
193 let mut b_decomp = [_mm_setzero_si128(); 2];
194 unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);}
195
196 let r = vectorized(
197 _mm512_extracti64x4_epi64::<0>(a),
198 _mm512_extracti64x4_epi64::<0>(b),
199 );
200 let mut e_decomp = [_mm_setzero_si128(); 2];
201 for i in 0..2 {
202 e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
203 }
204 unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);}
205 }
206
207 #[simd_test(enable = "vpclmulqdq,avx512f")]
208 unsafe fn test_mm512_clmulepi64_epi128() {
209 verify_kat_pclmul!(
210 _mm512_broadcast_i32x4,
211 _mm512_clmulepi64_epi128,
212 assert_eq_m512i
213 );
214
215 verify_512_helper(
216 |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
217 |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b),
218 );
219 verify_512_helper(
220 |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
221 |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b),
222 );
223 verify_512_helper(
224 |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
225 |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b),
226 );
227 verify_512_helper(
228 |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
229 |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b),
230 );
231 }
232
233 #[simd_test(enable = "vpclmulqdq,avx512vl")]
234 unsafe fn test_mm256_clmulepi64_epi128() {
235 verify_kat_pclmul!(
236 _mm256_broadcastsi128_si256,
237 _mm256_clmulepi64_epi128,
238 assert_eq_m256i
239 );
240
241 verify_256_helper(
242 |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
243 |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b),
244 );
245 verify_256_helper(
246 |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
247 |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b),
248 );
249 verify_256_helper(
250 |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
251 |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b),
252 );
253 verify_256_helper(
254 |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
255 |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b),
256 );
257 }
258}
259