1 | //! Vectorized Carry-less Multiplication (VCLMUL) |
2 | //! |
3 | //! The reference is [Intel 64 and IA-32 Architectures Software Developer's |
4 | //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241). |
5 | //! |
6 | //! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf |
7 | |
8 | use crate::core_arch::x86::__m256i; |
9 | use crate::core_arch::x86::__m512i; |
10 | |
11 | #[cfg (test)] |
12 | use stdarch_test::assert_instr; |
13 | |
14 | #[allow (improper_ctypes)] |
15 | extern "C" { |
16 | #[link_name = "llvm.x86.pclmulqdq.256" ] |
17 | fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i; |
18 | #[link_name = "llvm.x86.pclmulqdq.512" ] |
19 | fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i; |
20 | } |
21 | |
22 | // for some odd reason on x86_64 we generate the correct long name instructions |
23 | // but on i686 we generate the short name + imm8 |
24 | // so we need to special-case on that... |
25 | |
26 | /// Performs a carry-less multiplication of two 64-bit polynomials over the |
27 | /// finite field GF(2) - in each of the 4 128-bit lanes. |
28 | /// |
29 | /// The immediate byte is used for determining which halves of each lane `a` and `b` |
30 | /// should be used. Immediate bits other than 0 and 4 are ignored. |
31 | /// All lanes share immediate byte. |
32 | /// |
33 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128) |
34 | #[inline ] |
35 | #[target_feature (enable = "vpclmulqdq,avx512f" )] |
36 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
37 | // technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise |
38 | #[cfg_attr (test, assert_instr(vpclmul, IMM8 = 0))] |
39 | #[rustc_legacy_const_generics (2)] |
40 | pub unsafe fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i { |
41 | static_assert_uimm_bits!(IMM8, 8); |
42 | pclmulqdq_512(a, round_key:b, IMM8 as u8) |
43 | } |
44 | |
45 | /// Performs a carry-less multiplication of two 64-bit polynomials over the |
46 | /// finite field GF(2) - in each of the 2 128-bit lanes. |
47 | /// |
48 | /// The immediate byte is used for determining which halves of each lane `a` and `b` |
49 | /// should be used. Immediate bits other than 0 and 4 are ignored. |
50 | /// All lanes share immediate byte. |
51 | /// |
52 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128) |
53 | #[inline ] |
54 | #[target_feature (enable = "vpclmulqdq" )] |
55 | #[unstable (feature = "stdarch_x86_avx512" , issue = "111137" )] |
56 | #[cfg_attr (test, assert_instr(vpclmul, IMM8 = 0))] |
57 | #[rustc_legacy_const_generics (2)] |
58 | pub unsafe fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i { |
59 | static_assert_uimm_bits!(IMM8, 8); |
60 | pclmulqdq_256(a, round_key:b, IMM8 as u8) |
61 | } |
62 | |
63 | #[cfg (test)] |
64 | mod tests { |
65 | // The constants in the tests below are just bit patterns. They should not |
66 | // be interpreted as integers; signedness does not make sense for them, but |
67 | // __mXXXi happens to be defined in terms of signed integers. |
68 | #![allow (overflowing_literals)] |
69 | |
70 | use stdarch_test::simd_test; |
71 | |
72 | use crate::core_arch::x86::*; |
73 | |
74 | macro_rules! verify_kat_pclmul { |
75 | ($broadcast:ident, $clmul:ident, $assert:ident) => { |
76 | // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf |
77 | let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d); |
78 | let a = $broadcast(a); |
79 | let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d); |
80 | let b = $broadcast(b); |
81 | let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451); |
82 | let r00 = $broadcast(r00); |
83 | let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315); |
84 | let r01 = $broadcast(r01); |
85 | let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9); |
86 | let r10 = $broadcast(r10); |
87 | let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed); |
88 | let r11 = $broadcast(r11); |
89 | |
90 | $assert($clmul::<0x00>(a, b), r00); |
91 | $assert($clmul::<0x10>(a, b), r01); |
92 | $assert($clmul::<0x01>(a, b), r10); |
93 | $assert($clmul::<0x11>(a, b), r11); |
94 | |
95 | let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000); |
96 | let a0 = $broadcast(a0); |
97 | let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000); |
98 | let r = $broadcast(r); |
99 | $assert($clmul::<0x00>(a0, a0), r); |
100 | } |
101 | } |
102 | |
103 | macro_rules! unroll { |
104 | ($target:ident[4] = $op:ident::<4>($source:ident);) => { |
105 | $target[3] = $op::<3>($source); |
106 | $target[2] = $op::<2>($source); |
107 | unroll! {$target[2] = $op::<2>($source);} |
108 | }; |
109 | ($target:ident[2] = $op:ident::<2>($source:ident);) => { |
110 | $target[1] = $op::<1>($source); |
111 | $target[0] = $op::<0>($source); |
112 | }; |
113 | (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => { |
114 | assert_eq_m128i($op::<3>($vec_res), $lin_res[3]); |
115 | assert_eq_m128i($op::<2>($vec_res), $lin_res[2]); |
116 | unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);} |
117 | }; |
118 | (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => { |
119 | assert_eq_m128i($op::<1>($vec_res), $lin_res[1]); |
120 | assert_eq_m128i($op::<0>($vec_res), $lin_res[0]); |
121 | }; |
122 | } |
123 | |
124 | // this function tests one of the possible 4 instances |
125 | // with different inputs across lanes |
126 | #[target_feature (enable = "vpclmulqdq,avx512f" )] |
127 | unsafe fn verify_512_helper( |
128 | linear: unsafe fn(__m128i, __m128i) -> __m128i, |
129 | vectorized: unsafe fn(__m512i, __m512i) -> __m512i, |
130 | ) { |
131 | let a = _mm512_set_epi64( |
132 | 0xDCB4DB3657BF0B7D, |
133 | 0x18DB0601068EDD9F, |
134 | 0xB76B908233200DC5, |
135 | 0xE478235FA8E22D5E, |
136 | 0xAB05CFFA2621154C, |
137 | 0x1171B47A186174C9, |
138 | 0x8C6B6C0E7595CEC9, |
139 | 0xBE3E7D4934E961BD, |
140 | ); |
141 | let b = _mm512_set_epi64( |
142 | 0x672F6F105A94CEA7, |
143 | 0x8298B8FFCA5F829C, |
144 | 0xA3927047B3FB61D8, |
145 | 0x978093862CDE7187, |
146 | 0xB1927AB22F31D0EC, |
147 | 0xA9A5DA619BE4D7AF, |
148 | 0xCA2590F56884FDC6, |
149 | 0x19BE9F660038BDB5, |
150 | ); |
151 | |
152 | let mut a_decomp = [_mm_setzero_si128(); 4]; |
153 | unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);} |
154 | let mut b_decomp = [_mm_setzero_si128(); 4]; |
155 | unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);} |
156 | |
157 | let r = vectorized(a, b); |
158 | let mut e_decomp = [_mm_setzero_si128(); 4]; |
159 | for i in 0..4 { |
160 | e_decomp[i] = linear(a_decomp[i], b_decomp[i]); |
161 | } |
162 | unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);} |
163 | } |
164 | |
165 | // this function tests one of the possible 4 instances |
166 | // with different inputs across lanes for the VL version |
167 | #[target_feature (enable = "vpclmulqdq,avx512vl" )] |
168 | unsafe fn verify_256_helper( |
169 | linear: unsafe fn(__m128i, __m128i) -> __m128i, |
170 | vectorized: unsafe fn(__m256i, __m256i) -> __m256i, |
171 | ) { |
172 | let a = _mm512_set_epi64( |
173 | 0xDCB4DB3657BF0B7D, |
174 | 0x18DB0601068EDD9F, |
175 | 0xB76B908233200DC5, |
176 | 0xE478235FA8E22D5E, |
177 | 0xAB05CFFA2621154C, |
178 | 0x1171B47A186174C9, |
179 | 0x8C6B6C0E7595CEC9, |
180 | 0xBE3E7D4934E961BD, |
181 | ); |
182 | let b = _mm512_set_epi64( |
183 | 0x672F6F105A94CEA7, |
184 | 0x8298B8FFCA5F829C, |
185 | 0xA3927047B3FB61D8, |
186 | 0x978093862CDE7187, |
187 | 0xB1927AB22F31D0EC, |
188 | 0xA9A5DA619BE4D7AF, |
189 | 0xCA2590F56884FDC6, |
190 | 0x19BE9F660038BDB5, |
191 | ); |
192 | |
193 | let mut a_decomp = [_mm_setzero_si128(); 2]; |
194 | unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);} |
195 | let mut b_decomp = [_mm_setzero_si128(); 2]; |
196 | unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);} |
197 | |
198 | let r = vectorized( |
199 | _mm512_extracti64x4_epi64::<0>(a), |
200 | _mm512_extracti64x4_epi64::<0>(b), |
201 | ); |
202 | let mut e_decomp = [_mm_setzero_si128(); 2]; |
203 | for i in 0..2 { |
204 | e_decomp[i] = linear(a_decomp[i], b_decomp[i]); |
205 | } |
206 | unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);} |
207 | } |
208 | |
209 | #[simd_test(enable = "vpclmulqdq,avx512f" )] |
210 | unsafe fn test_mm512_clmulepi64_epi128() { |
211 | verify_kat_pclmul!( |
212 | _mm512_broadcast_i32x4, |
213 | _mm512_clmulepi64_epi128, |
214 | assert_eq_m512i |
215 | ); |
216 | |
217 | verify_512_helper( |
218 | |a, b| _mm_clmulepi64_si128::<0x00>(a, b), |
219 | |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b), |
220 | ); |
221 | verify_512_helper( |
222 | |a, b| _mm_clmulepi64_si128::<0x01>(a, b), |
223 | |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b), |
224 | ); |
225 | verify_512_helper( |
226 | |a, b| _mm_clmulepi64_si128::<0x10>(a, b), |
227 | |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b), |
228 | ); |
229 | verify_512_helper( |
230 | |a, b| _mm_clmulepi64_si128::<0x11>(a, b), |
231 | |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b), |
232 | ); |
233 | } |
234 | |
235 | #[simd_test(enable = "vpclmulqdq,avx512vl" )] |
236 | unsafe fn test_mm256_clmulepi64_epi128() { |
237 | verify_kat_pclmul!( |
238 | _mm256_broadcastsi128_si256, |
239 | _mm256_clmulepi64_epi128, |
240 | assert_eq_m256i |
241 | ); |
242 | |
243 | verify_256_helper( |
244 | |a, b| _mm_clmulepi64_si128::<0x00>(a, b), |
245 | |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b), |
246 | ); |
247 | verify_256_helper( |
248 | |a, b| _mm_clmulepi64_si128::<0x01>(a, b), |
249 | |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b), |
250 | ); |
251 | verify_256_helper( |
252 | |a, b| _mm_clmulepi64_si128::<0x10>(a, b), |
253 | |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b), |
254 | ); |
255 | verify_256_helper( |
256 | |a, b| _mm_clmulepi64_si128::<0x11>(a, b), |
257 | |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b), |
258 | ); |
259 | } |
260 | } |
261 | |