| 1 | //! Vectorized Carry-less Multiplication (VCLMUL) | 
| 2 | //! | 
|---|
| 3 | //! The reference is [Intel 64 and IA-32 Architectures Software Developer's | 
|---|
| 4 | //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241). | 
|---|
| 5 | //! | 
|---|
| 6 | //! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf | 
|---|
| 7 |  | 
|---|
| 8 | use crate::core_arch::x86::__m256i; | 
|---|
| 9 | use crate::core_arch::x86::__m512i; | 
|---|
| 10 |  | 
|---|
| 11 | #[ cfg(test)] | 
|---|
| 12 | use stdarch_test::assert_instr; | 
|---|
| 13 |  | 
|---|
| 14 | #[ allow(improper_ctypes)] | 
|---|
| 15 | unsafe extern "C"{ | 
|---|
| 16 | #[ link_name= "llvm.x86.pclmulqdq.256"] | 
|---|
| 17 | unsafefn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i; | 
|---|
| 18 | #[ link_name= "llvm.x86.pclmulqdq.512"] | 
|---|
| 19 | unsafefn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i; | 
|---|
| 20 | } | 
|---|
| 21 |  | 
|---|
| 22 | // for some odd reason on x86_64 we generate the correct long name instructions | 
|---|
| 23 | // but on i686 we generate the short name + imm8 | 
|---|
| 24 | // so we need to special-case on that... | 
|---|
| 25 |  | 
|---|
| 26 | /// Performs a carry-less multiplication of two 64-bit polynomials over the | 
|---|
| 27 | /// finite field GF(2) - in each of the 4 128-bit lanes. | 
|---|
| 28 | /// | 
|---|
| 29 | /// The immediate byte is used for determining which halves of each lane `a` and `b` | 
|---|
| 30 | /// should be used. Immediate bits other than 0 and 4 are ignored. | 
|---|
| 31 | /// All lanes share immediate byte. | 
|---|
| 32 | /// | 
|---|
| 33 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_clmulepi64_epi128) | 
|---|
| 34 | #[ inline] | 
|---|
| 35 | #[ target_feature(enable = "vpclmulqdq,avx512f")] | 
|---|
| 36 | #[ stable(feature = "stdarch_x86_avx512", since = "1.89")] | 
|---|
| 37 | // technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise | 
|---|
| 38 | #[ cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))] | 
|---|
| 39 | #[ rustc_legacy_const_generics(2)] | 
|---|
| 40 | pub fn _mm512_clmulepi64_epi128<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i { | 
|---|
| 41 | static_assert_uimm_bits!(IMM8, 8); | 
|---|
| 42 | unsafe { pclmulqdq_512(a, round_key:b, IMM8 as u8) } | 
|---|
| 43 | } | 
|---|
| 44 |  | 
|---|
| 45 | /// Performs a carry-less multiplication of two 64-bit polynomials over the | 
|---|
| 46 | /// finite field GF(2) - in each of the 2 128-bit lanes. | 
|---|
| 47 | /// | 
|---|
| 48 | /// The immediate byte is used for determining which halves of each lane `a` and `b` | 
|---|
| 49 | /// should be used. Immediate bits other than 0 and 4 are ignored. | 
|---|
| 50 | /// All lanes share immediate byte. | 
|---|
| 51 | /// | 
|---|
| 52 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_clmulepi64_epi128) | 
|---|
| 53 | #[ inline] | 
|---|
| 54 | #[ target_feature(enable = "vpclmulqdq")] | 
|---|
| 55 | #[ stable(feature = "stdarch_x86_avx512", since = "1.89")] | 
|---|
| 56 | #[ cfg_attr(test, assert_instr(vpclmul, IMM8 = 0))] | 
|---|
| 57 | #[ rustc_legacy_const_generics(2)] | 
|---|
| 58 | pub fn _mm256_clmulepi64_epi128<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i { | 
|---|
| 59 | static_assert_uimm_bits!(IMM8, 8); | 
|---|
| 60 | unsafe { pclmulqdq_256(a, round_key:b, IMM8 as u8) } | 
|---|
| 61 | } | 
|---|
| 62 |  | 
|---|
| 63 | #[ cfg(test)] | 
|---|
| 64 | mod tests { | 
|---|
| 65 | // The constants in the tests below are just bit patterns. They should not | 
|---|
| 66 | // be interpreted as integers; signedness does not make sense for them, but | 
|---|
| 67 | // __mXXXi happens to be defined in terms of signed integers. | 
|---|
| 68 | #![ allow(overflowing_literals)] | 
|---|
| 69 |  | 
|---|
| 70 | use stdarch_test::simd_test; | 
|---|
| 71 |  | 
|---|
| 72 | use crate::core_arch::x86::*; | 
|---|
| 73 |  | 
|---|
| 74 | macro_rules! verify_kat_pclmul { | 
|---|
| 75 | ($broadcast:ident, $clmul:ident, $assert:ident) => { | 
|---|
| 76 | // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf | 
|---|
| 77 | let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d); | 
|---|
| 78 | let a = $broadcast(a); | 
|---|
| 79 | let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d); | 
|---|
| 80 | let b = $broadcast(b); | 
|---|
| 81 | let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451); | 
|---|
| 82 | let r00 = $broadcast(r00); | 
|---|
| 83 | let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315); | 
|---|
| 84 | let r01 = $broadcast(r01); | 
|---|
| 85 | let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9); | 
|---|
| 86 | let r10 = $broadcast(r10); | 
|---|
| 87 | let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed); | 
|---|
| 88 | let r11 = $broadcast(r11); | 
|---|
| 89 |  | 
|---|
| 90 | $assert($clmul::<0x00>(a, b), r00); | 
|---|
| 91 | $assert($clmul::<0x10>(a, b), r01); | 
|---|
| 92 | $assert($clmul::<0x01>(a, b), r10); | 
|---|
| 93 | $assert($clmul::<0x11>(a, b), r11); | 
|---|
| 94 |  | 
|---|
| 95 | let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000); | 
|---|
| 96 | let a0 = $broadcast(a0); | 
|---|
| 97 | let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000); | 
|---|
| 98 | let r = $broadcast(r); | 
|---|
| 99 | $assert($clmul::<0x00>(a0, a0), r); | 
|---|
| 100 | } | 
|---|
| 101 | } | 
|---|
| 102 |  | 
|---|
| 103 | macro_rules! unroll { | 
|---|
| 104 | ($target:ident[4] = $op:ident::<4>($source:ident);) => { | 
|---|
| 105 | $target[3] = $op::<3>($source); | 
|---|
| 106 | $target[2] = $op::<2>($source); | 
|---|
| 107 | unroll! {$target[2] = $op::<2>($source);} | 
|---|
| 108 | }; | 
|---|
| 109 | ($target:ident[2] = $op:ident::<2>($source:ident);) => { | 
|---|
| 110 | $target[1] = $op::<1>($source); | 
|---|
| 111 | $target[0] = $op::<0>($source); | 
|---|
| 112 | }; | 
|---|
| 113 | (assert_eq_m128i($op:ident::<4>($vec_res:ident),$lin_res:ident[4]);) => { | 
|---|
| 114 | assert_eq_m128i($op::<3>($vec_res), $lin_res[3]); | 
|---|
| 115 | assert_eq_m128i($op::<2>($vec_res), $lin_res[2]); | 
|---|
| 116 | unroll! {assert_eq_m128i($op::<2>($vec_res),$lin_res[2]);} | 
|---|
| 117 | }; | 
|---|
| 118 | (assert_eq_m128i($op:ident::<2>($vec_res:ident),$lin_res:ident[2]);) => { | 
|---|
| 119 | assert_eq_m128i($op::<1>($vec_res), $lin_res[1]); | 
|---|
| 120 | assert_eq_m128i($op::<0>($vec_res), $lin_res[0]); | 
|---|
| 121 | }; | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | // this function tests one of the possible 4 instances | 
|---|
| 125 | // with different inputs across lanes | 
|---|
| 126 | #[ target_feature(enable = "vpclmulqdq,avx512f")] | 
|---|
| 127 | unsafe fn verify_512_helper( | 
|---|
| 128 | linear: unsafe fn(__m128i, __m128i) -> __m128i, | 
|---|
| 129 | vectorized: unsafe fn(__m512i, __m512i) -> __m512i, | 
|---|
| 130 | ) { | 
|---|
| 131 | let a = _mm512_set_epi64( | 
|---|
| 132 | 0xDCB4DB3657BF0B7D, | 
|---|
| 133 | 0x18DB0601068EDD9F, | 
|---|
| 134 | 0xB76B908233200DC5, | 
|---|
| 135 | 0xE478235FA8E22D5E, | 
|---|
| 136 | 0xAB05CFFA2621154C, | 
|---|
| 137 | 0x1171B47A186174C9, | 
|---|
| 138 | 0x8C6B6C0E7595CEC9, | 
|---|
| 139 | 0xBE3E7D4934E961BD, | 
|---|
| 140 | ); | 
|---|
| 141 | let b = _mm512_set_epi64( | 
|---|
| 142 | 0x672F6F105A94CEA7, | 
|---|
| 143 | 0x8298B8FFCA5F829C, | 
|---|
| 144 | 0xA3927047B3FB61D8, | 
|---|
| 145 | 0x978093862CDE7187, | 
|---|
| 146 | 0xB1927AB22F31D0EC, | 
|---|
| 147 | 0xA9A5DA619BE4D7AF, | 
|---|
| 148 | 0xCA2590F56884FDC6, | 
|---|
| 149 | 0x19BE9F660038BDB5, | 
|---|
| 150 | ); | 
|---|
| 151 |  | 
|---|
| 152 | let mut a_decomp = [_mm_setzero_si128(); 4]; | 
|---|
| 153 | unroll! {a_decomp[4] = _mm512_extracti32x4_epi32::<4>(a);} | 
|---|
| 154 | let mut b_decomp = [_mm_setzero_si128(); 4]; | 
|---|
| 155 | unroll! {b_decomp[4] = _mm512_extracti32x4_epi32::<4>(b);} | 
|---|
| 156 |  | 
|---|
| 157 | let r = vectorized(a, b); | 
|---|
| 158 | let mut e_decomp = [_mm_setzero_si128(); 4]; | 
|---|
| 159 | for i in 0..4 { | 
|---|
| 160 | e_decomp[i] = linear(a_decomp[i], b_decomp[i]); | 
|---|
| 161 | } | 
|---|
| 162 | unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32::<4>(r),e_decomp[4]);} | 
|---|
| 163 | } | 
|---|
| 164 |  | 
|---|
| 165 | // this function tests one of the possible 4 instances | 
|---|
| 166 | // with different inputs across lanes for the VL version | 
|---|
| 167 | #[ target_feature(enable = "vpclmulqdq,avx512vl")] | 
|---|
| 168 | unsafe fn verify_256_helper( | 
|---|
| 169 | linear: unsafe fn(__m128i, __m128i) -> __m128i, | 
|---|
| 170 | vectorized: unsafe fn(__m256i, __m256i) -> __m256i, | 
|---|
| 171 | ) { | 
|---|
| 172 | let a = _mm512_set_epi64( | 
|---|
| 173 | 0xDCB4DB3657BF0B7D, | 
|---|
| 174 | 0x18DB0601068EDD9F, | 
|---|
| 175 | 0xB76B908233200DC5, | 
|---|
| 176 | 0xE478235FA8E22D5E, | 
|---|
| 177 | 0xAB05CFFA2621154C, | 
|---|
| 178 | 0x1171B47A186174C9, | 
|---|
| 179 | 0x8C6B6C0E7595CEC9, | 
|---|
| 180 | 0xBE3E7D4934E961BD, | 
|---|
| 181 | ); | 
|---|
| 182 | let b = _mm512_set_epi64( | 
|---|
| 183 | 0x672F6F105A94CEA7, | 
|---|
| 184 | 0x8298B8FFCA5F829C, | 
|---|
| 185 | 0xA3927047B3FB61D8, | 
|---|
| 186 | 0x978093862CDE7187, | 
|---|
| 187 | 0xB1927AB22F31D0EC, | 
|---|
| 188 | 0xA9A5DA619BE4D7AF, | 
|---|
| 189 | 0xCA2590F56884FDC6, | 
|---|
| 190 | 0x19BE9F660038BDB5, | 
|---|
| 191 | ); | 
|---|
| 192 |  | 
|---|
| 193 | let mut a_decomp = [_mm_setzero_si128(); 2]; | 
|---|
| 194 | unroll! {a_decomp[2] = _mm512_extracti32x4_epi32::<2>(a);} | 
|---|
| 195 | let mut b_decomp = [_mm_setzero_si128(); 2]; | 
|---|
| 196 | unroll! {b_decomp[2] = _mm512_extracti32x4_epi32::<2>(b);} | 
|---|
| 197 |  | 
|---|
| 198 | let r = vectorized( | 
|---|
| 199 | _mm512_extracti64x4_epi64::<0>(a), | 
|---|
| 200 | _mm512_extracti64x4_epi64::<0>(b), | 
|---|
| 201 | ); | 
|---|
| 202 | let mut e_decomp = [_mm_setzero_si128(); 2]; | 
|---|
| 203 | for i in 0..2 { | 
|---|
| 204 | e_decomp[i] = linear(a_decomp[i], b_decomp[i]); | 
|---|
| 205 | } | 
|---|
| 206 | unroll! {assert_eq_m128i(_mm256_extracti128_si256::<2>(r),e_decomp[2]);} | 
|---|
| 207 | } | 
|---|
| 208 |  | 
|---|
| 209 | #[simd_test(enable = "vpclmulqdq,avx512f")] | 
|---|
| 210 | unsafe fn test_mm512_clmulepi64_epi128() { | 
|---|
| 211 | verify_kat_pclmul!( | 
|---|
| 212 | _mm512_broadcast_i32x4, | 
|---|
| 213 | _mm512_clmulepi64_epi128, | 
|---|
| 214 | assert_eq_m512i | 
|---|
| 215 | ); | 
|---|
| 216 |  | 
|---|
| 217 | verify_512_helper( | 
|---|
| 218 | |a, b| _mm_clmulepi64_si128::<0x00>(a, b), | 
|---|
| 219 | |a, b| _mm512_clmulepi64_epi128::<0x00>(a, b), | 
|---|
| 220 | ); | 
|---|
| 221 | verify_512_helper( | 
|---|
| 222 | |a, b| _mm_clmulepi64_si128::<0x01>(a, b), | 
|---|
| 223 | |a, b| _mm512_clmulepi64_epi128::<0x01>(a, b), | 
|---|
| 224 | ); | 
|---|
| 225 | verify_512_helper( | 
|---|
| 226 | |a, b| _mm_clmulepi64_si128::<0x10>(a, b), | 
|---|
| 227 | |a, b| _mm512_clmulepi64_epi128::<0x10>(a, b), | 
|---|
| 228 | ); | 
|---|
| 229 | verify_512_helper( | 
|---|
| 230 | |a, b| _mm_clmulepi64_si128::<0x11>(a, b), | 
|---|
| 231 | |a, b| _mm512_clmulepi64_epi128::<0x11>(a, b), | 
|---|
| 232 | ); | 
|---|
| 233 | } | 
|---|
| 234 |  | 
|---|
| 235 | #[simd_test(enable = "vpclmulqdq,avx512vl")] | 
|---|
| 236 | unsafe fn test_mm256_clmulepi64_epi128() { | 
|---|
| 237 | verify_kat_pclmul!( | 
|---|
| 238 | _mm256_broadcastsi128_si256, | 
|---|
| 239 | _mm256_clmulepi64_epi128, | 
|---|
| 240 | assert_eq_m256i | 
|---|
| 241 | ); | 
|---|
| 242 |  | 
|---|
| 243 | verify_256_helper( | 
|---|
| 244 | |a, b| _mm_clmulepi64_si128::<0x00>(a, b), | 
|---|
| 245 | |a, b| _mm256_clmulepi64_epi128::<0x00>(a, b), | 
|---|
| 246 | ); | 
|---|
| 247 | verify_256_helper( | 
|---|
| 248 | |a, b| _mm_clmulepi64_si128::<0x01>(a, b), | 
|---|
| 249 | |a, b| _mm256_clmulepi64_epi128::<0x01>(a, b), | 
|---|
| 250 | ); | 
|---|
| 251 | verify_256_helper( | 
|---|
| 252 | |a, b| _mm_clmulepi64_si128::<0x10>(a, b), | 
|---|
| 253 | |a, b| _mm256_clmulepi64_epi128::<0x10>(a, b), | 
|---|
| 254 | ); | 
|---|
| 255 | verify_256_helper( | 
|---|
| 256 | |a, b| _mm_clmulepi64_si128::<0x11>(a, b), | 
|---|
| 257 | |a, b| _mm256_clmulepi64_epi128::<0x11>(a, b), | 
|---|
| 258 | ); | 
|---|
| 259 | } | 
|---|
| 260 | } | 
|---|
| 261 |  | 
|---|