| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * POLYVAL library functions |
| 4 | * |
| 5 | * Copyright 2025 Google LLC |
| 6 | */ |
| 7 | |
| 8 | #include <crypto/polyval.h> |
| 9 | #include <linux/export.h> |
| 10 | #include <linux/module.h> |
| 11 | #include <linux/string.h> |
| 12 | #include <linux/unaligned.h> |
| 13 | |
| 14 | /* |
| 15 | * POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL |
| 16 | * interprets the message as the coefficients of a polynomial in GF(2^128) and |
| 17 | * evaluates that polynomial at a secret point. POLYVAL has a simple |
| 18 | * mathematical relationship with GHASH, but it uses a better field convention |
| 19 | * which makes it easier and faster to implement. |
| 20 | * |
| 21 | * POLYVAL is not a cryptographic hash function, and it should be used only by |
| 22 | * algorithms that are specifically designed to use it. |
| 23 | * |
| 24 | * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated |
| 25 | * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452) |
| 26 | * |
| 27 | * POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2" |
| 28 | * (https://eprint.iacr.org/2021/1441.pdf). |
| 29 | * |
| 30 | * This file provides a library API for POLYVAL. This API can delegate to |
| 31 | * either a generic implementation or an architecture-optimized implementation. |
| 32 | * |
| 33 | * For the generic implementation, we don't use the traditional table approach |
| 34 | * to GF(2^128) multiplication. That approach is not constant-time and requires |
| 35 | * a lot of memory. Instead, we use a different approach which emulates |
| 36 | * carryless multiplication using standard multiplications by spreading the data |
| 37 | * bits apart using "holes". This allows the carries to spill harmlessly. This |
| 38 | * approach is borrowed from BoringSSL, which in turn credits BearSSL's |
| 39 | * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the |
| 40 | * "holes" trick and a presentation by Shay Gueron |
| 41 | * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the |
| 42 | * 256-bit => 128-bit reduction algorithm. |
| 43 | */ |
| 44 | |
| 45 | #ifdef CONFIG_ARCH_SUPPORTS_INT128 |
| 46 | |
| 47 | /* Do a 64 x 64 => 128 bit carryless multiplication. */ |
| 48 | static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi) |
| 49 | { |
| 50 | /* |
| 51 | * With 64-bit multiplicands and one term every 4 bits, there would be |
| 52 | * up to 64 / 4 = 16 one bits per column when each multiplication is |
| 53 | * written out as a series of additions in the schoolbook manner. |
| 54 | * Unfortunately, that doesn't work since the value 16 is 1 too large to |
| 55 | * fit in 4 bits. Carries would sometimes overflow into the next term. |
| 56 | * |
| 57 | * Using one term every 5 bits would work. However, that would cost |
| 58 | * 5 x 5 = 25 multiplications instead of 4 x 4 = 16. |
| 59 | * |
| 60 | * Instead, mask off 4 bits from one multiplicand, giving a max of 15 |
| 61 | * one bits per column. Then handle those 4 bits separately. |
| 62 | */ |
| 63 | u64 a0 = a & 0x1111111111111110; |
| 64 | u64 a1 = a & 0x2222222222222220; |
| 65 | u64 a2 = a & 0x4444444444444440; |
| 66 | u64 a3 = a & 0x8888888888888880; |
| 67 | |
| 68 | u64 b0 = b & 0x1111111111111111; |
| 69 | u64 b1 = b & 0x2222222222222222; |
| 70 | u64 b2 = b & 0x4444444444444444; |
| 71 | u64 b3 = b & 0x8888888888888888; |
| 72 | |
| 73 | /* Multiply the high 60 bits of @a by @b. */ |
| 74 | u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^ |
| 75 | (a2 * (u128)b2) ^ (a3 * (u128)b1); |
| 76 | u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^ |
| 77 | (a2 * (u128)b3) ^ (a3 * (u128)b2); |
| 78 | u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^ |
| 79 | (a2 * (u128)b0) ^ (a3 * (u128)b3); |
| 80 | u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^ |
| 81 | (a2 * (u128)b1) ^ (a3 * (u128)b0); |
| 82 | |
| 83 | /* Multiply the low 4 bits of @a by @b. */ |
| 84 | u64 e0 = -(a & 1) & b; |
| 85 | u64 e1 = -((a >> 1) & 1) & b; |
| 86 | u64 e2 = -((a >> 2) & 1) & b; |
| 87 | u64 e3 = -((a >> 3) & 1) & b; |
| 88 | u64 = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3); |
| 89 | u64 = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61); |
| 90 | |
| 91 | /* Add all the intermediate products together. */ |
| 92 | *out_lo = (((u64)c0) & 0x1111111111111111) ^ |
| 93 | (((u64)c1) & 0x2222222222222222) ^ |
| 94 | (((u64)c2) & 0x4444444444444444) ^ |
| 95 | (((u64)c3) & 0x8888888888888888) ^ extra_lo; |
| 96 | *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^ |
| 97 | (((u64)(c1 >> 64)) & 0x2222222222222222) ^ |
| 98 | (((u64)(c2 >> 64)) & 0x4444444444444444) ^ |
| 99 | (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi; |
| 100 | } |
| 101 | |
| 102 | #else /* CONFIG_ARCH_SUPPORTS_INT128 */ |
| 103 | |
| 104 | /* Do a 32 x 32 => 64 bit carryless multiplication. */ |
| 105 | static u64 clmul32(u32 a, u32 b) |
| 106 | { |
| 107 | /* |
| 108 | * With 32-bit multiplicands and one term every 4 bits, there are up to |
| 109 | * 32 / 4 = 8 one bits per column when each multiplication is written |
| 110 | * out as a series of additions in the schoolbook manner. The value 8 |
| 111 | * fits in 4 bits, so the carries don't overflow into the next term. |
| 112 | */ |
| 113 | u32 a0 = a & 0x11111111; |
| 114 | u32 a1 = a & 0x22222222; |
| 115 | u32 a2 = a & 0x44444444; |
| 116 | u32 a3 = a & 0x88888888; |
| 117 | |
| 118 | u32 b0 = b & 0x11111111; |
| 119 | u32 b1 = b & 0x22222222; |
| 120 | u32 b2 = b & 0x44444444; |
| 121 | u32 b3 = b & 0x88888888; |
| 122 | |
| 123 | u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^ |
| 124 | (a2 * (u64)b2) ^ (a3 * (u64)b1); |
| 125 | u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^ |
| 126 | (a2 * (u64)b3) ^ (a3 * (u64)b2); |
| 127 | u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^ |
| 128 | (a2 * (u64)b0) ^ (a3 * (u64)b3); |
| 129 | u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^ |
| 130 | (a2 * (u64)b1) ^ (a3 * (u64)b0); |
| 131 | |
| 132 | /* Add all the intermediate products together. */ |
| 133 | return (c0 & 0x1111111111111111) ^ |
| 134 | (c1 & 0x2222222222222222) ^ |
| 135 | (c2 & 0x4444444444444444) ^ |
| 136 | (c3 & 0x8888888888888888); |
| 137 | } |
| 138 | |
| 139 | /* Do a 64 x 64 => 128 bit carryless multiplication. */ |
| 140 | static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi) |
| 141 | { |
| 142 | u32 a_lo = (u32)a; |
| 143 | u32 a_hi = a >> 32; |
| 144 | u32 b_lo = (u32)b; |
| 145 | u32 b_hi = b >> 32; |
| 146 | |
| 147 | /* Karatsuba multiplication */ |
| 148 | u64 lo = clmul32(a_lo, b_lo); |
| 149 | u64 hi = clmul32(a_hi, b_hi); |
| 150 | u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi; |
| 151 | |
| 152 | *out_lo = lo ^ (mi << 32); |
| 153 | *out_hi = hi ^ (mi >> 32); |
| 154 | } |
| 155 | #endif /* !CONFIG_ARCH_SUPPORTS_INT128 */ |
| 156 | |
| 157 | /* Compute @a = @a * @b * x^-128 in the POLYVAL field. */ |
| 158 | static void __maybe_unused |
| 159 | polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b) |
| 160 | { |
| 161 | u64 c0, c1, c2, c3, mi0, mi1; |
| 162 | |
| 163 | /* |
| 164 | * Carryless-multiply @a by @b using Karatsuba multiplication. Store |
| 165 | * the 256-bit product in @c0 (low) through @c3 (high). |
| 166 | */ |
| 167 | clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), out_lo: &c0, out_hi: &c1); |
| 168 | clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), out_lo: &c2, out_hi: &c3); |
| 169 | clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi), |
| 170 | out_lo: &mi0, out_hi: &mi1); |
| 171 | mi0 ^= c0 ^ c2; |
| 172 | mi1 ^= c1 ^ c3; |
| 173 | c1 ^= mi0; |
| 174 | c2 ^= mi1; |
| 175 | |
| 176 | /* |
| 177 | * Cancel out the low 128 bits of the product by adding multiples of |
| 178 | * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each |
| 179 | * of which cancels out 64 bits. Note that we break G(x) into three |
| 180 | * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1. |
| 181 | */ |
| 182 | |
| 183 | /* |
| 184 | * First, add G(x) times c0 as follows: |
| 185 | * |
| 186 | * (c0, c1, c2) = (0, |
| 187 | * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64), |
| 188 | * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64)) |
| 189 | */ |
| 190 | c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57); |
| 191 | c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7); |
| 192 | |
| 193 | /* |
| 194 | * Second, add G(x) times the new c1: |
| 195 | * |
| 196 | * (c1, c2, c3) = (0, |
| 197 | * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64), |
| 198 | * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64)) |
| 199 | */ |
| 200 | c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57); |
| 201 | c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7); |
| 202 | |
| 203 | /* Return (c2, c3). This implicitly multiplies by x^-128. */ |
| 204 | a->lo = cpu_to_le64(c2); |
| 205 | a->hi = cpu_to_le64(c3); |
| 206 | } |
| 207 | |
| 208 | static void __maybe_unused |
| 209 | polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key, |
| 210 | const u8 *data, size_t nblocks) |
| 211 | { |
| 212 | do { |
| 213 | acc->lo ^= get_unaligned((__le64 *)data); |
| 214 | acc->hi ^= get_unaligned((__le64 *)(data + 8)); |
| 215 | polyval_mul_generic(a: acc, b: key); |
| 216 | data += POLYVAL_BLOCK_SIZE; |
| 217 | } while (--nblocks); |
| 218 | } |
| 219 | |
| 220 | /* Include the arch-optimized implementation of POLYVAL, if one is available. */ |
| 221 | #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH |
| 222 | #include "polyval.h" /* $(SRCARCH)/polyval.h */ |
| 223 | void polyval_preparekey(struct polyval_key *key, |
| 224 | const u8 raw_key[POLYVAL_BLOCK_SIZE]) |
| 225 | { |
| 226 | polyval_preparekey_arch(key, raw_key); |
| 227 | } |
| 228 | EXPORT_SYMBOL_GPL(polyval_preparekey); |
| 229 | #endif /* Else, polyval_preparekey() is an inline function. */ |
| 230 | |
| 231 | /* |
| 232 | * polyval_mul_generic() and polyval_blocks_generic() take the key as a |
| 233 | * polyval_elem rather than a polyval_key, so that arch-optimized |
| 234 | * implementations with a different key format can use it as a fallback (if they |
| 235 | * have H^1 stored somewhere in their struct). Thus, the following dispatch |
| 236 | * code is needed to pass the appropriate key argument. |
| 237 | */ |
| 238 | |
| 239 | static void polyval_mul(struct polyval_ctx *ctx) |
| 240 | { |
| 241 | #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH |
| 242 | polyval_mul_arch(acc: &ctx->acc, key: ctx->key); |
| 243 | #else |
| 244 | polyval_mul_generic(&ctx->acc, &ctx->key->h); |
| 245 | #endif |
| 246 | } |
| 247 | |
| 248 | static void polyval_blocks(struct polyval_ctx *ctx, |
| 249 | const u8 *data, size_t nblocks) |
| 250 | { |
| 251 | #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH |
| 252 | polyval_blocks_arch(acc: &ctx->acc, key: ctx->key, data, nblocks); |
| 253 | #else |
| 254 | polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks); |
| 255 | #endif |
| 256 | } |
| 257 | |
| 258 | void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len) |
| 259 | { |
| 260 | if (unlikely(ctx->partial)) { |
| 261 | size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial); |
| 262 | |
| 263 | len -= n; |
| 264 | while (n--) |
| 265 | ctx->acc.bytes[ctx->partial++] ^= *data++; |
| 266 | if (ctx->partial < POLYVAL_BLOCK_SIZE) |
| 267 | return; |
| 268 | polyval_mul(ctx); |
| 269 | } |
| 270 | if (len >= POLYVAL_BLOCK_SIZE) { |
| 271 | size_t nblocks = len / POLYVAL_BLOCK_SIZE; |
| 272 | |
| 273 | polyval_blocks(ctx, data, nblocks); |
| 274 | data += len & ~(POLYVAL_BLOCK_SIZE - 1); |
| 275 | len &= POLYVAL_BLOCK_SIZE - 1; |
| 276 | } |
| 277 | for (size_t i = 0; i < len; i++) |
| 278 | ctx->acc.bytes[i] ^= data[i]; |
| 279 | ctx->partial = len; |
| 280 | } |
| 281 | EXPORT_SYMBOL_GPL(polyval_update); |
| 282 | |
| 283 | void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]) |
| 284 | { |
| 285 | if (unlikely(ctx->partial)) |
| 286 | polyval_mul(ctx); |
| 287 | memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE); |
| 288 | memzero_explicit(s: ctx, count: sizeof(*ctx)); |
| 289 | } |
| 290 | EXPORT_SYMBOL_GPL(polyval_final); |
| 291 | |
| 292 | #ifdef polyval_mod_init_arch |
| 293 | static int __init polyval_mod_init(void) |
| 294 | { |
| 295 | polyval_mod_init_arch(); |
| 296 | return 0; |
| 297 | } |
| 298 | subsys_initcall(polyval_mod_init); |
| 299 | |
| 300 | static void __exit polyval_mod_exit(void) |
| 301 | { |
| 302 | } |
| 303 | module_exit(polyval_mod_exit); |
| 304 | #endif |
| 305 | |
| 306 | MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function" ); |
| 307 | MODULE_LICENSE("GPL" ); |
| 308 | |