| 1 | /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ |
| 2 | // |
| 3 | // AES-NI optimized AES-GCM for x86_64 |
| 4 | // |
| 5 | // Copyright 2024 Google LLC |
| 6 | // |
| 7 | // Author: Eric Biggers <ebiggers@google.com> |
| 8 | // |
| 9 | //------------------------------------------------------------------------------ |
| 10 | // |
| 11 | // This file is dual-licensed, meaning that you can use it under your choice of |
| 12 | // either of the following two licenses: |
| 13 | // |
| 14 | // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy |
| 15 | // of the License at |
| 16 | // |
| 17 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 18 | // |
| 19 | // Unless required by applicable law or agreed to in writing, software |
| 20 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 21 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 22 | // See the License for the specific language governing permissions and |
| 23 | // limitations under the License. |
| 24 | // |
| 25 | // or |
| 26 | // |
| 27 | // Redistribution and use in source and binary forms, with or without |
| 28 | // modification, are permitted provided that the following conditions are met: |
| 29 | // |
| 30 | // 1. Redistributions of source code must retain the above copyright notice, |
| 31 | // this list of conditions and the following disclaimer. |
| 32 | // |
| 33 | // 2. Redistributions in binary form must reproduce the above copyright |
| 34 | // notice, this list of conditions and the following disclaimer in the |
| 35 | // documentation and/or other materials provided with the distribution. |
| 36 | // |
| 37 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 38 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 39 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 40 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| 41 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 42 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 43 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 44 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 45 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 46 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 47 | // POSSIBILITY OF SUCH DAMAGE. |
| 48 | // |
| 49 | //------------------------------------------------------------------------------ |
| 50 | // |
| 51 | // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that |
| 52 | // support the original set of AES instructions, i.e. AES-NI. Two |
| 53 | // implementations are provided, one that uses AVX and one that doesn't. They |
| 54 | // are very similar, being generated by the same macros. The only difference is |
| 55 | // that the AVX implementation takes advantage of VEX-coded instructions in some |
| 56 | // places to avoid some 'movdqu' and 'movdqa' instructions. The AVX |
| 57 | // implementation does *not* use 256-bit vectors, as AES is not supported on |
| 58 | // 256-bit vectors until the VAES feature (which this file doesn't target). |
| 59 | // |
| 60 | // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 |
| 61 | // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems |
| 62 | // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) |
| 63 | // |
| 64 | // The design generally follows that of aes-gcm-vaes-avx512.S, and that file is |
| 65 | // more thoroughly commented. This file has the following notable changes: |
| 66 | // |
| 67 | // - The vector length is fixed at 128-bit, i.e. xmm registers. This means |
| 68 | // there is only one AES block (and GHASH block) per register. |
| 69 | // |
| 70 | // - Without AVX512, only 16 SIMD registers are available instead of 32. We |
| 71 | // work around this by being much more careful about using registers, |
| 72 | // relying heavily on loads to load values as they are needed. |
| 73 | // |
| 74 | // - Masking is not available either. We work around this by implementing |
| 75 | // partial block loads and stores using overlapping scalar loads and stores |
| 76 | // combined with shifts and SSE4.1 insertion and extraction instructions. |
| 77 | // |
| 78 | // - The main loop is organized differently due to the different design |
| 79 | // constraints. First, with just one AES block per SIMD register, on some |
| 80 | // CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore |
| 81 | // do an 8-register wide loop. Considering that and the fact that we have |
| 82 | // just 16 SIMD registers to work with, it's not feasible to cache AES |
| 83 | // round keys and GHASH key powers in registers across loop iterations. |
| 84 | // That's not ideal, but also not actually that bad, since loads can run in |
| 85 | // parallel with other instructions. Significantly, this also makes it |
| 86 | // possible to roll up the inner loops, relying on hardware loop unrolling |
| 87 | // instead of software loop unrolling, greatly reducing code size. |
| 88 | // |
| 89 | // - We implement the GHASH multiplications in the main loop using Karatsuba |
| 90 | // multiplication instead of schoolbook multiplication. This saves one |
| 91 | // pclmulqdq instruction per block, at the cost of one 64-bit load, one |
| 92 | // pshufd, and 0.25 pxors per block. (This is without the three-argument |
| 93 | // XOR support that would be provided by AVX512, which would be more |
| 94 | // beneficial to schoolbook than Karatsuba.) |
| 95 | // |
| 96 | // As a rough approximation, we can assume that Karatsuba multiplication is |
| 97 | // faster than schoolbook multiplication in this context if one pshufd and |
| 98 | // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit |
| 99 | // load is "free" due to running in parallel with arithmetic instructions.) |
| 100 | // This is true on AMD CPUs, including all that support pclmulqdq up to at |
| 101 | // least Zen 3. It's also true on older Intel CPUs: Westmere through |
| 102 | // Haswell on the Core side, and Silvermont through Goldmont Plus on the |
| 103 | // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the |
| 104 | // benefit of Karatsuba should be substantial. On newer Intel CPUs, |
| 105 | // schoolbook multiplication should be faster, but only marginally. |
| 106 | // |
| 107 | // Not all these CPUs were available to be tested. However, benchmarks on |
| 108 | // available CPUs suggest that this approximation is plausible. Switching |
| 109 | // to Karatsuba showed negligible change (< 1%) on Intel Broadwell, |
| 110 | // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. |
| 111 | // Considering that and the fact that Karatsuba should be even more |
| 112 | // beneficial on older Intel CPUs, it seems like the right choice here. |
| 113 | // |
| 114 | // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be |
| 115 | // saved by using a multiplication-less reduction method. We don't do that |
| 116 | // because it would require a large number of shift and xor instructions, |
| 117 | // making it less worthwhile and likely harmful on newer CPUs. |
| 118 | // |
| 119 | // It does make sense to sometimes use a different reduction optimization |
| 120 | // that saves a pclmulqdq, though: precompute the hash key times x^64, and |
| 121 | // multiply the low half of the data block by the hash key with the extra |
| 122 | // factor of x^64. This eliminates one step of the reduction. However, |
| 123 | // this is incompatible with Karatsuba multiplication. Therefore, for |
| 124 | // multi-block processing we use Karatsuba multiplication with a regular |
| 125 | // reduction. For single-block processing, we use the x^64 optimization. |
| 126 | |
| 127 | #include <linux/linkage.h> |
| 128 | |
| 129 | .section .rodata |
| 130 | .p2align 4 |
| 131 | .Lbswap_mask: |
| 132 | .octa 0x000102030405060708090a0b0c0d0e0f |
| 133 | .Lgfpoly: |
| 134 | .quad 0xc200000000000000 |
| 135 | .Lone: |
| 136 | .quad 1 |
| 137 | .Lgfpoly_and_internal_carrybit: |
| 138 | .octa 0xc2000000000000010000000000000001 |
| 139 | // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of |
| 140 | // 'len' 0xff bytes and the rest zeroes. |
| 141 | .Lzeropad_mask: |
| 142 | .octa 0xffffffffffffffffffffffffffffffff |
| 143 | .octa 0 |
| 144 | |
| 145 | // Offsets in struct aes_gcm_key_aesni |
| 146 | #define OFFSETOF_AESKEYLEN 480 |
| 147 | #define OFFSETOF_H_POWERS 496 |
| 148 | #define OFFSETOF_H_POWERS_XORED 624 |
| 149 | #define OFFSETOF_H_TIMES_X64 688 |
| 150 | |
| 151 | .text |
| 152 | |
| 153 | // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback |
| 154 | // assumes that all operands are distinct and that any mem operand is aligned. |
| 155 | .macro _vpclmulqdq imm, src1, src2, dst |
| 156 | .if USE_AVX |
| 157 | vpclmulqdq \imm, \src1, \src2, \dst |
| 158 | .else |
| 159 | movdqa \src2, \dst |
| 160 | pclmulqdq \imm, \src1, \dst |
| 161 | .endif |
| 162 | .endm |
| 163 | |
| 164 | // Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes |
| 165 | // that all operands are distinct and that any mem operand is aligned. |
| 166 | .macro _vpshufb src1, src2, dst |
| 167 | .if USE_AVX |
| 168 | vpshufb \src1, \src2, \dst |
| 169 | .else |
| 170 | movdqa \src2, \dst |
| 171 | pshufb \src1, \dst |
| 172 | .endif |
| 173 | .endm |
| 174 | |
| 175 | // Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that |
| 176 | // all operands are distinct. |
| 177 | .macro _vpand src1, src2, dst |
| 178 | .if USE_AVX |
| 179 | vpand \src1, \src2, \dst |
| 180 | .else |
| 181 | movdqu \src1, \dst |
| 182 | pand \src2, \dst |
| 183 | .endif |
| 184 | .endm |
| 185 | |
| 186 | // XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must |
| 187 | // be a temporary xmm register. |
| 188 | .macro _xor_mem_to_reg mem, reg, tmp |
| 189 | .if USE_AVX |
| 190 | vpxor \mem, \reg, \reg |
| 191 | .else |
| 192 | movdqu \mem, \tmp |
| 193 | pxor \tmp, \reg |
| 194 | .endif |
| 195 | .endm |
| 196 | |
| 197 | // Test the unaligned memory operand \mem against the xmm register \reg. \tmp |
| 198 | // must be a temporary xmm register. |
| 199 | .macro _test_mem mem, reg, tmp |
| 200 | .if USE_AVX |
| 201 | vptest \mem, \reg |
| 202 | .else |
| 203 | movdqu \mem, \tmp |
| 204 | ptest \tmp, \reg |
| 205 | .endif |
| 206 | .endm |
| 207 | |
| 208 | // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst |
| 209 | // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. |
| 210 | .macro _load_partial_block src, dst, tmp64, tmp32 |
| 211 | sub $8, %ecx // LEN - 8 |
| 212 | jle .Lle8\@ |
| 213 | |
| 214 | // Load 9 <= LEN <= 15 bytes. |
| 215 | movq (\src), \dst // Load first 8 bytes |
| 216 | mov (\src, %rcx), %rax // Load last 8 bytes |
| 217 | neg %ecx |
| 218 | shl $3, %ecx |
| 219 | shr %cl, %rax // Discard overlapping bytes |
| 220 | pinsrq $1, %rax, \dst |
| 221 | jmp .Ldone\@ |
| 222 | |
| 223 | .Lle8\@: |
| 224 | add $4, %ecx // LEN - 4 |
| 225 | jl .Llt4\@ |
| 226 | |
| 227 | // Load 4 <= LEN <= 8 bytes. |
| 228 | mov (\src), %eax // Load first 4 bytes |
| 229 | mov (\src, %rcx), \tmp32 // Load last 4 bytes |
| 230 | jmp .Lcombine\@ |
| 231 | |
| 232 | .Llt4\@: |
| 233 | // Load 1 <= LEN <= 3 bytes. |
| 234 | add $2, %ecx // LEN - 2 |
| 235 | movzbl (\src), %eax // Load first byte |
| 236 | jl .Lmovq\@ |
| 237 | movzwl (\src, %rcx), \tmp32 // Load last 2 bytes |
| 238 | .Lcombine\@: |
| 239 | shl $3, %ecx |
| 240 | shl %cl, \tmp64 |
| 241 | or \tmp64, %rax // Combine the two parts |
| 242 | .Lmovq\@: |
| 243 | movq %rax, \dst |
| 244 | .Ldone\@: |
| 245 | .endm |
| 246 | |
| 247 | // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. |
| 248 | // Clobbers %rax, %rcx, and %rsi. |
| 249 | .macro _store_partial_block src, dst |
| 250 | sub $8, %ecx // LEN - 8 |
| 251 | jl .Llt8\@ |
| 252 | |
| 253 | // Store 8 <= LEN <= 15 bytes. |
| 254 | pextrq $1, \src, %rax |
| 255 | mov %ecx, %esi |
| 256 | shl $3, %ecx |
| 257 | ror %cl, %rax |
| 258 | mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes |
| 259 | movq \src, (\dst) // Store first 8 bytes |
| 260 | jmp .Ldone\@ |
| 261 | |
| 262 | .Llt8\@: |
| 263 | add $4, %ecx // LEN - 4 |
| 264 | jl .Llt4\@ |
| 265 | |
| 266 | // Store 4 <= LEN <= 7 bytes. |
| 267 | pextrd $1, \src, %eax |
| 268 | mov %ecx, %esi |
| 269 | shl $3, %ecx |
| 270 | ror %cl, %eax |
| 271 | mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes |
| 272 | movd \src, (\dst) // Store first 4 bytes |
| 273 | jmp .Ldone\@ |
| 274 | |
| 275 | .Llt4\@: |
| 276 | // Store 1 <= LEN <= 3 bytes. |
| 277 | pextrb $0, \src, 0(\dst) |
| 278 | cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? |
| 279 | jl .Ldone\@ |
| 280 | pextrb $1, \src, 1(\dst) |
| 281 | je .Ldone\@ |
| 282 | pextrb $2, \src, 2(\dst) |
| 283 | .Ldone\@: |
| 284 | .endm |
| 285 | |
| 286 | // Do one step of GHASH-multiplying \a by \b and storing the reduced product in |
| 287 | // \b. To complete all steps, this must be invoked with \i=0 through \i=9. |
| 288 | // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the |
| 289 | // .Lgfpoly constant, and \t0-\t1 must be temporary registers. |
| 290 | .macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 |
| 291 | |
| 292 | // MI = (a_L * b_H) + ((a*x^64)_L * b_L) |
| 293 | .if \i == 0 |
| 294 | _vpclmulqdq $0x01, \a, \b, \t0 |
| 295 | .elseif \i == 1 |
| 296 | _vpclmulqdq $0x00, \a_times_x64, \b, \t1 |
| 297 | .elseif \i == 2 |
| 298 | pxor \t1, \t0 |
| 299 | |
| 300 | // HI = (a_H * b_H) + ((a*x^64)_H * b_L) |
| 301 | .elseif \i == 3 |
| 302 | _vpclmulqdq $0x11, \a, \b, \t1 |
| 303 | .elseif \i == 4 |
| 304 | pclmulqdq $0x10, \a_times_x64, \b |
| 305 | .elseif \i == 5 |
| 306 | pxor \t1, \b |
| 307 | .elseif \i == 6 |
| 308 | |
| 309 | // Fold MI into HI. |
| 310 | pshufd $0x4e, \t0, \t1 // Swap halves of MI |
| 311 | .elseif \i == 7 |
| 312 | pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) |
| 313 | .elseif \i == 8 |
| 314 | pxor \t1, \b |
| 315 | .elseif \i == 9 |
| 316 | pxor \t0, \b |
| 317 | .endif |
| 318 | .endm |
| 319 | |
| 320 | // GHASH-multiply \a by \b and store the reduced product in \b. |
| 321 | // See _ghash_mul_step for details. |
| 322 | .macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 |
| 323 | .irp i, 0,1,2,3,4,5,6,7,8,9 |
| 324 | _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 |
| 325 | .endr |
| 326 | .endm |
| 327 | |
| 328 | // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. |
| 329 | // This does Karatsuba multiplication and must be paired with _ghash_reduce. On |
| 330 | // the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the |
| 331 | // two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. |
| 332 | .macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 |
| 333 | |
| 334 | // LO += a_L * b_L |
| 335 | _vpclmulqdq $0x00, \a, \b, \t0 |
| 336 | pxor \t0, \lo |
| 337 | |
| 338 | // b_L + b_H |
| 339 | pshufd $0x4e, \b, \t0 |
| 340 | pxor \b, \t0 |
| 341 | |
| 342 | // HI += a_H * b_H |
| 343 | pclmulqdq $0x11, \a, \b |
| 344 | pxor \b, \hi |
| 345 | |
| 346 | // MI += (a_L + a_H) * (b_L + b_H) |
| 347 | pclmulqdq $0x00, \a_xored, \t0 |
| 348 | pxor \t0, \mi |
| 349 | .endm |
| 350 | |
| 351 | // Reduce the product from \lo, \mi, and \hi, and store the result in \dst. |
| 352 | // This assumes that _ghash_mul_noreduce was used. |
| 353 | .macro _ghash_reduce lo, mi, hi, dst, t0 |
| 354 | |
| 355 | movq .Lgfpoly(%rip), \t0 |
| 356 | |
| 357 | // MI += LO + HI (needed because we used Karatsuba multiplication) |
| 358 | pxor \lo, \mi |
| 359 | pxor \hi, \mi |
| 360 | |
| 361 | // Fold LO into MI. |
| 362 | pshufd $0x4e, \lo, \dst |
| 363 | pclmulqdq $0x00, \t0, \lo |
| 364 | pxor \dst, \mi |
| 365 | pxor \lo, \mi |
| 366 | |
| 367 | // Fold MI into HI. |
| 368 | pshufd $0x4e, \mi, \dst |
| 369 | pclmulqdq $0x00, \t0, \mi |
| 370 | pxor \hi, \dst |
| 371 | pxor \mi, \dst |
| 372 | .endm |
| 373 | |
| 374 | // Do the first step of the GHASH update of a set of 8 ciphertext blocks. |
| 375 | // |
| 376 | // The whole GHASH update does: |
| 377 | // |
| 378 | // GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + |
| 379 | // blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 |
| 380 | // |
| 381 | // This macro just does the first step: it does the unreduced multiplication |
| 382 | // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm |
| 383 | // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the |
| 384 | // inner block counter in %rax, which is a value that counts up by 8 for each |
| 385 | // block in the set of 8 and is used later to index by 8*blknum and 16*blknum. |
| 386 | // |
| 387 | // To reduce the number of pclmulqdq instructions required, both this macro and |
| 388 | // _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook |
| 389 | // multiplication. See the file comment for more details about this choice. |
| 390 | // |
| 391 | // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if |
| 392 | // encrypting, or SRC if decrypting. They also expect the precomputed hash key |
| 393 | // powers H^i and their XOR'd-together halves to be available in the struct |
| 394 | // pointed to by KEY. Both macros clobber TMP[0-2]. |
| 395 | .macro _ghash_update_begin_8x enc |
| 396 | |
| 397 | // Initialize the inner block counter. |
| 398 | xor %eax, %eax |
| 399 | |
| 400 | // Load the highest hash key power, H^8. |
| 401 | movdqa OFFSETOF_H_POWERS(KEY), TMP0 |
| 402 | |
| 403 | // Load the first ciphertext block and byte-reflect it. |
| 404 | .if \enc |
| 405 | movdqu (DST), TMP1 |
| 406 | .else |
| 407 | movdqu (SRC), TMP1 |
| 408 | .endif |
| 409 | pshufb BSWAP_MASK, TMP1 |
| 410 | |
| 411 | // Add the GHASH accumulator to the ciphertext block to get the block |
| 412 | // 'b' that needs to be multiplied with the hash key power 'a'. |
| 413 | pxor TMP1, GHASH_ACC |
| 414 | |
| 415 | // b_L + b_H |
| 416 | pshufd $0x4e, GHASH_ACC, MI |
| 417 | pxor GHASH_ACC, MI |
| 418 | |
| 419 | // LO = a_L * b_L |
| 420 | _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO |
| 421 | |
| 422 | // HI = a_H * b_H |
| 423 | pclmulqdq $0x11, TMP0, GHASH_ACC |
| 424 | |
| 425 | // MI = (a_L + a_H) * (b_L + b_H) |
| 426 | pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI |
| 427 | .endm |
| 428 | |
| 429 | // Continue the GHASH update of 8 ciphertext blocks as described above by doing |
| 430 | // an unreduced multiplication of the next ciphertext block by the next lowest |
| 431 | // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. |
| 432 | .macro _ghash_update_continue_8x enc |
| 433 | add $8, %eax |
| 434 | |
| 435 | // Load the next lowest key power. |
| 436 | movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 |
| 437 | |
| 438 | // Load the next ciphertext block and byte-reflect it. |
| 439 | .if \enc |
| 440 | movdqu (DST,%rax,2), TMP1 |
| 441 | .else |
| 442 | movdqu (SRC,%rax,2), TMP1 |
| 443 | .endif |
| 444 | pshufb BSWAP_MASK, TMP1 |
| 445 | |
| 446 | // LO += a_L * b_L |
| 447 | _vpclmulqdq $0x00, TMP0, TMP1, TMP2 |
| 448 | pxor TMP2, LO |
| 449 | |
| 450 | // b_L + b_H |
| 451 | pshufd $0x4e, TMP1, TMP2 |
| 452 | pxor TMP1, TMP2 |
| 453 | |
| 454 | // HI += a_H * b_H |
| 455 | pclmulqdq $0x11, TMP0, TMP1 |
| 456 | pxor TMP1, GHASH_ACC |
| 457 | |
| 458 | // MI += (a_L + a_H) * (b_L + b_H) |
| 459 | movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 |
| 460 | pclmulqdq $0x00, TMP1, TMP2 |
| 461 | pxor TMP2, MI |
| 462 | .endm |
| 463 | |
| 464 | // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to |
| 465 | // _ghash_reduce, but it's hardcoded to use the registers of the main loop and |
| 466 | // it uses the same register for HI and the destination. It's also divided into |
| 467 | // two steps. TMP1 must be preserved across steps. |
| 468 | // |
| 469 | // One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of |
| 470 | // shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would |
| 471 | // increase the critical path length, and it seems to slightly hurt performance. |
| 472 | .macro _ghash_update_end_8x_step i |
| 473 | .if \i == 0 |
| 474 | movq .Lgfpoly(%rip), TMP1 |
| 475 | pxor LO, MI |
| 476 | pxor GHASH_ACC, MI |
| 477 | pshufd $0x4e, LO, TMP2 |
| 478 | pclmulqdq $0x00, TMP1, LO |
| 479 | pxor TMP2, MI |
| 480 | pxor LO, MI |
| 481 | .elseif \i == 1 |
| 482 | pshufd $0x4e, MI, TMP2 |
| 483 | pclmulqdq $0x00, TMP1, MI |
| 484 | pxor TMP2, GHASH_ACC |
| 485 | pxor MI, GHASH_ACC |
| 486 | .endif |
| 487 | .endm |
| 488 | |
| 489 | // void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); |
| 490 | // |
| 491 | // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH |
| 492 | // related fields in the key struct. |
| 493 | .macro _aes_gcm_precompute |
| 494 | |
| 495 | // Function arguments |
| 496 | .set KEY, %rdi |
| 497 | |
| 498 | // Additional local variables. |
| 499 | // %xmm0-%xmm1 and %rax are used as temporaries. |
| 500 | .set RNDKEYLAST_PTR, %rsi |
| 501 | .set H_CUR, %xmm2 |
| 502 | .set H_POW1, %xmm3 // H^1 |
| 503 | .set H_POW1_X64, %xmm4 // H^1 * x^64 |
| 504 | .set GFPOLY, %xmm5 |
| 505 | |
| 506 | // Encrypt an all-zeroes block to get the raw hash subkey. |
| 507 | movl OFFSETOF_AESKEYLEN(KEY), %eax |
| 508 | lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR |
| 509 | movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block |
| 510 | lea 16(KEY), %rax |
| 511 | 1: |
| 512 | aesenc (%rax), H_POW1 |
| 513 | add $16, %rax |
| 514 | cmp %rax, RNDKEYLAST_PTR |
| 515 | jne 1b |
| 516 | aesenclast (RNDKEYLAST_PTR), H_POW1 |
| 517 | |
| 518 | // Preprocess the raw hash subkey as needed to operate on GHASH's |
| 519 | // bit-reflected values directly: reflect its bytes, then multiply it by |
| 520 | // x^-1 (using the backwards interpretation of polynomial coefficients |
| 521 | // from the GCM spec) or equivalently x^1 (using the alternative, |
| 522 | // natural interpretation of polynomial coefficients). |
| 523 | pshufb .Lbswap_mask(%rip), H_POW1 |
| 524 | movdqa H_POW1, %xmm0 |
| 525 | pshufd $0xd3, %xmm0, %xmm0 |
| 526 | psrad $31, %xmm0 |
| 527 | paddq H_POW1, H_POW1 |
| 528 | pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 |
| 529 | pxor %xmm0, H_POW1 |
| 530 | |
| 531 | // Store H^1. |
| 532 | movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) |
| 533 | |
| 534 | // Compute and store H^1 * x^64. |
| 535 | movq .Lgfpoly(%rip), GFPOLY |
| 536 | pshufd $0x4e, H_POW1, %xmm0 |
| 537 | _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 |
| 538 | pxor %xmm0, H_POW1_X64 |
| 539 | movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) |
| 540 | |
| 541 | // Compute and store the halves of H^1 XOR'd together. |
| 542 | pxor H_POW1, %xmm0 |
| 543 | movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) |
| 544 | |
| 545 | // Compute and store the remaining key powers H^2 through H^8. |
| 546 | movdqa H_POW1, H_CUR |
| 547 | mov $6*8, %eax |
| 548 | .Lprecompute_next\@: |
| 549 | // Compute H^i = H^{i-1} * H^1. |
| 550 | _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 |
| 551 | // Store H^i. |
| 552 | movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) |
| 553 | // Compute and store the halves of H^i XOR'd together. |
| 554 | pshufd $0x4e, H_CUR, %xmm0 |
| 555 | pxor H_CUR, %xmm0 |
| 556 | movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) |
| 557 | sub $8, %eax |
| 558 | jge .Lprecompute_next\@ |
| 559 | |
| 560 | RET |
| 561 | .endm |
| 562 | |
| 563 | // void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, |
| 564 | // u8 ghash_acc[16], const u8 *aad, int aadlen); |
| 565 | // |
| 566 | // This function processes the AAD (Additional Authenticated Data) in GCM. |
| 567 | // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the |
| 568 | // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all |
| 569 | // zeroes. |aadlen| must be a multiple of 16, except on the last call where it |
| 570 | // can be any length. The caller must do any buffering needed to ensure this. |
| 571 | .macro _aes_gcm_aad_update |
| 572 | |
| 573 | // Function arguments |
| 574 | .set KEY, %rdi |
| 575 | .set GHASH_ACC_PTR, %rsi |
| 576 | .set AAD, %rdx |
| 577 | .set AADLEN, %ecx |
| 578 | // Note: _load_partial_block relies on AADLEN being in %ecx. |
| 579 | |
| 580 | // Additional local variables. |
| 581 | // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. |
| 582 | .set BSWAP_MASK, %xmm2 |
| 583 | .set GHASH_ACC, %xmm3 |
| 584 | .set H_POW1, %xmm4 // H^1 |
| 585 | .set H_POW1_X64, %xmm5 // H^1 * x^64 |
| 586 | .set GFPOLY, %xmm6 |
| 587 | |
| 588 | movdqa .Lbswap_mask(%rip), BSWAP_MASK |
| 589 | movdqu (GHASH_ACC_PTR), GHASH_ACC |
| 590 | movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 |
| 591 | movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 |
| 592 | movq .Lgfpoly(%rip), GFPOLY |
| 593 | |
| 594 | // Process the AAD one full block at a time. |
| 595 | sub $16, AADLEN |
| 596 | jl .Laad_loop_1x_done\@ |
| 597 | .Laad_loop_1x\@: |
| 598 | movdqu (AAD), %xmm0 |
| 599 | pshufb BSWAP_MASK, %xmm0 |
| 600 | pxor %xmm0, GHASH_ACC |
| 601 | _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 |
| 602 | add $16, AAD |
| 603 | sub $16, AADLEN |
| 604 | jge .Laad_loop_1x\@ |
| 605 | .Laad_loop_1x_done\@: |
| 606 | // Check whether there is a partial block at the end. |
| 607 | add $16, AADLEN |
| 608 | jz .Laad_done\@ |
| 609 | |
| 610 | // Process a partial block of length 1 <= AADLEN <= 15. |
| 611 | // _load_partial_block assumes that %ecx contains AADLEN. |
| 612 | _load_partial_block AAD, %xmm0, %r10, %r10d |
| 613 | pshufb BSWAP_MASK, %xmm0 |
| 614 | pxor %xmm0, GHASH_ACC |
| 615 | _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 |
| 616 | |
| 617 | .Laad_done\@: |
| 618 | movdqu GHASH_ACC, (GHASH_ACC_PTR) |
| 619 | RET |
| 620 | .endm |
| 621 | |
| 622 | // Increment LE_CTR eight times to generate eight little-endian counter blocks, |
| 623 | // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with |
| 624 | // the zero-th AES round key. Clobbers TMP0 and TMP1. |
| 625 | .macro _ctr_begin_8x |
| 626 | movq .Lone(%rip), TMP0 |
| 627 | movdqa (KEY), TMP1 // zero-th round key |
| 628 | .irp i, 0,1,2,3,4,5,6,7 |
| 629 | _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i |
| 630 | pxor TMP1, AESDATA\i |
| 631 | paddd TMP0, LE_CTR |
| 632 | .endr |
| 633 | .endm |
| 634 | |
| 635 | // Do a non-last round of AES on AESDATA[0-7] using \round_key. |
| 636 | .macro _aesenc_8x round_key |
| 637 | .irp i, 0,1,2,3,4,5,6,7 |
| 638 | aesenc \round_key, AESDATA\i |
| 639 | .endr |
| 640 | .endm |
| 641 | |
| 642 | // Do the last round of AES on AESDATA[0-7] using \round_key. |
| 643 | .macro _aesenclast_8x round_key |
| 644 | .irp i, 0,1,2,3,4,5,6,7 |
| 645 | aesenclast \round_key, AESDATA\i |
| 646 | .endr |
| 647 | .endm |
| 648 | |
| 649 | // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and |
| 650 | // store the result to DST. Clobbers TMP0. |
| 651 | .macro _xor_data_8x |
| 652 | .irp i, 0,1,2,3,4,5,6,7 |
| 653 | _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 |
| 654 | .endr |
| 655 | .irp i, 0,1,2,3,4,5,6,7 |
| 656 | movdqu AESDATA\i, \i*16(DST) |
| 657 | .endr |
| 658 | .endm |
| 659 | |
| 660 | // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, |
| 661 | // const u32 le_ctr[4], u8 ghash_acc[16], |
| 662 | // const u8 *src, u8 *dst, int datalen); |
| 663 | // |
| 664 | // This macro generates a GCM encryption or decryption update function with the |
| 665 | // above prototype (with \enc selecting which one). |
| 666 | // |
| 667 | // This function computes the next portion of the CTR keystream, XOR's it with |
| 668 | // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted |
| 669 | // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the |
| 670 | // next |datalen| ciphertext bytes. |
| 671 | // |
| 672 | // |datalen| must be a multiple of 16, except on the last call where it can be |
| 673 | // any length. The caller must do any buffering needed to ensure this. Both |
| 674 | // in-place and out-of-place en/decryption are supported. |
| 675 | // |
| 676 | // |le_ctr| must give the current counter in little-endian format. For a new |
| 677 | // message, the low word of the counter must be 2. This function loads the |
| 678 | // counter from |le_ctr| and increments the loaded counter as needed, but it |
| 679 | // does *not* store the updated counter back to |le_ctr|. The caller must |
| 680 | // update |le_ctr| if any more data segments follow. Internally, only the low |
| 681 | // 32-bit word of the counter is incremented, following the GCM standard. |
| 682 | .macro _aes_gcm_update enc |
| 683 | |
| 684 | // Function arguments |
| 685 | .set KEY, %rdi |
| 686 | .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg |
| 687 | .set GHASH_ACC_PTR, %rdx |
| 688 | .set SRC, %rcx |
| 689 | .set DST, %r8 |
| 690 | .set DATALEN, %r9d |
| 691 | .set DATALEN64, %r9 // Zero-extend DATALEN before using! |
| 692 | // Note: the code setting up for _load_partial_block assumes that SRC is |
| 693 | // in %rcx (and that DATALEN is *not* in %rcx). |
| 694 | |
| 695 | // Additional local variables |
| 696 | |
| 697 | // %rax and %rsi are used as temporary registers. Note: %rsi overlaps |
| 698 | // with LE_CTR_PTR, which is used only at the beginning. |
| 699 | |
| 700 | .set AESKEYLEN, %r10d // AES key length in bytes |
| 701 | .set AESKEYLEN64, %r10 |
| 702 | .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key |
| 703 | |
| 704 | // Put the most frequently used values in %xmm0-%xmm7 to reduce code |
| 705 | // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) |
| 706 | .set TMP0, %xmm0 |
| 707 | .set TMP1, %xmm1 |
| 708 | .set TMP2, %xmm2 |
| 709 | .set LO, %xmm3 // Low part of unreduced product |
| 710 | .set MI, %xmm4 // Middle part of unreduced product |
| 711 | .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also |
| 712 | // the high part of unreduced product |
| 713 | .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes |
| 714 | .set LE_CTR, %xmm7 // Little-endian counter value |
| 715 | .set AESDATA0, %xmm8 |
| 716 | .set AESDATA1, %xmm9 |
| 717 | .set AESDATA2, %xmm10 |
| 718 | .set AESDATA3, %xmm11 |
| 719 | .set AESDATA4, %xmm12 |
| 720 | .set AESDATA5, %xmm13 |
| 721 | .set AESDATA6, %xmm14 |
| 722 | .set AESDATA7, %xmm15 |
| 723 | |
| 724 | movdqa .Lbswap_mask(%rip), BSWAP_MASK |
| 725 | movdqu (GHASH_ACC_PTR), GHASH_ACC |
| 726 | movdqu (LE_CTR_PTR), LE_CTR |
| 727 | |
| 728 | movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN |
| 729 | lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR |
| 730 | |
| 731 | // If there are at least 8*16 bytes of data, then continue into the main |
| 732 | // loop, which processes 8*16 bytes of data per iteration. |
| 733 | // |
| 734 | // The main loop interleaves AES and GHASH to improve performance on |
| 735 | // CPUs that can execute these instructions in parallel. When |
| 736 | // decrypting, the GHASH input (the ciphertext) is immediately |
| 737 | // available. When encrypting, we instead encrypt a set of 8 blocks |
| 738 | // first and then GHASH those blocks while encrypting the next set of 8, |
| 739 | // repeat that as needed, and finally GHASH the last set of 8 blocks. |
| 740 | // |
| 741 | // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, |
| 742 | // as this makes the immediate fit in a signed byte, saving 3 bytes. |
| 743 | add $-8*16, DATALEN |
| 744 | jl .Lcrypt_loop_8x_done\@ |
| 745 | .if \enc |
| 746 | // Encrypt the first 8 plaintext blocks. |
| 747 | _ctr_begin_8x |
| 748 | lea 16(KEY), %rsi |
| 749 | .p2align 4 |
| 750 | 1: |
| 751 | movdqa (%rsi), TMP0 |
| 752 | _aesenc_8x TMP0 |
| 753 | add $16, %rsi |
| 754 | cmp %rsi, RNDKEYLAST_PTR |
| 755 | jne 1b |
| 756 | movdqa (%rsi), TMP0 |
| 757 | _aesenclast_8x TMP0 |
| 758 | _xor_data_8x |
| 759 | // Don't increment DST until the ciphertext blocks have been hashed. |
| 760 | sub $-8*16, SRC |
| 761 | add $-8*16, DATALEN |
| 762 | jl .Lghash_last_ciphertext_8x\@ |
| 763 | .endif |
| 764 | |
| 765 | .p2align 4 |
| 766 | .Lcrypt_loop_8x\@: |
| 767 | |
| 768 | // Generate the next set of 8 counter blocks and start encrypting them. |
| 769 | _ctr_begin_8x |
| 770 | lea 16(KEY), %rsi |
| 771 | |
| 772 | // Do a round of AES, and start the GHASH update of 8 ciphertext blocks |
| 773 | // by doing the unreduced multiplication for the first ciphertext block. |
| 774 | movdqa (%rsi), TMP0 |
| 775 | add $16, %rsi |
| 776 | _aesenc_8x TMP0 |
| 777 | _ghash_update_begin_8x \enc |
| 778 | |
| 779 | // Do 7 more rounds of AES, and continue the GHASH update by doing the |
| 780 | // unreduced multiplication for the remaining ciphertext blocks. |
| 781 | .p2align 4 |
| 782 | 1: |
| 783 | movdqa (%rsi), TMP0 |
| 784 | add $16, %rsi |
| 785 | _aesenc_8x TMP0 |
| 786 | _ghash_update_continue_8x \enc |
| 787 | cmp $7*8, %eax |
| 788 | jne 1b |
| 789 | |
| 790 | // Do the remaining AES rounds. |
| 791 | .p2align 4 |
| 792 | 1: |
| 793 | movdqa (%rsi), TMP0 |
| 794 | add $16, %rsi |
| 795 | _aesenc_8x TMP0 |
| 796 | cmp %rsi, RNDKEYLAST_PTR |
| 797 | jne 1b |
| 798 | |
| 799 | // Do the GHASH reduction and the last round of AES. |
| 800 | movdqa (RNDKEYLAST_PTR), TMP0 |
| 801 | _ghash_update_end_8x_step 0 |
| 802 | _aesenclast_8x TMP0 |
| 803 | _ghash_update_end_8x_step 1 |
| 804 | |
| 805 | // XOR the data with the AES-CTR keystream blocks. |
| 806 | .if \enc |
| 807 | sub $-8*16, DST |
| 808 | .endif |
| 809 | _xor_data_8x |
| 810 | sub $-8*16, SRC |
| 811 | .if !\enc |
| 812 | sub $-8*16, DST |
| 813 | .endif |
| 814 | add $-8*16, DATALEN |
| 815 | jge .Lcrypt_loop_8x\@ |
| 816 | |
| 817 | .if \enc |
| 818 | .Lghash_last_ciphertext_8x\@: |
| 819 | // Update GHASH with the last set of 8 ciphertext blocks. |
| 820 | _ghash_update_begin_8x \enc |
| 821 | .p2align 4 |
| 822 | 1: |
| 823 | _ghash_update_continue_8x \enc |
| 824 | cmp $7*8, %eax |
| 825 | jne 1b |
| 826 | _ghash_update_end_8x_step 0 |
| 827 | _ghash_update_end_8x_step 1 |
| 828 | sub $-8*16, DST |
| 829 | .endif |
| 830 | |
| 831 | .Lcrypt_loop_8x_done\@: |
| 832 | |
| 833 | sub $-8*16, DATALEN |
| 834 | jz .Ldone\@ |
| 835 | |
| 836 | // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep |
| 837 | // things simple and keep the code size down by just going one block at |
| 838 | // a time, again taking advantage of hardware loop unrolling. Since |
| 839 | // there are enough key powers available for all remaining data, we do |
| 840 | // the GHASH multiplications unreduced, and only reduce at the very end. |
| 841 | |
| 842 | .set HI, TMP2 |
| 843 | .set H_POW, AESDATA0 |
| 844 | .set H_POW_XORED, AESDATA1 |
| 845 | .set ONE, AESDATA2 |
| 846 | |
| 847 | movq .Lone(%rip), ONE |
| 848 | |
| 849 | // Start collecting the unreduced GHASH intermediate value LO, MI, HI. |
| 850 | pxor LO, LO |
| 851 | pxor MI, MI |
| 852 | pxor HI, HI |
| 853 | |
| 854 | // Set up a block counter %rax to contain 8*(8-n), where n is the number |
| 855 | // of blocks that remain, counting any partial block. This will be used |
| 856 | // to access the key powers H^n through H^1. |
| 857 | mov DATALEN, %eax |
| 858 | neg %eax |
| 859 | and $~15, %eax |
| 860 | sar $1, %eax |
| 861 | add $64, %eax |
| 862 | |
| 863 | sub $16, DATALEN |
| 864 | jl .Lcrypt_loop_1x_done\@ |
| 865 | |
| 866 | // Process the data one full block at a time. |
| 867 | .Lcrypt_loop_1x\@: |
| 868 | |
| 869 | // Encrypt the next counter block. |
| 870 | _vpshufb BSWAP_MASK, LE_CTR, TMP0 |
| 871 | paddd ONE, LE_CTR |
| 872 | pxor (KEY), TMP0 |
| 873 | lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size |
| 874 | cmp $24, AESKEYLEN |
| 875 | jl 128f // AES-128? |
| 876 | je 192f // AES-192? |
| 877 | // AES-256 |
| 878 | aesenc -7*16(%rsi), TMP0 |
| 879 | aesenc -6*16(%rsi), TMP0 |
| 880 | 192: |
| 881 | aesenc -5*16(%rsi), TMP0 |
| 882 | aesenc -4*16(%rsi), TMP0 |
| 883 | 128: |
| 884 | .irp i, -3,-2,-1,0,1,2,3,4,5 |
| 885 | aesenc \i*16(%rsi), TMP0 |
| 886 | .endr |
| 887 | aesenclast (RNDKEYLAST_PTR), TMP0 |
| 888 | |
| 889 | // Load the next key power H^i. |
| 890 | movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW |
| 891 | movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED |
| 892 | |
| 893 | // XOR the keystream block that was just generated in TMP0 with the next |
| 894 | // source data block and store the resulting en/decrypted data to DST. |
| 895 | .if \enc |
| 896 | _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 |
| 897 | movdqu TMP0, (DST) |
| 898 | .else |
| 899 | movdqu (SRC), TMP1 |
| 900 | pxor TMP1, TMP0 |
| 901 | movdqu TMP0, (DST) |
| 902 | .endif |
| 903 | |
| 904 | // Update GHASH with the ciphertext block. |
| 905 | .if \enc |
| 906 | pshufb BSWAP_MASK, TMP0 |
| 907 | pxor TMP0, GHASH_ACC |
| 908 | .else |
| 909 | pshufb BSWAP_MASK, TMP1 |
| 910 | pxor TMP1, GHASH_ACC |
| 911 | .endif |
| 912 | _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 |
| 913 | pxor GHASH_ACC, GHASH_ACC |
| 914 | |
| 915 | add $8, %eax |
| 916 | add $16, SRC |
| 917 | add $16, DST |
| 918 | sub $16, DATALEN |
| 919 | jge .Lcrypt_loop_1x\@ |
| 920 | .Lcrypt_loop_1x_done\@: |
| 921 | // Check whether there is a partial block at the end. |
| 922 | add $16, DATALEN |
| 923 | jz .Lghash_reduce\@ |
| 924 | |
| 925 | // Process a partial block of length 1 <= DATALEN <= 15. |
| 926 | |
| 927 | // Encrypt a counter block for the last time. |
| 928 | pshufb BSWAP_MASK, LE_CTR |
| 929 | pxor (KEY), LE_CTR |
| 930 | lea 16(KEY), %rsi |
| 931 | 1: |
| 932 | aesenc (%rsi), LE_CTR |
| 933 | add $16, %rsi |
| 934 | cmp %rsi, RNDKEYLAST_PTR |
| 935 | jne 1b |
| 936 | aesenclast (RNDKEYLAST_PTR), LE_CTR |
| 937 | |
| 938 | // Load the lowest key power, H^1. |
| 939 | movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW |
| 940 | movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED |
| 941 | |
| 942 | // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is |
| 943 | // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. |
| 944 | // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. |
| 945 | mov SRC, RNDKEYLAST_PTR |
| 946 | mov DATALEN, %ecx |
| 947 | _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi |
| 948 | |
| 949 | // XOR the keystream block that was just generated in LE_CTR with the |
| 950 | // source data block and store the resulting en/decrypted data to DST. |
| 951 | pxor TMP0, LE_CTR |
| 952 | mov DATALEN, %ecx |
| 953 | _store_partial_block LE_CTR, DST |
| 954 | |
| 955 | // If encrypting, zero-pad the final ciphertext block for GHASH. (If |
| 956 | // decrypting, this was already done by _load_partial_block.) |
| 957 | .if \enc |
| 958 | lea .Lzeropad_mask+16(%rip), %rax |
| 959 | sub DATALEN64, %rax |
| 960 | _vpand (%rax), LE_CTR, TMP0 |
| 961 | .endif |
| 962 | |
| 963 | // Update GHASH with the final ciphertext block. |
| 964 | pshufb BSWAP_MASK, TMP0 |
| 965 | pxor TMP0, GHASH_ACC |
| 966 | _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 |
| 967 | |
| 968 | .Lghash_reduce\@: |
| 969 | // Finally, do the GHASH reduction. |
| 970 | _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 |
| 971 | |
| 972 | .Ldone\@: |
| 973 | // Store the updated GHASH accumulator back to memory. |
| 974 | movdqu GHASH_ACC, (GHASH_ACC_PTR) |
| 975 | |
| 976 | RET |
| 977 | .endm |
| 978 | |
| 979 | // void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, |
| 980 | // const u32 le_ctr[4], u8 ghash_acc[16], |
| 981 | // u64 total_aadlen, u64 total_datalen); |
| 982 | // bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, |
| 983 | // const u32 le_ctr[4], const u8 ghash_acc[16], |
| 984 | // u64 total_aadlen, u64 total_datalen, |
| 985 | // const u8 tag[16], int taglen); |
| 986 | // |
| 987 | // This macro generates one of the above two functions (with \enc selecting |
| 988 | // which one). Both functions finish computing the GCM authentication tag by |
| 989 | // updating GHASH with the lengths block and encrypting the GHASH accumulator. |
| 990 | // |total_aadlen| and |total_datalen| must be the total length of the additional |
| 991 | // authenticated data and the en/decrypted data in bytes, respectively. |
| 992 | // |
| 993 | // The encryption function then stores the full-length (16-byte) computed |
| 994 | // authentication tag to |ghash_acc|. The decryption function instead loads the |
| 995 | // expected authentication tag (the one that was transmitted) from the 16-byte |
| 996 | // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the |
| 997 | // computed tag in constant time, and returns true if and only if they match. |
| 998 | .macro _aes_gcm_final enc |
| 999 | |
| 1000 | // Function arguments |
| 1001 | .set KEY, %rdi |
| 1002 | .set LE_CTR_PTR, %rsi |
| 1003 | .set GHASH_ACC_PTR, %rdx |
| 1004 | .set TOTAL_AADLEN, %rcx |
| 1005 | .set TOTAL_DATALEN, %r8 |
| 1006 | .set TAG, %r9 |
| 1007 | .set TAGLEN, %r10d // Originally at 8(%rsp) |
| 1008 | .set TAGLEN64, %r10 |
| 1009 | |
| 1010 | // Additional local variables. |
| 1011 | // %rax and %xmm0-%xmm2 are used as temporary registers. |
| 1012 | .set AESKEYLEN, %r11d |
| 1013 | .set AESKEYLEN64, %r11 |
| 1014 | .set BSWAP_MASK, %xmm3 |
| 1015 | .set GHASH_ACC, %xmm4 |
| 1016 | .set H_POW1, %xmm5 // H^1 |
| 1017 | .set H_POW1_X64, %xmm6 // H^1 * x^64 |
| 1018 | .set GFPOLY, %xmm7 |
| 1019 | |
| 1020 | movdqa .Lbswap_mask(%rip), BSWAP_MASK |
| 1021 | movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN |
| 1022 | |
| 1023 | // Set up a counter block with 1 in the low 32-bit word. This is the |
| 1024 | // counter that produces the ciphertext needed to encrypt the auth tag. |
| 1025 | movdqu (LE_CTR_PTR), %xmm0 |
| 1026 | mov $1, %eax |
| 1027 | pinsrd $0, %eax, %xmm0 |
| 1028 | |
| 1029 | // Build the lengths block and XOR it into the GHASH accumulator. |
| 1030 | movq TOTAL_DATALEN, GHASH_ACC |
| 1031 | pinsrq $1, TOTAL_AADLEN, GHASH_ACC |
| 1032 | psllq $3, GHASH_ACC // Bytes to bits |
| 1033 | _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 |
| 1034 | |
| 1035 | movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 |
| 1036 | movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 |
| 1037 | movq .Lgfpoly(%rip), GFPOLY |
| 1038 | |
| 1039 | // Make %rax point to the 6th from last AES round key. (Using signed |
| 1040 | // byte offsets -7*16 through 6*16 decreases code size.) |
| 1041 | lea (KEY,AESKEYLEN64,4), %rax |
| 1042 | |
| 1043 | // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. |
| 1044 | // Interleave the AES and GHASH instructions to improve performance. |
| 1045 | pshufb BSWAP_MASK, %xmm0 |
| 1046 | pxor (KEY), %xmm0 |
| 1047 | cmp $24, AESKEYLEN |
| 1048 | jl 128f // AES-128? |
| 1049 | je 192f // AES-192? |
| 1050 | // AES-256 |
| 1051 | aesenc -7*16(%rax), %xmm0 |
| 1052 | aesenc -6*16(%rax), %xmm0 |
| 1053 | 192: |
| 1054 | aesenc -5*16(%rax), %xmm0 |
| 1055 | aesenc -4*16(%rax), %xmm0 |
| 1056 | 128: |
| 1057 | .irp i, 0,1,2,3,4,5,6,7,8 |
| 1058 | aesenc (\i-3)*16(%rax), %xmm0 |
| 1059 | _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 |
| 1060 | .endr |
| 1061 | aesenclast 6*16(%rax), %xmm0 |
| 1062 | _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 |
| 1063 | |
| 1064 | // Undo the byte reflection of the GHASH accumulator. |
| 1065 | pshufb BSWAP_MASK, GHASH_ACC |
| 1066 | |
| 1067 | // Encrypt the GHASH accumulator. |
| 1068 | pxor %xmm0, GHASH_ACC |
| 1069 | |
| 1070 | .if \enc |
| 1071 | // Return the computed auth tag. |
| 1072 | movdqu GHASH_ACC, (GHASH_ACC_PTR) |
| 1073 | .else |
| 1074 | .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! |
| 1075 | |
| 1076 | // Verify the auth tag in constant time by XOR'ing the transmitted and |
| 1077 | // computed auth tags together and using the ptest instruction to check |
| 1078 | // whether the first TAGLEN bytes of the result are zero. |
| 1079 | _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 |
| 1080 | movl 8(%rsp), TAGLEN |
| 1081 | lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR |
| 1082 | sub TAGLEN64, ZEROPAD_MASK_PTR |
| 1083 | xor %eax, %eax |
| 1084 | _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 |
| 1085 | sete %al |
| 1086 | .endif |
| 1087 | RET |
| 1088 | .endm |
| 1089 | |
| 1090 | .set USE_AVX, 0 |
| 1091 | SYM_FUNC_START(aes_gcm_precompute_aesni) |
| 1092 | _aes_gcm_precompute |
| 1093 | SYM_FUNC_END(aes_gcm_precompute_aesni) |
| 1094 | SYM_FUNC_START(aes_gcm_aad_update_aesni) |
| 1095 | _aes_gcm_aad_update |
| 1096 | SYM_FUNC_END(aes_gcm_aad_update_aesni) |
| 1097 | SYM_FUNC_START(aes_gcm_enc_update_aesni) |
| 1098 | _aes_gcm_update 1 |
| 1099 | SYM_FUNC_END(aes_gcm_enc_update_aesni) |
| 1100 | SYM_FUNC_START(aes_gcm_dec_update_aesni) |
| 1101 | _aes_gcm_update 0 |
| 1102 | SYM_FUNC_END(aes_gcm_dec_update_aesni) |
| 1103 | SYM_FUNC_START(aes_gcm_enc_final_aesni) |
| 1104 | _aes_gcm_final 1 |
| 1105 | SYM_FUNC_END(aes_gcm_enc_final_aesni) |
| 1106 | SYM_FUNC_START(aes_gcm_dec_final_aesni) |
| 1107 | _aes_gcm_final 0 |
| 1108 | SYM_FUNC_END(aes_gcm_dec_final_aesni) |
| 1109 | |
| 1110 | .set USE_AVX, 1 |
| 1111 | SYM_FUNC_START(aes_gcm_precompute_aesni_avx) |
| 1112 | _aes_gcm_precompute |
| 1113 | SYM_FUNC_END(aes_gcm_precompute_aesni_avx) |
| 1114 | SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) |
| 1115 | _aes_gcm_aad_update |
| 1116 | SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) |
| 1117 | SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) |
| 1118 | _aes_gcm_update 1 |
| 1119 | SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) |
| 1120 | SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) |
| 1121 | _aes_gcm_update 0 |
| 1122 | SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) |
| 1123 | SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) |
| 1124 | _aes_gcm_final 1 |
| 1125 | SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) |
| 1126 | SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) |
| 1127 | _aes_gcm_final 0 |
| 1128 | SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) |
| 1129 | |