| 1 | /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ |
| 2 | // |
| 3 | // AES-XTS for modern x86_64 CPUs |
| 4 | // |
| 5 | // Copyright 2024 Google LLC |
| 6 | // |
| 7 | // Author: Eric Biggers <ebiggers@google.com> |
| 8 | // |
| 9 | //------------------------------------------------------------------------------ |
| 10 | // |
| 11 | // This file is dual-licensed, meaning that you can use it under your choice of |
| 12 | // either of the following two licenses: |
| 13 | // |
| 14 | // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy |
| 15 | // of the License at |
| 16 | // |
| 17 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 18 | // |
| 19 | // Unless required by applicable law or agreed to in writing, software |
| 20 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 21 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 22 | // See the License for the specific language governing permissions and |
| 23 | // limitations under the License. |
| 24 | // |
| 25 | // or |
| 26 | // |
| 27 | // Redistribution and use in source and binary forms, with or without |
| 28 | // modification, are permitted provided that the following conditions are met: |
| 29 | // |
| 30 | // 1. Redistributions of source code must retain the above copyright notice, |
| 31 | // this list of conditions and the following disclaimer. |
| 32 | // |
| 33 | // 2. Redistributions in binary form must reproduce the above copyright |
| 34 | // notice, this list of conditions and the following disclaimer in the |
| 35 | // documentation and/or other materials provided with the distribution. |
| 36 | // |
| 37 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 38 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 39 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 40 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| 41 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 42 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 43 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 44 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 45 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 46 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 47 | // POSSIBILITY OF SUCH DAMAGE. |
| 48 | |
| 49 | /* |
| 50 | * This file implements AES-XTS for modern x86_64 CPUs. To handle the |
| 51 | * complexities of coding for x86 SIMD, e.g. where every vector length needs |
| 52 | * different code, it uses a macro to generate several implementations that |
| 53 | * share similar source code but are targeted at different CPUs, listed below: |
| 54 | * |
| 55 | * AES-NI && AVX |
| 56 | * - 128-bit vectors (1 AES block per vector) |
| 57 | * - VEX-coded instructions |
| 58 | * - xmm0-xmm15 |
| 59 | * - This is for older CPUs that lack VAES but do have AVX. |
| 60 | * |
| 61 | * VAES && VPCLMULQDQ && AVX2 |
| 62 | * - 256-bit vectors (2 AES blocks per vector) |
| 63 | * - VEX-coded instructions |
| 64 | * - ymm0-ymm15 |
| 65 | * - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's |
| 66 | * Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm |
| 67 | * registers (e.g. Intel's Ice Lake). |
| 68 | * |
| 69 | * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 |
| 70 | * - 512-bit vectors (4 AES blocks per vector) |
| 71 | * - EVEX-coded instructions |
| 72 | * - zmm0-zmm31 |
| 73 | * - This is for CPUs that have good AVX512 support. |
| 74 | * |
| 75 | * This file doesn't have an implementation for AES-NI alone (without AVX), as |
| 76 | * the lack of VEX would make all the assembly code different. |
| 77 | * |
| 78 | * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of |
| 79 | * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be |
| 80 | * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might |
| 81 | * need to start also providing an implementation using VAES alone. |
| 82 | * |
| 83 | * The AES-XTS implementations in this file support everything required by the |
| 84 | * crypto API, including support for arbitrary input lengths and multi-part |
| 85 | * processing. However, they are most heavily optimized for the common case of |
| 86 | * power-of-2 length inputs that are processed in a single part (disk sectors). |
| 87 | */ |
| 88 | |
| 89 | #include <linux/linkage.h> |
| 90 | #include <linux/cfi_types.h> |
| 91 | |
| 92 | .section .rodata |
| 93 | .p2align 4 |
| 94 | .Lgf_poly: |
| 95 | // The low 64 bits of this value represent the polynomial x^7 + x^2 + x |
| 96 | // + 1. It is the value that must be XOR'd into the low 64 bits of the |
| 97 | // tweak each time a 1 is carried out of the high 64 bits. |
| 98 | // |
| 99 | // The high 64 bits of this value is just the internal carry bit that |
| 100 | // exists when there's a carry out of the low 64 bits of the tweak. |
| 101 | .quad 0x87, 1 |
| 102 | |
| 103 | // These are the shift amounts that are needed when multiplying by [x^0, |
| 104 | // x^1, x^2, x^3] to compute the first vector of tweaks when VL=64. |
| 105 | // |
| 106 | // The right shifts by 64 are expected to zeroize the destination. |
| 107 | // 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the |
| 108 | // amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would. |
| 109 | .Lrshift_amounts: |
| 110 | .byte 64, 64, 63, 63, 62, 62, 61, 61 |
| 111 | .Llshift_amounts: |
| 112 | .byte 0, 0, 1, 1, 2, 2, 3, 3 |
| 113 | |
| 114 | // This table contains constants for vpshufb and vpblendvb, used to |
| 115 | // handle variable byte shifts and blending during ciphertext stealing |
| 116 | // on CPUs that don't support AVX512-style masking. |
| 117 | .Lcts_permute_table: |
| 118 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 |
| 119 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 |
| 120 | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 |
| 121 | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f |
| 122 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 |
| 123 | .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 |
| 124 | .text |
| 125 | |
| 126 | .macro _define_Vi i |
| 127 | .if VL == 16 |
| 128 | .set V\i, %xmm\i |
| 129 | .elseif VL == 32 |
| 130 | .set V\i, %ymm\i |
| 131 | .elseif VL == 64 |
| 132 | .set V\i, %zmm\i |
| 133 | .else |
| 134 | .error "Unsupported Vector Length (VL)" |
| 135 | .endif |
| 136 | .endm |
| 137 | |
| 138 | .macro _define_aliases |
| 139 | // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers |
| 140 | // are available, that map to the xmm, ymm, or zmm registers according |
| 141 | // to the selected Vector Length (VL). |
| 142 | .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 |
| 143 | _define_Vi \i |
| 144 | .endr |
| 145 | .if USE_AVX512 |
| 146 | .irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 |
| 147 | _define_Vi \i |
| 148 | .endr |
| 149 | .endif |
| 150 | |
| 151 | // Function parameters |
| 152 | .set KEY, %rdi // Initially points to crypto_aes_ctx, then is |
| 153 | // advanced to point to 7th-from-last round key |
| 154 | .set SRC, %rsi // Pointer to next source data |
| 155 | .set DST, %rdx // Pointer to next destination data |
| 156 | .set LEN, %ecx // Remaining length in bytes |
| 157 | .set LEN8, %cl |
| 158 | .set LEN64, %rcx |
| 159 | .set TWEAK, %r8 // Pointer to next tweak |
| 160 | |
| 161 | // %rax holds the AES key length in bytes. |
| 162 | .set KEYLEN, %eax |
| 163 | .set KEYLEN64, %rax |
| 164 | |
| 165 | // %r9-r11 are available as temporaries. |
| 166 | |
| 167 | // V0-V3 hold the data blocks during the main loop, or temporary values |
| 168 | // otherwise. V4-V5 hold temporary values. |
| 169 | |
| 170 | // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. |
| 171 | .set TWEAK0_XMM, %xmm6 |
| 172 | .set TWEAK0, V6 |
| 173 | .set TWEAK1_XMM, %xmm7 |
| 174 | .set TWEAK1, V7 |
| 175 | .set TWEAK2, V8 |
| 176 | .set TWEAK3, V9 |
| 177 | |
| 178 | // V10-V13 are used for computing the next values of TWEAK[0-3]. |
| 179 | .set NEXT_TWEAK0, V10 |
| 180 | .set NEXT_TWEAK1, V11 |
| 181 | .set NEXT_TWEAK2, V12 |
| 182 | .set NEXT_TWEAK3, V13 |
| 183 | |
| 184 | // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. |
| 185 | .set GF_POLY_XMM, %xmm14 |
| 186 | .set GF_POLY, V14 |
| 187 | |
| 188 | // V15 holds the key for AES "round 0", copied to all 128-bit lanes. |
| 189 | .set KEY0_XMM, %xmm15 |
| 190 | .set KEY0, V15 |
| 191 | |
| 192 | // If 32 SIMD registers are available, then V16-V29 hold the remaining |
| 193 | // AES round keys, copied to all 128-bit lanes. |
| 194 | // |
| 195 | // AES-128, AES-192, and AES-256 use different numbers of round keys. |
| 196 | // To allow handling all three variants efficiently, we align the round |
| 197 | // keys to the *end* of this register range. I.e., AES-128 uses |
| 198 | // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. |
| 199 | // (All also use KEY0 for the XOR-only "round" at the beginning.) |
| 200 | .if USE_AVX512 |
| 201 | .set KEY1_XMM, %xmm16 |
| 202 | .set KEY1, V16 |
| 203 | .set KEY2_XMM, %xmm17 |
| 204 | .set KEY2, V17 |
| 205 | .set KEY3_XMM, %xmm18 |
| 206 | .set KEY3, V18 |
| 207 | .set KEY4_XMM, %xmm19 |
| 208 | .set KEY4, V19 |
| 209 | .set KEY5_XMM, %xmm20 |
| 210 | .set KEY5, V20 |
| 211 | .set KEY6_XMM, %xmm21 |
| 212 | .set KEY6, V21 |
| 213 | .set KEY7_XMM, %xmm22 |
| 214 | .set KEY7, V22 |
| 215 | .set KEY8_XMM, %xmm23 |
| 216 | .set KEY8, V23 |
| 217 | .set KEY9_XMM, %xmm24 |
| 218 | .set KEY9, V24 |
| 219 | .set KEY10_XMM, %xmm25 |
| 220 | .set KEY10, V25 |
| 221 | .set KEY11_XMM, %xmm26 |
| 222 | .set KEY11, V26 |
| 223 | .set KEY12_XMM, %xmm27 |
| 224 | .set KEY12, V27 |
| 225 | .set KEY13_XMM, %xmm28 |
| 226 | .set KEY13, V28 |
| 227 | .set KEY14_XMM, %xmm29 |
| 228 | .set KEY14, V29 |
| 229 | .endif |
| 230 | // V30-V31 are currently unused. |
| 231 | .endm |
| 232 | |
| 233 | // Move a vector between memory and a register. |
| 234 | .macro _vmovdqu src, dst |
| 235 | .if VL < 64 |
| 236 | vmovdqu \src, \dst |
| 237 | .else |
| 238 | vmovdqu8 \src, \dst |
| 239 | .endif |
| 240 | .endm |
| 241 | |
| 242 | // Broadcast a 128-bit value into a vector. |
| 243 | .macro _vbroadcast128 src, dst |
| 244 | .if VL == 16 |
| 245 | vmovdqu \src, \dst |
| 246 | .elseif VL == 32 |
| 247 | vbroadcasti128 \src, \dst |
| 248 | .else |
| 249 | vbroadcasti32x4 \src, \dst |
| 250 | .endif |
| 251 | .endm |
| 252 | |
| 253 | // XOR two vectors together. |
| 254 | .macro _vpxor src1, src2, dst |
| 255 | .if VL < 64 |
| 256 | vpxor \src1, \src2, \dst |
| 257 | .else |
| 258 | vpxord \src1, \src2, \dst |
| 259 | .endif |
| 260 | .endm |
| 261 | |
| 262 | // XOR three vectors together. |
| 263 | .macro _xor3 src1, src2, src3_and_dst |
| 264 | .if USE_AVX512 |
| 265 | // vpternlogd with immediate 0x96 is a three-argument XOR. |
| 266 | vpternlogd $0x96, \src1, \src2, \src3_and_dst |
| 267 | .else |
| 268 | vpxor \src1, \src3_and_dst, \src3_and_dst |
| 269 | vpxor \src2, \src3_and_dst, \src3_and_dst |
| 270 | .endif |
| 271 | .endm |
| 272 | |
| 273 | // Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak |
| 274 | // (by multiplying by the polynomial 'x') and write it to \dst. |
| 275 | .macro _next_tweak src, tmp, dst |
| 276 | vpshufd $0x13, \src, \tmp |
| 277 | vpaddq \src, \src, \dst |
| 278 | vpsrad $31, \tmp, \tmp |
| 279 | .if USE_AVX512 |
| 280 | vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst |
| 281 | .else |
| 282 | vpand GF_POLY_XMM, \tmp, \tmp |
| 283 | vpxor \tmp, \dst, \dst |
| 284 | .endif |
| 285 | .endm |
| 286 | |
| 287 | // Given the XTS tweak(s) in the vector \src, compute the next vector of |
| 288 | // tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. |
| 289 | // |
| 290 | // If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute |
| 291 | // all tweaks in the vector in parallel. If VL=16, we just do the regular |
| 292 | // computation without vpclmulqdq, as it's the faster method for a single tweak. |
| 293 | .macro _next_tweakvec src, tmp1, tmp2, dst |
| 294 | .if VL == 16 |
| 295 | _next_tweak \src, \tmp1, \dst |
| 296 | .else |
| 297 | vpsrlq $64 - VL/16, \src, \tmp1 |
| 298 | vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 |
| 299 | vpslldq $8, \tmp1, \tmp1 |
| 300 | vpsllq $VL/16, \src, \dst |
| 301 | _xor3 \tmp1, \tmp2, \dst |
| 302 | .endif |
| 303 | .endm |
| 304 | |
| 305 | // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and |
| 306 | // store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. |
| 307 | .macro _compute_first_set_of_tweaks |
| 308 | .if VL == 16 |
| 309 | vmovdqu (TWEAK), TWEAK0_XMM |
| 310 | vmovdqu .Lgf_poly(%rip), GF_POLY |
| 311 | _next_tweak TWEAK0, %xmm0, TWEAK1 |
| 312 | _next_tweak TWEAK1, %xmm0, TWEAK2 |
| 313 | _next_tweak TWEAK2, %xmm0, TWEAK3 |
| 314 | .elseif VL == 32 |
| 315 | vmovdqu (TWEAK), TWEAK0_XMM |
| 316 | vbroadcasti128 .Lgf_poly(%rip), GF_POLY |
| 317 | |
| 318 | // Compute the first vector of tweaks. |
| 319 | _next_tweak TWEAK0_XMM, %xmm0, %xmm1 |
| 320 | vinserti128 $1, %xmm1, TWEAK0, TWEAK0 |
| 321 | |
| 322 | // Compute the next three vectors of tweaks: |
| 323 | // TWEAK1 = TWEAK0 * [x^2, x^2] |
| 324 | // TWEAK2 = TWEAK0 * [x^4, x^4] |
| 325 | // TWEAK3 = TWEAK0 * [x^6, x^6] |
| 326 | vpsrlq $64 - 2, TWEAK0, V0 |
| 327 | vpsrlq $64 - 4, TWEAK0, V2 |
| 328 | vpsrlq $64 - 6, TWEAK0, V4 |
| 329 | vpclmulqdq $0x01, GF_POLY, V0, V1 |
| 330 | vpclmulqdq $0x01, GF_POLY, V2, V3 |
| 331 | vpclmulqdq $0x01, GF_POLY, V4, V5 |
| 332 | vpslldq $8, V0, V0 |
| 333 | vpslldq $8, V2, V2 |
| 334 | vpslldq $8, V4, V4 |
| 335 | vpsllq $2, TWEAK0, TWEAK1 |
| 336 | vpsllq $4, TWEAK0, TWEAK2 |
| 337 | vpsllq $6, TWEAK0, TWEAK3 |
| 338 | vpxor V0, TWEAK1, TWEAK1 |
| 339 | vpxor V2, TWEAK2, TWEAK2 |
| 340 | vpxor V4, TWEAK3, TWEAK3 |
| 341 | vpxor V1, TWEAK1, TWEAK1 |
| 342 | vpxor V3, TWEAK2, TWEAK2 |
| 343 | vpxor V5, TWEAK3, TWEAK3 |
| 344 | .else |
| 345 | vbroadcasti32x4 (TWEAK), TWEAK0 |
| 346 | vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY |
| 347 | |
| 348 | // Compute the first vector of tweaks: |
| 349 | // TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3] |
| 350 | vpmovzxbq .Lrshift_amounts(%rip), V4 |
| 351 | vpsrlvq V4, TWEAK0, V0 |
| 352 | vpclmulqdq $0x01, GF_POLY, V0, V1 |
| 353 | vpmovzxbq .Llshift_amounts(%rip), V4 |
| 354 | vpslldq $8, V0, V0 |
| 355 | vpsllvq V4, TWEAK0, TWEAK0 |
| 356 | vpternlogd $0x96, V0, V1, TWEAK0 |
| 357 | |
| 358 | // Compute the next three vectors of tweaks: |
| 359 | // TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4] |
| 360 | // TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8] |
| 361 | // TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12] |
| 362 | // x^8 only needs byte-aligned shifts, so optimize accordingly. |
| 363 | vpsrlq $64 - 4, TWEAK0, V0 |
| 364 | vpsrldq $(64 - 8) / 8, TWEAK0, V2 |
| 365 | vpsrlq $64 - 12, TWEAK0, V4 |
| 366 | vpclmulqdq $0x01, GF_POLY, V0, V1 |
| 367 | vpclmulqdq $0x01, GF_POLY, V2, V3 |
| 368 | vpclmulqdq $0x01, GF_POLY, V4, V5 |
| 369 | vpslldq $8, V0, V0 |
| 370 | vpslldq $8, V4, V4 |
| 371 | vpsllq $4, TWEAK0, TWEAK1 |
| 372 | vpslldq $8 / 8, TWEAK0, TWEAK2 |
| 373 | vpsllq $12, TWEAK0, TWEAK3 |
| 374 | vpternlogd $0x96, V0, V1, TWEAK1 |
| 375 | vpxord V3, TWEAK2, TWEAK2 |
| 376 | vpternlogd $0x96, V4, V5, TWEAK3 |
| 377 | .endif |
| 378 | .endm |
| 379 | |
| 380 | // Do one step in computing the next set of tweaks using the method of just |
| 381 | // multiplying by x repeatedly (the same method _next_tweak uses). |
| 382 | .macro _tweak_step_mulx i |
| 383 | .if \i == 0 |
| 384 | .set PREV_TWEAK, TWEAK3 |
| 385 | .set NEXT_TWEAK, NEXT_TWEAK0 |
| 386 | .elseif \i == 5 |
| 387 | .set PREV_TWEAK, NEXT_TWEAK0 |
| 388 | .set NEXT_TWEAK, NEXT_TWEAK1 |
| 389 | .elseif \i == 10 |
| 390 | .set PREV_TWEAK, NEXT_TWEAK1 |
| 391 | .set NEXT_TWEAK, NEXT_TWEAK2 |
| 392 | .elseif \i == 15 |
| 393 | .set PREV_TWEAK, NEXT_TWEAK2 |
| 394 | .set NEXT_TWEAK, NEXT_TWEAK3 |
| 395 | .endif |
| 396 | .if \i >= 0 && \i < 20 && \i % 5 == 0 |
| 397 | vpshufd $0x13, PREV_TWEAK, V5 |
| 398 | .elseif \i >= 0 && \i < 20 && \i % 5 == 1 |
| 399 | vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK |
| 400 | .elseif \i >= 0 && \i < 20 && \i % 5 == 2 |
| 401 | vpsrad $31, V5, V5 |
| 402 | .elseif \i >= 0 && \i < 20 && \i % 5 == 3 |
| 403 | vpand GF_POLY, V5, V5 |
| 404 | .elseif \i >= 0 && \i < 20 && \i % 5 == 4 |
| 405 | vpxor V5, NEXT_TWEAK, NEXT_TWEAK |
| 406 | .elseif \i == 1000 |
| 407 | vmovdqa NEXT_TWEAK0, TWEAK0 |
| 408 | vmovdqa NEXT_TWEAK1, TWEAK1 |
| 409 | vmovdqa NEXT_TWEAK2, TWEAK2 |
| 410 | vmovdqa NEXT_TWEAK3, TWEAK3 |
| 411 | .endif |
| 412 | .endm |
| 413 | |
| 414 | // Do one step in computing the next set of tweaks using the VPCLMULQDQ method |
| 415 | // (the same method _next_tweakvec uses for VL > 16). This means multiplying |
| 416 | // each tweak by x^(4*VL/16) independently. |
| 417 | // |
| 418 | // Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed |
| 419 | // shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq |
| 420 | // to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves |
| 421 | // instructions directly. The 128-bit right shift (vpsrldq) performs better |
| 422 | // than a 64-bit right shift on Intel CPUs in the context where it is used here, |
| 423 | // because it runs on a different execution port from the AES instructions. |
| 424 | .macro _tweak_step_pclmul i |
| 425 | .if \i == 0 |
| 426 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 |
| 427 | .elseif \i == 2 |
| 428 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 |
| 429 | .elseif \i == 4 |
| 430 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 |
| 431 | .elseif \i == 6 |
| 432 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 |
| 433 | .elseif \i == 8 |
| 434 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 |
| 435 | .elseif \i == 10 |
| 436 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 |
| 437 | .elseif \i == 12 |
| 438 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 |
| 439 | .elseif \i == 14 |
| 440 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 |
| 441 | .elseif \i == 1000 |
| 442 | vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 |
| 443 | vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 |
| 444 | vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 |
| 445 | vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 |
| 446 | _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 |
| 447 | _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 |
| 448 | _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 |
| 449 | _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 |
| 450 | .endif |
| 451 | .endm |
| 452 | |
| 453 | // _tweak_step does one step of the computation of the next set of tweaks from |
| 454 | // TWEAK[0-3]. To complete all steps, this is invoked with increasing values of |
| 455 | // \i that include at least 0 through 19, then 1000 which signals the last step. |
| 456 | // |
| 457 | // This is used to interleave the computation of the next set of tweaks with the |
| 458 | // AES en/decryptions, which increases performance in some cases. Clobbers V5. |
| 459 | .macro _tweak_step i |
| 460 | .if VL == 16 |
| 461 | _tweak_step_mulx \i |
| 462 | .else |
| 463 | _tweak_step_pclmul \i |
| 464 | .endif |
| 465 | .endm |
| 466 | |
| 467 | .macro _setup_round_keys enc |
| 468 | |
| 469 | // Select either the encryption round keys or the decryption round keys. |
| 470 | .if \enc |
| 471 | .set OFFS, 0 |
| 472 | .else |
| 473 | .set OFFS, 240 |
| 474 | .endif |
| 475 | |
| 476 | // Load the round key for "round 0". |
| 477 | _vbroadcast128 OFFS(KEY), KEY0 |
| 478 | |
| 479 | // Increment KEY to make it so that 7*16(KEY) is the last round key. |
| 480 | // For AES-128, increment by 3*16, resulting in the 10 round keys (not |
| 481 | // counting the zero-th round key which was just loaded into KEY0) being |
| 482 | // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use |
| 483 | // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment |
| 484 | // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). |
| 485 | // |
| 486 | // This rebasing provides two benefits. First, it makes the offset to |
| 487 | // any round key be in the range [-96, 112], fitting in a signed byte. |
| 488 | // This shortens VEX-encoded instructions that access the later round |
| 489 | // keys which otherwise would need 4-byte offsets. Second, it makes it |
| 490 | // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the |
| 491 | // beginning. Skipping rounds at the end doesn't work as well because |
| 492 | // the last round needs different instructions. |
| 493 | // |
| 494 | // An alternative approach would be to roll up all the round loops. We |
| 495 | // don't do that because (a) it isn't compatible with caching the round |
| 496 | // keys in registers which we do when possible (see below), (b) we |
| 497 | // interleave the AES rounds with the XTS tweak computation, and (c) it |
| 498 | // seems unwise to rely *too* heavily on the CPU's branch predictor. |
| 499 | lea OFFS-16(KEY, KEYLEN64, 4), KEY |
| 500 | |
| 501 | // If all 32 SIMD registers are available, cache all the round keys. |
| 502 | .if USE_AVX512 |
| 503 | cmp $24, KEYLEN |
| 504 | jl .Laes128\@ |
| 505 | je .Laes192\@ |
| 506 | vbroadcasti32x4 -6*16(KEY), KEY1 |
| 507 | vbroadcasti32x4 -5*16(KEY), KEY2 |
| 508 | .Laes192\@: |
| 509 | vbroadcasti32x4 -4*16(KEY), KEY3 |
| 510 | vbroadcasti32x4 -3*16(KEY), KEY4 |
| 511 | .Laes128\@: |
| 512 | vbroadcasti32x4 -2*16(KEY), KEY5 |
| 513 | vbroadcasti32x4 -1*16(KEY), KEY6 |
| 514 | vbroadcasti32x4 0*16(KEY), KEY7 |
| 515 | vbroadcasti32x4 1*16(KEY), KEY8 |
| 516 | vbroadcasti32x4 2*16(KEY), KEY9 |
| 517 | vbroadcasti32x4 3*16(KEY), KEY10 |
| 518 | vbroadcasti32x4 4*16(KEY), KEY11 |
| 519 | vbroadcasti32x4 5*16(KEY), KEY12 |
| 520 | vbroadcasti32x4 6*16(KEY), KEY13 |
| 521 | vbroadcasti32x4 7*16(KEY), KEY14 |
| 522 | .endif |
| 523 | .endm |
| 524 | |
| 525 | // Do a single non-last round of AES encryption (if \enc==1) or decryption (if |
| 526 | // \enc==0) on the block(s) in \data using the round key(s) in \key. The |
| 527 | // register length determines the number of AES blocks en/decrypted. |
| 528 | .macro _vaes enc, key, data |
| 529 | .if \enc |
| 530 | vaesenc \key, \data, \data |
| 531 | .else |
| 532 | vaesdec \key, \data, \data |
| 533 | .endif |
| 534 | .endm |
| 535 | |
| 536 | // Same as _vaes, but does the last round. |
| 537 | .macro _vaeslast enc, key, data |
| 538 | .if \enc |
| 539 | vaesenclast \key, \data, \data |
| 540 | .else |
| 541 | vaesdeclast \key, \data, \data |
| 542 | .endif |
| 543 | .endm |
| 544 | |
| 545 | // Do a single non-last round of AES en/decryption on the block(s) in \data, |
| 546 | // using the same key for all block(s). The round key is loaded from the |
| 547 | // appropriate register or memory location for round \i. May clobber \tmp. |
| 548 | .macro _vaes_1x enc, i, xmm_suffix, data, tmp |
| 549 | .if USE_AVX512 |
| 550 | _vaes \enc, KEY\i\xmm_suffix, \data |
| 551 | .else |
| 552 | .ifnb \xmm_suffix |
| 553 | _vaes \enc, (\i-7)*16(KEY), \data |
| 554 | .else |
| 555 | _vbroadcast128 (\i-7)*16(KEY), \tmp |
| 556 | _vaes \enc, \tmp, \data |
| 557 | .endif |
| 558 | .endif |
| 559 | .endm |
| 560 | |
| 561 | // Do a single non-last round of AES en/decryption on the blocks in registers |
| 562 | // V0-V3, using the same key for all blocks. The round key is loaded from the |
| 563 | // appropriate register or memory location for round \i. In addition, does two |
| 564 | // steps of the computation of the next set of tweaks. May clobber V4 and V5. |
| 565 | .macro _vaes_4x enc, i |
| 566 | .if USE_AVX512 |
| 567 | _tweak_step (2*(\i-5)) |
| 568 | _vaes \enc, KEY\i, V0 |
| 569 | _vaes \enc, KEY\i, V1 |
| 570 | _tweak_step (2*(\i-5) + 1) |
| 571 | _vaes \enc, KEY\i, V2 |
| 572 | _vaes \enc, KEY\i, V3 |
| 573 | .else |
| 574 | _vbroadcast128 (\i-7)*16(KEY), V4 |
| 575 | _tweak_step (2*(\i-5)) |
| 576 | _vaes \enc, V4, V0 |
| 577 | _vaes \enc, V4, V1 |
| 578 | _tweak_step (2*(\i-5) + 1) |
| 579 | _vaes \enc, V4, V2 |
| 580 | _vaes \enc, V4, V3 |
| 581 | .endif |
| 582 | .endm |
| 583 | |
| 584 | // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, |
| 585 | // then XOR with \tweak again) of the block(s) in \data. To process a single |
| 586 | // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of |
| 587 | // length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. |
| 588 | .macro _aes_crypt enc, xmm_suffix, tweak, data, tmp |
| 589 | _xor3 KEY0\xmm_suffix, \tweak, \data |
| 590 | cmp $24, KEYLEN |
| 591 | jl .Laes128\@ |
| 592 | je .Laes192\@ |
| 593 | _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp |
| 594 | _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp |
| 595 | .Laes192\@: |
| 596 | _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp |
| 597 | _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp |
| 598 | .Laes128\@: |
| 599 | .irp i, 5,6,7,8,9,10,11,12,13 |
| 600 | _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp |
| 601 | .endr |
| 602 | .if USE_AVX512 |
| 603 | vpxord KEY14\xmm_suffix, \tweak, \tmp |
| 604 | .else |
| 605 | .ifnb \xmm_suffix |
| 606 | vpxor 7*16(KEY), \tweak, \tmp |
| 607 | .else |
| 608 | _vbroadcast128 7*16(KEY), \tmp |
| 609 | vpxor \tweak, \tmp, \tmp |
| 610 | .endif |
| 611 | .endif |
| 612 | _vaeslast \enc, \tmp, \data |
| 613 | .endm |
| 614 | |
| 615 | .macro _aes_xts_crypt enc |
| 616 | _define_aliases |
| 617 | |
| 618 | .if !\enc |
| 619 | // When decrypting a message whose length isn't a multiple of the AES |
| 620 | // block length, exclude the last full block from the main loop by |
| 621 | // subtracting 16 from LEN. This is needed because ciphertext stealing |
| 622 | // decryption uses the last two tweaks in reverse order. We'll handle |
| 623 | // the last full block and the partial block specially at the end. |
| 624 | lea -16(LEN), %eax |
| 625 | test $15, LEN8 |
| 626 | cmovnz %eax, LEN |
| 627 | .endif |
| 628 | |
| 629 | // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). |
| 630 | movl 480(KEY), KEYLEN |
| 631 | |
| 632 | // Setup the pointer to the round keys and cache as many as possible. |
| 633 | _setup_round_keys \enc |
| 634 | |
| 635 | // Compute the first set of tweaks TWEAK[0-3]. |
| 636 | _compute_first_set_of_tweaks |
| 637 | |
| 638 | add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 |
| 639 | jl .Lhandle_remainder\@ |
| 640 | |
| 641 | .Lmain_loop\@: |
| 642 | // This is the main loop, en/decrypting 4*VL bytes per iteration. |
| 643 | |
| 644 | // XOR each source block with its tweak and the zero-th round key. |
| 645 | .if USE_AVX512 |
| 646 | vmovdqu8 0*VL(SRC), V0 |
| 647 | vmovdqu8 1*VL(SRC), V1 |
| 648 | vmovdqu8 2*VL(SRC), V2 |
| 649 | vmovdqu8 3*VL(SRC), V3 |
| 650 | vpternlogd $0x96, TWEAK0, KEY0, V0 |
| 651 | vpternlogd $0x96, TWEAK1, KEY0, V1 |
| 652 | vpternlogd $0x96, TWEAK2, KEY0, V2 |
| 653 | vpternlogd $0x96, TWEAK3, KEY0, V3 |
| 654 | .else |
| 655 | vpxor 0*VL(SRC), KEY0, V0 |
| 656 | vpxor 1*VL(SRC), KEY0, V1 |
| 657 | vpxor 2*VL(SRC), KEY0, V2 |
| 658 | vpxor 3*VL(SRC), KEY0, V3 |
| 659 | vpxor TWEAK0, V0, V0 |
| 660 | vpxor TWEAK1, V1, V1 |
| 661 | vpxor TWEAK2, V2, V2 |
| 662 | vpxor TWEAK3, V3, V3 |
| 663 | .endif |
| 664 | cmp $24, KEYLEN |
| 665 | jl .Laes128\@ |
| 666 | je .Laes192\@ |
| 667 | // Do all the AES rounds on the data blocks, interleaved with |
| 668 | // the computation of the next set of tweaks. |
| 669 | _vaes_4x \enc, 1 |
| 670 | _vaes_4x \enc, 2 |
| 671 | .Laes192\@: |
| 672 | _vaes_4x \enc, 3 |
| 673 | _vaes_4x \enc, 4 |
| 674 | .Laes128\@: |
| 675 | .irp i, 5,6,7,8,9,10,11,12,13 |
| 676 | _vaes_4x \enc, \i |
| 677 | .endr |
| 678 | // Do the last AES round, then XOR the results with the tweaks again. |
| 679 | // Reduce latency by doing the XOR before the vaesenclast, utilizing the |
| 680 | // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) |
| 681 | // (and likewise for vaesdeclast). |
| 682 | .if USE_AVX512 |
| 683 | _tweak_step 18 |
| 684 | _tweak_step 19 |
| 685 | vpxord TWEAK0, KEY14, V4 |
| 686 | vpxord TWEAK1, KEY14, V5 |
| 687 | _vaeslast \enc, V4, V0 |
| 688 | _vaeslast \enc, V5, V1 |
| 689 | vpxord TWEAK2, KEY14, V4 |
| 690 | vpxord TWEAK3, KEY14, V5 |
| 691 | _vaeslast \enc, V4, V2 |
| 692 | _vaeslast \enc, V5, V3 |
| 693 | .else |
| 694 | _vbroadcast128 7*16(KEY), V4 |
| 695 | _tweak_step 18 // uses V5 |
| 696 | _tweak_step 19 // uses V5 |
| 697 | vpxor TWEAK0, V4, V5 |
| 698 | _vaeslast \enc, V5, V0 |
| 699 | vpxor TWEAK1, V4, V5 |
| 700 | _vaeslast \enc, V5, V1 |
| 701 | vpxor TWEAK2, V4, V5 |
| 702 | vpxor TWEAK3, V4, V4 |
| 703 | _vaeslast \enc, V5, V2 |
| 704 | _vaeslast \enc, V4, V3 |
| 705 | .endif |
| 706 | |
| 707 | // Store the destination blocks. |
| 708 | _vmovdqu V0, 0*VL(DST) |
| 709 | _vmovdqu V1, 1*VL(DST) |
| 710 | _vmovdqu V2, 2*VL(DST) |
| 711 | _vmovdqu V3, 3*VL(DST) |
| 712 | |
| 713 | // Finish computing the next set of tweaks. |
| 714 | _tweak_step 1000 |
| 715 | |
| 716 | sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 |
| 717 | sub $-4*VL, DST |
| 718 | add $-4*VL, LEN |
| 719 | jge .Lmain_loop\@ |
| 720 | |
| 721 | // Check for the uncommon case where the data length isn't a multiple of |
| 722 | // 4*VL. Handle it out-of-line in order to optimize for the common |
| 723 | // case. In the common case, just fall through to the ret. |
| 724 | test $4*VL-1, LEN8 |
| 725 | jnz .Lhandle_remainder\@ |
| 726 | .Ldone\@: |
| 727 | // Store the next tweak back to *TWEAK to support continuation calls. |
| 728 | vmovdqu TWEAK0_XMM, (TWEAK) |
| 729 | .if VL > 16 |
| 730 | vzeroupper |
| 731 | .endif |
| 732 | RET |
| 733 | |
| 734 | .Lhandle_remainder\@: |
| 735 | |
| 736 | // En/decrypt any remaining full blocks, one vector at a time. |
| 737 | .if VL > 16 |
| 738 | add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL. |
| 739 | jl .Lvec_at_a_time_done\@ |
| 740 | .Lvec_at_a_time\@: |
| 741 | _vmovdqu (SRC), V0 |
| 742 | _aes_crypt \enc, , TWEAK0, V0, tmp=V1 |
| 743 | _vmovdqu V0, (DST) |
| 744 | _next_tweakvec TWEAK0, V0, V1, TWEAK0 |
| 745 | add $VL, SRC |
| 746 | add $VL, DST |
| 747 | sub $VL, LEN |
| 748 | jge .Lvec_at_a_time\@ |
| 749 | .Lvec_at_a_time_done\@: |
| 750 | add $VL-16, LEN // Undo extra sub of VL, then sub 16. |
| 751 | .else |
| 752 | add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16. |
| 753 | .endif |
| 754 | |
| 755 | // En/decrypt any remaining full blocks, one at a time. |
| 756 | jl .Lblock_at_a_time_done\@ |
| 757 | .Lblock_at_a_time\@: |
| 758 | vmovdqu (SRC), %xmm0 |
| 759 | _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 |
| 760 | vmovdqu %xmm0, (DST) |
| 761 | _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM |
| 762 | add $16, SRC |
| 763 | add $16, DST |
| 764 | sub $16, LEN |
| 765 | jge .Lblock_at_a_time\@ |
| 766 | .Lblock_at_a_time_done\@: |
| 767 | add $16, LEN // Undo the extra sub of 16. |
| 768 | // Now 0 <= LEN <= 15. If LEN is zero, we're done. |
| 769 | jz .Ldone\@ |
| 770 | |
| 771 | // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN. |
| 772 | // Do ciphertext stealing to process the last 16 + LEN bytes. |
| 773 | |
| 774 | .if \enc |
| 775 | // If encrypting, the main loop already encrypted the last full block to |
| 776 | // create the CTS intermediate ciphertext. Prepare for the rest of CTS |
| 777 | // by rewinding the pointers and loading the intermediate ciphertext. |
| 778 | sub $16, SRC |
| 779 | sub $16, DST |
| 780 | vmovdqu (DST), %xmm0 |
| 781 | .else |
| 782 | // If decrypting, the main loop didn't decrypt the last full block |
| 783 | // because CTS decryption uses the last two tweaks in reverse order. |
| 784 | // Do it now by advancing the tweak and decrypting the last full block. |
| 785 | _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM |
| 786 | vmovdqu (SRC), %xmm0 |
| 787 | _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 |
| 788 | .endif |
| 789 | |
| 790 | .if USE_AVX512 |
| 791 | // Create a mask that has the first LEN bits set. |
| 792 | mov $-1, %r9d |
| 793 | bzhi LEN, %r9d, %r9d |
| 794 | kmovd %r9d, %k1 |
| 795 | |
| 796 | // Swap the first LEN bytes of the en/decryption of the last full block |
| 797 | // with the partial block. Note that to support in-place en/decryption, |
| 798 | // the load from the src partial block must happen before the store to |
| 799 | // the dst partial block. |
| 800 | vmovdqa %xmm0, %xmm1 |
| 801 | vmovdqu8 16(SRC), %xmm0{%k1} |
| 802 | vmovdqu8 %xmm1, 16(DST){%k1} |
| 803 | .else |
| 804 | lea .Lcts_permute_table(%rip), %r9 |
| 805 | |
| 806 | // Load the src partial block, left-aligned. Note that to support |
| 807 | // in-place en/decryption, this must happen before the store to the dst |
| 808 | // partial block. |
| 809 | vmovdqu (SRC, LEN64, 1), %xmm1 |
| 810 | |
| 811 | // Shift the first LEN bytes of the en/decryption of the last full block |
| 812 | // to the end of a register, then store it to DST+LEN. This stores the |
| 813 | // dst partial block. It also writes to the second part of the dst last |
| 814 | // full block, but that part is overwritten later. |
| 815 | vpshufb (%r9, LEN64, 1), %xmm0, %xmm2 |
| 816 | vmovdqu %xmm2, (DST, LEN64, 1) |
| 817 | |
| 818 | // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. |
| 819 | sub LEN64, %r9 |
| 820 | vmovdqu 32(%r9), %xmm3 |
| 821 | |
| 822 | // Shift the src partial block to the beginning of its register. |
| 823 | vpshufb %xmm3, %xmm1, %xmm1 |
| 824 | |
| 825 | // Do a blend to generate the src partial block followed by the second |
| 826 | // part of the en/decryption of the last full block. |
| 827 | vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 |
| 828 | .endif |
| 829 | // En/decrypt again and store the last full block. |
| 830 | _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 |
| 831 | vmovdqu %xmm0, (DST) |
| 832 | jmp .Ldone\@ |
| 833 | .endm |
| 834 | |
| 835 | // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, |
| 836 | // u8 iv[AES_BLOCK_SIZE]); |
| 837 | // |
| 838 | // Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes |
| 839 | // that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512. |
| 840 | SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) |
| 841 | .set TWEAK_KEY, %rdi |
| 842 | .set IV, %rsi |
| 843 | .set KEYLEN, %eax |
| 844 | .set KEYLEN64, %rax |
| 845 | |
| 846 | vmovdqu (IV), %xmm0 |
| 847 | vpxor (TWEAK_KEY), %xmm0, %xmm0 |
| 848 | movl 480(TWEAK_KEY), KEYLEN |
| 849 | lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY |
| 850 | cmp $24, KEYLEN |
| 851 | jl .Lencrypt_iv_aes128 |
| 852 | je .Lencrypt_iv_aes192 |
| 853 | vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 |
| 854 | vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 |
| 855 | .Lencrypt_iv_aes192: |
| 856 | vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 |
| 857 | vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 |
| 858 | .Lencrypt_iv_aes128: |
| 859 | .irp i, -2,-1,0,1,2,3,4,5,6 |
| 860 | vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 |
| 861 | .endr |
| 862 | vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 |
| 863 | vmovdqu %xmm0, (IV) |
| 864 | RET |
| 865 | SYM_FUNC_END(aes_xts_encrypt_iv) |
| 866 | |
| 867 | // Below are the actual AES-XTS encryption and decryption functions, |
| 868 | // instantiated from the above macro. They all have the following prototype: |
| 869 | // |
| 870 | // void (*xts_crypt_func)(const struct crypto_aes_ctx *key, |
| 871 | // const u8 *src, u8 *dst, int len, |
| 872 | // u8 tweak[AES_BLOCK_SIZE]); |
| 873 | // |
| 874 | // |key| is the data key. |tweak| contains the next tweak; the encryption of |
| 875 | // the original IV with the tweak key was already done. This function supports |
| 876 | // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and |
| 877 | // |len| must be a multiple of 16 except on the last call. If |len| is a |
| 878 | // multiple of 16, then this function updates |tweak| to contain the next tweak. |
| 879 | |
| 880 | .set VL, 16 |
| 881 | .set USE_AVX512, 0 |
| 882 | SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) |
| 883 | _aes_xts_crypt 1 |
| 884 | SYM_FUNC_END(aes_xts_encrypt_aesni_avx) |
| 885 | SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) |
| 886 | _aes_xts_crypt 0 |
| 887 | SYM_FUNC_END(aes_xts_decrypt_aesni_avx) |
| 888 | |
| 889 | .set VL, 32 |
| 890 | .set USE_AVX512, 0 |
| 891 | SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) |
| 892 | _aes_xts_crypt 1 |
| 893 | SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) |
| 894 | SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) |
| 895 | _aes_xts_crypt 0 |
| 896 | SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) |
| 897 | |
| 898 | .set VL, 64 |
| 899 | .set USE_AVX512, 1 |
| 900 | SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512) |
| 901 | _aes_xts_crypt 1 |
| 902 | SYM_FUNC_END(aes_xts_encrypt_vaes_avx512) |
| 903 | SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512) |
| 904 | _aes_xts_crypt 0 |
| 905 | SYM_FUNC_END(aes_xts_decrypt_vaes_avx512) |
| 906 | |