| 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
| 2 | /* |
| 3 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES |
| 4 | * |
| 5 | * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> |
| 6 | */ |
| 7 | |
| 8 | /* included by aes-ce.S and aes-neon.S */ |
| 9 | |
| 10 | .text |
| 11 | .align 4 |
| 12 | |
| 13 | #ifndef MAX_STRIDE |
| 14 | #define MAX_STRIDE 4 |
| 15 | #endif |
| 16 | |
| 17 | #if MAX_STRIDE == 4 |
| 18 | #define ST4(x...) x |
| 19 | #define ST5(x...) |
| 20 | #else |
| 21 | #define ST4(x...) |
| 22 | #define ST5(x...) x |
| 23 | #endif |
| 24 | |
| 25 | SYM_FUNC_START_LOCAL(aes_encrypt_block4x) |
| 26 | encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
| 27 | ret |
| 28 | SYM_FUNC_END(aes_encrypt_block4x) |
| 29 | |
| 30 | SYM_FUNC_START_LOCAL(aes_decrypt_block4x) |
| 31 | decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
| 32 | ret |
| 33 | SYM_FUNC_END(aes_decrypt_block4x) |
| 34 | |
| 35 | #if MAX_STRIDE == 5 |
| 36 | SYM_FUNC_START_LOCAL(aes_encrypt_block5x) |
| 37 | encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
| 38 | ret |
| 39 | SYM_FUNC_END(aes_encrypt_block5x) |
| 40 | |
| 41 | SYM_FUNC_START_LOCAL(aes_decrypt_block5x) |
| 42 | decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
| 43 | ret |
| 44 | SYM_FUNC_END(aes_decrypt_block5x) |
| 45 | #endif |
| 46 | |
| 47 | /* |
| 48 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 49 | * int blocks) |
| 50 | * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 51 | * int blocks) |
| 52 | */ |
| 53 | |
| 54 | AES_FUNC_START(aes_ecb_encrypt) |
| 55 | frame_push 0 |
| 56 | |
| 57 | enc_prepare w3, x2, x5 |
| 58 | |
| 59 | .LecbencloopNx: |
| 60 | subs w4, w4, #MAX_STRIDE |
| 61 | bmi .Lecbenc1x |
| 62 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
| 63 | ST4( bl aes_encrypt_block4x ) |
| 64 | ST5( ld1 {v4.16b}, [x1], #16 ) |
| 65 | ST5( bl aes_encrypt_block5x ) |
| 66 | st1 {v0.16b-v3.16b}, [x0], #64 |
| 67 | ST5( st1 {v4.16b}, [x0], #16 ) |
| 68 | b .LecbencloopNx |
| 69 | .Lecbenc1x: |
| 70 | adds w4, w4, #MAX_STRIDE |
| 71 | beq .Lecbencout |
| 72 | .Lecbencloop: |
| 73 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
| 74 | encrypt_block v0, w3, x2, x5, w6 |
| 75 | st1 {v0.16b}, [x0], #16 |
| 76 | subs w4, w4, #1 |
| 77 | bne .Lecbencloop |
| 78 | .Lecbencout: |
| 79 | frame_pop |
| 80 | ret |
| 81 | AES_FUNC_END(aes_ecb_encrypt) |
| 82 | |
| 83 | |
| 84 | AES_FUNC_START(aes_ecb_decrypt) |
| 85 | frame_push 0 |
| 86 | |
| 87 | dec_prepare w3, x2, x5 |
| 88 | |
| 89 | .LecbdecloopNx: |
| 90 | subs w4, w4, #MAX_STRIDE |
| 91 | bmi .Lecbdec1x |
| 92 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
| 93 | ST4( bl aes_decrypt_block4x ) |
| 94 | ST5( ld1 {v4.16b}, [x1], #16 ) |
| 95 | ST5( bl aes_decrypt_block5x ) |
| 96 | st1 {v0.16b-v3.16b}, [x0], #64 |
| 97 | ST5( st1 {v4.16b}, [x0], #16 ) |
| 98 | b .LecbdecloopNx |
| 99 | .Lecbdec1x: |
| 100 | adds w4, w4, #MAX_STRIDE |
| 101 | beq .Lecbdecout |
| 102 | .Lecbdecloop: |
| 103 | ld1 {v0.16b}, [x1], #16 /* get next ct block */ |
| 104 | decrypt_block v0, w3, x2, x5, w6 |
| 105 | st1 {v0.16b}, [x0], #16 |
| 106 | subs w4, w4, #1 |
| 107 | bne .Lecbdecloop |
| 108 | .Lecbdecout: |
| 109 | frame_pop |
| 110 | ret |
| 111 | AES_FUNC_END(aes_ecb_decrypt) |
| 112 | |
| 113 | |
| 114 | /* |
| 115 | * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 116 | * int blocks, u8 iv[]) |
| 117 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 118 | * int blocks, u8 iv[]) |
| 119 | * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], |
| 120 | * int rounds, int blocks, u8 iv[], |
| 121 | * u32 const rk2[]); |
| 122 | * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], |
| 123 | * int rounds, int blocks, u8 iv[], |
| 124 | * u32 const rk2[]); |
| 125 | */ |
| 126 | |
| 127 | AES_FUNC_START(aes_essiv_cbc_encrypt) |
| 128 | ld1 {v4.16b}, [x5] /* get iv */ |
| 129 | |
| 130 | mov w8, #14 /* AES-256: 14 rounds */ |
| 131 | enc_prepare w8, x6, x7 |
| 132 | encrypt_block v4, w8, x6, x7, w9 |
| 133 | enc_switch_key w3, x2, x6 |
| 134 | b .Lcbcencloop4x |
| 135 | |
| 136 | AES_FUNC_START(aes_cbc_encrypt) |
| 137 | ld1 {v4.16b}, [x5] /* get iv */ |
| 138 | enc_prepare w3, x2, x6 |
| 139 | |
| 140 | .Lcbcencloop4x: |
| 141 | subs w4, w4, #4 |
| 142 | bmi .Lcbcenc1x |
| 143 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
| 144 | eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ |
| 145 | encrypt_block v0, w3, x2, x6, w7 |
| 146 | eor v1.16b, v1.16b, v0.16b |
| 147 | encrypt_block v1, w3, x2, x6, w7 |
| 148 | eor v2.16b, v2.16b, v1.16b |
| 149 | encrypt_block v2, w3, x2, x6, w7 |
| 150 | eor v3.16b, v3.16b, v2.16b |
| 151 | encrypt_block v3, w3, x2, x6, w7 |
| 152 | st1 {v0.16b-v3.16b}, [x0], #64 |
| 153 | mov v4.16b, v3.16b |
| 154 | b .Lcbcencloop4x |
| 155 | .Lcbcenc1x: |
| 156 | adds w4, w4, #4 |
| 157 | beq .Lcbcencout |
| 158 | .Lcbcencloop: |
| 159 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
| 160 | eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ |
| 161 | encrypt_block v4, w3, x2, x6, w7 |
| 162 | st1 {v4.16b}, [x0], #16 |
| 163 | subs w4, w4, #1 |
| 164 | bne .Lcbcencloop |
| 165 | .Lcbcencout: |
| 166 | st1 {v4.16b}, [x5] /* return iv */ |
| 167 | ret |
| 168 | AES_FUNC_END(aes_cbc_encrypt) |
| 169 | AES_FUNC_END(aes_essiv_cbc_encrypt) |
| 170 | |
| 171 | AES_FUNC_START(aes_essiv_cbc_decrypt) |
| 172 | ld1 {cbciv.16b}, [x5] /* get iv */ |
| 173 | |
| 174 | mov w8, #14 /* AES-256: 14 rounds */ |
| 175 | enc_prepare w8, x6, x7 |
| 176 | encrypt_block cbciv, w8, x6, x7, w9 |
| 177 | b .Lessivcbcdecstart |
| 178 | |
| 179 | AES_FUNC_START(aes_cbc_decrypt) |
| 180 | ld1 {cbciv.16b}, [x5] /* get iv */ |
| 181 | .Lessivcbcdecstart: |
| 182 | frame_push 0 |
| 183 | dec_prepare w3, x2, x6 |
| 184 | |
| 185 | .LcbcdecloopNx: |
| 186 | subs w4, w4, #MAX_STRIDE |
| 187 | bmi .Lcbcdec1x |
| 188 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
| 189 | #if MAX_STRIDE == 5 |
| 190 | ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ |
| 191 | mov v5.16b, v0.16b |
| 192 | mov v6.16b, v1.16b |
| 193 | mov v7.16b, v2.16b |
| 194 | bl aes_decrypt_block5x |
| 195 | sub x1, x1, #32 |
| 196 | eor v0.16b, v0.16b, cbciv.16b |
| 197 | eor v1.16b, v1.16b, v5.16b |
| 198 | ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ |
| 199 | ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
| 200 | eor v2.16b, v2.16b, v6.16b |
| 201 | eor v3.16b, v3.16b, v7.16b |
| 202 | eor v4.16b, v4.16b, v5.16b |
| 203 | #else |
| 204 | mov v4.16b, v0.16b |
| 205 | mov v5.16b, v1.16b |
| 206 | mov v6.16b, v2.16b |
| 207 | bl aes_decrypt_block4x |
| 208 | sub x1, x1, #16 |
| 209 | eor v0.16b, v0.16b, cbciv.16b |
| 210 | eor v1.16b, v1.16b, v4.16b |
| 211 | ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
| 212 | eor v2.16b, v2.16b, v5.16b |
| 213 | eor v3.16b, v3.16b, v6.16b |
| 214 | #endif |
| 215 | st1 {v0.16b-v3.16b}, [x0], #64 |
| 216 | ST5( st1 {v4.16b}, [x0], #16 ) |
| 217 | b .LcbcdecloopNx |
| 218 | .Lcbcdec1x: |
| 219 | adds w4, w4, #MAX_STRIDE |
| 220 | beq .Lcbcdecout |
| 221 | .Lcbcdecloop: |
| 222 | ld1 {v1.16b}, [x1], #16 /* get next ct block */ |
| 223 | mov v0.16b, v1.16b /* ...and copy to v0 */ |
| 224 | decrypt_block v0, w3, x2, x6, w7 |
| 225 | eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ |
| 226 | mov cbciv.16b, v1.16b /* ct is next iv */ |
| 227 | st1 {v0.16b}, [x0], #16 |
| 228 | subs w4, w4, #1 |
| 229 | bne .Lcbcdecloop |
| 230 | .Lcbcdecout: |
| 231 | st1 {cbciv.16b}, [x5] /* return iv */ |
| 232 | frame_pop |
| 233 | ret |
| 234 | AES_FUNC_END(aes_cbc_decrypt) |
| 235 | AES_FUNC_END(aes_essiv_cbc_decrypt) |
| 236 | |
| 237 | |
| 238 | /* |
| 239 | * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], |
| 240 | * int rounds, int bytes, u8 const iv[]) |
| 241 | * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], |
| 242 | * int rounds, int bytes, u8 const iv[]) |
| 243 | */ |
| 244 | |
| 245 | AES_FUNC_START(aes_cbc_cts_encrypt) |
| 246 | adr_l x8, .Lcts_permute_table |
| 247 | sub x4, x4, #16 |
| 248 | add x9, x8, #32 |
| 249 | add x8, x8, x4 |
| 250 | sub x9, x9, x4 |
| 251 | ld1 {v3.16b}, [x8] |
| 252 | ld1 {v4.16b}, [x9] |
| 253 | |
| 254 | ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
| 255 | ld1 {v1.16b}, [x1] |
| 256 | |
| 257 | ld1 {v5.16b}, [x5] /* get iv */ |
| 258 | enc_prepare w3, x2, x6 |
| 259 | |
| 260 | eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
| 261 | tbl v1.16b, {v1.16b}, v4.16b |
| 262 | encrypt_block v0, w3, x2, x6, w7 |
| 263 | |
| 264 | eor v1.16b, v1.16b, v0.16b |
| 265 | tbl v0.16b, {v0.16b}, v3.16b |
| 266 | encrypt_block v1, w3, x2, x6, w7 |
| 267 | |
| 268 | add x4, x0, x4 |
| 269 | st1 {v0.16b}, [x4] /* overlapping stores */ |
| 270 | st1 {v1.16b}, [x0] |
| 271 | ret |
| 272 | AES_FUNC_END(aes_cbc_cts_encrypt) |
| 273 | |
| 274 | AES_FUNC_START(aes_cbc_cts_decrypt) |
| 275 | adr_l x8, .Lcts_permute_table |
| 276 | sub x4, x4, #16 |
| 277 | add x9, x8, #32 |
| 278 | add x8, x8, x4 |
| 279 | sub x9, x9, x4 |
| 280 | ld1 {v3.16b}, [x8] |
| 281 | ld1 {v4.16b}, [x9] |
| 282 | |
| 283 | ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
| 284 | ld1 {v1.16b}, [x1] |
| 285 | |
| 286 | ld1 {v5.16b}, [x5] /* get iv */ |
| 287 | dec_prepare w3, x2, x6 |
| 288 | |
| 289 | decrypt_block v0, w3, x2, x6, w7 |
| 290 | tbl v2.16b, {v0.16b}, v3.16b |
| 291 | eor v2.16b, v2.16b, v1.16b |
| 292 | |
| 293 | tbx v0.16b, {v1.16b}, v4.16b |
| 294 | decrypt_block v0, w3, x2, x6, w7 |
| 295 | eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
| 296 | |
| 297 | add x4, x0, x4 |
| 298 | st1 {v2.16b}, [x4] /* overlapping stores */ |
| 299 | st1 {v0.16b}, [x0] |
| 300 | ret |
| 301 | AES_FUNC_END(aes_cbc_cts_decrypt) |
| 302 | |
| 303 | .section ".rodata" , "a" |
| 304 | .align 6 |
| 305 | .Lcts_permute_table: |
| 306 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| 307 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| 308 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
| 309 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
| 310 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| 311 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| 312 | .previous |
| 313 | |
| 314 | /* |
| 315 | * This macro generates the code for CTR and XCTR mode. |
| 316 | */ |
| 317 | .macro ctr_encrypt xctr |
| 318 | // Arguments |
| 319 | OUT .req x0 |
| 320 | IN .req x1 |
| 321 | KEY .req x2 |
| 322 | ROUNDS_W .req w3 |
| 323 | BYTES_W .req w4 |
| 324 | IV .req x5 |
| 325 | BYTE_CTR_W .req w6 // XCTR only |
| 326 | // Intermediate values |
| 327 | CTR_W .req w11 // XCTR only |
| 328 | CTR .req x11 // XCTR only |
| 329 | IV_PART .req x12 |
| 330 | BLOCKS .req x13 |
| 331 | BLOCKS_W .req w13 |
| 332 | |
| 333 | frame_push 0 |
| 334 | |
| 335 | enc_prepare ROUNDS_W, KEY, IV_PART |
| 336 | ld1 {vctr.16b}, [IV] |
| 337 | |
| 338 | /* |
| 339 | * Keep 64 bits of the IV in a register. For CTR mode this lets us |
| 340 | * easily increment the IV. For XCTR mode this lets us efficiently XOR |
| 341 | * the 64-bit counter with the IV. |
| 342 | */ |
| 343 | .if \xctr |
| 344 | umov IV_PART, vctr.d[0] |
| 345 | lsr CTR_W, BYTE_CTR_W, #4 |
| 346 | .else |
| 347 | umov IV_PART, vctr.d[1] |
| 348 | rev IV_PART, IV_PART |
| 349 | .endif |
| 350 | |
| 351 | .LctrloopNx\xctr: |
| 352 | add BLOCKS_W, BYTES_W, #15 |
| 353 | sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 |
| 354 | lsr BLOCKS_W, BLOCKS_W, #4 |
| 355 | mov w8, #MAX_STRIDE |
| 356 | cmp BLOCKS_W, w8 |
| 357 | csel BLOCKS_W, BLOCKS_W, w8, lt |
| 358 | |
| 359 | /* |
| 360 | * Set up the counter values in v0-v{MAX_STRIDE-1}. |
| 361 | * |
| 362 | * If we are encrypting less than MAX_STRIDE blocks, the tail block |
| 363 | * handling code expects the last keystream block to be in |
| 364 | * v{MAX_STRIDE-1}. For example: if encrypting two blocks with |
| 365 | * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. |
| 366 | */ |
| 367 | .if \xctr |
| 368 | add CTR, CTR, BLOCKS |
| 369 | .else |
| 370 | adds IV_PART, IV_PART, BLOCKS |
| 371 | .endif |
| 372 | mov v0.16b, vctr.16b |
| 373 | mov v1.16b, vctr.16b |
| 374 | mov v2.16b, vctr.16b |
| 375 | mov v3.16b, vctr.16b |
| 376 | ST5( mov v4.16b, vctr.16b ) |
| 377 | .if \xctr |
| 378 | sub x6, CTR, #MAX_STRIDE - 1 |
| 379 | sub x7, CTR, #MAX_STRIDE - 2 |
| 380 | sub x8, CTR, #MAX_STRIDE - 3 |
| 381 | sub x9, CTR, #MAX_STRIDE - 4 |
| 382 | ST5( sub x10, CTR, #MAX_STRIDE - 5 ) |
| 383 | eor x6, x6, IV_PART |
| 384 | eor x7, x7, IV_PART |
| 385 | eor x8, x8, IV_PART |
| 386 | eor x9, x9, IV_PART |
| 387 | ST5( eor x10, x10, IV_PART ) |
| 388 | mov v0.d[0], x6 |
| 389 | mov v1.d[0], x7 |
| 390 | mov v2.d[0], x8 |
| 391 | mov v3.d[0], x9 |
| 392 | ST5( mov v4.d[0], x10 ) |
| 393 | .else |
| 394 | bcs 0f |
| 395 | .subsection 1 |
| 396 | /* |
| 397 | * This subsection handles carries. |
| 398 | * |
| 399 | * Conditional branching here is allowed with respect to time |
| 400 | * invariance since the branches are dependent on the IV instead |
| 401 | * of the plaintext or key. This code is rarely executed in |
| 402 | * practice anyway. |
| 403 | */ |
| 404 | |
| 405 | /* Apply carry to outgoing counter. */ |
| 406 | 0: umov x8, vctr.d[0] |
| 407 | rev x8, x8 |
| 408 | add x8, x8, #1 |
| 409 | rev x8, x8 |
| 410 | ins vctr.d[0], x8 |
| 411 | |
| 412 | /* |
| 413 | * Apply carry to counter blocks if needed. |
| 414 | * |
| 415 | * Since the carry flag was set, we know 0 <= IV_PART < |
| 416 | * MAX_STRIDE. Using the value of IV_PART we can determine how |
| 417 | * many counter blocks need to be updated. |
| 418 | */ |
| 419 | cbz IV_PART, 2f |
| 420 | adr x16, 1f |
| 421 | sub x16, x16, IV_PART, lsl #3 |
| 422 | br x16 |
| 423 | bti c |
| 424 | mov v0.d[0], vctr.d[0] |
| 425 | bti c |
| 426 | mov v1.d[0], vctr.d[0] |
| 427 | bti c |
| 428 | mov v2.d[0], vctr.d[0] |
| 429 | bti c |
| 430 | mov v3.d[0], vctr.d[0] |
| 431 | ST5( bti c ) |
| 432 | ST5( mov v4.d[0], vctr.d[0] ) |
| 433 | 1: b 2f |
| 434 | .previous |
| 435 | |
| 436 | 2: rev x7, IV_PART |
| 437 | ins vctr.d[1], x7 |
| 438 | sub x7, IV_PART, #MAX_STRIDE - 1 |
| 439 | sub x8, IV_PART, #MAX_STRIDE - 2 |
| 440 | sub x9, IV_PART, #MAX_STRIDE - 3 |
| 441 | rev x7, x7 |
| 442 | rev x8, x8 |
| 443 | mov v1.d[1], x7 |
| 444 | rev x9, x9 |
| 445 | ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) |
| 446 | mov v2.d[1], x8 |
| 447 | ST5( rev x10, x10 ) |
| 448 | mov v3.d[1], x9 |
| 449 | ST5( mov v4.d[1], x10 ) |
| 450 | .endif |
| 451 | |
| 452 | /* |
| 453 | * If there are at least MAX_STRIDE blocks left, XOR the data with |
| 454 | * keystream and store. Otherwise jump to tail handling. |
| 455 | */ |
| 456 | tbnz BYTES_W, #31, .Lctrtail\xctr |
| 457 | ld1 {v5.16b-v7.16b}, [IN], #48 |
| 458 | ST4( bl aes_encrypt_block4x ) |
| 459 | ST5( bl aes_encrypt_block5x ) |
| 460 | eor v0.16b, v5.16b, v0.16b |
| 461 | ST4( ld1 {v5.16b}, [IN], #16 ) |
| 462 | eor v1.16b, v6.16b, v1.16b |
| 463 | ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) |
| 464 | eor v2.16b, v7.16b, v2.16b |
| 465 | eor v3.16b, v5.16b, v3.16b |
| 466 | ST5( eor v4.16b, v6.16b, v4.16b ) |
| 467 | st1 {v0.16b-v3.16b}, [OUT], #64 |
| 468 | ST5( st1 {v4.16b}, [OUT], #16 ) |
| 469 | cbz BYTES_W, .Lctrout\xctr |
| 470 | b .LctrloopNx\xctr |
| 471 | |
| 472 | .Lctrout\xctr: |
| 473 | .if !\xctr |
| 474 | st1 {vctr.16b}, [IV] /* return next CTR value */ |
| 475 | .endif |
| 476 | frame_pop |
| 477 | ret |
| 478 | |
| 479 | .Lctrtail\xctr: |
| 480 | /* |
| 481 | * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext |
| 482 | * |
| 483 | * This code expects the last keystream block to be in v{MAX_STRIDE-1}. |
| 484 | * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and |
| 485 | * v4 should have the next two counter blocks. |
| 486 | * |
| 487 | * This allows us to store the ciphertext by writing to overlapping |
| 488 | * regions of memory. Any invalid ciphertext blocks get overwritten by |
| 489 | * correctly computed blocks. This approach greatly simplifies the |
| 490 | * logic for storing the ciphertext. |
| 491 | */ |
| 492 | mov x16, #16 |
| 493 | ands w7, BYTES_W, #0xf |
| 494 | csel x13, x7, x16, ne |
| 495 | |
| 496 | ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) |
| 497 | ST5( csel x14, x16, xzr, gt ) |
| 498 | cmp BYTES_W, #48 - (MAX_STRIDE << 4) |
| 499 | csel x15, x16, xzr, gt |
| 500 | cmp BYTES_W, #32 - (MAX_STRIDE << 4) |
| 501 | csel x16, x16, xzr, gt |
| 502 | cmp BYTES_W, #16 - (MAX_STRIDE << 4) |
| 503 | |
| 504 | adr_l x9, .Lcts_permute_table |
| 505 | add x9, x9, x13 |
| 506 | ble .Lctrtail1x\xctr |
| 507 | |
| 508 | ST5( ld1 {v5.16b}, [IN], x14 ) |
| 509 | ld1 {v6.16b}, [IN], x15 |
| 510 | ld1 {v7.16b}, [IN], x16 |
| 511 | |
| 512 | ST4( bl aes_encrypt_block4x ) |
| 513 | ST5( bl aes_encrypt_block5x ) |
| 514 | |
| 515 | ld1 {v8.16b}, [IN], x13 |
| 516 | ld1 {v9.16b}, [IN] |
| 517 | ld1 {v10.16b}, [x9] |
| 518 | |
| 519 | ST4( eor v6.16b, v6.16b, v0.16b ) |
| 520 | ST4( eor v7.16b, v7.16b, v1.16b ) |
| 521 | ST4( tbl v3.16b, {v3.16b}, v10.16b ) |
| 522 | ST4( eor v8.16b, v8.16b, v2.16b ) |
| 523 | ST4( eor v9.16b, v9.16b, v3.16b ) |
| 524 | |
| 525 | ST5( eor v5.16b, v5.16b, v0.16b ) |
| 526 | ST5( eor v6.16b, v6.16b, v1.16b ) |
| 527 | ST5( tbl v4.16b, {v4.16b}, v10.16b ) |
| 528 | ST5( eor v7.16b, v7.16b, v2.16b ) |
| 529 | ST5( eor v8.16b, v8.16b, v3.16b ) |
| 530 | ST5( eor v9.16b, v9.16b, v4.16b ) |
| 531 | |
| 532 | ST5( st1 {v5.16b}, [OUT], x14 ) |
| 533 | st1 {v6.16b}, [OUT], x15 |
| 534 | st1 {v7.16b}, [OUT], x16 |
| 535 | add x13, x13, OUT |
| 536 | st1 {v9.16b}, [x13] // overlapping stores |
| 537 | st1 {v8.16b}, [OUT] |
| 538 | b .Lctrout\xctr |
| 539 | |
| 540 | .Lctrtail1x\xctr: |
| 541 | /* |
| 542 | * Handle <= 16 bytes of plaintext |
| 543 | * |
| 544 | * This code always reads and writes 16 bytes. To avoid out of bounds |
| 545 | * accesses, XCTR and CTR modes must use a temporary buffer when |
| 546 | * encrypting/decrypting less than 16 bytes. |
| 547 | * |
| 548 | * This code is unusual in that it loads the input and stores the output |
| 549 | * relative to the end of the buffers rather than relative to the start. |
| 550 | * This causes unusual behaviour when encrypting/decrypting less than 16 |
| 551 | * bytes; the end of the data is expected to be at the end of the |
| 552 | * temporary buffer rather than the start of the data being at the start |
| 553 | * of the temporary buffer. |
| 554 | */ |
| 555 | sub x8, x7, #16 |
| 556 | csel x7, x7, x8, eq |
| 557 | add IN, IN, x7 |
| 558 | add OUT, OUT, x7 |
| 559 | ld1 {v5.16b}, [IN] |
| 560 | ld1 {v6.16b}, [OUT] |
| 561 | ST5( mov v3.16b, v4.16b ) |
| 562 | encrypt_block v3, ROUNDS_W, KEY, x8, w7 |
| 563 | ld1 {v10.16b-v11.16b}, [x9] |
| 564 | tbl v3.16b, {v3.16b}, v10.16b |
| 565 | sshr v11.16b, v11.16b, #7 |
| 566 | eor v5.16b, v5.16b, v3.16b |
| 567 | bif v5.16b, v6.16b, v11.16b |
| 568 | st1 {v5.16b}, [OUT] |
| 569 | b .Lctrout\xctr |
| 570 | |
| 571 | // Arguments |
| 572 | .unreq OUT |
| 573 | .unreq IN |
| 574 | .unreq KEY |
| 575 | .unreq ROUNDS_W |
| 576 | .unreq BYTES_W |
| 577 | .unreq IV |
| 578 | .unreq BYTE_CTR_W // XCTR only |
| 579 | // Intermediate values |
| 580 | .unreq CTR_W // XCTR only |
| 581 | .unreq CTR // XCTR only |
| 582 | .unreq IV_PART |
| 583 | .unreq BLOCKS |
| 584 | .unreq BLOCKS_W |
| 585 | .endm |
| 586 | |
| 587 | /* |
| 588 | * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 589 | * int bytes, u8 ctr[]) |
| 590 | * |
| 591 | * The input and output buffers must always be at least 16 bytes even if |
| 592 | * encrypting/decrypting less than 16 bytes. Otherwise out of bounds |
| 593 | * accesses will occur. The data to be encrypted/decrypted is expected |
| 594 | * to be at the end of this 16-byte temporary buffer rather than the |
| 595 | * start. |
| 596 | */ |
| 597 | |
| 598 | AES_FUNC_START(aes_ctr_encrypt) |
| 599 | ctr_encrypt 0 |
| 600 | AES_FUNC_END(aes_ctr_encrypt) |
| 601 | |
| 602 | /* |
| 603 | * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 604 | * int bytes, u8 const iv[], int byte_ctr) |
| 605 | * |
| 606 | * The input and output buffers must always be at least 16 bytes even if |
| 607 | * encrypting/decrypting less than 16 bytes. Otherwise out of bounds |
| 608 | * accesses will occur. The data to be encrypted/decrypted is expected |
| 609 | * to be at the end of this 16-byte temporary buffer rather than the |
| 610 | * start. |
| 611 | */ |
| 612 | |
| 613 | AES_FUNC_START(aes_xctr_encrypt) |
| 614 | ctr_encrypt 1 |
| 615 | AES_FUNC_END(aes_xctr_encrypt) |
| 616 | |
| 617 | |
| 618 | /* |
| 619 | * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
| 620 | * int bytes, u8 const rk2[], u8 iv[], int first) |
| 621 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
| 622 | * int bytes, u8 const rk2[], u8 iv[], int first) |
| 623 | */ |
| 624 | |
| 625 | .macro next_tweak, out, in, tmp |
| 626 | sshr \tmp\().2d, \in\().2d, #63 |
| 627 | and \tmp\().16b, \tmp\().16b, xtsmask.16b |
| 628 | add \out\().2d, \in\().2d, \in\().2d |
| 629 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 |
| 630 | eor \out\().16b, \out\().16b, \tmp\().16b |
| 631 | .endm |
| 632 | |
| 633 | .macro xts_load_mask, tmp |
| 634 | movi xtsmask.2s, #0x1 |
| 635 | movi \tmp\().2s, #0x87 |
| 636 | uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s |
| 637 | .endm |
| 638 | |
| 639 | AES_FUNC_START(aes_xts_encrypt) |
| 640 | frame_push 0 |
| 641 | |
| 642 | ld1 {v4.16b}, [x6] |
| 643 | xts_load_mask v8 |
| 644 | cbz w7, .Lxtsencnotfirst |
| 645 | |
| 646 | enc_prepare w3, x5, x8 |
| 647 | xts_cts_skip_tw w7, .LxtsencNx |
| 648 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
| 649 | enc_switch_key w3, x2, x8 |
| 650 | b .LxtsencNx |
| 651 | |
| 652 | .Lxtsencnotfirst: |
| 653 | enc_prepare w3, x2, x8 |
| 654 | .LxtsencloopNx: |
| 655 | next_tweak v4, v4, v8 |
| 656 | .LxtsencNx: |
| 657 | subs w4, w4, #64 |
| 658 | bmi .Lxtsenc1x |
| 659 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
| 660 | next_tweak v5, v4, v8 |
| 661 | eor v0.16b, v0.16b, v4.16b |
| 662 | next_tweak v6, v5, v8 |
| 663 | eor v1.16b, v1.16b, v5.16b |
| 664 | eor v2.16b, v2.16b, v6.16b |
| 665 | next_tweak v7, v6, v8 |
| 666 | eor v3.16b, v3.16b, v7.16b |
| 667 | bl aes_encrypt_block4x |
| 668 | eor v3.16b, v3.16b, v7.16b |
| 669 | eor v0.16b, v0.16b, v4.16b |
| 670 | eor v1.16b, v1.16b, v5.16b |
| 671 | eor v2.16b, v2.16b, v6.16b |
| 672 | st1 {v0.16b-v3.16b}, [x0], #64 |
| 673 | mov v4.16b, v7.16b |
| 674 | cbz w4, .Lxtsencret |
| 675 | xts_reload_mask v8 |
| 676 | b .LxtsencloopNx |
| 677 | .Lxtsenc1x: |
| 678 | adds w4, w4, #64 |
| 679 | beq .Lxtsencout |
| 680 | subs w4, w4, #16 |
| 681 | bmi .LxtsencctsNx |
| 682 | .Lxtsencloop: |
| 683 | ld1 {v0.16b}, [x1], #16 |
| 684 | .Lxtsencctsout: |
| 685 | eor v0.16b, v0.16b, v4.16b |
| 686 | encrypt_block v0, w3, x2, x8, w7 |
| 687 | eor v0.16b, v0.16b, v4.16b |
| 688 | cbz w4, .Lxtsencout |
| 689 | subs w4, w4, #16 |
| 690 | next_tweak v4, v4, v8 |
| 691 | bmi .Lxtsenccts |
| 692 | st1 {v0.16b}, [x0], #16 |
| 693 | b .Lxtsencloop |
| 694 | .Lxtsencout: |
| 695 | st1 {v0.16b}, [x0] |
| 696 | .Lxtsencret: |
| 697 | st1 {v4.16b}, [x6] |
| 698 | frame_pop |
| 699 | ret |
| 700 | |
| 701 | .LxtsencctsNx: |
| 702 | mov v0.16b, v3.16b |
| 703 | sub x0, x0, #16 |
| 704 | .Lxtsenccts: |
| 705 | adr_l x8, .Lcts_permute_table |
| 706 | |
| 707 | add x1, x1, w4, sxtw /* rewind input pointer */ |
| 708 | add w4, w4, #16 /* # bytes in final block */ |
| 709 | add x9, x8, #32 |
| 710 | add x8, x8, x4 |
| 711 | sub x9, x9, x4 |
| 712 | add x4, x0, x4 /* output address of final block */ |
| 713 | |
| 714 | ld1 {v1.16b}, [x1] /* load final block */ |
| 715 | ld1 {v2.16b}, [x8] |
| 716 | ld1 {v3.16b}, [x9] |
| 717 | |
| 718 | tbl v2.16b, {v0.16b}, v2.16b |
| 719 | tbx v0.16b, {v1.16b}, v3.16b |
| 720 | st1 {v2.16b}, [x4] /* overlapping stores */ |
| 721 | mov w4, wzr |
| 722 | b .Lxtsencctsout |
| 723 | AES_FUNC_END(aes_xts_encrypt) |
| 724 | |
| 725 | AES_FUNC_START(aes_xts_decrypt) |
| 726 | frame_push 0 |
| 727 | |
| 728 | /* subtract 16 bytes if we are doing CTS */ |
| 729 | sub w8, w4, #0x10 |
| 730 | tst w4, #0xf |
| 731 | csel w4, w4, w8, eq |
| 732 | |
| 733 | ld1 {v4.16b}, [x6] |
| 734 | xts_load_mask v8 |
| 735 | xts_cts_skip_tw w7, .Lxtsdecskiptw |
| 736 | cbz w7, .Lxtsdecnotfirst |
| 737 | |
| 738 | enc_prepare w3, x5, x8 |
| 739 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
| 740 | .Lxtsdecskiptw: |
| 741 | dec_prepare w3, x2, x8 |
| 742 | b .LxtsdecNx |
| 743 | |
| 744 | .Lxtsdecnotfirst: |
| 745 | dec_prepare w3, x2, x8 |
| 746 | .LxtsdecloopNx: |
| 747 | next_tweak v4, v4, v8 |
| 748 | .LxtsdecNx: |
| 749 | subs w4, w4, #64 |
| 750 | bmi .Lxtsdec1x |
| 751 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
| 752 | next_tweak v5, v4, v8 |
| 753 | eor v0.16b, v0.16b, v4.16b |
| 754 | next_tweak v6, v5, v8 |
| 755 | eor v1.16b, v1.16b, v5.16b |
| 756 | eor v2.16b, v2.16b, v6.16b |
| 757 | next_tweak v7, v6, v8 |
| 758 | eor v3.16b, v3.16b, v7.16b |
| 759 | bl aes_decrypt_block4x |
| 760 | eor v3.16b, v3.16b, v7.16b |
| 761 | eor v0.16b, v0.16b, v4.16b |
| 762 | eor v1.16b, v1.16b, v5.16b |
| 763 | eor v2.16b, v2.16b, v6.16b |
| 764 | st1 {v0.16b-v3.16b}, [x0], #64 |
| 765 | mov v4.16b, v7.16b |
| 766 | cbz w4, .Lxtsdecout |
| 767 | xts_reload_mask v8 |
| 768 | b .LxtsdecloopNx |
| 769 | .Lxtsdec1x: |
| 770 | adds w4, w4, #64 |
| 771 | beq .Lxtsdecout |
| 772 | subs w4, w4, #16 |
| 773 | .Lxtsdecloop: |
| 774 | ld1 {v0.16b}, [x1], #16 |
| 775 | bmi .Lxtsdeccts |
| 776 | .Lxtsdecctsout: |
| 777 | eor v0.16b, v0.16b, v4.16b |
| 778 | decrypt_block v0, w3, x2, x8, w7 |
| 779 | eor v0.16b, v0.16b, v4.16b |
| 780 | st1 {v0.16b}, [x0], #16 |
| 781 | cbz w4, .Lxtsdecout |
| 782 | subs w4, w4, #16 |
| 783 | next_tweak v4, v4, v8 |
| 784 | b .Lxtsdecloop |
| 785 | .Lxtsdecout: |
| 786 | st1 {v4.16b}, [x6] |
| 787 | frame_pop |
| 788 | ret |
| 789 | |
| 790 | .Lxtsdeccts: |
| 791 | adr_l x8, .Lcts_permute_table |
| 792 | |
| 793 | add x1, x1, w4, sxtw /* rewind input pointer */ |
| 794 | add w4, w4, #16 /* # bytes in final block */ |
| 795 | add x9, x8, #32 |
| 796 | add x8, x8, x4 |
| 797 | sub x9, x9, x4 |
| 798 | add x4, x0, x4 /* output address of final block */ |
| 799 | |
| 800 | next_tweak v5, v4, v8 |
| 801 | |
| 802 | ld1 {v1.16b}, [x1] /* load final block */ |
| 803 | ld1 {v2.16b}, [x8] |
| 804 | ld1 {v3.16b}, [x9] |
| 805 | |
| 806 | eor v0.16b, v0.16b, v5.16b |
| 807 | decrypt_block v0, w3, x2, x8, w7 |
| 808 | eor v0.16b, v0.16b, v5.16b |
| 809 | |
| 810 | tbl v2.16b, {v0.16b}, v2.16b |
| 811 | tbx v0.16b, {v1.16b}, v3.16b |
| 812 | |
| 813 | st1 {v2.16b}, [x4] /* overlapping stores */ |
| 814 | mov w4, wzr |
| 815 | b .Lxtsdecctsout |
| 816 | AES_FUNC_END(aes_xts_decrypt) |
| 817 | |
| 818 | /* |
| 819 | * aes_mac_update(u8 const in[], u32 const rk[], int rounds, |
| 820 | * int blocks, u8 dg[], int enc_before, int enc_after) |
| 821 | */ |
| 822 | AES_FUNC_START(aes_mac_update) |
| 823 | ld1 {v0.16b}, [x4] /* get dg */ |
| 824 | enc_prepare w2, x1, x7 |
| 825 | cbz w5, .Lmacloop4x |
| 826 | |
| 827 | encrypt_block v0, w2, x1, x7, w8 |
| 828 | |
| 829 | .Lmacloop4x: |
| 830 | subs w3, w3, #4 |
| 831 | bmi .Lmac1x |
| 832 | ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ |
| 833 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
| 834 | encrypt_block v0, w2, x1, x7, w8 |
| 835 | eor v0.16b, v0.16b, v2.16b |
| 836 | encrypt_block v0, w2, x1, x7, w8 |
| 837 | eor v0.16b, v0.16b, v3.16b |
| 838 | encrypt_block v0, w2, x1, x7, w8 |
| 839 | eor v0.16b, v0.16b, v4.16b |
| 840 | cmp w3, wzr |
| 841 | csinv x5, x6, xzr, eq |
| 842 | cbz w5, .Lmacout |
| 843 | encrypt_block v0, w2, x1, x7, w8 |
| 844 | st1 {v0.16b}, [x4] /* return dg */ |
| 845 | cond_yield .Lmacout, x7, x8 |
| 846 | b .Lmacloop4x |
| 847 | .Lmac1x: |
| 848 | add w3, w3, #4 |
| 849 | .Lmacloop: |
| 850 | cbz w3, .Lmacout |
| 851 | ld1 {v1.16b}, [x0], #16 /* get next pt block */ |
| 852 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
| 853 | |
| 854 | subs w3, w3, #1 |
| 855 | csinv x5, x6, xzr, eq |
| 856 | cbz w5, .Lmacout |
| 857 | |
| 858 | .Lmacenc: |
| 859 | encrypt_block v0, w2, x1, x7, w8 |
| 860 | b .Lmacloop |
| 861 | |
| 862 | .Lmacout: |
| 863 | st1 {v0.16b}, [x4] /* return dg */ |
| 864 | mov w0, w3 |
| 865 | ret |
| 866 | AES_FUNC_END(aes_mac_update) |
| 867 | |