1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES |
4 | * |
5 | * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> |
6 | */ |
7 | |
8 | /* included by aes-ce.S and aes-neon.S */ |
9 | |
10 | .text |
11 | .align 4 |
12 | |
13 | #ifndef MAX_STRIDE |
14 | #define MAX_STRIDE 4 |
15 | #endif |
16 | |
17 | #if MAX_STRIDE == 4 |
18 | #define ST4(x...) x |
19 | #define ST5(x...) |
20 | #else |
21 | #define ST4(x...) |
22 | #define ST5(x...) x |
23 | #endif |
24 | |
25 | SYM_FUNC_START_LOCAL(aes_encrypt_block4x) |
26 | encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
27 | ret |
28 | SYM_FUNC_END(aes_encrypt_block4x) |
29 | |
30 | SYM_FUNC_START_LOCAL(aes_decrypt_block4x) |
31 | decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
32 | ret |
33 | SYM_FUNC_END(aes_decrypt_block4x) |
34 | |
35 | #if MAX_STRIDE == 5 |
36 | SYM_FUNC_START_LOCAL(aes_encrypt_block5x) |
37 | encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
38 | ret |
39 | SYM_FUNC_END(aes_encrypt_block5x) |
40 | |
41 | SYM_FUNC_START_LOCAL(aes_decrypt_block5x) |
42 | decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
43 | ret |
44 | SYM_FUNC_END(aes_decrypt_block5x) |
45 | #endif |
46 | |
47 | /* |
48 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
49 | * int blocks) |
50 | * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
51 | * int blocks) |
52 | */ |
53 | |
54 | AES_FUNC_START(aes_ecb_encrypt) |
55 | frame_push 0 |
56 | |
57 | enc_prepare w3, x2, x5 |
58 | |
59 | .LecbencloopNx: |
60 | subs w4, w4, #MAX_STRIDE |
61 | bmi .Lecbenc1x |
62 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
63 | ST4( bl aes_encrypt_block4x ) |
64 | ST5( ld1 {v4.16b}, [x1], #16 ) |
65 | ST5( bl aes_encrypt_block5x ) |
66 | st1 {v0.16b-v3.16b}, [x0], #64 |
67 | ST5( st1 {v4.16b}, [x0], #16 ) |
68 | b .LecbencloopNx |
69 | .Lecbenc1x: |
70 | adds w4, w4, #MAX_STRIDE |
71 | beq .Lecbencout |
72 | .Lecbencloop: |
73 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
74 | encrypt_block v0, w3, x2, x5, w6 |
75 | st1 {v0.16b}, [x0], #16 |
76 | subs w4, w4, #1 |
77 | bne .Lecbencloop |
78 | .Lecbencout: |
79 | frame_pop |
80 | ret |
81 | AES_FUNC_END(aes_ecb_encrypt) |
82 | |
83 | |
84 | AES_FUNC_START(aes_ecb_decrypt) |
85 | frame_push 0 |
86 | |
87 | dec_prepare w3, x2, x5 |
88 | |
89 | .LecbdecloopNx: |
90 | subs w4, w4, #MAX_STRIDE |
91 | bmi .Lecbdec1x |
92 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
93 | ST4( bl aes_decrypt_block4x ) |
94 | ST5( ld1 {v4.16b}, [x1], #16 ) |
95 | ST5( bl aes_decrypt_block5x ) |
96 | st1 {v0.16b-v3.16b}, [x0], #64 |
97 | ST5( st1 {v4.16b}, [x0], #16 ) |
98 | b .LecbdecloopNx |
99 | .Lecbdec1x: |
100 | adds w4, w4, #MAX_STRIDE |
101 | beq .Lecbdecout |
102 | .Lecbdecloop: |
103 | ld1 {v0.16b}, [x1], #16 /* get next ct block */ |
104 | decrypt_block v0, w3, x2, x5, w6 |
105 | st1 {v0.16b}, [x0], #16 |
106 | subs w4, w4, #1 |
107 | bne .Lecbdecloop |
108 | .Lecbdecout: |
109 | frame_pop |
110 | ret |
111 | AES_FUNC_END(aes_ecb_decrypt) |
112 | |
113 | |
114 | /* |
115 | * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
116 | * int blocks, u8 iv[]) |
117 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
118 | * int blocks, u8 iv[]) |
119 | * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], |
120 | * int rounds, int blocks, u8 iv[], |
121 | * u32 const rk2[]); |
122 | * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], |
123 | * int rounds, int blocks, u8 iv[], |
124 | * u32 const rk2[]); |
125 | */ |
126 | |
127 | AES_FUNC_START(aes_essiv_cbc_encrypt) |
128 | ld1 {v4.16b}, [x5] /* get iv */ |
129 | |
130 | mov w8, #14 /* AES-256: 14 rounds */ |
131 | enc_prepare w8, x6, x7 |
132 | encrypt_block v4, w8, x6, x7, w9 |
133 | enc_switch_key w3, x2, x6 |
134 | b .Lcbcencloop4x |
135 | |
136 | AES_FUNC_START(aes_cbc_encrypt) |
137 | ld1 {v4.16b}, [x5] /* get iv */ |
138 | enc_prepare w3, x2, x6 |
139 | |
140 | .Lcbcencloop4x: |
141 | subs w4, w4, #4 |
142 | bmi .Lcbcenc1x |
143 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
144 | eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ |
145 | encrypt_block v0, w3, x2, x6, w7 |
146 | eor v1.16b, v1.16b, v0.16b |
147 | encrypt_block v1, w3, x2, x6, w7 |
148 | eor v2.16b, v2.16b, v1.16b |
149 | encrypt_block v2, w3, x2, x6, w7 |
150 | eor v3.16b, v3.16b, v2.16b |
151 | encrypt_block v3, w3, x2, x6, w7 |
152 | st1 {v0.16b-v3.16b}, [x0], #64 |
153 | mov v4.16b, v3.16b |
154 | b .Lcbcencloop4x |
155 | .Lcbcenc1x: |
156 | adds w4, w4, #4 |
157 | beq .Lcbcencout |
158 | .Lcbcencloop: |
159 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
160 | eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ |
161 | encrypt_block v4, w3, x2, x6, w7 |
162 | st1 {v4.16b}, [x0], #16 |
163 | subs w4, w4, #1 |
164 | bne .Lcbcencloop |
165 | .Lcbcencout: |
166 | st1 {v4.16b}, [x5] /* return iv */ |
167 | ret |
168 | AES_FUNC_END(aes_cbc_encrypt) |
169 | AES_FUNC_END(aes_essiv_cbc_encrypt) |
170 | |
171 | AES_FUNC_START(aes_essiv_cbc_decrypt) |
172 | ld1 {cbciv.16b}, [x5] /* get iv */ |
173 | |
174 | mov w8, #14 /* AES-256: 14 rounds */ |
175 | enc_prepare w8, x6, x7 |
176 | encrypt_block cbciv, w8, x6, x7, w9 |
177 | b .Lessivcbcdecstart |
178 | |
179 | AES_FUNC_START(aes_cbc_decrypt) |
180 | ld1 {cbciv.16b}, [x5] /* get iv */ |
181 | .Lessivcbcdecstart: |
182 | frame_push 0 |
183 | dec_prepare w3, x2, x6 |
184 | |
185 | .LcbcdecloopNx: |
186 | subs w4, w4, #MAX_STRIDE |
187 | bmi .Lcbcdec1x |
188 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
189 | #if MAX_STRIDE == 5 |
190 | ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ |
191 | mov v5.16b, v0.16b |
192 | mov v6.16b, v1.16b |
193 | mov v7.16b, v2.16b |
194 | bl aes_decrypt_block5x |
195 | sub x1, x1, #32 |
196 | eor v0.16b, v0.16b, cbciv.16b |
197 | eor v1.16b, v1.16b, v5.16b |
198 | ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ |
199 | ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
200 | eor v2.16b, v2.16b, v6.16b |
201 | eor v3.16b, v3.16b, v7.16b |
202 | eor v4.16b, v4.16b, v5.16b |
203 | #else |
204 | mov v4.16b, v0.16b |
205 | mov v5.16b, v1.16b |
206 | mov v6.16b, v2.16b |
207 | bl aes_decrypt_block4x |
208 | sub x1, x1, #16 |
209 | eor v0.16b, v0.16b, cbciv.16b |
210 | eor v1.16b, v1.16b, v4.16b |
211 | ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
212 | eor v2.16b, v2.16b, v5.16b |
213 | eor v3.16b, v3.16b, v6.16b |
214 | #endif |
215 | st1 {v0.16b-v3.16b}, [x0], #64 |
216 | ST5( st1 {v4.16b}, [x0], #16 ) |
217 | b .LcbcdecloopNx |
218 | .Lcbcdec1x: |
219 | adds w4, w4, #MAX_STRIDE |
220 | beq .Lcbcdecout |
221 | .Lcbcdecloop: |
222 | ld1 {v1.16b}, [x1], #16 /* get next ct block */ |
223 | mov v0.16b, v1.16b /* ...and copy to v0 */ |
224 | decrypt_block v0, w3, x2, x6, w7 |
225 | eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ |
226 | mov cbciv.16b, v1.16b /* ct is next iv */ |
227 | st1 {v0.16b}, [x0], #16 |
228 | subs w4, w4, #1 |
229 | bne .Lcbcdecloop |
230 | .Lcbcdecout: |
231 | st1 {cbciv.16b}, [x5] /* return iv */ |
232 | frame_pop |
233 | ret |
234 | AES_FUNC_END(aes_cbc_decrypt) |
235 | AES_FUNC_END(aes_essiv_cbc_decrypt) |
236 | |
237 | |
238 | /* |
239 | * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], |
240 | * int rounds, int bytes, u8 const iv[]) |
241 | * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], |
242 | * int rounds, int bytes, u8 const iv[]) |
243 | */ |
244 | |
245 | AES_FUNC_START(aes_cbc_cts_encrypt) |
246 | adr_l x8, .Lcts_permute_table |
247 | sub x4, x4, #16 |
248 | add x9, x8, #32 |
249 | add x8, x8, x4 |
250 | sub x9, x9, x4 |
251 | ld1 {v3.16b}, [x8] |
252 | ld1 {v4.16b}, [x9] |
253 | |
254 | ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
255 | ld1 {v1.16b}, [x1] |
256 | |
257 | ld1 {v5.16b}, [x5] /* get iv */ |
258 | enc_prepare w3, x2, x6 |
259 | |
260 | eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
261 | tbl v1.16b, {v1.16b}, v4.16b |
262 | encrypt_block v0, w3, x2, x6, w7 |
263 | |
264 | eor v1.16b, v1.16b, v0.16b |
265 | tbl v0.16b, {v0.16b}, v3.16b |
266 | encrypt_block v1, w3, x2, x6, w7 |
267 | |
268 | add x4, x0, x4 |
269 | st1 {v0.16b}, [x4] /* overlapping stores */ |
270 | st1 {v1.16b}, [x0] |
271 | ret |
272 | AES_FUNC_END(aes_cbc_cts_encrypt) |
273 | |
274 | AES_FUNC_START(aes_cbc_cts_decrypt) |
275 | adr_l x8, .Lcts_permute_table |
276 | sub x4, x4, #16 |
277 | add x9, x8, #32 |
278 | add x8, x8, x4 |
279 | sub x9, x9, x4 |
280 | ld1 {v3.16b}, [x8] |
281 | ld1 {v4.16b}, [x9] |
282 | |
283 | ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
284 | ld1 {v1.16b}, [x1] |
285 | |
286 | ld1 {v5.16b}, [x5] /* get iv */ |
287 | dec_prepare w3, x2, x6 |
288 | |
289 | decrypt_block v0, w3, x2, x6, w7 |
290 | tbl v2.16b, {v0.16b}, v3.16b |
291 | eor v2.16b, v2.16b, v1.16b |
292 | |
293 | tbx v0.16b, {v1.16b}, v4.16b |
294 | decrypt_block v0, w3, x2, x6, w7 |
295 | eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
296 | |
297 | add x4, x0, x4 |
298 | st1 {v2.16b}, [x4] /* overlapping stores */ |
299 | st1 {v0.16b}, [x0] |
300 | ret |
301 | AES_FUNC_END(aes_cbc_cts_decrypt) |
302 | |
303 | .section ".rodata" , "a" |
304 | .align 6 |
305 | .Lcts_permute_table: |
306 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
307 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
308 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
309 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
310 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
311 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
312 | .previous |
313 | |
314 | /* |
315 | * This macro generates the code for CTR and XCTR mode. |
316 | */ |
317 | .macro ctr_encrypt xctr |
318 | // Arguments |
319 | OUT .req x0 |
320 | IN .req x1 |
321 | KEY .req x2 |
322 | ROUNDS_W .req w3 |
323 | BYTES_W .req w4 |
324 | IV .req x5 |
325 | BYTE_CTR_W .req w6 // XCTR only |
326 | // Intermediate values |
327 | CTR_W .req w11 // XCTR only |
328 | CTR .req x11 // XCTR only |
329 | IV_PART .req x12 |
330 | BLOCKS .req x13 |
331 | BLOCKS_W .req w13 |
332 | |
333 | frame_push 0 |
334 | |
335 | enc_prepare ROUNDS_W, KEY, IV_PART |
336 | ld1 {vctr.16b}, [IV] |
337 | |
338 | /* |
339 | * Keep 64 bits of the IV in a register. For CTR mode this lets us |
340 | * easily increment the IV. For XCTR mode this lets us efficiently XOR |
341 | * the 64-bit counter with the IV. |
342 | */ |
343 | .if \xctr |
344 | umov IV_PART, vctr.d[0] |
345 | lsr CTR_W, BYTE_CTR_W, #4 |
346 | .else |
347 | umov IV_PART, vctr.d[1] |
348 | rev IV_PART, IV_PART |
349 | .endif |
350 | |
351 | .LctrloopNx\xctr: |
352 | add BLOCKS_W, BYTES_W, #15 |
353 | sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 |
354 | lsr BLOCKS_W, BLOCKS_W, #4 |
355 | mov w8, #MAX_STRIDE |
356 | cmp BLOCKS_W, w8 |
357 | csel BLOCKS_W, BLOCKS_W, w8, lt |
358 | |
359 | /* |
360 | * Set up the counter values in v0-v{MAX_STRIDE-1}. |
361 | * |
362 | * If we are encrypting less than MAX_STRIDE blocks, the tail block |
363 | * handling code expects the last keystream block to be in |
364 | * v{MAX_STRIDE-1}. For example: if encrypting two blocks with |
365 | * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. |
366 | */ |
367 | .if \xctr |
368 | add CTR, CTR, BLOCKS |
369 | .else |
370 | adds IV_PART, IV_PART, BLOCKS |
371 | .endif |
372 | mov v0.16b, vctr.16b |
373 | mov v1.16b, vctr.16b |
374 | mov v2.16b, vctr.16b |
375 | mov v3.16b, vctr.16b |
376 | ST5( mov v4.16b, vctr.16b ) |
377 | .if \xctr |
378 | sub x6, CTR, #MAX_STRIDE - 1 |
379 | sub x7, CTR, #MAX_STRIDE - 2 |
380 | sub x8, CTR, #MAX_STRIDE - 3 |
381 | sub x9, CTR, #MAX_STRIDE - 4 |
382 | ST5( sub x10, CTR, #MAX_STRIDE - 5 ) |
383 | eor x6, x6, IV_PART |
384 | eor x7, x7, IV_PART |
385 | eor x8, x8, IV_PART |
386 | eor x9, x9, IV_PART |
387 | ST5( eor x10, x10, IV_PART ) |
388 | mov v0.d[0], x6 |
389 | mov v1.d[0], x7 |
390 | mov v2.d[0], x8 |
391 | mov v3.d[0], x9 |
392 | ST5( mov v4.d[0], x10 ) |
393 | .else |
394 | bcs 0f |
395 | .subsection 1 |
396 | /* |
397 | * This subsection handles carries. |
398 | * |
399 | * Conditional branching here is allowed with respect to time |
400 | * invariance since the branches are dependent on the IV instead |
401 | * of the plaintext or key. This code is rarely executed in |
402 | * practice anyway. |
403 | */ |
404 | |
405 | /* Apply carry to outgoing counter. */ |
406 | 0: umov x8, vctr.d[0] |
407 | rev x8, x8 |
408 | add x8, x8, #1 |
409 | rev x8, x8 |
410 | ins vctr.d[0], x8 |
411 | |
412 | /* |
413 | * Apply carry to counter blocks if needed. |
414 | * |
415 | * Since the carry flag was set, we know 0 <= IV_PART < |
416 | * MAX_STRIDE. Using the value of IV_PART we can determine how |
417 | * many counter blocks need to be updated. |
418 | */ |
419 | cbz IV_PART, 2f |
420 | adr x16, 1f |
421 | sub x16, x16, IV_PART, lsl #3 |
422 | br x16 |
423 | bti c |
424 | mov v0.d[0], vctr.d[0] |
425 | bti c |
426 | mov v1.d[0], vctr.d[0] |
427 | bti c |
428 | mov v2.d[0], vctr.d[0] |
429 | bti c |
430 | mov v3.d[0], vctr.d[0] |
431 | ST5( bti c ) |
432 | ST5( mov v4.d[0], vctr.d[0] ) |
433 | 1: b 2f |
434 | .previous |
435 | |
436 | 2: rev x7, IV_PART |
437 | ins vctr.d[1], x7 |
438 | sub x7, IV_PART, #MAX_STRIDE - 1 |
439 | sub x8, IV_PART, #MAX_STRIDE - 2 |
440 | sub x9, IV_PART, #MAX_STRIDE - 3 |
441 | rev x7, x7 |
442 | rev x8, x8 |
443 | mov v1.d[1], x7 |
444 | rev x9, x9 |
445 | ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) |
446 | mov v2.d[1], x8 |
447 | ST5( rev x10, x10 ) |
448 | mov v3.d[1], x9 |
449 | ST5( mov v4.d[1], x10 ) |
450 | .endif |
451 | |
452 | /* |
453 | * If there are at least MAX_STRIDE blocks left, XOR the data with |
454 | * keystream and store. Otherwise jump to tail handling. |
455 | */ |
456 | tbnz BYTES_W, #31, .Lctrtail\xctr |
457 | ld1 {v5.16b-v7.16b}, [IN], #48 |
458 | ST4( bl aes_encrypt_block4x ) |
459 | ST5( bl aes_encrypt_block5x ) |
460 | eor v0.16b, v5.16b, v0.16b |
461 | ST4( ld1 {v5.16b}, [IN], #16 ) |
462 | eor v1.16b, v6.16b, v1.16b |
463 | ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) |
464 | eor v2.16b, v7.16b, v2.16b |
465 | eor v3.16b, v5.16b, v3.16b |
466 | ST5( eor v4.16b, v6.16b, v4.16b ) |
467 | st1 {v0.16b-v3.16b}, [OUT], #64 |
468 | ST5( st1 {v4.16b}, [OUT], #16 ) |
469 | cbz BYTES_W, .Lctrout\xctr |
470 | b .LctrloopNx\xctr |
471 | |
472 | .Lctrout\xctr: |
473 | .if !\xctr |
474 | st1 {vctr.16b}, [IV] /* return next CTR value */ |
475 | .endif |
476 | frame_pop |
477 | ret |
478 | |
479 | .Lctrtail\xctr: |
480 | /* |
481 | * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext |
482 | * |
483 | * This code expects the last keystream block to be in v{MAX_STRIDE-1}. |
484 | * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and |
485 | * v4 should have the next two counter blocks. |
486 | * |
487 | * This allows us to store the ciphertext by writing to overlapping |
488 | * regions of memory. Any invalid ciphertext blocks get overwritten by |
489 | * correctly computed blocks. This approach greatly simplifies the |
490 | * logic for storing the ciphertext. |
491 | */ |
492 | mov x16, #16 |
493 | ands w7, BYTES_W, #0xf |
494 | csel x13, x7, x16, ne |
495 | |
496 | ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) |
497 | ST5( csel x14, x16, xzr, gt ) |
498 | cmp BYTES_W, #48 - (MAX_STRIDE << 4) |
499 | csel x15, x16, xzr, gt |
500 | cmp BYTES_W, #32 - (MAX_STRIDE << 4) |
501 | csel x16, x16, xzr, gt |
502 | cmp BYTES_W, #16 - (MAX_STRIDE << 4) |
503 | |
504 | adr_l x9, .Lcts_permute_table |
505 | add x9, x9, x13 |
506 | ble .Lctrtail1x\xctr |
507 | |
508 | ST5( ld1 {v5.16b}, [IN], x14 ) |
509 | ld1 {v6.16b}, [IN], x15 |
510 | ld1 {v7.16b}, [IN], x16 |
511 | |
512 | ST4( bl aes_encrypt_block4x ) |
513 | ST5( bl aes_encrypt_block5x ) |
514 | |
515 | ld1 {v8.16b}, [IN], x13 |
516 | ld1 {v9.16b}, [IN] |
517 | ld1 {v10.16b}, [x9] |
518 | |
519 | ST4( eor v6.16b, v6.16b, v0.16b ) |
520 | ST4( eor v7.16b, v7.16b, v1.16b ) |
521 | ST4( tbl v3.16b, {v3.16b}, v10.16b ) |
522 | ST4( eor v8.16b, v8.16b, v2.16b ) |
523 | ST4( eor v9.16b, v9.16b, v3.16b ) |
524 | |
525 | ST5( eor v5.16b, v5.16b, v0.16b ) |
526 | ST5( eor v6.16b, v6.16b, v1.16b ) |
527 | ST5( tbl v4.16b, {v4.16b}, v10.16b ) |
528 | ST5( eor v7.16b, v7.16b, v2.16b ) |
529 | ST5( eor v8.16b, v8.16b, v3.16b ) |
530 | ST5( eor v9.16b, v9.16b, v4.16b ) |
531 | |
532 | ST5( st1 {v5.16b}, [OUT], x14 ) |
533 | st1 {v6.16b}, [OUT], x15 |
534 | st1 {v7.16b}, [OUT], x16 |
535 | add x13, x13, OUT |
536 | st1 {v9.16b}, [x13] // overlapping stores |
537 | st1 {v8.16b}, [OUT] |
538 | b .Lctrout\xctr |
539 | |
540 | .Lctrtail1x\xctr: |
541 | /* |
542 | * Handle <= 16 bytes of plaintext |
543 | * |
544 | * This code always reads and writes 16 bytes. To avoid out of bounds |
545 | * accesses, XCTR and CTR modes must use a temporary buffer when |
546 | * encrypting/decrypting less than 16 bytes. |
547 | * |
548 | * This code is unusual in that it loads the input and stores the output |
549 | * relative to the end of the buffers rather than relative to the start. |
550 | * This causes unusual behaviour when encrypting/decrypting less than 16 |
551 | * bytes; the end of the data is expected to be at the end of the |
552 | * temporary buffer rather than the start of the data being at the start |
553 | * of the temporary buffer. |
554 | */ |
555 | sub x8, x7, #16 |
556 | csel x7, x7, x8, eq |
557 | add IN, IN, x7 |
558 | add OUT, OUT, x7 |
559 | ld1 {v5.16b}, [IN] |
560 | ld1 {v6.16b}, [OUT] |
561 | ST5( mov v3.16b, v4.16b ) |
562 | encrypt_block v3, ROUNDS_W, KEY, x8, w7 |
563 | ld1 {v10.16b-v11.16b}, [x9] |
564 | tbl v3.16b, {v3.16b}, v10.16b |
565 | sshr v11.16b, v11.16b, #7 |
566 | eor v5.16b, v5.16b, v3.16b |
567 | bif v5.16b, v6.16b, v11.16b |
568 | st1 {v5.16b}, [OUT] |
569 | b .Lctrout\xctr |
570 | |
571 | // Arguments |
572 | .unreq OUT |
573 | .unreq IN |
574 | .unreq KEY |
575 | .unreq ROUNDS_W |
576 | .unreq BYTES_W |
577 | .unreq IV |
578 | .unreq BYTE_CTR_W // XCTR only |
579 | // Intermediate values |
580 | .unreq CTR_W // XCTR only |
581 | .unreq CTR // XCTR only |
582 | .unreq IV_PART |
583 | .unreq BLOCKS |
584 | .unreq BLOCKS_W |
585 | .endm |
586 | |
587 | /* |
588 | * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
589 | * int bytes, u8 ctr[]) |
590 | * |
591 | * The input and output buffers must always be at least 16 bytes even if |
592 | * encrypting/decrypting less than 16 bytes. Otherwise out of bounds |
593 | * accesses will occur. The data to be encrypted/decrypted is expected |
594 | * to be at the end of this 16-byte temporary buffer rather than the |
595 | * start. |
596 | */ |
597 | |
598 | AES_FUNC_START(aes_ctr_encrypt) |
599 | ctr_encrypt 0 |
600 | AES_FUNC_END(aes_ctr_encrypt) |
601 | |
602 | /* |
603 | * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
604 | * int bytes, u8 const iv[], int byte_ctr) |
605 | * |
606 | * The input and output buffers must always be at least 16 bytes even if |
607 | * encrypting/decrypting less than 16 bytes. Otherwise out of bounds |
608 | * accesses will occur. The data to be encrypted/decrypted is expected |
609 | * to be at the end of this 16-byte temporary buffer rather than the |
610 | * start. |
611 | */ |
612 | |
613 | AES_FUNC_START(aes_xctr_encrypt) |
614 | ctr_encrypt 1 |
615 | AES_FUNC_END(aes_xctr_encrypt) |
616 | |
617 | |
618 | /* |
619 | * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
620 | * int bytes, u8 const rk2[], u8 iv[], int first) |
621 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
622 | * int bytes, u8 const rk2[], u8 iv[], int first) |
623 | */ |
624 | |
625 | .macro next_tweak, out, in, tmp |
626 | sshr \tmp\().2d, \in\().2d, #63 |
627 | and \tmp\().16b, \tmp\().16b, xtsmask.16b |
628 | add \out\().2d, \in\().2d, \in\().2d |
629 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 |
630 | eor \out\().16b, \out\().16b, \tmp\().16b |
631 | .endm |
632 | |
633 | .macro xts_load_mask, tmp |
634 | movi xtsmask.2s, #0x1 |
635 | movi \tmp\().2s, #0x87 |
636 | uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s |
637 | .endm |
638 | |
639 | AES_FUNC_START(aes_xts_encrypt) |
640 | frame_push 0 |
641 | |
642 | ld1 {v4.16b}, [x6] |
643 | xts_load_mask v8 |
644 | cbz w7, .Lxtsencnotfirst |
645 | |
646 | enc_prepare w3, x5, x8 |
647 | xts_cts_skip_tw w7, .LxtsencNx |
648 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
649 | enc_switch_key w3, x2, x8 |
650 | b .LxtsencNx |
651 | |
652 | .Lxtsencnotfirst: |
653 | enc_prepare w3, x2, x8 |
654 | .LxtsencloopNx: |
655 | next_tweak v4, v4, v8 |
656 | .LxtsencNx: |
657 | subs w4, w4, #64 |
658 | bmi .Lxtsenc1x |
659 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
660 | next_tweak v5, v4, v8 |
661 | eor v0.16b, v0.16b, v4.16b |
662 | next_tweak v6, v5, v8 |
663 | eor v1.16b, v1.16b, v5.16b |
664 | eor v2.16b, v2.16b, v6.16b |
665 | next_tweak v7, v6, v8 |
666 | eor v3.16b, v3.16b, v7.16b |
667 | bl aes_encrypt_block4x |
668 | eor v3.16b, v3.16b, v7.16b |
669 | eor v0.16b, v0.16b, v4.16b |
670 | eor v1.16b, v1.16b, v5.16b |
671 | eor v2.16b, v2.16b, v6.16b |
672 | st1 {v0.16b-v3.16b}, [x0], #64 |
673 | mov v4.16b, v7.16b |
674 | cbz w4, .Lxtsencret |
675 | xts_reload_mask v8 |
676 | b .LxtsencloopNx |
677 | .Lxtsenc1x: |
678 | adds w4, w4, #64 |
679 | beq .Lxtsencout |
680 | subs w4, w4, #16 |
681 | bmi .LxtsencctsNx |
682 | .Lxtsencloop: |
683 | ld1 {v0.16b}, [x1], #16 |
684 | .Lxtsencctsout: |
685 | eor v0.16b, v0.16b, v4.16b |
686 | encrypt_block v0, w3, x2, x8, w7 |
687 | eor v0.16b, v0.16b, v4.16b |
688 | cbz w4, .Lxtsencout |
689 | subs w4, w4, #16 |
690 | next_tweak v4, v4, v8 |
691 | bmi .Lxtsenccts |
692 | st1 {v0.16b}, [x0], #16 |
693 | b .Lxtsencloop |
694 | .Lxtsencout: |
695 | st1 {v0.16b}, [x0] |
696 | .Lxtsencret: |
697 | st1 {v4.16b}, [x6] |
698 | frame_pop |
699 | ret |
700 | |
701 | .LxtsencctsNx: |
702 | mov v0.16b, v3.16b |
703 | sub x0, x0, #16 |
704 | .Lxtsenccts: |
705 | adr_l x8, .Lcts_permute_table |
706 | |
707 | add x1, x1, w4, sxtw /* rewind input pointer */ |
708 | add w4, w4, #16 /* # bytes in final block */ |
709 | add x9, x8, #32 |
710 | add x8, x8, x4 |
711 | sub x9, x9, x4 |
712 | add x4, x0, x4 /* output address of final block */ |
713 | |
714 | ld1 {v1.16b}, [x1] /* load final block */ |
715 | ld1 {v2.16b}, [x8] |
716 | ld1 {v3.16b}, [x9] |
717 | |
718 | tbl v2.16b, {v0.16b}, v2.16b |
719 | tbx v0.16b, {v1.16b}, v3.16b |
720 | st1 {v2.16b}, [x4] /* overlapping stores */ |
721 | mov w4, wzr |
722 | b .Lxtsencctsout |
723 | AES_FUNC_END(aes_xts_encrypt) |
724 | |
725 | AES_FUNC_START(aes_xts_decrypt) |
726 | frame_push 0 |
727 | |
728 | /* subtract 16 bytes if we are doing CTS */ |
729 | sub w8, w4, #0x10 |
730 | tst w4, #0xf |
731 | csel w4, w4, w8, eq |
732 | |
733 | ld1 {v4.16b}, [x6] |
734 | xts_load_mask v8 |
735 | xts_cts_skip_tw w7, .Lxtsdecskiptw |
736 | cbz w7, .Lxtsdecnotfirst |
737 | |
738 | enc_prepare w3, x5, x8 |
739 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
740 | .Lxtsdecskiptw: |
741 | dec_prepare w3, x2, x8 |
742 | b .LxtsdecNx |
743 | |
744 | .Lxtsdecnotfirst: |
745 | dec_prepare w3, x2, x8 |
746 | .LxtsdecloopNx: |
747 | next_tweak v4, v4, v8 |
748 | .LxtsdecNx: |
749 | subs w4, w4, #64 |
750 | bmi .Lxtsdec1x |
751 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
752 | next_tweak v5, v4, v8 |
753 | eor v0.16b, v0.16b, v4.16b |
754 | next_tweak v6, v5, v8 |
755 | eor v1.16b, v1.16b, v5.16b |
756 | eor v2.16b, v2.16b, v6.16b |
757 | next_tweak v7, v6, v8 |
758 | eor v3.16b, v3.16b, v7.16b |
759 | bl aes_decrypt_block4x |
760 | eor v3.16b, v3.16b, v7.16b |
761 | eor v0.16b, v0.16b, v4.16b |
762 | eor v1.16b, v1.16b, v5.16b |
763 | eor v2.16b, v2.16b, v6.16b |
764 | st1 {v0.16b-v3.16b}, [x0], #64 |
765 | mov v4.16b, v7.16b |
766 | cbz w4, .Lxtsdecout |
767 | xts_reload_mask v8 |
768 | b .LxtsdecloopNx |
769 | .Lxtsdec1x: |
770 | adds w4, w4, #64 |
771 | beq .Lxtsdecout |
772 | subs w4, w4, #16 |
773 | .Lxtsdecloop: |
774 | ld1 {v0.16b}, [x1], #16 |
775 | bmi .Lxtsdeccts |
776 | .Lxtsdecctsout: |
777 | eor v0.16b, v0.16b, v4.16b |
778 | decrypt_block v0, w3, x2, x8, w7 |
779 | eor v0.16b, v0.16b, v4.16b |
780 | st1 {v0.16b}, [x0], #16 |
781 | cbz w4, .Lxtsdecout |
782 | subs w4, w4, #16 |
783 | next_tweak v4, v4, v8 |
784 | b .Lxtsdecloop |
785 | .Lxtsdecout: |
786 | st1 {v4.16b}, [x6] |
787 | frame_pop |
788 | ret |
789 | |
790 | .Lxtsdeccts: |
791 | adr_l x8, .Lcts_permute_table |
792 | |
793 | add x1, x1, w4, sxtw /* rewind input pointer */ |
794 | add w4, w4, #16 /* # bytes in final block */ |
795 | add x9, x8, #32 |
796 | add x8, x8, x4 |
797 | sub x9, x9, x4 |
798 | add x4, x0, x4 /* output address of final block */ |
799 | |
800 | next_tweak v5, v4, v8 |
801 | |
802 | ld1 {v1.16b}, [x1] /* load final block */ |
803 | ld1 {v2.16b}, [x8] |
804 | ld1 {v3.16b}, [x9] |
805 | |
806 | eor v0.16b, v0.16b, v5.16b |
807 | decrypt_block v0, w3, x2, x8, w7 |
808 | eor v0.16b, v0.16b, v5.16b |
809 | |
810 | tbl v2.16b, {v0.16b}, v2.16b |
811 | tbx v0.16b, {v1.16b}, v3.16b |
812 | |
813 | st1 {v2.16b}, [x4] /* overlapping stores */ |
814 | mov w4, wzr |
815 | b .Lxtsdecctsout |
816 | AES_FUNC_END(aes_xts_decrypt) |
817 | |
818 | /* |
819 | * aes_mac_update(u8 const in[], u32 const rk[], int rounds, |
820 | * int blocks, u8 dg[], int enc_before, int enc_after) |
821 | */ |
822 | AES_FUNC_START(aes_mac_update) |
823 | ld1 {v0.16b}, [x4] /* get dg */ |
824 | enc_prepare w2, x1, x7 |
825 | cbz w5, .Lmacloop4x |
826 | |
827 | encrypt_block v0, w2, x1, x7, w8 |
828 | |
829 | .Lmacloop4x: |
830 | subs w3, w3, #4 |
831 | bmi .Lmac1x |
832 | ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ |
833 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
834 | encrypt_block v0, w2, x1, x7, w8 |
835 | eor v0.16b, v0.16b, v2.16b |
836 | encrypt_block v0, w2, x1, x7, w8 |
837 | eor v0.16b, v0.16b, v3.16b |
838 | encrypt_block v0, w2, x1, x7, w8 |
839 | eor v0.16b, v0.16b, v4.16b |
840 | cmp w3, wzr |
841 | csinv x5, x6, xzr, eq |
842 | cbz w5, .Lmacout |
843 | encrypt_block v0, w2, x1, x7, w8 |
844 | st1 {v0.16b}, [x4] /* return dg */ |
845 | cond_yield .Lmacout, x7, x8 |
846 | b .Lmacloop4x |
847 | .Lmac1x: |
848 | add w3, w3, #4 |
849 | .Lmacloop: |
850 | cbz w3, .Lmacout |
851 | ld1 {v1.16b}, [x0], #16 /* get next pt block */ |
852 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
853 | |
854 | subs w3, w3, #1 |
855 | csinv x5, x6, xzr, eq |
856 | cbz w5, .Lmacout |
857 | |
858 | .Lmacenc: |
859 | encrypt_block v0, w2, x1, x7, w8 |
860 | b .Lmacloop |
861 | |
862 | .Lmacout: |
863 | st1 {v0.16b}, [x4] /* return dg */ |
864 | mov w0, w3 |
865 | ret |
866 | AES_FUNC_END(aes_mac_update) |
867 | |