1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions |
4 | * |
5 | * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | #include <asm/assembler.h> |
10 | |
11 | .text |
12 | .arch armv8-a |
13 | .fpu crypto-neon-fp-armv8 |
14 | .align 3 |
15 | |
16 | .macro enc_round, state, key |
17 | aese.8 \state, \key |
18 | aesmc.8 \state, \state |
19 | .endm |
20 | |
21 | .macro dec_round, state, key |
22 | aesd.8 \state, \key |
23 | aesimc.8 \state, \state |
24 | .endm |
25 | |
26 | .macro enc_dround, key1, key2 |
27 | enc_round q0, \key1 |
28 | enc_round q0, \key2 |
29 | .endm |
30 | |
31 | .macro dec_dround, key1, key2 |
32 | dec_round q0, \key1 |
33 | dec_round q0, \key2 |
34 | .endm |
35 | |
36 | .macro enc_fround, key1, key2, key3 |
37 | enc_round q0, \key1 |
38 | aese.8 q0, \key2 |
39 | veor q0, q0, \key3 |
40 | .endm |
41 | |
42 | .macro dec_fround, key1, key2, key3 |
43 | dec_round q0, \key1 |
44 | aesd.8 q0, \key2 |
45 | veor q0, q0, \key3 |
46 | .endm |
47 | |
48 | .macro enc_dround_4x, key1, key2 |
49 | enc_round q0, \key1 |
50 | enc_round q1, \key1 |
51 | enc_round q2, \key1 |
52 | enc_round q3, \key1 |
53 | enc_round q0, \key2 |
54 | enc_round q1, \key2 |
55 | enc_round q2, \key2 |
56 | enc_round q3, \key2 |
57 | .endm |
58 | |
59 | .macro dec_dround_4x, key1, key2 |
60 | dec_round q0, \key1 |
61 | dec_round q1, \key1 |
62 | dec_round q2, \key1 |
63 | dec_round q3, \key1 |
64 | dec_round q0, \key2 |
65 | dec_round q1, \key2 |
66 | dec_round q2, \key2 |
67 | dec_round q3, \key2 |
68 | .endm |
69 | |
70 | .macro enc_fround_4x, key1, key2, key3 |
71 | enc_round q0, \key1 |
72 | enc_round q1, \key1 |
73 | enc_round q2, \key1 |
74 | enc_round q3, \key1 |
75 | aese.8 q0, \key2 |
76 | aese.8 q1, \key2 |
77 | aese.8 q2, \key2 |
78 | aese.8 q3, \key2 |
79 | veor q0, q0, \key3 |
80 | veor q1, q1, \key3 |
81 | veor q2, q2, \key3 |
82 | veor q3, q3, \key3 |
83 | .endm |
84 | |
85 | .macro dec_fround_4x, key1, key2, key3 |
86 | dec_round q0, \key1 |
87 | dec_round q1, \key1 |
88 | dec_round q2, \key1 |
89 | dec_round q3, \key1 |
90 | aesd.8 q0, \key2 |
91 | aesd.8 q1, \key2 |
92 | aesd.8 q2, \key2 |
93 | aesd.8 q3, \key2 |
94 | veor q0, q0, \key3 |
95 | veor q1, q1, \key3 |
96 | veor q2, q2, \key3 |
97 | veor q3, q3, \key3 |
98 | .endm |
99 | |
100 | .macro do_block, dround, fround |
101 | cmp r3, #12 @ which key size? |
102 | vld1.32 {q10-q11}, [ip]! |
103 | \dround q8, q9 |
104 | vld1.32 {q12-q13}, [ip]! |
105 | \dround q10, q11 |
106 | vld1.32 {q10-q11}, [ip]! |
107 | \dround q12, q13 |
108 | vld1.32 {q12-q13}, [ip]! |
109 | \dround q10, q11 |
110 | blo 0f @ AES-128: 10 rounds |
111 | vld1.32 {q10-q11}, [ip]! |
112 | \dround q12, q13 |
113 | beq 1f @ AES-192: 12 rounds |
114 | vld1.32 {q12-q13}, [ip] |
115 | \dround q10, q11 |
116 | 0: \fround q12, q13, q14 |
117 | bx lr |
118 | |
119 | 1: \fround q10, q11, q14 |
120 | bx lr |
121 | .endm |
122 | |
123 | /* |
124 | * Internal, non-AAPCS compliant functions that implement the core AES |
125 | * transforms. These should preserve all registers except q0 - q2 and ip |
126 | * Arguments: |
127 | * q0 : first in/output block |
128 | * q1 : second in/output block (_4x version only) |
129 | * q2 : third in/output block (_4x version only) |
130 | * q3 : fourth in/output block (_4x version only) |
131 | * q8 : first round key |
132 | * q9 : secound round key |
133 | * q14 : final round key |
134 | * r2 : address of round key array |
135 | * r3 : number of rounds |
136 | */ |
137 | .align 6 |
138 | aes_encrypt: |
139 | add ip, r2, #32 @ 3rd round key |
140 | .Laes_encrypt_tweak: |
141 | do_block enc_dround, enc_fround |
142 | ENDPROC(aes_encrypt) |
143 | |
144 | .align 6 |
145 | aes_decrypt: |
146 | add ip, r2, #32 @ 3rd round key |
147 | do_block dec_dround, dec_fround |
148 | ENDPROC(aes_decrypt) |
149 | |
150 | .align 6 |
151 | aes_encrypt_4x: |
152 | add ip, r2, #32 @ 3rd round key |
153 | do_block enc_dround_4x, enc_fround_4x |
154 | ENDPROC(aes_encrypt_4x) |
155 | |
156 | .align 6 |
157 | aes_decrypt_4x: |
158 | add ip, r2, #32 @ 3rd round key |
159 | do_block dec_dround_4x, dec_fround_4x |
160 | ENDPROC(aes_decrypt_4x) |
161 | |
162 | .macro prepare_key, rk, rounds |
163 | add ip, \rk, \rounds, lsl #4 |
164 | vld1.32 {q8-q9}, [\rk] @ load first 2 round keys |
165 | vld1.32 {q14}, [ip] @ load last round key |
166 | .endm |
167 | |
168 | /* |
169 | * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
170 | * int blocks) |
171 | * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
172 | * int blocks) |
173 | */ |
174 | ENTRY(ce_aes_ecb_encrypt) |
175 | push {r4, lr} |
176 | ldr r4, [sp, #8] |
177 | prepare_key r2, r3 |
178 | .Lecbencloop4x: |
179 | subs r4, r4, #4 |
180 | bmi .Lecbenc1x |
181 | vld1.8 {q0-q1}, [r1]! |
182 | vld1.8 {q2-q3}, [r1]! |
183 | bl aes_encrypt_4x |
184 | vst1.8 {q0-q1}, [r0]! |
185 | vst1.8 {q2-q3}, [r0]! |
186 | b .Lecbencloop4x |
187 | .Lecbenc1x: |
188 | adds r4, r4, #4 |
189 | beq .Lecbencout |
190 | .Lecbencloop: |
191 | vld1.8 {q0}, [r1]! |
192 | bl aes_encrypt |
193 | vst1.8 {q0}, [r0]! |
194 | subs r4, r4, #1 |
195 | bne .Lecbencloop |
196 | .Lecbencout: |
197 | pop {r4, pc} |
198 | ENDPROC(ce_aes_ecb_encrypt) |
199 | |
200 | ENTRY(ce_aes_ecb_decrypt) |
201 | push {r4, lr} |
202 | ldr r4, [sp, #8] |
203 | prepare_key r2, r3 |
204 | .Lecbdecloop4x: |
205 | subs r4, r4, #4 |
206 | bmi .Lecbdec1x |
207 | vld1.8 {q0-q1}, [r1]! |
208 | vld1.8 {q2-q3}, [r1]! |
209 | bl aes_decrypt_4x |
210 | vst1.8 {q0-q1}, [r0]! |
211 | vst1.8 {q2-q3}, [r0]! |
212 | b .Lecbdecloop4x |
213 | .Lecbdec1x: |
214 | adds r4, r4, #4 |
215 | beq .Lecbdecout |
216 | .Lecbdecloop: |
217 | vld1.8 {q0}, [r1]! |
218 | bl aes_decrypt |
219 | vst1.8 {q0}, [r0]! |
220 | subs r4, r4, #1 |
221 | bne .Lecbdecloop |
222 | .Lecbdecout: |
223 | pop {r4, pc} |
224 | ENDPROC(ce_aes_ecb_decrypt) |
225 | |
226 | /* |
227 | * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
228 | * int blocks, u8 iv[]) |
229 | * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
230 | * int blocks, u8 iv[]) |
231 | */ |
232 | ENTRY(ce_aes_cbc_encrypt) |
233 | push {r4-r6, lr} |
234 | ldrd r4, r5, [sp, #16] |
235 | vld1.8 {q0}, [r5] |
236 | prepare_key r2, r3 |
237 | .Lcbcencloop: |
238 | vld1.8 {q1}, [r1]! @ get next pt block |
239 | veor q0, q0, q1 @ ..and xor with iv |
240 | bl aes_encrypt |
241 | vst1.8 {q0}, [r0]! |
242 | subs r4, r4, #1 |
243 | bne .Lcbcencloop |
244 | vst1.8 {q0}, [r5] |
245 | pop {r4-r6, pc} |
246 | ENDPROC(ce_aes_cbc_encrypt) |
247 | |
248 | ENTRY(ce_aes_cbc_decrypt) |
249 | push {r4-r6, lr} |
250 | ldrd r4, r5, [sp, #16] |
251 | vld1.8 {q15}, [r5] @ keep iv in q15 |
252 | prepare_key r2, r3 |
253 | .Lcbcdecloop4x: |
254 | subs r4, r4, #4 |
255 | bmi .Lcbcdec1x |
256 | vld1.8 {q0-q1}, [r1]! |
257 | vld1.8 {q2-q3}, [r1]! |
258 | vmov q4, q0 |
259 | vmov q5, q1 |
260 | vmov q6, q2 |
261 | vmov q7, q3 |
262 | bl aes_decrypt_4x |
263 | veor q0, q0, q15 |
264 | veor q1, q1, q4 |
265 | veor q2, q2, q5 |
266 | veor q3, q3, q6 |
267 | vmov q15, q7 |
268 | vst1.8 {q0-q1}, [r0]! |
269 | vst1.8 {q2-q3}, [r0]! |
270 | b .Lcbcdecloop4x |
271 | .Lcbcdec1x: |
272 | adds r4, r4, #4 |
273 | beq .Lcbcdecout |
274 | vmov q6, q14 @ preserve last round key |
275 | .Lcbcdecloop: |
276 | vld1.8 {q0}, [r1]! @ get next ct block |
277 | veor q14, q15, q6 @ combine prev ct with last key |
278 | vmov q15, q0 |
279 | bl aes_decrypt |
280 | vst1.8 {q0}, [r0]! |
281 | subs r4, r4, #1 |
282 | bne .Lcbcdecloop |
283 | .Lcbcdecout: |
284 | vst1.8 {q15}, [r5] @ keep iv in q15 |
285 | pop {r4-r6, pc} |
286 | ENDPROC(ce_aes_cbc_decrypt) |
287 | |
288 | |
289 | /* |
290 | * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], |
291 | * int rounds, int bytes, u8 const iv[]) |
292 | * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], |
293 | * int rounds, int bytes, u8 const iv[]) |
294 | */ |
295 | |
296 | ENTRY(ce_aes_cbc_cts_encrypt) |
297 | push {r4-r6, lr} |
298 | ldrd r4, r5, [sp, #16] |
299 | |
300 | movw ip, :lower16:.Lcts_permute_table |
301 | movt ip, :upper16:.Lcts_permute_table |
302 | sub r4, r4, #16 |
303 | add lr, ip, #32 |
304 | add ip, ip, r4 |
305 | sub lr, lr, r4 |
306 | vld1.8 {q5}, [ip] |
307 | vld1.8 {q6}, [lr] |
308 | |
309 | add ip, r1, r4 |
310 | vld1.8 {q0}, [r1] @ overlapping loads |
311 | vld1.8 {q3}, [ip] |
312 | |
313 | vld1.8 {q1}, [r5] @ get iv |
314 | prepare_key r2, r3 |
315 | |
316 | veor q0, q0, q1 @ xor with iv |
317 | bl aes_encrypt |
318 | |
319 | vtbl.8 d4, {d0-d1}, d10 |
320 | vtbl.8 d5, {d0-d1}, d11 |
321 | vtbl.8 d2, {d6-d7}, d12 |
322 | vtbl.8 d3, {d6-d7}, d13 |
323 | |
324 | veor q0, q0, q1 |
325 | bl aes_encrypt |
326 | |
327 | add r4, r0, r4 |
328 | vst1.8 {q2}, [r4] @ overlapping stores |
329 | vst1.8 {q0}, [r0] |
330 | |
331 | pop {r4-r6, pc} |
332 | ENDPROC(ce_aes_cbc_cts_encrypt) |
333 | |
334 | ENTRY(ce_aes_cbc_cts_decrypt) |
335 | push {r4-r6, lr} |
336 | ldrd r4, r5, [sp, #16] |
337 | |
338 | movw ip, :lower16:.Lcts_permute_table |
339 | movt ip, :upper16:.Lcts_permute_table |
340 | sub r4, r4, #16 |
341 | add lr, ip, #32 |
342 | add ip, ip, r4 |
343 | sub lr, lr, r4 |
344 | vld1.8 {q5}, [ip] |
345 | vld1.8 {q6}, [lr] |
346 | |
347 | add ip, r1, r4 |
348 | vld1.8 {q0}, [r1] @ overlapping loads |
349 | vld1.8 {q1}, [ip] |
350 | |
351 | vld1.8 {q3}, [r5] @ get iv |
352 | prepare_key r2, r3 |
353 | |
354 | bl aes_decrypt |
355 | |
356 | vtbl.8 d4, {d0-d1}, d10 |
357 | vtbl.8 d5, {d0-d1}, d11 |
358 | vtbx.8 d0, {d2-d3}, d12 |
359 | vtbx.8 d1, {d2-d3}, d13 |
360 | |
361 | veor q1, q1, q2 |
362 | bl aes_decrypt |
363 | veor q0, q0, q3 @ xor with iv |
364 | |
365 | add r4, r0, r4 |
366 | vst1.8 {q1}, [r4] @ overlapping stores |
367 | vst1.8 {q0}, [r0] |
368 | |
369 | pop {r4-r6, pc} |
370 | ENDPROC(ce_aes_cbc_cts_decrypt) |
371 | |
372 | |
373 | /* |
374 | * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
375 | * int blocks, u8 ctr[]) |
376 | */ |
377 | ENTRY(ce_aes_ctr_encrypt) |
378 | push {r4-r6, lr} |
379 | ldrd r4, r5, [sp, #16] |
380 | vld1.8 {q7}, [r5] @ load ctr |
381 | prepare_key r2, r3 |
382 | vmov r6, s31 @ keep swabbed ctr in r6 |
383 | rev r6, r6 |
384 | cmn r6, r4 @ 32 bit overflow? |
385 | bcs .Lctrloop |
386 | .Lctrloop4x: |
387 | subs r4, r4, #4 |
388 | bmi .Lctr1x |
389 | |
390 | /* |
391 | * NOTE: the sequence below has been carefully tweaked to avoid |
392 | * a silicon erratum that exists in Cortex-A57 (#1742098) and |
393 | * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs |
394 | * may produce an incorrect result if they take their input from a |
395 | * register of which a single 32-bit lane has been updated the last |
396 | * time it was modified. To work around this, the lanes of registers |
397 | * q0-q3 below are not manipulated individually, and the different |
398 | * counter values are prepared by successive manipulations of q7. |
399 | */ |
400 | add ip, r6, #1 |
401 | vmov q0, q7 |
402 | rev ip, ip |
403 | add lr, r6, #2 |
404 | vmov s31, ip @ set lane 3 of q1 via q7 |
405 | add ip, r6, #3 |
406 | rev lr, lr |
407 | vmov q1, q7 |
408 | vmov s31, lr @ set lane 3 of q2 via q7 |
409 | rev ip, ip |
410 | vmov q2, q7 |
411 | vmov s31, ip @ set lane 3 of q3 via q7 |
412 | add r6, r6, #4 |
413 | vmov q3, q7 |
414 | |
415 | vld1.8 {q4-q5}, [r1]! |
416 | vld1.8 {q6}, [r1]! |
417 | vld1.8 {q15}, [r1]! |
418 | bl aes_encrypt_4x |
419 | veor q0, q0, q4 |
420 | veor q1, q1, q5 |
421 | veor q2, q2, q6 |
422 | veor q3, q3, q15 |
423 | rev ip, r6 |
424 | vst1.8 {q0-q1}, [r0]! |
425 | vst1.8 {q2-q3}, [r0]! |
426 | vmov s31, ip |
427 | b .Lctrloop4x |
428 | .Lctr1x: |
429 | adds r4, r4, #4 |
430 | beq .Lctrout |
431 | .Lctrloop: |
432 | vmov q0, q7 |
433 | bl aes_encrypt |
434 | |
435 | adds r6, r6, #1 @ increment BE ctr |
436 | rev ip, r6 |
437 | vmov s31, ip |
438 | bcs .Lctrcarry |
439 | |
440 | .Lctrcarrydone: |
441 | subs r4, r4, #1 |
442 | bmi .Lctrtailblock @ blocks < 0 means tail block |
443 | vld1.8 {q3}, [r1]! |
444 | veor q3, q0, q3 |
445 | vst1.8 {q3}, [r0]! |
446 | bne .Lctrloop |
447 | |
448 | .Lctrout: |
449 | vst1.8 {q7}, [r5] @ return next CTR value |
450 | pop {r4-r6, pc} |
451 | |
452 | .Lctrtailblock: |
453 | vst1.8 {q0}, [r0, :64] @ return the key stream |
454 | b .Lctrout |
455 | |
456 | .Lctrcarry: |
457 | .irp sreg, s30, s29, s28 |
458 | vmov ip, \sreg @ load next word of ctr |
459 | rev ip, ip @ ... to handle the carry |
460 | adds ip, ip, #1 |
461 | rev ip, ip |
462 | vmov \sreg, ip |
463 | bcc .Lctrcarrydone |
464 | .endr |
465 | b .Lctrcarrydone |
466 | ENDPROC(ce_aes_ctr_encrypt) |
467 | |
468 | /* |
469 | * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, |
470 | * int bytes, u8 iv[], u32 const rk2[], int first) |
471 | * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, |
472 | * int bytes, u8 iv[], u32 const rk2[], int first) |
473 | */ |
474 | |
475 | .macro next_tweak, out, in, const, tmp |
476 | vshr.s64 \tmp, \in, #63 |
477 | vand \tmp, \tmp, \const |
478 | vadd.u64 \out, \in, \in |
479 | vext.8 \tmp, \tmp, \tmp, #8 |
480 | veor \out, \out, \tmp |
481 | .endm |
482 | |
483 | ce_aes_xts_init: |
484 | vmov.i32 d30, #0x87 @ compose tweak mask vector |
485 | vmovl.u32 q15, d30 |
486 | vshr.u64 d30, d31, #7 |
487 | |
488 | ldrd r4, r5, [sp, #16] @ load args |
489 | ldr r6, [sp, #28] |
490 | vld1.8 {q0}, [r5] @ load iv |
491 | teq r6, #1 @ start of a block? |
492 | bxne lr |
493 | |
494 | @ Encrypt the IV in q0 with the second AES key. This should only |
495 | @ be done at the start of a block. |
496 | ldr r6, [sp, #24] @ load AES key 2 |
497 | prepare_key r6, r3 |
498 | add ip, r6, #32 @ 3rd round key of key 2 |
499 | b .Laes_encrypt_tweak @ tail call |
500 | ENDPROC(ce_aes_xts_init) |
501 | |
502 | ENTRY(ce_aes_xts_encrypt) |
503 | push {r4-r6, lr} |
504 | |
505 | bl ce_aes_xts_init @ run shared prologue |
506 | prepare_key r2, r3 |
507 | vmov q4, q0 |
508 | |
509 | teq r6, #0 @ start of a block? |
510 | bne .Lxtsenc4x |
511 | |
512 | .Lxtsencloop4x: |
513 | next_tweak q4, q4, q15, q10 |
514 | .Lxtsenc4x: |
515 | subs r4, r4, #64 |
516 | bmi .Lxtsenc1x |
517 | vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks |
518 | vld1.8 {q2-q3}, [r1]! |
519 | next_tweak q5, q4, q15, q10 |
520 | veor q0, q0, q4 |
521 | next_tweak q6, q5, q15, q10 |
522 | veor q1, q1, q5 |
523 | next_tweak q7, q6, q15, q10 |
524 | veor q2, q2, q6 |
525 | veor q3, q3, q7 |
526 | bl aes_encrypt_4x |
527 | veor q0, q0, q4 |
528 | veor q1, q1, q5 |
529 | veor q2, q2, q6 |
530 | veor q3, q3, q7 |
531 | vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks |
532 | vst1.8 {q2-q3}, [r0]! |
533 | vmov q4, q7 |
534 | teq r4, #0 |
535 | beq .Lxtsencret |
536 | b .Lxtsencloop4x |
537 | .Lxtsenc1x: |
538 | adds r4, r4, #64 |
539 | beq .Lxtsencout |
540 | subs r4, r4, #16 |
541 | bmi .LxtsencctsNx |
542 | .Lxtsencloop: |
543 | vld1.8 {q0}, [r1]! |
544 | .Lxtsencctsout: |
545 | veor q0, q0, q4 |
546 | bl aes_encrypt |
547 | veor q0, q0, q4 |
548 | teq r4, #0 |
549 | beq .Lxtsencout |
550 | subs r4, r4, #16 |
551 | next_tweak q4, q4, q15, q6 |
552 | bmi .Lxtsenccts |
553 | vst1.8 {q0}, [r0]! |
554 | b .Lxtsencloop |
555 | .Lxtsencout: |
556 | vst1.8 {q0}, [r0] |
557 | .Lxtsencret: |
558 | vst1.8 {q4}, [r5] |
559 | pop {r4-r6, pc} |
560 | |
561 | .LxtsencctsNx: |
562 | vmov q0, q3 |
563 | sub r0, r0, #16 |
564 | .Lxtsenccts: |
565 | movw ip, :lower16:.Lcts_permute_table |
566 | movt ip, :upper16:.Lcts_permute_table |
567 | |
568 | add r1, r1, r4 @ rewind input pointer |
569 | add r4, r4, #16 @ # bytes in final block |
570 | add lr, ip, #32 |
571 | add ip, ip, r4 |
572 | sub lr, lr, r4 |
573 | add r4, r0, r4 @ output address of final block |
574 | |
575 | vld1.8 {q1}, [r1] @ load final partial block |
576 | vld1.8 {q2}, [ip] |
577 | vld1.8 {q3}, [lr] |
578 | |
579 | vtbl.8 d4, {d0-d1}, d4 |
580 | vtbl.8 d5, {d0-d1}, d5 |
581 | vtbx.8 d0, {d2-d3}, d6 |
582 | vtbx.8 d1, {d2-d3}, d7 |
583 | |
584 | vst1.8 {q2}, [r4] @ overlapping stores |
585 | mov r4, #0 |
586 | b .Lxtsencctsout |
587 | ENDPROC(ce_aes_xts_encrypt) |
588 | |
589 | |
590 | ENTRY(ce_aes_xts_decrypt) |
591 | push {r4-r6, lr} |
592 | |
593 | bl ce_aes_xts_init @ run shared prologue |
594 | prepare_key r2, r3 |
595 | vmov q4, q0 |
596 | |
597 | /* subtract 16 bytes if we are doing CTS */ |
598 | tst r4, #0xf |
599 | subne r4, r4, #0x10 |
600 | |
601 | teq r6, #0 @ start of a block? |
602 | bne .Lxtsdec4x |
603 | |
604 | .Lxtsdecloop4x: |
605 | next_tweak q4, q4, q15, q10 |
606 | .Lxtsdec4x: |
607 | subs r4, r4, #64 |
608 | bmi .Lxtsdec1x |
609 | vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks |
610 | vld1.8 {q2-q3}, [r1]! |
611 | next_tweak q5, q4, q15, q10 |
612 | veor q0, q0, q4 |
613 | next_tweak q6, q5, q15, q10 |
614 | veor q1, q1, q5 |
615 | next_tweak q7, q6, q15, q10 |
616 | veor q2, q2, q6 |
617 | veor q3, q3, q7 |
618 | bl aes_decrypt_4x |
619 | veor q0, q0, q4 |
620 | veor q1, q1, q5 |
621 | veor q2, q2, q6 |
622 | veor q3, q3, q7 |
623 | vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks |
624 | vst1.8 {q2-q3}, [r0]! |
625 | vmov q4, q7 |
626 | teq r4, #0 |
627 | beq .Lxtsdecout |
628 | b .Lxtsdecloop4x |
629 | .Lxtsdec1x: |
630 | adds r4, r4, #64 |
631 | beq .Lxtsdecout |
632 | subs r4, r4, #16 |
633 | .Lxtsdecloop: |
634 | vld1.8 {q0}, [r1]! |
635 | bmi .Lxtsdeccts |
636 | .Lxtsdecctsout: |
637 | veor q0, q0, q4 |
638 | bl aes_decrypt |
639 | veor q0, q0, q4 |
640 | vst1.8 {q0}, [r0]! |
641 | teq r4, #0 |
642 | beq .Lxtsdecout |
643 | subs r4, r4, #16 |
644 | next_tweak q4, q4, q15, q6 |
645 | b .Lxtsdecloop |
646 | .Lxtsdecout: |
647 | vst1.8 {q4}, [r5] |
648 | pop {r4-r6, pc} |
649 | |
650 | .Lxtsdeccts: |
651 | movw ip, :lower16:.Lcts_permute_table |
652 | movt ip, :upper16:.Lcts_permute_table |
653 | |
654 | add r1, r1, r4 @ rewind input pointer |
655 | add r4, r4, #16 @ # bytes in final block |
656 | add lr, ip, #32 |
657 | add ip, ip, r4 |
658 | sub lr, lr, r4 |
659 | add r4, r0, r4 @ output address of final block |
660 | |
661 | next_tweak q5, q4, q15, q6 |
662 | |
663 | vld1.8 {q1}, [r1] @ load final partial block |
664 | vld1.8 {q2}, [ip] |
665 | vld1.8 {q3}, [lr] |
666 | |
667 | veor q0, q0, q5 |
668 | bl aes_decrypt |
669 | veor q0, q0, q5 |
670 | |
671 | vtbl.8 d4, {d0-d1}, d4 |
672 | vtbl.8 d5, {d0-d1}, d5 |
673 | vtbx.8 d0, {d2-d3}, d6 |
674 | vtbx.8 d1, {d2-d3}, d7 |
675 | |
676 | vst1.8 {q2}, [r4] @ overlapping stores |
677 | mov r4, #0 |
678 | b .Lxtsdecctsout |
679 | ENDPROC(ce_aes_xts_decrypt) |
680 | |
681 | /* |
682 | * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the |
683 | * AES sbox substitution on each byte in |
684 | * 'input' |
685 | */ |
686 | ENTRY(ce_aes_sub) |
687 | vdup.32 q1, r0 |
688 | veor q0, q0, q0 |
689 | aese.8 q0, q1 |
690 | vmov r0, s0 |
691 | bx lr |
692 | ENDPROC(ce_aes_sub) |
693 | |
694 | /* |
695 | * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns |
696 | * operation on round key *src |
697 | */ |
698 | ENTRY(ce_aes_invert) |
699 | vld1.32 {q0}, [r1] |
700 | aesimc.8 q0, q0 |
701 | vst1.32 {q0}, [r0] |
702 | bx lr |
703 | ENDPROC(ce_aes_invert) |
704 | |
705 | .section ".rodata" , "a" |
706 | .align 6 |
707 | .Lcts_permute_table: |
708 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
709 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
710 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
711 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
712 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
713 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
714 | |