1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. |
4 | * |
5 | * Copyright (C) 2015 - 2017 Linaro Ltd. |
6 | * Copyright (C) 2023 Google LLC. <ardb@google.com> |
7 | */ |
8 | |
9 | #include <linux/linkage.h> |
10 | #include <asm/assembler.h> |
11 | |
12 | .arch armv8-a |
13 | .fpu crypto-neon-fp-armv8 |
14 | |
15 | SHASH .req q0 |
16 | T1 .req q1 |
17 | XL .req q2 |
18 | XM .req q3 |
19 | XH .req q4 |
20 | IN1 .req q4 |
21 | |
22 | SHASH_L .req d0 |
23 | SHASH_H .req d1 |
24 | T1_L .req d2 |
25 | T1_H .req d3 |
26 | XL_L .req d4 |
27 | XL_H .req d5 |
28 | XM_L .req d6 |
29 | XM_H .req d7 |
30 | XH_L .req d8 |
31 | |
32 | t0l .req d10 |
33 | t0h .req d11 |
34 | t1l .req d12 |
35 | t1h .req d13 |
36 | t2l .req d14 |
37 | t2h .req d15 |
38 | t3l .req d16 |
39 | t3h .req d17 |
40 | t4l .req d18 |
41 | t4h .req d19 |
42 | |
43 | t0q .req q5 |
44 | t1q .req q6 |
45 | t2q .req q7 |
46 | t3q .req q8 |
47 | t4q .req q9 |
48 | XH2 .req q9 |
49 | |
50 | s1l .req d20 |
51 | s1h .req d21 |
52 | s2l .req d22 |
53 | s2h .req d23 |
54 | s3l .req d24 |
55 | s3h .req d25 |
56 | s4l .req d26 |
57 | s4h .req d27 |
58 | |
59 | MASK .req d28 |
60 | SHASH2_p8 .req d28 |
61 | |
62 | k16 .req d29 |
63 | k32 .req d30 |
64 | k48 .req d31 |
65 | SHASH2_p64 .req d31 |
66 | |
67 | HH .req q10 |
68 | HH3 .req q11 |
69 | HH4 .req q12 |
70 | HH34 .req q13 |
71 | |
72 | HH_L .req d20 |
73 | HH_H .req d21 |
74 | HH3_L .req d22 |
75 | HH3_H .req d23 |
76 | HH4_L .req d24 |
77 | HH4_H .req d25 |
78 | HH34_L .req d26 |
79 | HH34_H .req d27 |
80 | SHASH2_H .req d29 |
81 | |
82 | XL2 .req q5 |
83 | XM2 .req q6 |
84 | T2 .req q7 |
85 | T3 .req q8 |
86 | |
87 | XL2_L .req d10 |
88 | XL2_H .req d11 |
89 | XM2_L .req d12 |
90 | XM2_H .req d13 |
91 | T3_L .req d16 |
92 | T3_H .req d17 |
93 | |
94 | .text |
95 | |
96 | .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 |
97 | vmull.p64 \rd, \rn, \rm |
98 | .endm |
99 | |
100 | /* |
101 | * This implementation of 64x64 -> 128 bit polynomial multiplication |
102 | * using vmull.p8 instructions (8x8 -> 16) is taken from the paper |
103 | * "Fast Software Polynomial Multiplication on ARM Processors Using |
104 | * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and |
105 | * Ricardo Dahab (https://hal.inria.fr/hal-01506572) |
106 | * |
107 | * It has been slightly tweaked for in-order performance, and to allow |
108 | * 'rq' to overlap with 'ad' or 'bd'. |
109 | */ |
110 | .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l |
111 | vext.8 t0l, \ad, \ad, #1 @ A1 |
112 | .ifc \b1, t4l |
113 | vext.8 t4l, \bd, \bd, #1 @ B1 |
114 | .endif |
115 | vmull.p8 t0q, t0l, \bd @ F = A1*B |
116 | vext.8 t1l, \ad, \ad, #2 @ A2 |
117 | vmull.p8 t4q, \ad, \b1 @ E = A*B1 |
118 | .ifc \b2, t3l |
119 | vext.8 t3l, \bd, \bd, #2 @ B2 |
120 | .endif |
121 | vmull.p8 t1q, t1l, \bd @ H = A2*B |
122 | vext.8 t2l, \ad, \ad, #3 @ A3 |
123 | vmull.p8 t3q, \ad, \b2 @ G = A*B2 |
124 | veor t0q, t0q, t4q @ L = E + F |
125 | .ifc \b3, t4l |
126 | vext.8 t4l, \bd, \bd, #3 @ B3 |
127 | .endif |
128 | vmull.p8 t2q, t2l, \bd @ J = A3*B |
129 | veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 |
130 | veor t1q, t1q, t3q @ M = G + H |
131 | .ifc \b4, t3l |
132 | vext.8 t3l, \bd, \bd, #4 @ B4 |
133 | .endif |
134 | vmull.p8 t4q, \ad, \b3 @ I = A*B3 |
135 | veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 |
136 | vmull.p8 t3q, \ad, \b4 @ K = A*B4 |
137 | vand t0h, t0h, k48 |
138 | vand t1h, t1h, k32 |
139 | veor t2q, t2q, t4q @ N = I + J |
140 | veor t0l, t0l, t0h |
141 | veor t1l, t1l, t1h |
142 | veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 |
143 | vand t2h, t2h, k16 |
144 | veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 |
145 | vmov.i64 t3h, #0 |
146 | vext.8 t0q, t0q, t0q, #15 |
147 | veor t2l, t2l, t2h |
148 | vext.8 t1q, t1q, t1q, #14 |
149 | vmull.p8 \rq, \ad, \bd @ D = A*B |
150 | vext.8 t2q, t2q, t2q, #13 |
151 | vext.8 t3q, t3q, t3q, #12 |
152 | veor t0q, t0q, t1q |
153 | veor t2q, t2q, t3q |
154 | veor \rq, \rq, t0q |
155 | veor \rq, \rq, t2q |
156 | .endm |
157 | |
158 | // |
159 | // PMULL (64x64->128) based reduction for CPUs that can do |
160 | // it in a single instruction. |
161 | // |
162 | .macro __pmull_reduce_p64 |
163 | vmull.p64 T1, XL_L, MASK |
164 | |
165 | veor XH_L, XH_L, XM_H |
166 | vext.8 T1, T1, T1, #8 |
167 | veor XL_H, XL_H, XM_L |
168 | veor T1, T1, XL |
169 | |
170 | vmull.p64 XL, T1_H, MASK |
171 | .endm |
172 | |
173 | // |
174 | // Alternative reduction for CPUs that lack support for the |
175 | // 64x64->128 PMULL instruction |
176 | // |
177 | .macro __pmull_reduce_p8 |
178 | veor XL_H, XL_H, XM_L |
179 | veor XH_L, XH_L, XM_H |
180 | |
181 | vshl.i64 T1, XL, #57 |
182 | vshl.i64 T2, XL, #62 |
183 | veor T1, T1, T2 |
184 | vshl.i64 T2, XL, #63 |
185 | veor T1, T1, T2 |
186 | veor XL_H, XL_H, T1_L |
187 | veor XH_L, XH_L, T1_H |
188 | |
189 | vshr.u64 T1, XL, #1 |
190 | veor XH, XH, XL |
191 | veor XL, XL, T1 |
192 | vshr.u64 T1, T1, #6 |
193 | vshr.u64 XL, XL, #1 |
194 | .endm |
195 | |
196 | .macro ghash_update, pn, enc, aggregate=1, head=1 |
197 | vld1.64 {XL}, [r1] |
198 | |
199 | .if \head |
200 | /* do the head block first, if supplied */ |
201 | ldr ip, [sp] |
202 | teq ip, #0 |
203 | beq 0f |
204 | vld1.64 {T1}, [ip] |
205 | teq r0, #0 |
206 | b 3f |
207 | .endif |
208 | |
209 | 0: .ifc \pn, p64 |
210 | .if \aggregate |
211 | tst r0, #3 // skip until #blocks is a |
212 | bne 2f // round multiple of 4 |
213 | |
214 | vld1.8 {XL2-XM2}, [r2]! |
215 | 1: vld1.8 {T2-T3}, [r2]! |
216 | |
217 | .ifnb \enc |
218 | \enc\()_4x XL2, XM2, T2, T3 |
219 | |
220 | add ip, r3, #16 |
221 | vld1.64 {HH}, [ip, :128]! |
222 | vld1.64 {HH3-HH4}, [ip, :128] |
223 | |
224 | veor SHASH2_p64, SHASH_L, SHASH_H |
225 | veor SHASH2_H, HH_L, HH_H |
226 | veor HH34_L, HH3_L, HH3_H |
227 | veor HH34_H, HH4_L, HH4_H |
228 | |
229 | vmov.i8 MASK, #0xe1 |
230 | vshl.u64 MASK, MASK, #57 |
231 | .endif |
232 | |
233 | vrev64.8 XL2, XL2 |
234 | vrev64.8 XM2, XM2 |
235 | |
236 | subs r0, r0, #4 |
237 | |
238 | vext.8 T1, XL2, XL2, #8 |
239 | veor XL2_H, XL2_H, XL_L |
240 | veor XL, XL, T1 |
241 | |
242 | vrev64.8 T1, T3 |
243 | vrev64.8 T3, T2 |
244 | |
245 | vmull.p64 XH, HH4_H, XL_H // a1 * b1 |
246 | veor XL2_H, XL2_H, XL_H |
247 | vmull.p64 XL, HH4_L, XL_L // a0 * b0 |
248 | vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) |
249 | |
250 | vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 |
251 | veor XM2_L, XM2_L, XM2_H |
252 | vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 |
253 | vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) |
254 | |
255 | veor XH, XH, XH2 |
256 | veor XL, XL, XL2 |
257 | veor XM, XM, XM2 |
258 | |
259 | vmull.p64 XH2, HH_H, T3_L // a1 * b1 |
260 | veor T3_L, T3_L, T3_H |
261 | vmull.p64 XL2, HH_L, T3_H // a0 * b0 |
262 | vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) |
263 | |
264 | veor XH, XH, XH2 |
265 | veor XL, XL, XL2 |
266 | veor XM, XM, XM2 |
267 | |
268 | vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 |
269 | veor T1_L, T1_L, T1_H |
270 | vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 |
271 | vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) |
272 | |
273 | veor XH, XH, XH2 |
274 | veor XL, XL, XL2 |
275 | veor XM, XM, XM2 |
276 | |
277 | beq 4f |
278 | |
279 | vld1.8 {XL2-XM2}, [r2]! |
280 | |
281 | veor T1, XL, XH |
282 | veor XM, XM, T1 |
283 | |
284 | __pmull_reduce_p64 |
285 | |
286 | veor T1, T1, XH |
287 | veor XL, XL, T1 |
288 | |
289 | b 1b |
290 | .endif |
291 | .endif |
292 | |
293 | 2: vld1.8 {T1}, [r2]! |
294 | |
295 | .ifnb \enc |
296 | \enc\()_1x T1 |
297 | veor SHASH2_p64, SHASH_L, SHASH_H |
298 | vmov.i8 MASK, #0xe1 |
299 | vshl.u64 MASK, MASK, #57 |
300 | .endif |
301 | |
302 | subs r0, r0, #1 |
303 | |
304 | 3: /* multiply XL by SHASH in GF(2^128) */ |
305 | vrev64.8 T1, T1 |
306 | |
307 | vext.8 IN1, T1, T1, #8 |
308 | veor T1_L, T1_L, XL_H |
309 | veor XL, XL, IN1 |
310 | |
311 | __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 |
312 | veor T1, T1, XL |
313 | __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 |
314 | __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) |
315 | |
316 | 4: veor T1, XL, XH |
317 | veor XM, XM, T1 |
318 | |
319 | __pmull_reduce_\pn |
320 | |
321 | veor T1, T1, XH |
322 | veor XL, XL, T1 |
323 | |
324 | bne 0b |
325 | .endm |
326 | |
327 | /* |
328 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, |
329 | * struct ghash_key const *k, const char *head) |
330 | */ |
331 | ENTRY(pmull_ghash_update_p64) |
332 | vld1.64 {SHASH}, [r3]! |
333 | vld1.64 {HH}, [r3]! |
334 | vld1.64 {HH3-HH4}, [r3] |
335 | |
336 | veor SHASH2_p64, SHASH_L, SHASH_H |
337 | veor SHASH2_H, HH_L, HH_H |
338 | veor HH34_L, HH3_L, HH3_H |
339 | veor HH34_H, HH4_L, HH4_H |
340 | |
341 | vmov.i8 MASK, #0xe1 |
342 | vshl.u64 MASK, MASK, #57 |
343 | |
344 | ghash_update p64 |
345 | vst1.64 {XL}, [r1] |
346 | |
347 | bx lr |
348 | ENDPROC(pmull_ghash_update_p64) |
349 | |
350 | ENTRY(pmull_ghash_update_p8) |
351 | vld1.64 {SHASH}, [r3] |
352 | veor SHASH2_p8, SHASH_L, SHASH_H |
353 | |
354 | vext.8 s1l, SHASH_L, SHASH_L, #1 |
355 | vext.8 s2l, SHASH_L, SHASH_L, #2 |
356 | vext.8 s3l, SHASH_L, SHASH_L, #3 |
357 | vext.8 s4l, SHASH_L, SHASH_L, #4 |
358 | vext.8 s1h, SHASH_H, SHASH_H, #1 |
359 | vext.8 s2h, SHASH_H, SHASH_H, #2 |
360 | vext.8 s3h, SHASH_H, SHASH_H, #3 |
361 | vext.8 s4h, SHASH_H, SHASH_H, #4 |
362 | |
363 | vmov.i64 k16, #0xffff |
364 | vmov.i64 k32, #0xffffffff |
365 | vmov.i64 k48, #0xffffffffffff |
366 | |
367 | ghash_update p8 |
368 | vst1.64 {XL}, [r1] |
369 | |
370 | bx lr |
371 | ENDPROC(pmull_ghash_update_p8) |
372 | |
373 | e0 .req q9 |
374 | e1 .req q10 |
375 | e2 .req q11 |
376 | e3 .req q12 |
377 | e0l .req d18 |
378 | e0h .req d19 |
379 | e2l .req d22 |
380 | e2h .req d23 |
381 | e3l .req d24 |
382 | e3h .req d25 |
383 | ctr .req q13 |
384 | ctr0 .req d26 |
385 | ctr1 .req d27 |
386 | |
387 | ek0 .req q14 |
388 | ek1 .req q15 |
389 | |
390 | .macro round, rk:req, regs:vararg |
391 | .irp r, \regs |
392 | aese.8 \r, \rk |
393 | aesmc.8 \r, \r |
394 | .endr |
395 | .endm |
396 | |
397 | .macro aes_encrypt, rkp, rounds, regs:vararg |
398 | vld1.8 {ek0-ek1}, [\rkp, :128]! |
399 | cmp \rounds, #12 |
400 | blt .L\@ // AES-128 |
401 | |
402 | round ek0, \regs |
403 | vld1.8 {ek0}, [\rkp, :128]! |
404 | round ek1, \regs |
405 | vld1.8 {ek1}, [\rkp, :128]! |
406 | |
407 | beq .L\@ // AES-192 |
408 | |
409 | round ek0, \regs |
410 | vld1.8 {ek0}, [\rkp, :128]! |
411 | round ek1, \regs |
412 | vld1.8 {ek1}, [\rkp, :128]! |
413 | |
414 | .L\@: .rept 4 |
415 | round ek0, \regs |
416 | vld1.8 {ek0}, [\rkp, :128]! |
417 | round ek1, \regs |
418 | vld1.8 {ek1}, [\rkp, :128]! |
419 | .endr |
420 | |
421 | round ek0, \regs |
422 | vld1.8 {ek0}, [\rkp, :128] |
423 | |
424 | .irp r, \regs |
425 | aese.8 \r, ek1 |
426 | .endr |
427 | .irp r, \regs |
428 | veor \r, \r, ek0 |
429 | .endr |
430 | .endm |
431 | |
432 | pmull_aes_encrypt: |
433 | add ip, r5, #4 |
434 | vld1.8 {ctr0}, [r5] // load 12 byte IV |
435 | vld1.8 {ctr1}, [ip] |
436 | rev r8, r7 |
437 | vext.8 ctr1, ctr1, ctr1, #4 |
438 | add r7, r7, #1 |
439 | vmov.32 ctr1[1], r8 |
440 | vmov e0, ctr |
441 | |
442 | add ip, r3, #64 |
443 | aes_encrypt ip, r6, e0 |
444 | bx lr |
445 | ENDPROC(pmull_aes_encrypt) |
446 | |
447 | pmull_aes_encrypt_4x: |
448 | add ip, r5, #4 |
449 | vld1.8 {ctr0}, [r5] |
450 | vld1.8 {ctr1}, [ip] |
451 | rev r8, r7 |
452 | vext.8 ctr1, ctr1, ctr1, #4 |
453 | add r7, r7, #1 |
454 | vmov.32 ctr1[1], r8 |
455 | rev ip, r7 |
456 | vmov e0, ctr |
457 | add r7, r7, #1 |
458 | vmov.32 ctr1[1], ip |
459 | rev r8, r7 |
460 | vmov e1, ctr |
461 | add r7, r7, #1 |
462 | vmov.32 ctr1[1], r8 |
463 | rev ip, r7 |
464 | vmov e2, ctr |
465 | add r7, r7, #1 |
466 | vmov.32 ctr1[1], ip |
467 | vmov e3, ctr |
468 | |
469 | add ip, r3, #64 |
470 | aes_encrypt ip, r6, e0, e1, e2, e3 |
471 | bx lr |
472 | ENDPROC(pmull_aes_encrypt_4x) |
473 | |
474 | pmull_aes_encrypt_final: |
475 | add ip, r5, #4 |
476 | vld1.8 {ctr0}, [r5] |
477 | vld1.8 {ctr1}, [ip] |
478 | rev r8, r7 |
479 | vext.8 ctr1, ctr1, ctr1, #4 |
480 | mov r7, #1 << 24 // BE #1 for the tag |
481 | vmov.32 ctr1[1], r8 |
482 | vmov e0, ctr |
483 | vmov.32 ctr1[1], r7 |
484 | vmov e1, ctr |
485 | |
486 | add ip, r3, #64 |
487 | aes_encrypt ip, r6, e0, e1 |
488 | bx lr |
489 | ENDPROC(pmull_aes_encrypt_final) |
490 | |
491 | .macro enc_1x, in0 |
492 | bl pmull_aes_encrypt |
493 | veor \in0, \in0, e0 |
494 | vst1.8 {\in0}, [r4]! |
495 | .endm |
496 | |
497 | .macro dec_1x, in0 |
498 | bl pmull_aes_encrypt |
499 | veor e0, e0, \in0 |
500 | vst1.8 {e0}, [r4]! |
501 | .endm |
502 | |
503 | .macro enc_4x, in0, in1, in2, in3 |
504 | bl pmull_aes_encrypt_4x |
505 | |
506 | veor \in0, \in0, e0 |
507 | veor \in1, \in1, e1 |
508 | veor \in2, \in2, e2 |
509 | veor \in3, \in3, e3 |
510 | |
511 | vst1.8 {\in0-\in1}, [r4]! |
512 | vst1.8 {\in2-\in3}, [r4]! |
513 | .endm |
514 | |
515 | .macro dec_4x, in0, in1, in2, in3 |
516 | bl pmull_aes_encrypt_4x |
517 | |
518 | veor e0, e0, \in0 |
519 | veor e1, e1, \in1 |
520 | veor e2, e2, \in2 |
521 | veor e3, e3, \in3 |
522 | |
523 | vst1.8 {e0-e1}, [r4]! |
524 | vst1.8 {e2-e3}, [r4]! |
525 | .endm |
526 | |
527 | /* |
528 | * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src, |
529 | * struct gcm_key const *k, char *dst, |
530 | * char *iv, int rounds, u32 counter) |
531 | */ |
532 | ENTRY(pmull_gcm_encrypt) |
533 | push {r4-r8, lr} |
534 | ldrd r4, r5, [sp, #24] |
535 | ldrd r6, r7, [sp, #32] |
536 | |
537 | vld1.64 {SHASH}, [r3] |
538 | |
539 | ghash_update p64, enc, head=0 |
540 | vst1.64 {XL}, [r1] |
541 | |
542 | pop {r4-r8, pc} |
543 | ENDPROC(pmull_gcm_encrypt) |
544 | |
545 | /* |
546 | * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src, |
547 | * struct gcm_key const *k, char *dst, |
548 | * char *iv, int rounds, u32 counter) |
549 | */ |
550 | ENTRY(pmull_gcm_decrypt) |
551 | push {r4-r8, lr} |
552 | ldrd r4, r5, [sp, #24] |
553 | ldrd r6, r7, [sp, #32] |
554 | |
555 | vld1.64 {SHASH}, [r3] |
556 | |
557 | ghash_update p64, dec, head=0 |
558 | vst1.64 {XL}, [r1] |
559 | |
560 | pop {r4-r8, pc} |
561 | ENDPROC(pmull_gcm_decrypt) |
562 | |
563 | /* |
564 | * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag, |
565 | * struct gcm_key const *k, char *head, |
566 | * char *iv, int rounds, u32 counter) |
567 | */ |
568 | ENTRY(pmull_gcm_enc_final) |
569 | push {r4-r8, lr} |
570 | ldrd r4, r5, [sp, #24] |
571 | ldrd r6, r7, [sp, #32] |
572 | |
573 | bl pmull_aes_encrypt_final |
574 | |
575 | cmp r0, #0 |
576 | beq .Lenc_final |
577 | |
578 | mov_l ip, .Lpermute |
579 | sub r4, r4, #16 |
580 | add r8, ip, r0 |
581 | add ip, ip, #32 |
582 | add r4, r4, r0 |
583 | sub ip, ip, r0 |
584 | |
585 | vld1.8 {e3}, [r8] // permute vector for key stream |
586 | vld1.8 {e2}, [ip] // permute vector for ghash input |
587 | |
588 | vtbl.8 e3l, {e0}, e3l |
589 | vtbl.8 e3h, {e0}, e3h |
590 | |
591 | vld1.8 {e0}, [r4] // encrypt tail block |
592 | veor e0, e0, e3 |
593 | vst1.8 {e0}, [r4] |
594 | |
595 | vtbl.8 T1_L, {e0}, e2l |
596 | vtbl.8 T1_H, {e0}, e2h |
597 | |
598 | vld1.64 {XL}, [r1] |
599 | .Lenc_final: |
600 | vld1.64 {SHASH}, [r3, :128] |
601 | vmov.i8 MASK, #0xe1 |
602 | veor SHASH2_p64, SHASH_L, SHASH_H |
603 | vshl.u64 MASK, MASK, #57 |
604 | mov r0, #1 |
605 | bne 3f // process head block first |
606 | ghash_update p64, aggregate=0, head=0 |
607 | |
608 | vrev64.8 XL, XL |
609 | vext.8 XL, XL, XL, #8 |
610 | veor XL, XL, e1 |
611 | |
612 | sub r2, r2, #16 // rewind src pointer |
613 | vst1.8 {XL}, [r2] // store tag |
614 | |
615 | pop {r4-r8, pc} |
616 | ENDPROC(pmull_gcm_enc_final) |
617 | |
618 | /* |
619 | * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag, |
620 | * struct gcm_key const *k, char *head, |
621 | * char *iv, int rounds, u32 counter, |
622 | * const char *otag, int authsize) |
623 | */ |
624 | ENTRY(pmull_gcm_dec_final) |
625 | push {r4-r8, lr} |
626 | ldrd r4, r5, [sp, #24] |
627 | ldrd r6, r7, [sp, #32] |
628 | |
629 | bl pmull_aes_encrypt_final |
630 | |
631 | cmp r0, #0 |
632 | beq .Ldec_final |
633 | |
634 | mov_l ip, .Lpermute |
635 | sub r4, r4, #16 |
636 | add r8, ip, r0 |
637 | add ip, ip, #32 |
638 | add r4, r4, r0 |
639 | sub ip, ip, r0 |
640 | |
641 | vld1.8 {e3}, [r8] // permute vector for key stream |
642 | vld1.8 {e2}, [ip] // permute vector for ghash input |
643 | |
644 | vtbl.8 e3l, {e0}, e3l |
645 | vtbl.8 e3h, {e0}, e3h |
646 | |
647 | vld1.8 {e0}, [r4] |
648 | |
649 | vtbl.8 T1_L, {e0}, e2l |
650 | vtbl.8 T1_H, {e0}, e2h |
651 | |
652 | veor e0, e0, e3 |
653 | vst1.8 {e0}, [r4] |
654 | |
655 | vld1.64 {XL}, [r1] |
656 | .Ldec_final: |
657 | vld1.64 {SHASH}, [r3] |
658 | vmov.i8 MASK, #0xe1 |
659 | veor SHASH2_p64, SHASH_L, SHASH_H |
660 | vshl.u64 MASK, MASK, #57 |
661 | mov r0, #1 |
662 | bne 3f // process head block first |
663 | ghash_update p64, aggregate=0, head=0 |
664 | |
665 | vrev64.8 XL, XL |
666 | vext.8 XL, XL, XL, #8 |
667 | veor XL, XL, e1 |
668 | |
669 | mov_l ip, .Lpermute |
670 | ldrd r2, r3, [sp, #40] // otag and authsize |
671 | vld1.8 {T1}, [r2] |
672 | add ip, ip, r3 |
673 | vceq.i8 T1, T1, XL // compare tags |
674 | vmvn T1, T1 // 0 for eq, -1 for ne |
675 | |
676 | vld1.8 {e0}, [ip] |
677 | vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only |
678 | vtbl.8 XL_H, {T1}, e0h |
679 | |
680 | vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector |
681 | vpmin.s8 XL_L, XL_L, XL_L |
682 | vmov.32 r0, XL_L[0] // fail if != 0x0 |
683 | |
684 | pop {r4-r8, pc} |
685 | ENDPROC(pmull_gcm_dec_final) |
686 | |
687 | .section ".rodata" , "a" , %progbits |
688 | .align 5 |
689 | .Lpermute: |
690 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
691 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
692 | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 |
693 | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f |
694 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
695 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
696 | |