1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4 *
5 * Copyright (C) 2015 - 2017 Linaro Ltd.
6 * Copyright (C) 2023 Google LLC. <ardb@google.com>
7 */
8
9#include <linux/linkage.h>
10#include <asm/assembler.h>
11
12 .arch armv8-a
13 .fpu crypto-neon-fp-armv8
14
15 SHASH .req q0
16 T1 .req q1
17 XL .req q2
18 XM .req q3
19 XH .req q4
20 IN1 .req q4
21
22 SHASH_L .req d0
23 SHASH_H .req d1
24 T1_L .req d2
25 T1_H .req d3
26 XL_L .req d4
27 XL_H .req d5
28 XM_L .req d6
29 XM_H .req d7
30 XH_L .req d8
31
32 t0l .req d10
33 t0h .req d11
34 t1l .req d12
35 t1h .req d13
36 t2l .req d14
37 t2h .req d15
38 t3l .req d16
39 t3h .req d17
40 t4l .req d18
41 t4h .req d19
42
43 t0q .req q5
44 t1q .req q6
45 t2q .req q7
46 t3q .req q8
47 t4q .req q9
48 XH2 .req q9
49
50 s1l .req d20
51 s1h .req d21
52 s2l .req d22
53 s2h .req d23
54 s3l .req d24
55 s3h .req d25
56 s4l .req d26
57 s4h .req d27
58
59 MASK .req d28
60 SHASH2_p8 .req d28
61
62 k16 .req d29
63 k32 .req d30
64 k48 .req d31
65 SHASH2_p64 .req d31
66
67 HH .req q10
68 HH3 .req q11
69 HH4 .req q12
70 HH34 .req q13
71
72 HH_L .req d20
73 HH_H .req d21
74 HH3_L .req d22
75 HH3_H .req d23
76 HH4_L .req d24
77 HH4_H .req d25
78 HH34_L .req d26
79 HH34_H .req d27
80 SHASH2_H .req d29
81
82 XL2 .req q5
83 XM2 .req q6
84 T2 .req q7
85 T3 .req q8
86
87 XL2_L .req d10
88 XL2_H .req d11
89 XM2_L .req d12
90 XM2_H .req d13
91 T3_L .req d16
92 T3_H .req d17
93
94 .text
95
96 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
97 vmull.p64 \rd, \rn, \rm
98 .endm
99
100 /*
101 * This implementation of 64x64 -> 128 bit polynomial multiplication
102 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103 * "Fast Software Polynomial Multiplication on ARM Processors Using
104 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
106 *
107 * It has been slightly tweaked for in-order performance, and to allow
108 * 'rq' to overlap with 'ad' or 'bd'.
109 */
110 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111 vext.8 t0l, \ad, \ad, #1 @ A1
112 .ifc \b1, t4l
113 vext.8 t4l, \bd, \bd, #1 @ B1
114 .endif
115 vmull.p8 t0q, t0l, \bd @ F = A1*B
116 vext.8 t1l, \ad, \ad, #2 @ A2
117 vmull.p8 t4q, \ad, \b1 @ E = A*B1
118 .ifc \b2, t3l
119 vext.8 t3l, \bd, \bd, #2 @ B2
120 .endif
121 vmull.p8 t1q, t1l, \bd @ H = A2*B
122 vext.8 t2l, \ad, \ad, #3 @ A3
123 vmull.p8 t3q, \ad, \b2 @ G = A*B2
124 veor t0q, t0q, t4q @ L = E + F
125 .ifc \b3, t4l
126 vext.8 t4l, \bd, \bd, #3 @ B3
127 .endif
128 vmull.p8 t2q, t2l, \bd @ J = A3*B
129 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
130 veor t1q, t1q, t3q @ M = G + H
131 .ifc \b4, t3l
132 vext.8 t3l, \bd, \bd, #4 @ B4
133 .endif
134 vmull.p8 t4q, \ad, \b3 @ I = A*B3
135 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
136 vmull.p8 t3q, \ad, \b4 @ K = A*B4
137 vand t0h, t0h, k48
138 vand t1h, t1h, k32
139 veor t2q, t2q, t4q @ N = I + J
140 veor t0l, t0l, t0h
141 veor t1l, t1l, t1h
142 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
143 vand t2h, t2h, k16
144 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
145 vmov.i64 t3h, #0
146 vext.8 t0q, t0q, t0q, #15
147 veor t2l, t2l, t2h
148 vext.8 t1q, t1q, t1q, #14
149 vmull.p8 \rq, \ad, \bd @ D = A*B
150 vext.8 t2q, t2q, t2q, #13
151 vext.8 t3q, t3q, t3q, #12
152 veor t0q, t0q, t1q
153 veor t2q, t2q, t3q
154 veor \rq, \rq, t0q
155 veor \rq, \rq, t2q
156 .endm
157
158 //
159 // PMULL (64x64->128) based reduction for CPUs that can do
160 // it in a single instruction.
161 //
162 .macro __pmull_reduce_p64
163 vmull.p64 T1, XL_L, MASK
164
165 veor XH_L, XH_L, XM_H
166 vext.8 T1, T1, T1, #8
167 veor XL_H, XL_H, XM_L
168 veor T1, T1, XL
169
170 vmull.p64 XL, T1_H, MASK
171 .endm
172
173 //
174 // Alternative reduction for CPUs that lack support for the
175 // 64x64->128 PMULL instruction
176 //
177 .macro __pmull_reduce_p8
178 veor XL_H, XL_H, XM_L
179 veor XH_L, XH_L, XM_H
180
181 vshl.i64 T1, XL, #57
182 vshl.i64 T2, XL, #62
183 veor T1, T1, T2
184 vshl.i64 T2, XL, #63
185 veor T1, T1, T2
186 veor XL_H, XL_H, T1_L
187 veor XH_L, XH_L, T1_H
188
189 vshr.u64 T1, XL, #1
190 veor XH, XH, XL
191 veor XL, XL, T1
192 vshr.u64 T1, T1, #6
193 vshr.u64 XL, XL, #1
194 .endm
195
196 .macro ghash_update, pn, enc, aggregate=1, head=1
197 vld1.64 {XL}, [r1]
198
199 .if \head
200 /* do the head block first, if supplied */
201 ldr ip, [sp]
202 teq ip, #0
203 beq 0f
204 vld1.64 {T1}, [ip]
205 teq r0, #0
206 b 3f
207 .endif
208
2090: .ifc \pn, p64
210 .if \aggregate
211 tst r0, #3 // skip until #blocks is a
212 bne 2f // round multiple of 4
213
214 vld1.8 {XL2-XM2}, [r2]!
2151: vld1.8 {T2-T3}, [r2]!
216
217 .ifnb \enc
218 \enc\()_4x XL2, XM2, T2, T3
219
220 add ip, r3, #16
221 vld1.64 {HH}, [ip, :128]!
222 vld1.64 {HH3-HH4}, [ip, :128]
223
224 veor SHASH2_p64, SHASH_L, SHASH_H
225 veor SHASH2_H, HH_L, HH_H
226 veor HH34_L, HH3_L, HH3_H
227 veor HH34_H, HH4_L, HH4_H
228
229 vmov.i8 MASK, #0xe1
230 vshl.u64 MASK, MASK, #57
231 .endif
232
233 vrev64.8 XL2, XL2
234 vrev64.8 XM2, XM2
235
236 subs r0, r0, #4
237
238 vext.8 T1, XL2, XL2, #8
239 veor XL2_H, XL2_H, XL_L
240 veor XL, XL, T1
241
242 vrev64.8 T1, T3
243 vrev64.8 T3, T2
244
245 vmull.p64 XH, HH4_H, XL_H // a1 * b1
246 veor XL2_H, XL2_H, XL_H
247 vmull.p64 XL, HH4_L, XL_L // a0 * b0
248 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
249
250 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
251 veor XM2_L, XM2_L, XM2_H
252 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
253 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
254
255 veor XH, XH, XH2
256 veor XL, XL, XL2
257 veor XM, XM, XM2
258
259 vmull.p64 XH2, HH_H, T3_L // a1 * b1
260 veor T3_L, T3_L, T3_H
261 vmull.p64 XL2, HH_L, T3_H // a0 * b0
262 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
263
264 veor XH, XH, XH2
265 veor XL, XL, XL2
266 veor XM, XM, XM2
267
268 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
269 veor T1_L, T1_L, T1_H
270 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
271 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
272
273 veor XH, XH, XH2
274 veor XL, XL, XL2
275 veor XM, XM, XM2
276
277 beq 4f
278
279 vld1.8 {XL2-XM2}, [r2]!
280
281 veor T1, XL, XH
282 veor XM, XM, T1
283
284 __pmull_reduce_p64
285
286 veor T1, T1, XH
287 veor XL, XL, T1
288
289 b 1b
290 .endif
291 .endif
292
2932: vld1.8 {T1}, [r2]!
294
295 .ifnb \enc
296 \enc\()_1x T1
297 veor SHASH2_p64, SHASH_L, SHASH_H
298 vmov.i8 MASK, #0xe1
299 vshl.u64 MASK, MASK, #57
300 .endif
301
302 subs r0, r0, #1
303
3043: /* multiply XL by SHASH in GF(2^128) */
305 vrev64.8 T1, T1
306
307 vext.8 IN1, T1, T1, #8
308 veor T1_L, T1_L, XL_H
309 veor XL, XL, IN1
310
311 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
312 veor T1, T1, XL
313 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
314 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
315
3164: veor T1, XL, XH
317 veor XM, XM, T1
318
319 __pmull_reduce_\pn
320
321 veor T1, T1, XH
322 veor XL, XL, T1
323
324 bne 0b
325 .endm
326
327 /*
328 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
329 * struct ghash_key const *k, const char *head)
330 */
331ENTRY(pmull_ghash_update_p64)
332 vld1.64 {SHASH}, [r3]!
333 vld1.64 {HH}, [r3]!
334 vld1.64 {HH3-HH4}, [r3]
335
336 veor SHASH2_p64, SHASH_L, SHASH_H
337 veor SHASH2_H, HH_L, HH_H
338 veor HH34_L, HH3_L, HH3_H
339 veor HH34_H, HH4_L, HH4_H
340
341 vmov.i8 MASK, #0xe1
342 vshl.u64 MASK, MASK, #57
343
344 ghash_update p64
345 vst1.64 {XL}, [r1]
346
347 bx lr
348ENDPROC(pmull_ghash_update_p64)
349
350ENTRY(pmull_ghash_update_p8)
351 vld1.64 {SHASH}, [r3]
352 veor SHASH2_p8, SHASH_L, SHASH_H
353
354 vext.8 s1l, SHASH_L, SHASH_L, #1
355 vext.8 s2l, SHASH_L, SHASH_L, #2
356 vext.8 s3l, SHASH_L, SHASH_L, #3
357 vext.8 s4l, SHASH_L, SHASH_L, #4
358 vext.8 s1h, SHASH_H, SHASH_H, #1
359 vext.8 s2h, SHASH_H, SHASH_H, #2
360 vext.8 s3h, SHASH_H, SHASH_H, #3
361 vext.8 s4h, SHASH_H, SHASH_H, #4
362
363 vmov.i64 k16, #0xffff
364 vmov.i64 k32, #0xffffffff
365 vmov.i64 k48, #0xffffffffffff
366
367 ghash_update p8
368 vst1.64 {XL}, [r1]
369
370 bx lr
371ENDPROC(pmull_ghash_update_p8)
372
373 e0 .req q9
374 e1 .req q10
375 e2 .req q11
376 e3 .req q12
377 e0l .req d18
378 e0h .req d19
379 e2l .req d22
380 e2h .req d23
381 e3l .req d24
382 e3h .req d25
383 ctr .req q13
384 ctr0 .req d26
385 ctr1 .req d27
386
387 ek0 .req q14
388 ek1 .req q15
389
390 .macro round, rk:req, regs:vararg
391 .irp r, \regs
392 aese.8 \r, \rk
393 aesmc.8 \r, \r
394 .endr
395 .endm
396
397 .macro aes_encrypt, rkp, rounds, regs:vararg
398 vld1.8 {ek0-ek1}, [\rkp, :128]!
399 cmp \rounds, #12
400 blt .L\@ // AES-128
401
402 round ek0, \regs
403 vld1.8 {ek0}, [\rkp, :128]!
404 round ek1, \regs
405 vld1.8 {ek1}, [\rkp, :128]!
406
407 beq .L\@ // AES-192
408
409 round ek0, \regs
410 vld1.8 {ek0}, [\rkp, :128]!
411 round ek1, \regs
412 vld1.8 {ek1}, [\rkp, :128]!
413
414.L\@: .rept 4
415 round ek0, \regs
416 vld1.8 {ek0}, [\rkp, :128]!
417 round ek1, \regs
418 vld1.8 {ek1}, [\rkp, :128]!
419 .endr
420
421 round ek0, \regs
422 vld1.8 {ek0}, [\rkp, :128]
423
424 .irp r, \regs
425 aese.8 \r, ek1
426 .endr
427 .irp r, \regs
428 veor \r, \r, ek0
429 .endr
430 .endm
431
432pmull_aes_encrypt:
433 add ip, r5, #4
434 vld1.8 {ctr0}, [r5] // load 12 byte IV
435 vld1.8 {ctr1}, [ip]
436 rev r8, r7
437 vext.8 ctr1, ctr1, ctr1, #4
438 add r7, r7, #1
439 vmov.32 ctr1[1], r8
440 vmov e0, ctr
441
442 add ip, r3, #64
443 aes_encrypt ip, r6, e0
444 bx lr
445ENDPROC(pmull_aes_encrypt)
446
447pmull_aes_encrypt_4x:
448 add ip, r5, #4
449 vld1.8 {ctr0}, [r5]
450 vld1.8 {ctr1}, [ip]
451 rev r8, r7
452 vext.8 ctr1, ctr1, ctr1, #4
453 add r7, r7, #1
454 vmov.32 ctr1[1], r8
455 rev ip, r7
456 vmov e0, ctr
457 add r7, r7, #1
458 vmov.32 ctr1[1], ip
459 rev r8, r7
460 vmov e1, ctr
461 add r7, r7, #1
462 vmov.32 ctr1[1], r8
463 rev ip, r7
464 vmov e2, ctr
465 add r7, r7, #1
466 vmov.32 ctr1[1], ip
467 vmov e3, ctr
468
469 add ip, r3, #64
470 aes_encrypt ip, r6, e0, e1, e2, e3
471 bx lr
472ENDPROC(pmull_aes_encrypt_4x)
473
474pmull_aes_encrypt_final:
475 add ip, r5, #4
476 vld1.8 {ctr0}, [r5]
477 vld1.8 {ctr1}, [ip]
478 rev r8, r7
479 vext.8 ctr1, ctr1, ctr1, #4
480 mov r7, #1 << 24 // BE #1 for the tag
481 vmov.32 ctr1[1], r8
482 vmov e0, ctr
483 vmov.32 ctr1[1], r7
484 vmov e1, ctr
485
486 add ip, r3, #64
487 aes_encrypt ip, r6, e0, e1
488 bx lr
489ENDPROC(pmull_aes_encrypt_final)
490
491 .macro enc_1x, in0
492 bl pmull_aes_encrypt
493 veor \in0, \in0, e0
494 vst1.8 {\in0}, [r4]!
495 .endm
496
497 .macro dec_1x, in0
498 bl pmull_aes_encrypt
499 veor e0, e0, \in0
500 vst1.8 {e0}, [r4]!
501 .endm
502
503 .macro enc_4x, in0, in1, in2, in3
504 bl pmull_aes_encrypt_4x
505
506 veor \in0, \in0, e0
507 veor \in1, \in1, e1
508 veor \in2, \in2, e2
509 veor \in3, \in3, e3
510
511 vst1.8 {\in0-\in1}, [r4]!
512 vst1.8 {\in2-\in3}, [r4]!
513 .endm
514
515 .macro dec_4x, in0, in1, in2, in3
516 bl pmull_aes_encrypt_4x
517
518 veor e0, e0, \in0
519 veor e1, e1, \in1
520 veor e2, e2, \in2
521 veor e3, e3, \in3
522
523 vst1.8 {e0-e1}, [r4]!
524 vst1.8 {e2-e3}, [r4]!
525 .endm
526
527 /*
528 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
529 * struct gcm_key const *k, char *dst,
530 * char *iv, int rounds, u32 counter)
531 */
532ENTRY(pmull_gcm_encrypt)
533 push {r4-r8, lr}
534 ldrd r4, r5, [sp, #24]
535 ldrd r6, r7, [sp, #32]
536
537 vld1.64 {SHASH}, [r3]
538
539 ghash_update p64, enc, head=0
540 vst1.64 {XL}, [r1]
541
542 pop {r4-r8, pc}
543ENDPROC(pmull_gcm_encrypt)
544
545 /*
546 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
547 * struct gcm_key const *k, char *dst,
548 * char *iv, int rounds, u32 counter)
549 */
550ENTRY(pmull_gcm_decrypt)
551 push {r4-r8, lr}
552 ldrd r4, r5, [sp, #24]
553 ldrd r6, r7, [sp, #32]
554
555 vld1.64 {SHASH}, [r3]
556
557 ghash_update p64, dec, head=0
558 vst1.64 {XL}, [r1]
559
560 pop {r4-r8, pc}
561ENDPROC(pmull_gcm_decrypt)
562
563 /*
564 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
565 * struct gcm_key const *k, char *head,
566 * char *iv, int rounds, u32 counter)
567 */
568ENTRY(pmull_gcm_enc_final)
569 push {r4-r8, lr}
570 ldrd r4, r5, [sp, #24]
571 ldrd r6, r7, [sp, #32]
572
573 bl pmull_aes_encrypt_final
574
575 cmp r0, #0
576 beq .Lenc_final
577
578 mov_l ip, .Lpermute
579 sub r4, r4, #16
580 add r8, ip, r0
581 add ip, ip, #32
582 add r4, r4, r0
583 sub ip, ip, r0
584
585 vld1.8 {e3}, [r8] // permute vector for key stream
586 vld1.8 {e2}, [ip] // permute vector for ghash input
587
588 vtbl.8 e3l, {e0}, e3l
589 vtbl.8 e3h, {e0}, e3h
590
591 vld1.8 {e0}, [r4] // encrypt tail block
592 veor e0, e0, e3
593 vst1.8 {e0}, [r4]
594
595 vtbl.8 T1_L, {e0}, e2l
596 vtbl.8 T1_H, {e0}, e2h
597
598 vld1.64 {XL}, [r1]
599.Lenc_final:
600 vld1.64 {SHASH}, [r3, :128]
601 vmov.i8 MASK, #0xe1
602 veor SHASH2_p64, SHASH_L, SHASH_H
603 vshl.u64 MASK, MASK, #57
604 mov r0, #1
605 bne 3f // process head block first
606 ghash_update p64, aggregate=0, head=0
607
608 vrev64.8 XL, XL
609 vext.8 XL, XL, XL, #8
610 veor XL, XL, e1
611
612 sub r2, r2, #16 // rewind src pointer
613 vst1.8 {XL}, [r2] // store tag
614
615 pop {r4-r8, pc}
616ENDPROC(pmull_gcm_enc_final)
617
618 /*
619 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
620 * struct gcm_key const *k, char *head,
621 * char *iv, int rounds, u32 counter,
622 * const char *otag, int authsize)
623 */
624ENTRY(pmull_gcm_dec_final)
625 push {r4-r8, lr}
626 ldrd r4, r5, [sp, #24]
627 ldrd r6, r7, [sp, #32]
628
629 bl pmull_aes_encrypt_final
630
631 cmp r0, #0
632 beq .Ldec_final
633
634 mov_l ip, .Lpermute
635 sub r4, r4, #16
636 add r8, ip, r0
637 add ip, ip, #32
638 add r4, r4, r0
639 sub ip, ip, r0
640
641 vld1.8 {e3}, [r8] // permute vector for key stream
642 vld1.8 {e2}, [ip] // permute vector for ghash input
643
644 vtbl.8 e3l, {e0}, e3l
645 vtbl.8 e3h, {e0}, e3h
646
647 vld1.8 {e0}, [r4]
648
649 vtbl.8 T1_L, {e0}, e2l
650 vtbl.8 T1_H, {e0}, e2h
651
652 veor e0, e0, e3
653 vst1.8 {e0}, [r4]
654
655 vld1.64 {XL}, [r1]
656.Ldec_final:
657 vld1.64 {SHASH}, [r3]
658 vmov.i8 MASK, #0xe1
659 veor SHASH2_p64, SHASH_L, SHASH_H
660 vshl.u64 MASK, MASK, #57
661 mov r0, #1
662 bne 3f // process head block first
663 ghash_update p64, aggregate=0, head=0
664
665 vrev64.8 XL, XL
666 vext.8 XL, XL, XL, #8
667 veor XL, XL, e1
668
669 mov_l ip, .Lpermute
670 ldrd r2, r3, [sp, #40] // otag and authsize
671 vld1.8 {T1}, [r2]
672 add ip, ip, r3
673 vceq.i8 T1, T1, XL // compare tags
674 vmvn T1, T1 // 0 for eq, -1 for ne
675
676 vld1.8 {e0}, [ip]
677 vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only
678 vtbl.8 XL_H, {T1}, e0h
679
680 vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector
681 vpmin.s8 XL_L, XL_L, XL_L
682 vmov.32 r0, XL_L[0] // fail if != 0x0
683
684 pop {r4-r8, pc}
685ENDPROC(pmull_gcm_dec_final)
686
687 .section ".rodata", "a", %progbits
688 .align 5
689.Lpermute:
690 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
691 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
692 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
693 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
694 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
695 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
696

source code of linux/arch/arm/crypto/ghash-ce-core.S