1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions |
4 | * |
5 | * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | #include <asm/assembler.h> |
10 | |
11 | .text |
12 | .arch armv8-a+crypto |
13 | |
14 | k0 .req v0 |
15 | k1 .req v1 |
16 | k2 .req v2 |
17 | k3 .req v3 |
18 | |
19 | t0 .req v4 |
20 | t1 .req v5 |
21 | |
22 | dga .req q6 |
23 | dgav .req v6 |
24 | dgb .req s7 |
25 | dgbv .req v7 |
26 | |
27 | dg0q .req q12 |
28 | dg0s .req s12 |
29 | dg0v .req v12 |
30 | dg1s .req s13 |
31 | dg1v .req v13 |
32 | dg2s .req s14 |
33 | |
34 | .macro add_only, op, ev, rc, s0, dg1 |
35 | .ifc \ev, ev |
36 | add t1.4s, v\s0\().4s, \rc\().4s |
37 | sha1h dg2s, dg0s |
38 | .ifnb \dg1 |
39 | sha1\op dg0q, \dg1, t0.4s |
40 | .else |
41 | sha1\op dg0q, dg1s, t0.4s |
42 | .endif |
43 | .else |
44 | .ifnb \s0 |
45 | add t0.4s, v\s0\().4s, \rc\().4s |
46 | .endif |
47 | sha1h dg1s, dg0s |
48 | sha1\op dg0q, dg2s, t1.4s |
49 | .endif |
50 | .endm |
51 | |
52 | .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 |
53 | sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s |
54 | add_only \op, \ev, \rc, \s1, \dg1 |
55 | sha1su1 v\s0\().4s, v\s3\().4s |
56 | .endm |
57 | |
58 | .macro loadrc, k, val, tmp |
59 | movz \tmp, :abs_g0_nc:\val |
60 | movk \tmp, :abs_g1:\val |
61 | dup \k, \tmp |
62 | .endm |
63 | |
64 | /* |
65 | * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, |
66 | * int blocks) |
67 | */ |
68 | SYM_FUNC_START(__sha1_ce_transform) |
69 | /* load round constants */ |
70 | loadrc k0.4s, 0x5a827999, w6 |
71 | loadrc k1.4s, 0x6ed9eba1, w6 |
72 | loadrc k2.4s, 0x8f1bbcdc, w6 |
73 | loadrc k3.4s, 0xca62c1d6, w6 |
74 | |
75 | /* load state */ |
76 | ld1 {dgav.4s}, [x0] |
77 | ldr dgb, [x0, #16] |
78 | |
79 | /* load sha1_ce_state::finalize */ |
80 | ldr_l w4, sha1_ce_offsetof_finalize, x4 |
81 | ldr w4, [x0, x4] |
82 | |
83 | /* load input */ |
84 | 0: ld1 {v8.4s-v11.4s}, [x1], #64 |
85 | sub w2, w2, #1 |
86 | |
87 | CPU_LE( rev32 v8.16b, v8.16b ) |
88 | CPU_LE( rev32 v9.16b, v9.16b ) |
89 | CPU_LE( rev32 v10.16b, v10.16b ) |
90 | CPU_LE( rev32 v11.16b, v11.16b ) |
91 | |
92 | 1: add t0.4s, v8.4s, k0.4s |
93 | mov dg0v.16b, dgav.16b |
94 | |
95 | add_update c, ev, k0, 8, 9, 10, 11, dgb |
96 | add_update c, od, k0, 9, 10, 11, 8 |
97 | add_update c, ev, k0, 10, 11, 8, 9 |
98 | add_update c, od, k0, 11, 8, 9, 10 |
99 | add_update c, ev, k1, 8, 9, 10, 11 |
100 | |
101 | add_update p, od, k1, 9, 10, 11, 8 |
102 | add_update p, ev, k1, 10, 11, 8, 9 |
103 | add_update p, od, k1, 11, 8, 9, 10 |
104 | add_update p, ev, k1, 8, 9, 10, 11 |
105 | add_update p, od, k2, 9, 10, 11, 8 |
106 | |
107 | add_update m, ev, k2, 10, 11, 8, 9 |
108 | add_update m, od, k2, 11, 8, 9, 10 |
109 | add_update m, ev, k2, 8, 9, 10, 11 |
110 | add_update m, od, k2, 9, 10, 11, 8 |
111 | add_update m, ev, k3, 10, 11, 8, 9 |
112 | |
113 | add_update p, od, k3, 11, 8, 9, 10 |
114 | add_only p, ev, k3, 9 |
115 | add_only p, od, k3, 10 |
116 | add_only p, ev, k3, 11 |
117 | add_only p, od |
118 | |
119 | /* update state */ |
120 | add dgbv.2s, dgbv.2s, dg1v.2s |
121 | add dgav.4s, dgav.4s, dg0v.4s |
122 | |
123 | cbz w2, 2f |
124 | cond_yield 3f, x5, x6 |
125 | b 0b |
126 | |
127 | /* |
128 | * Final block: add padding and total bit count. |
129 | * Skip if the input size was not a round multiple of the block size, |
130 | * the padding is handled by the C code in that case. |
131 | */ |
132 | 2: cbz x4, 3f |
133 | ldr_l w4, sha1_ce_offsetof_count, x4 |
134 | ldr x4, [x0, x4] |
135 | movi v9.2d, #0 |
136 | mov x8, #0x80000000 |
137 | movi v10.2d, #0 |
138 | ror x7, x4, #29 // ror(lsl(x4, 3), 32) |
139 | fmov d8, x8 |
140 | mov x4, #0 |
141 | mov v11.d[0], xzr |
142 | mov v11.d[1], x7 |
143 | b 1b |
144 | |
145 | /* store new state */ |
146 | 3: st1 {dgav.4s}, [x0] |
147 | str dgb, [x0, #16] |
148 | mov w0, w2 |
149 | ret |
150 | SYM_FUNC_END(__sha1_ce_transform) |
151 | |