1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France |
4 | */ |
5 | |
6 | #include <linux/linkage.h> |
7 | |
8 | #include <asm/ppc_asm.h> |
9 | |
10 | #define dst_bytes r3 |
11 | #define key r4 |
12 | #define counter r5 |
13 | #define nblocks r6 |
14 | |
15 | #define idx_r0 r0 |
16 | #define val4 r4 |
17 | |
18 | #define const0 0x61707865 |
19 | #define const1 0x3320646e |
20 | #define const2 0x79622d32 |
21 | #define const3 0x6b206574 |
22 | |
23 | #define key0 r5 |
24 | #define key1 r6 |
25 | #define key2 r7 |
26 | #define key3 r8 |
27 | #define key4 r9 |
28 | #define key5 r10 |
29 | #define key6 r11 |
30 | #define key7 r12 |
31 | |
32 | #define counter0 r14 |
33 | #define counter1 r15 |
34 | |
35 | #define state0 r16 |
36 | #define state1 r17 |
37 | #define state2 r18 |
38 | #define state3 r19 |
39 | #define state4 r20 |
40 | #define state5 r21 |
41 | #define state6 r22 |
42 | #define state7 r23 |
43 | #define state8 r24 |
44 | #define state9 r25 |
45 | #define state10 r26 |
46 | #define state11 r27 |
47 | #define state12 r28 |
48 | #define state13 r29 |
49 | #define state14 r30 |
50 | #define state15 r31 |
51 | |
52 | .macro quarterround4 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4 |
53 | add \a1, \a1, \b1 |
54 | add \a2, \a2, \b2 |
55 | add \a3, \a3, \b3 |
56 | add \a4, \a4, \b4 |
57 | xor \d1, \d1, \a1 |
58 | xor \d2, \d2, \a2 |
59 | xor \d3, \d3, \a3 |
60 | xor \d4, \d4, \a4 |
61 | rotlwi \d1, \d1, 16 |
62 | rotlwi \d2, \d2, 16 |
63 | rotlwi \d3, \d3, 16 |
64 | rotlwi \d4, \d4, 16 |
65 | add \c1, \c1, \d1 |
66 | add \c2, \c2, \d2 |
67 | add \c3, \c3, \d3 |
68 | add \c4, \c4, \d4 |
69 | xor \b1, \b1, \c1 |
70 | xor \b2, \b2, \c2 |
71 | xor \b3, \b3, \c3 |
72 | xor \b4, \b4, \c4 |
73 | rotlwi \b1, \b1, 12 |
74 | rotlwi \b2, \b2, 12 |
75 | rotlwi \b3, \b3, 12 |
76 | rotlwi \b4, \b4, 12 |
77 | add \a1, \a1, \b1 |
78 | add \a2, \a2, \b2 |
79 | add \a3, \a3, \b3 |
80 | add \a4, \a4, \b4 |
81 | xor \d1, \d1, \a1 |
82 | xor \d2, \d2, \a2 |
83 | xor \d3, \d3, \a3 |
84 | xor \d4, \d4, \a4 |
85 | rotlwi \d1, \d1, 8 |
86 | rotlwi \d2, \d2, 8 |
87 | rotlwi \d3, \d3, 8 |
88 | rotlwi \d4, \d4, 8 |
89 | add \c1, \c1, \d1 |
90 | add \c2, \c2, \d2 |
91 | add \c3, \c3, \d3 |
92 | add \c4, \c4, \d4 |
93 | xor \b1, \b1, \c1 |
94 | xor \b2, \b2, \c2 |
95 | xor \b3, \b3, \c3 |
96 | xor \b4, \b4, \c4 |
97 | rotlwi \b1, \b1, 7 |
98 | rotlwi \b2, \b2, 7 |
99 | rotlwi \b3, \b3, 7 |
100 | rotlwi \b4, \b4, 7 |
101 | .endm |
102 | |
103 | #define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4) \ |
104 | quarterround4 state##a1 state##b1 state##c1 state##d1 \ |
105 | state##a2 state##b2 state##c2 state##d2 \ |
106 | state##a3 state##b3 state##c3 state##d3 \ |
107 | state##a4 state##b4 state##c4 state##d4 |
108 | |
109 | /* |
110 | * Very basic 32 bits implementation of ChaCha20. Produces a given positive number |
111 | * of blocks of output with a nonce of 0, taking an input key and 8-byte |
112 | * counter. Importantly does not spill to the stack. Its arguments are: |
113 | * |
114 | * r3: output bytes |
115 | * r4: 32-byte key input |
116 | * r5: 8-byte counter input/output (saved on stack) |
117 | * r6: number of 64-byte blocks to write to output |
118 | * |
119 | * r0: counter of blocks (initialised with r6) |
120 | * r4: Value '4' after key has been read. |
121 | * r5-r12: key |
122 | * r14-r15: counter |
123 | * r16-r31: state |
124 | */ |
125 | SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
126 | #ifdef __powerpc64__ |
127 | std counter, -216(r1) |
128 | |
129 | std r14, -144(r1) |
130 | std r15, -136(r1) |
131 | std r16, -128(r1) |
132 | std r17, -120(r1) |
133 | std r18, -112(r1) |
134 | std r19, -104(r1) |
135 | std r20, -96(r1) |
136 | std r21, -88(r1) |
137 | std r22, -80(r1) |
138 | std r23, -72(r1) |
139 | std r24, -64(r1) |
140 | std r25, -56(r1) |
141 | std r26, -48(r1) |
142 | std r27, -40(r1) |
143 | std r28, -32(r1) |
144 | std r29, -24(r1) |
145 | std r30, -16(r1) |
146 | std r31, -8(r1) |
147 | #else |
148 | stwu r1, -96(r1) |
149 | stw counter, 20(r1) |
150 | #ifdef __BIG_ENDIAN__ |
151 | stmw r14, 24(r1) |
152 | #else |
153 | stw r14, 24(r1) |
154 | stw r15, 28(r1) |
155 | stw r16, 32(r1) |
156 | stw r17, 36(r1) |
157 | stw r18, 40(r1) |
158 | stw r19, 44(r1) |
159 | stw r20, 48(r1) |
160 | stw r21, 52(r1) |
161 | stw r22, 56(r1) |
162 | stw r23, 60(r1) |
163 | stw r24, 64(r1) |
164 | stw r25, 68(r1) |
165 | stw r26, 72(r1) |
166 | stw r27, 76(r1) |
167 | stw r28, 80(r1) |
168 | stw r29, 84(r1) |
169 | stw r30, 88(r1) |
170 | stw r31, 92(r1) |
171 | #endif |
172 | #endif /* __powerpc64__ */ |
173 | |
174 | lwz counter0, 0(counter) |
175 | lwz counter1, 4(counter) |
176 | #ifdef __powerpc64__ |
177 | rldimi counter0, counter1, 32, 0 |
178 | #endif |
179 | mr idx_r0, nblocks |
180 | subi dst_bytes, dst_bytes, 4 |
181 | |
182 | lwz key0, 0(key) |
183 | lwz key1, 4(key) |
184 | lwz key2, 8(key) |
185 | lwz key3, 12(key) |
186 | lwz key4, 16(key) |
187 | lwz key5, 20(key) |
188 | lwz key6, 24(key) |
189 | lwz key7, 28(key) |
190 | |
191 | li val4, 4 |
192 | .Lblock: |
193 | li r31, 10 |
194 | |
195 | lis state0, const0@ha |
196 | lis state1, const1@ha |
197 | lis state2, const2@ha |
198 | lis state3, const3@ha |
199 | addi state0, state0, const0@l |
200 | addi state1, state1, const1@l |
201 | addi state2, state2, const2@l |
202 | addi state3, state3, const3@l |
203 | |
204 | mtctr r31 |
205 | |
206 | mr state4, key0 |
207 | mr state5, key1 |
208 | mr state6, key2 |
209 | mr state7, key3 |
210 | mr state8, key4 |
211 | mr state9, key5 |
212 | mr state10, key6 |
213 | mr state11, key7 |
214 | |
215 | mr state12, counter0 |
216 | mr state13, counter1 |
217 | |
218 | li state14, 0 |
219 | li state15, 0 |
220 | |
221 | .Lpermute: |
222 | QUARTERROUND4( 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15) |
223 | QUARTERROUND4( 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14) |
224 | |
225 | bdnz .Lpermute |
226 | |
227 | addis state0, state0, const0@ha |
228 | addis state1, state1, const1@ha |
229 | addis state2, state2, const2@ha |
230 | addis state3, state3, const3@ha |
231 | addi state0, state0, const0@l |
232 | addi state1, state1, const1@l |
233 | addi state2, state2, const2@l |
234 | addi state3, state3, const3@l |
235 | |
236 | add state4, state4, key0 |
237 | add state5, state5, key1 |
238 | add state6, state6, key2 |
239 | add state7, state7, key3 |
240 | add state8, state8, key4 |
241 | add state9, state9, key5 |
242 | add state10, state10, key6 |
243 | add state11, state11, key7 |
244 | |
245 | add state12, state12, counter0 |
246 | add state13, state13, counter1 |
247 | |
248 | #ifdef __BIG_ENDIAN__ |
249 | stwbrx state0, val4, dst_bytes |
250 | addi dst_bytes, dst_bytes, 8 |
251 | stwbrx state1, 0, dst_bytes |
252 | stwbrx state2, val4, dst_bytes |
253 | addi dst_bytes, dst_bytes, 8 |
254 | stwbrx state3, 0, dst_bytes |
255 | stwbrx state4, val4, dst_bytes |
256 | addi dst_bytes, dst_bytes, 8 |
257 | stwbrx state5, 0, dst_bytes |
258 | stwbrx state6, val4, dst_bytes |
259 | addi dst_bytes, dst_bytes, 8 |
260 | stwbrx state7, 0, dst_bytes |
261 | stwbrx state8, val4, dst_bytes |
262 | addi dst_bytes, dst_bytes, 8 |
263 | stwbrx state9, 0, dst_bytes |
264 | stwbrx state10, val4, dst_bytes |
265 | addi dst_bytes, dst_bytes, 8 |
266 | stwbrx state11, 0, dst_bytes |
267 | stwbrx state12, val4, dst_bytes |
268 | addi dst_bytes, dst_bytes, 8 |
269 | stwbrx state13, 0, dst_bytes |
270 | stwbrx state14, val4, dst_bytes |
271 | addi dst_bytes, dst_bytes, 8 |
272 | stwbrx state15, 0, dst_bytes |
273 | #else |
274 | stw state0, 4(dst_bytes) |
275 | stw state1, 8(dst_bytes) |
276 | stw state2, 12(dst_bytes) |
277 | stw state3, 16(dst_bytes) |
278 | stw state4, 20(dst_bytes) |
279 | stw state5, 24(dst_bytes) |
280 | stw state6, 28(dst_bytes) |
281 | stw state7, 32(dst_bytes) |
282 | stw state8, 36(dst_bytes) |
283 | stw state9, 40(dst_bytes) |
284 | stw state10, 44(dst_bytes) |
285 | stw state11, 48(dst_bytes) |
286 | stw state12, 52(dst_bytes) |
287 | stw state13, 56(dst_bytes) |
288 | stw state14, 60(dst_bytes) |
289 | stwu state15, 64(dst_bytes) |
290 | #endif |
291 | |
292 | subic. idx_r0, idx_r0, 1 /* subi. can't use r0 as source */ |
293 | |
294 | #ifdef __powerpc64__ |
295 | addi counter0, counter0, 1 |
296 | srdi counter1, counter0, 32 |
297 | #else |
298 | addic counter0, counter0, 1 |
299 | addze counter1, counter1 |
300 | #endif |
301 | |
302 | bne .Lblock |
303 | |
304 | #ifdef __powerpc64__ |
305 | ld counter, -216(r1) |
306 | #else |
307 | lwz counter, 20(r1) |
308 | #endif |
309 | stw counter0, 0(counter) |
310 | stw counter1, 4(counter) |
311 | |
312 | li r6, 0 |
313 | li r7, 0 |
314 | li r8, 0 |
315 | li r9, 0 |
316 | li r10, 0 |
317 | li r11, 0 |
318 | li r12, 0 |
319 | |
320 | #ifdef __powerpc64__ |
321 | ld r14, -144(r1) |
322 | ld r15, -136(r1) |
323 | ld r16, -128(r1) |
324 | ld r17, -120(r1) |
325 | ld r18, -112(r1) |
326 | ld r19, -104(r1) |
327 | ld r20, -96(r1) |
328 | ld r21, -88(r1) |
329 | ld r22, -80(r1) |
330 | ld r23, -72(r1) |
331 | ld r24, -64(r1) |
332 | ld r25, -56(r1) |
333 | ld r26, -48(r1) |
334 | ld r27, -40(r1) |
335 | ld r28, -32(r1) |
336 | ld r29, -24(r1) |
337 | ld r30, -16(r1) |
338 | ld r31, -8(r1) |
339 | #else |
340 | #ifdef __BIG_ENDIAN__ |
341 | lmw r14, 24(r1) |
342 | #else |
343 | lwz r14, 24(r1) |
344 | lwz r15, 28(r1) |
345 | lwz r16, 32(r1) |
346 | lwz r17, 36(r1) |
347 | lwz r18, 40(r1) |
348 | lwz r19, 44(r1) |
349 | lwz r20, 48(r1) |
350 | lwz r21, 52(r1) |
351 | lwz r22, 56(r1) |
352 | lwz r23, 60(r1) |
353 | lwz r24, 64(r1) |
354 | lwz r25, 68(r1) |
355 | lwz r26, 72(r1) |
356 | lwz r27, 76(r1) |
357 | lwz r28, 80(r1) |
358 | lwz r29, 84(r1) |
359 | lwz r30, 88(r1) |
360 | lwz r31, 92(r1) |
361 | #endif |
362 | addi r1, r1, 96 |
363 | #endif /* __powerpc64__ */ |
364 | blr |
365 | SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
366 | |