1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2018 Google, Inc.
4 */
5
6#include <linux/linkage.h>
7#include <asm/assembler.h>
8
9/*
10 * Design notes:
11 *
12 * 16 registers would be needed to hold the state matrix, but only 14 are
13 * available because 'sp' and 'pc' cannot be used. So we spill the elements
14 * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
15 * 'ldrd' and one 'strd' instruction per round.
16 *
17 * All rotates are performed using the implicit rotate operand accepted by the
18 * 'add' and 'eor' instructions. This is faster than using explicit rotate
19 * instructions. To make this work, we allow the values in the second and last
20 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
21 * wrong rotation amount. The rotation amount is then fixed up just in time
22 * when the values are used. 'brot' is the number of bits the values in row 'b'
23 * need to be rotated right to arrive at the correct values, and 'drot'
24 * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
25 * that they end up as (25, 24) after every round.
26 */
27
28 // ChaCha state registers
29 X0 .req r0
30 X1 .req r1
31 X2 .req r2
32 X3 .req r3
33 X4 .req r4
34 X5 .req r5
35 X6 .req r6
36 X7 .req r7
37 X8_X10 .req r8 // shared by x8 and x10
38 X9_X11 .req r9 // shared by x9 and x11
39 X12 .req r10
40 X13 .req r11
41 X14 .req r12
42 X15 .req r14
43
44.macro _le32_bswap_4x a, b, c, d, tmp
45#ifdef __ARMEB__
46 rev_l \a, \tmp
47 rev_l \b, \tmp
48 rev_l \c, \tmp
49 rev_l \d, \tmp
50#endif
51.endm
52
53.macro __ldrd a, b, src, offset
54#if __LINUX_ARM_ARCH__ >= 6
55 ldrd \a, \b, [\src, #\offset]
56#else
57 ldr \a, [\src, #\offset]
58 ldr \b, [\src, #\offset + 4]
59#endif
60.endm
61
62.macro __strd a, b, dst, offset
63#if __LINUX_ARM_ARCH__ >= 6
64 strd \a, \b, [\dst, #\offset]
65#else
66 str \a, [\dst, #\offset]
67 str \b, [\dst, #\offset + 4]
68#endif
69.endm
70
71.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
72
73 // a += b; d ^= a; d = rol(d, 16);
74 add \a1, \a1, \b1, ror #brot
75 add \a2, \a2, \b2, ror #brot
76 eor \d1, \a1, \d1, ror #drot
77 eor \d2, \a2, \d2, ror #drot
78 // drot == 32 - 16 == 16
79
80 // c += d; b ^= c; b = rol(b, 12);
81 add \c1, \c1, \d1, ror #16
82 add \c2, \c2, \d2, ror #16
83 eor \b1, \c1, \b1, ror #brot
84 eor \b2, \c2, \b2, ror #brot
85 // brot == 32 - 12 == 20
86
87 // a += b; d ^= a; d = rol(d, 8);
88 add \a1, \a1, \b1, ror #20
89 add \a2, \a2, \b2, ror #20
90 eor \d1, \a1, \d1, ror #16
91 eor \d2, \a2, \d2, ror #16
92 // drot == 32 - 8 == 24
93
94 // c += d; b ^= c; b = rol(b, 7);
95 add \c1, \c1, \d1, ror #24
96 add \c2, \c2, \d2, ror #24
97 eor \b1, \c1, \b1, ror #20
98 eor \b2, \c2, \b2, ror #20
99 // brot == 32 - 7 == 25
100.endm
101
102.macro _doubleround
103
104 // column round
105
106 // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
107 _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
108
109 // save (x8, x9); restore (x10, x11)
110 __strd X8_X10, X9_X11, sp, 0
111 __ldrd X8_X10, X9_X11, sp, 8
112
113 // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
114 _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
115
116 .set brot, 25
117 .set drot, 24
118
119 // diagonal round
120
121 // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
122 _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
123
124 // save (x10, x11); restore (x8, x9)
125 __strd X8_X10, X9_X11, sp, 8
126 __ldrd X8_X10, X9_X11, sp, 0
127
128 // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
129 _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
130.endm
131
132.macro _chacha_permute nrounds
133 .set brot, 0
134 .set drot, 0
135 .rept \nrounds / 2
136 _doubleround
137 .endr
138.endm
139
140.macro _chacha nrounds
141
142.Lnext_block\@:
143 // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
144 // Registers contain x0-x9,x12-x15.
145
146 // Do the core ChaCha permutation to update x0-x15.
147 _chacha_permute \nrounds
148
149 add sp, #8
150 // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
151 // Registers contain x0-x9,x12-x15.
152 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
153
154 // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
155 push {X8_X10, X9_X11, X12, X13, X14, X15}
156
157 // Load (OUT, IN, LEN).
158 ldr r14, [sp, #96]
159 ldr r12, [sp, #100]
160 ldr r11, [sp, #104]
161
162 orr r10, r14, r12
163
164 // Use slow path if fewer than 64 bytes remain.
165 cmp r11, #64
166 blt .Lxor_slowpath\@
167
168 // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
169 // ARMv6+, since ldmia and stmia (used below) still require alignment.
170 tst r10, #3
171 bne .Lxor_slowpath\@
172
173 // Fast path: XOR 64 bytes of aligned data.
174
175 // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
176 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
177 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
178
179 // x0-x3
180 __ldrd r8, r9, sp, 32
181 __ldrd r10, r11, sp, 40
182 add X0, X0, r8
183 add X1, X1, r9
184 add X2, X2, r10
185 add X3, X3, r11
186 _le32_bswap_4x X0, X1, X2, X3, r8
187 ldmia r12!, {r8-r11}
188 eor X0, X0, r8
189 eor X1, X1, r9
190 eor X2, X2, r10
191 eor X3, X3, r11
192 stmia r14!, {X0-X3}
193
194 // x4-x7
195 __ldrd r8, r9, sp, 48
196 __ldrd r10, r11, sp, 56
197 add X4, r8, X4, ror #brot
198 add X5, r9, X5, ror #brot
199 ldmia r12!, {X0-X3}
200 add X6, r10, X6, ror #brot
201 add X7, r11, X7, ror #brot
202 _le32_bswap_4x X4, X5, X6, X7, r8
203 eor X4, X4, X0
204 eor X5, X5, X1
205 eor X6, X6, X2
206 eor X7, X7, X3
207 stmia r14!, {X4-X7}
208
209 // x8-x15
210 pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
211 __ldrd r8, r9, sp, 32
212 __ldrd r10, r11, sp, 40
213 add r0, r0, r8 // x8
214 add r1, r1, r9 // x9
215 add r6, r6, r10 // x10
216 add r7, r7, r11 // x11
217 _le32_bswap_4x r0, r1, r6, r7, r8
218 ldmia r12!, {r8-r11}
219 eor r0, r0, r8 // x8
220 eor r1, r1, r9 // x9
221 eor r6, r6, r10 // x10
222 eor r7, r7, r11 // x11
223 stmia r14!, {r0,r1,r6,r7}
224 ldmia r12!, {r0,r1,r6,r7}
225 __ldrd r8, r9, sp, 48
226 __ldrd r10, r11, sp, 56
227 add r2, r8, r2, ror #drot // x12
228 add r3, r9, r3, ror #drot // x13
229 add r4, r10, r4, ror #drot // x14
230 add r5, r11, r5, ror #drot // x15
231 _le32_bswap_4x r2, r3, r4, r5, r9
232 ldr r9, [sp, #72] // load LEN
233 eor r2, r2, r0 // x12
234 eor r3, r3, r1 // x13
235 eor r4, r4, r6 // x14
236 eor r5, r5, r7 // x15
237 subs r9, #64 // decrement and check LEN
238 stmia r14!, {r2-r5}
239
240 beq .Ldone\@
241
242.Lprepare_for_next_block\@:
243
244 // Stack: x0-x15 OUT IN LEN
245
246 // Increment block counter (x12)
247 add r8, #1
248
249 // Store updated (OUT, IN, LEN)
250 str r14, [sp, #64]
251 str r12, [sp, #68]
252 str r9, [sp, #72]
253
254 mov r14, sp
255
256 // Store updated block counter (x12)
257 str r8, [sp, #48]
258
259 sub sp, #16
260
261 // Reload state and do next block
262 ldmia r14!, {r0-r11} // load x0-x11
263 __strd r10, r11, sp, 8 // store x10-x11 before state
264 ldmia r14, {r10-r12,r14} // load x12-x15
265 b .Lnext_block\@
266
267.Lxor_slowpath\@:
268 // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
269 // We handle it by storing the 64 bytes of keystream to the stack, then
270 // XOR-ing the needed portion with the data.
271
272 // Allocate keystream buffer
273 sub sp, #64
274 mov r14, sp
275
276 // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
277 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
278 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
279
280 // Save keystream for x0-x3
281 __ldrd r8, r9, sp, 96
282 __ldrd r10, r11, sp, 104
283 add X0, X0, r8
284 add X1, X1, r9
285 add X2, X2, r10
286 add X3, X3, r11
287 _le32_bswap_4x X0, X1, X2, X3, r8
288 stmia r14!, {X0-X3}
289
290 // Save keystream for x4-x7
291 __ldrd r8, r9, sp, 112
292 __ldrd r10, r11, sp, 120
293 add X4, r8, X4, ror #brot
294 add X5, r9, X5, ror #brot
295 add X6, r10, X6, ror #brot
296 add X7, r11, X7, ror #brot
297 _le32_bswap_4x X4, X5, X6, X7, r8
298 add r8, sp, #64
299 stmia r14!, {X4-X7}
300
301 // Save keystream for x8-x15
302 ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
303 __ldrd r8, r9, sp, 128
304 __ldrd r10, r11, sp, 136
305 add r0, r0, r8 // x8
306 add r1, r1, r9 // x9
307 add r6, r6, r10 // x10
308 add r7, r7, r11 // x11
309 _le32_bswap_4x r0, r1, r6, r7, r8
310 stmia r14!, {r0,r1,r6,r7}
311 __ldrd r8, r9, sp, 144
312 __ldrd r10, r11, sp, 152
313 add r2, r8, r2, ror #drot // x12
314 add r3, r9, r3, ror #drot // x13
315 add r4, r10, r4, ror #drot // x14
316 add r5, r11, r5, ror #drot // x15
317 _le32_bswap_4x r2, r3, r4, r5, r9
318 stmia r14, {r2-r5}
319
320 // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
321 // Registers: r8 is block counter, r12 is IN.
322
323 ldr r9, [sp, #168] // LEN
324 ldr r14, [sp, #160] // OUT
325 cmp r9, #64
326 mov r0, sp
327 movle r1, r9
328 movgt r1, #64
329 // r1 is number of bytes to XOR, in range [1, 64]
330
331.if __LINUX_ARM_ARCH__ < 6
332 orr r2, r12, r14
333 tst r2, #3 // IN or OUT misaligned?
334 bne .Lxor_next_byte\@
335.endif
336
337 // XOR a word at a time
338.rept 16
339 subs r1, #4
340 blt .Lxor_words_done\@
341 ldr r2, [r12], #4
342 ldr r3, [r0], #4
343 eor r2, r2, r3
344 str r2, [r14], #4
345.endr
346 b .Lxor_slowpath_done\@
347.Lxor_words_done\@:
348 ands r1, r1, #3
349 beq .Lxor_slowpath_done\@
350
351 // XOR a byte at a time
352.Lxor_next_byte\@:
353 ldrb r2, [r12], #1
354 ldrb r3, [r0], #1
355 eor r2, r2, r3
356 strb r2, [r14], #1
357 subs r1, #1
358 bne .Lxor_next_byte\@
359
360.Lxor_slowpath_done\@:
361 subs r9, #64
362 add sp, #96
363 bgt .Lprepare_for_next_block\@
364
365.Ldone\@:
366.endm // _chacha
367
368/*
369 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
370 * const u32 *state, int nrounds);
371 */
372ENTRY(chacha_doarm)
373 cmp r2, #0 // len == 0?
374 reteq lr
375
376 ldr ip, [sp]
377 cmp ip, #12
378
379 push {r0-r2,r4-r11,lr}
380
381 // Push state x0-x15 onto stack.
382 // Also store an extra copy of x10-x11 just before the state.
383
384 add X12, r3, #48
385 ldm X12, {X12,X13,X14,X15}
386 push {X12,X13,X14,X15}
387 sub sp, sp, #64
388
389 __ldrd X8_X10, X9_X11, r3, 40
390 __strd X8_X10, X9_X11, sp, 8
391 __strd X8_X10, X9_X11, sp, 56
392 ldm r3, {X0-X9_X11}
393 __strd X0, X1, sp, 16
394 __strd X2, X3, sp, 24
395 __strd X4, X5, sp, 32
396 __strd X6, X7, sp, 40
397 __strd X8_X10, X9_X11, sp, 48
398
399 beq 1f
400 _chacha 20
401
4020: add sp, #76
403 pop {r4-r11, pc}
404
4051: _chacha 12
406 b 0b
407ENDPROC(chacha_doarm)
408
409/*
410 * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
411 */
412ENTRY(hchacha_block_arm)
413 push {r1,r4-r11,lr}
414
415 cmp r2, #12 // ChaCha12 ?
416
417 mov r14, r0
418 ldmia r14!, {r0-r11} // load x0-x11
419 push {r10-r11} // store x10-x11 to stack
420 ldm r14, {r10-r12,r14} // load x12-x15
421 sub sp, #8
422
423 beq 1f
424 _chacha_permute 20
425
426 // Skip over (unused0-unused1, x10-x11)
4270: add sp, #16
428
429 // Fix up rotations of x12-x15
430 ror X12, X12, #drot
431 ror X13, X13, #drot
432 pop {r4} // load 'out'
433 ror X14, X14, #drot
434 ror X15, X15, #drot
435
436 // Store (x0-x3,x12-x15) to 'out'
437 stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
438
439 pop {r4-r11,pc}
440
4411: _chacha_permute 12
442 b 0b
443ENDPROC(hchacha_block_arm)
444

source code of linux/arch/arm/crypto/chacha-scalar-core.S