1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 */
6
7#define MASK_U32 0x3c
8#define CHACHA20_BLOCK_SIZE 64
9#define STACK_SIZE 32
10
11#define X0 $t0
12#define X1 $t1
13#define X2 $t2
14#define X3 $t3
15#define X4 $t4
16#define X5 $t5
17#define X6 $t6
18#define X7 $t7
19#define X8 $t8
20#define X9 $t9
21#define X10 $v1
22#define X11 $s6
23#define X12 $s5
24#define X13 $s4
25#define X14 $s3
26#define X15 $s2
27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28#define T0 $s1
29#define T1 $s0
30#define T(n) T ## n
31#define X(n) X ## n
32
33/* Input arguments */
34#define STATE $a0
35#define OUT $a1
36#define IN $a2
37#define BYTES $a3
38
39/* Output argument */
40/* NONCE[0] is kept in a register and not in memory.
41 * We don't want to touch original value in memory.
42 * Must be incremented every loop iteration.
43 */
44#define NONCE_0 $v0
45
46/* SAVED_X and SAVED_CA are set in the jump table.
47 * Use regs which are overwritten on exit else we don't leak clear data.
48 * They are used to handling the last bytes which are not multiple of 4.
49 */
50#define SAVED_X X15
51#define SAVED_CA $s7
52
53#define IS_UNALIGNED $s7
54
55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56#define MSB 0
57#define LSB 3
58#define CPU_TO_LE32(n) \
59 wsbh n, n; \
60 rotr n, 16;
61#else
62#define MSB 3
63#define LSB 0
64#define CPU_TO_LE32(n)
65#endif
66
67#define FOR_EACH_WORD(x) \
68 x( 0); \
69 x( 1); \
70 x( 2); \
71 x( 3); \
72 x( 4); \
73 x( 5); \
74 x( 6); \
75 x( 7); \
76 x( 8); \
77 x( 9); \
78 x(10); \
79 x(11); \
80 x(12); \
81 x(13); \
82 x(14); \
83 x(15);
84
85#define FOR_EACH_WORD_REV(x) \
86 x(15); \
87 x(14); \
88 x(13); \
89 x(12); \
90 x(11); \
91 x(10); \
92 x( 9); \
93 x( 8); \
94 x( 7); \
95 x( 6); \
96 x( 5); \
97 x( 4); \
98 x( 3); \
99 x( 2); \
100 x( 1); \
101 x( 0);
102
103#define PLUS_ONE_0 1
104#define PLUS_ONE_1 2
105#define PLUS_ONE_2 3
106#define PLUS_ONE_3 4
107#define PLUS_ONE_4 5
108#define PLUS_ONE_5 6
109#define PLUS_ONE_6 7
110#define PLUS_ONE_7 8
111#define PLUS_ONE_8 9
112#define PLUS_ONE_9 10
113#define PLUS_ONE_10 11
114#define PLUS_ONE_11 12
115#define PLUS_ONE_12 13
116#define PLUS_ONE_13 14
117#define PLUS_ONE_14 15
118#define PLUS_ONE_15 16
119#define PLUS_ONE(x) PLUS_ONE_ ## x
120#define _CONCAT3(a,b,c) a ## b ## c
121#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
122
123#define STORE_UNALIGNED(x) \
124CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
125 .if (x != 12); \
126 lw T0, (x*4)(STATE); \
127 .endif; \
128 lwl T1, (x*4)+MSB ## (IN); \
129 lwr T1, (x*4)+LSB ## (IN); \
130 .if (x == 12); \
131 addu X ## x, NONCE_0; \
132 .else; \
133 addu X ## x, T0; \
134 .endif; \
135 CPU_TO_LE32(X ## x); \
136 xor X ## x, T1; \
137 swl X ## x, (x*4)+MSB ## (OUT); \
138 swr X ## x, (x*4)+LSB ## (OUT);
139
140#define STORE_ALIGNED(x) \
141CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
142 .if (x != 12); \
143 lw T0, (x*4)(STATE); \
144 .endif; \
145 lw T1, (x*4) ## (IN); \
146 .if (x == 12); \
147 addu X ## x, NONCE_0; \
148 .else; \
149 addu X ## x, T0; \
150 .endif; \
151 CPU_TO_LE32(X ## x); \
152 xor X ## x, T1; \
153 sw X ## x, (x*4) ## (OUT);
154
155/* Jump table macro.
156 * Used for setup and handling the last bytes, which are not multiple of 4.
157 * X15 is free to store Xn
158 * Every jumptable entry must be equal in size.
159 */
160#define JMPTBL_ALIGNED(x) \
161.Lchacha_mips_jmptbl_aligned_ ## x: ; \
162 .set noreorder; \
163 b .Lchacha_mips_xor_aligned_ ## x ## _b; \
164 .if (x == 12); \
165 addu SAVED_X, X ## x, NONCE_0; \
166 .else; \
167 addu SAVED_X, X ## x, SAVED_CA; \
168 .endif; \
169 .set reorder
170
171#define JMPTBL_UNALIGNED(x) \
172.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
173 .set noreorder; \
174 b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
175 .if (x == 12); \
176 addu SAVED_X, X ## x, NONCE_0; \
177 .else; \
178 addu SAVED_X, X ## x, SAVED_CA; \
179 .endif; \
180 .set reorder
181
182#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
183 addu X(A), X(K); \
184 addu X(B), X(L); \
185 addu X(C), X(M); \
186 addu X(D), X(N); \
187 xor X(V), X(A); \
188 xor X(W), X(B); \
189 xor X(Y), X(C); \
190 xor X(Z), X(D); \
191 rotr X(V), 32 - S; \
192 rotr X(W), 32 - S; \
193 rotr X(Y), 32 - S; \
194 rotr X(Z), 32 - S;
195
196.text
197.set reorder
198.set noat
199.globl chacha_crypt_arch
200.ent chacha_crypt_arch
201chacha_crypt_arch:
202 .frame $sp, STACK_SIZE, $ra
203
204 /* Load number of rounds */
205 lw $at, 16($sp)
206
207 addiu $sp, -STACK_SIZE
208
209 /* Return bytes = 0. */
210 beqz BYTES, .Lchacha_mips_end
211
212 lw NONCE_0, 48(STATE)
213
214 /* Save s0-s7 */
215 sw $s0, 0($sp)
216 sw $s1, 4($sp)
217 sw $s2, 8($sp)
218 sw $s3, 12($sp)
219 sw $s4, 16($sp)
220 sw $s5, 20($sp)
221 sw $s6, 24($sp)
222 sw $s7, 28($sp)
223
224 /* Test IN or OUT is unaligned.
225 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
226 */
227 or IS_UNALIGNED, IN, OUT
228 andi IS_UNALIGNED, 0x3
229
230 b .Lchacha_rounds_start
231
232.align 4
233.Loop_chacha_rounds:
234 addiu IN, CHACHA20_BLOCK_SIZE
235 addiu OUT, CHACHA20_BLOCK_SIZE
236 addiu NONCE_0, 1
237
238.Lchacha_rounds_start:
239 lw X0, 0(STATE)
240 lw X1, 4(STATE)
241 lw X2, 8(STATE)
242 lw X3, 12(STATE)
243
244 lw X4, 16(STATE)
245 lw X5, 20(STATE)
246 lw X6, 24(STATE)
247 lw X7, 28(STATE)
248 lw X8, 32(STATE)
249 lw X9, 36(STATE)
250 lw X10, 40(STATE)
251 lw X11, 44(STATE)
252
253 move X12, NONCE_0
254 lw X13, 52(STATE)
255 lw X14, 56(STATE)
256 lw X15, 60(STATE)
257
258.Loop_chacha_xor_rounds:
259 addiu $at, -2
260 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
261 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
262 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
263 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
264 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
265 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
266 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
267 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
268 bnez $at, .Loop_chacha_xor_rounds
269
270 addiu BYTES, -(CHACHA20_BLOCK_SIZE)
271
272 /* Is data src/dst unaligned? Jump */
273 bnez IS_UNALIGNED, .Loop_chacha_unaligned
274
275 /* Set number rounds here to fill delayslot. */
276 lw $at, (STACK_SIZE+16)($sp)
277
278 /* BYTES < 0, it has no full block. */
279 bltz BYTES, .Lchacha_mips_no_full_block_aligned
280
281 FOR_EACH_WORD_REV(STORE_ALIGNED)
282
283 /* BYTES > 0? Loop again. */
284 bgtz BYTES, .Loop_chacha_rounds
285
286 /* Place this here to fill delay slot */
287 addiu NONCE_0, 1
288
289 /* BYTES < 0? Handle last bytes */
290 bltz BYTES, .Lchacha_mips_xor_bytes
291
292.Lchacha_mips_xor_done:
293 /* Restore used registers */
294 lw $s0, 0($sp)
295 lw $s1, 4($sp)
296 lw $s2, 8($sp)
297 lw $s3, 12($sp)
298 lw $s4, 16($sp)
299 lw $s5, 20($sp)
300 lw $s6, 24($sp)
301 lw $s7, 28($sp)
302
303 /* Write NONCE_0 back to right location in state */
304 sw NONCE_0, 48(STATE)
305
306.Lchacha_mips_end:
307 addiu $sp, STACK_SIZE
308 jr $ra
309
310.Lchacha_mips_no_full_block_aligned:
311 /* Restore the offset on BYTES */
312 addiu BYTES, CHACHA20_BLOCK_SIZE
313
314 /* Get number of full WORDS */
315 andi $at, BYTES, MASK_U32
316
317 /* Load upper half of jump table addr */
318 lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
319
320 /* Calculate lower half jump table offset */
321 ins T0, $at, 1, 6
322
323 /* Add offset to STATE */
324 addu T1, STATE, $at
325
326 /* Add lower half jump table addr */
327 addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
328
329 /* Read value from STATE */
330 lw SAVED_CA, 0(T1)
331
332 /* Store remaining bytecounter as negative value */
333 subu BYTES, $at, BYTES
334
335 jr T0
336
337 /* Jump table */
338 FOR_EACH_WORD(JMPTBL_ALIGNED)
339
340
341.Loop_chacha_unaligned:
342 /* Set number rounds here to fill delayslot. */
343 lw $at, (STACK_SIZE+16)($sp)
344
345 /* BYTES > 0, it has no full block. */
346 bltz BYTES, .Lchacha_mips_no_full_block_unaligned
347
348 FOR_EACH_WORD_REV(STORE_UNALIGNED)
349
350 /* BYTES > 0? Loop again. */
351 bgtz BYTES, .Loop_chacha_rounds
352
353 /* Write NONCE_0 back to right location in state */
354 sw NONCE_0, 48(STATE)
355
356 .set noreorder
357 /* Fall through to byte handling */
358 bgez BYTES, .Lchacha_mips_xor_done
359.Lchacha_mips_xor_unaligned_0_b:
360.Lchacha_mips_xor_aligned_0_b:
361 /* Place this here to fill delay slot */
362 addiu NONCE_0, 1
363 .set reorder
364
365.Lchacha_mips_xor_bytes:
366 addu IN, $at
367 addu OUT, $at
368 /* First byte */
369 lbu T1, 0(IN)
370 addiu $at, BYTES, 1
371 xor T1, SAVED_X
372 sb T1, 0(OUT)
373 beqz $at, .Lchacha_mips_xor_done
374 /* Second byte */
375 lbu T1, 1(IN)
376 addiu $at, BYTES, 2
377 rotr SAVED_X, 8
378 xor T1, SAVED_X
379 sb T1, 1(OUT)
380 beqz $at, .Lchacha_mips_xor_done
381 /* Third byte */
382 lbu T1, 2(IN)
383 rotr SAVED_X, 8
384 xor T1, SAVED_X
385 sb T1, 2(OUT)
386 b .Lchacha_mips_xor_done
387
388.Lchacha_mips_no_full_block_unaligned:
389 /* Restore the offset on BYTES */
390 addiu BYTES, CHACHA20_BLOCK_SIZE
391
392 /* Get number of full WORDS */
393 andi $at, BYTES, MASK_U32
394
395 /* Load upper half of jump table addr */
396 lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
397
398 /* Calculate lower half jump table offset */
399 ins T0, $at, 1, 6
400
401 /* Add offset to STATE */
402 addu T1, STATE, $at
403
404 /* Add lower half jump table addr */
405 addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
406
407 /* Read value from STATE */
408 lw SAVED_CA, 0(T1)
409
410 /* Store remaining bytecounter as negative value */
411 subu BYTES, $at, BYTES
412
413 jr T0
414
415 /* Jump table */
416 FOR_EACH_WORD(JMPTBL_UNALIGNED)
417.end chacha_crypt_arch
418.set at
419
420/* Input arguments
421 * STATE $a0
422 * OUT $a1
423 * NROUND $a2
424 */
425
426#undef X12
427#undef X13
428#undef X14
429#undef X15
430
431#define X12 $a3
432#define X13 $at
433#define X14 $v0
434#define X15 STATE
435
436.set noat
437.globl hchacha_block_arch
438.ent hchacha_block_arch
439hchacha_block_arch:
440 .frame $sp, STACK_SIZE, $ra
441
442 addiu $sp, -STACK_SIZE
443
444 /* Save X11(s6) */
445 sw X11, 0($sp)
446
447 lw X0, 0(STATE)
448 lw X1, 4(STATE)
449 lw X2, 8(STATE)
450 lw X3, 12(STATE)
451 lw X4, 16(STATE)
452 lw X5, 20(STATE)
453 lw X6, 24(STATE)
454 lw X7, 28(STATE)
455 lw X8, 32(STATE)
456 lw X9, 36(STATE)
457 lw X10, 40(STATE)
458 lw X11, 44(STATE)
459 lw X12, 48(STATE)
460 lw X13, 52(STATE)
461 lw X14, 56(STATE)
462 lw X15, 60(STATE)
463
464.Loop_hchacha_xor_rounds:
465 addiu $a2, -2
466 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
467 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
468 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
469 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
470 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
471 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
472 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
473 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
474 bnez $a2, .Loop_hchacha_xor_rounds
475
476 /* Restore used register */
477 lw X11, 0($sp)
478
479 sw X0, 0(OUT)
480 sw X1, 4(OUT)
481 sw X2, 8(OUT)
482 sw X3, 12(OUT)
483 sw X12, 16(OUT)
484 sw X13, 20(OUT)
485 sw X14, 24(OUT)
486 sw X15, 28(OUT)
487
488 addiu $sp, STACK_SIZE
489 jr $ra
490.end hchacha_block_arch
491.set at
492

source code of linux/lib/crypto/mips/chacha-core.S