blake2s-core.S source code [linux/arch/arm/crypto/blake2s-core.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* BLAKE2s digest algorithm, ARM scalar implementation
4	*
5	* Copyright 2020 Google LLC
6	*
7	* Author: Eric Biggers <ebiggers@google.com>
8	*/
9
10	#include <linux/linkage.h>
11	#include <asm/assembler.h>
12
13	// Registers used to hold message words temporarily. There aren't
14	// enough ARM registers to hold the whole message block, so we have to
15	// load the words on-demand.
16	M_0 .req r12
17	M_1 .req r14
18
19	// The BLAKE2s initialization vector
20	.Lblake2s_IV:
21	.word `0x6A09E667`, `0xBB67AE85`, `0x3C6EF372`, `0xA54FF53A`
22	.word `0x510E527F`, `0x9B05688C`, `0x1F83D9AB`, `0x5BE0CD19`
23
24	.macro __ldrd a, b, src, offset
25	#if __LINUX_ARM_ARCH__ >= 6
26	ldrd \a, \b, [\src, #\offset]
27	#else
28	ldr \a, [\src, #\offset]
29	ldr \b, [\src, #\offset + `4`]
30	#endif
31	.endm
32
33	.macro __strd a, b, dst, offset
34	#if __LINUX_ARM_ARCH__ >= 6
35	strd \a, \b, [\dst, #\offset]
36	#else
37	str \a, [\dst, #\offset]
38	str \b, [\dst, #\offset + `4`]
39	#endif
40	.endm
41
42	.macro _le32_bswap a, tmp
43	#ifdef __ARMEB__
44	rev_l \a, \tmp
45	#endif
46	.endm
47
48	.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
49	_le32_bswap \a, \tmp
50	_le32_bswap \b, \tmp
51	_le32_bswap \c, \tmp
52	_le32_bswap \d, \tmp
53	_le32_bswap \e, \tmp
54	_le32_bswap \f, \tmp
55	_le32_bswap \g, \tmp
56	_le32_bswap \h, \tmp
57	.endm
58
59	// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
60	// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
61	// columns/diagonals. s0-s1 are the word offsets to the message words the first
62	// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
63	// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
64	//
65	// Note that to save instructions, the rotations don't happen when the
66	// pseudocode says they should, but rather they are delayed until the values are
67	// used. See the comment above _blake2s_round().
68	.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
69
70	ldr M_0, [sp, #`32` + `4` * \s0]
71	ldr M_1, [sp, #`32` + `4` * \s2]
72
73	// a += b + m[blake2s_sigma[r][2i + 0]];*
74	add \a0, \a0, \b0, ror #brot
75	add \a1, \a1, \b1, ror #brot
76	add \a0, \a0, M_0
77	add \a1, \a1, M_1
78
79	// d = ror32(d ^ a, 16);
80	eor \d0, \a0, \d0, ror #drot
81	eor \d1, \a1, \d1, ror #drot
82
83	// c += d;
84	add \c0, \c0, \d0, ror #`16`
85	add \c1, \c1, \d1, ror #`16`
86
87	// b = ror32(b ^ c, 12);
88	eor \b0, \c0, \b0, ror #brot
89	eor \b1, \c1, \b1, ror #brot
90
91	ldr M_0, [sp, #`32` + `4` * \s1]
92	ldr M_1, [sp, #`32` + `4` * \s3]
93
94	// a += b + m[blake2s_sigma[r][2i + 1]];*
95	add \a0, \a0, \b0, ror #`12`
96	add \a1, \a1, \b1, ror #`12`
97	add \a0, \a0, M_0
98	add \a1, \a1, M_1
99
100	// d = ror32(d ^ a, 8);
101	eor \d0, \a0, \d0, ror#`16`
102	eor \d1, \a1, \d1, ror#`16`
103
104	// c += d;
105	add \c0, \c0, \d0, ror#`8`
106	add \c1, \c1, \d1, ror#`8`
107
108	// b = ror32(b ^ c, 7);
109	eor \b0, \c0, \b0, ror#`12`
110	eor \b1, \c1, \b1, ror#`12`
111	.endm
112
113	// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
114	// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
115	// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
116	// r14 are free to use. The macro arguments s0-s15 give the order in which the
117	// message words are used in this round.
118	//
119	// All rotates are performed using the implicit rotate operand accepted by the
120	// 'add' and 'eor' instructions. This is faster than using explicit rotate
121	// instructions. To make this work, we allow the values in the second and last
122	// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
123	// wrong rotation amount. The rotation amount is then fixed up just in time
124	// when the values are used. 'brot' is the number of bits the values in row 'b'
125	// need to be rotated right to arrive at the correct values, and 'drot'
126	// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
127	// that they end up as (7, 8) after every round.
128	.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
129	s8, s9, s10, s11, s12, s13, s14, s15
130
131	// Mix first two columns:
132	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
133	__ldrd r10, r11, sp, `16` // load v[12] and v[13]
134	_blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
135	\s0, \s1, \s2, \s3
136	__strd r8, r9, sp, `0`
137	__strd r10, r11, sp, `16`
138
139	// Mix second two columns:
140	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
141	__ldrd r8, r9, sp, `8` // load v[10] and v[11]
142	__ldrd r10, r11, sp, `24` // load v[14] and v[15]
143	_blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
144	\s4, \s5, \s6, \s7
145	str r10, [sp, #`24`] // store v[14]
146	// v[10], v[11], and v[15] are used below, so no need to store them yet.
147
148	.set brot, `7`
149	.set drot, `8`
150
151	// Mix first two diagonals:
152	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
153	ldr r10, [sp, #`16`] // load v[12]
154	_blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
155	\s8, \s9, \s10, \s11
156	__strd r8, r9, sp, `8`
157	str r11, [sp, #`28`]
158	str r10, [sp, #`16`]
159
160	// Mix second two diagonals:
161	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
162	__ldrd r8, r9, sp, `0` // load v[8] and v[9]
163	__ldrd r10, r11, sp, `20` // load v[13] and v[14]
164	_blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
165	\s12, \s13, \s14, \s15
166	__strd r10, r11, sp, `20`
167	.endm
168
169	//
170	// void blake2s_compress(struct blake2s_state state,*
171	// const u8 block, size_t nblocks, u32 inc);*
172	//
173	// Only the first three fields of struct blake2s_state are used:
174	// u32 h[8]; (inout)
175	// u32 t[2]; (inout)
176	// u32 f[2]; (in)
177	//
178	.align `5`
179	ENTRY(blake2s_compress)
180	push {r0-r2,r4-r11,lr} // keep this an even number
181
182	.Lnext_block:
183	// r0 is 'state'
184	// r1 is 'block'
185	// r3 is 'inc'
186
187	// Load and increment the counter t[0..1].
188	__ldrd r10, r11, r0, `32`
189	adds r10, r10, r3
190	adc r11, r11, #`0`
191	__strd r10, r11, r0, `32`
192
193	// _blake2s_round is very short on registers, so copy the message block
194	// to the stack to save a register during the rounds. This also has the
195	// advantage that misalignment only needs to be dealt with in one place.
196	sub sp, sp, #`64`
197	mov r12, sp
198	tst r1, #`3`
199	bne .Lcopy_block_misaligned
200	ldmia r1!, {r2-r9}
201	_le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
202	stmia r12!, {r2-r9}
203	ldmia r1!, {r2-r9}
204	_le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
205	stmia r12, {r2-r9}
206	.Lcopy_block_done:
207	str r1, [sp, #`68`] // Update message pointer
208
209	// Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
210	// for spilling v[8..9]. Leave v[8..9] in r8-r9.
211	mov r14, r0 // r14 = state
212	adr r12, .Lblake2s_IV
213	ldmia r12!, {r8-r9} // load IV[0..1]
214	__ldrd r0, r1, r14, `40` // load f[0..1]
215	ldm r12, {r2-r7} // load IV[3..7]
216	eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
217	eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
218	eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
219	eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
220	push {r2-r7} // push v[9..15]
221	sub sp, sp, #`8` // leave space for v[8..9]
222
223	// Load h[0..7] == v[0..7].
224	ldm r14, {r0-r7}
225
226	// Execute the rounds. Each round is provided the order in which it
227	// needs to use the message words.
228	.set brot, `0`
229	.set drot, `0`
230	_blake2s_round `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`
231	_blake2s_round `14`, `10`, `4`, `8`, `9`, `15`, `13`, `6`, `1`, `12`, `0`, `2`, `11`, `7`, `5`, `3`
232	_blake2s_round `11`, `8`, `12`, `0`, `5`, `2`, `15`, `13`, `10`, `14`, `3`, `6`, `7`, `1`, `9`, `4`
233	_blake2s_round `7`, `9`, `3`, `1`, `13`, `12`, `11`, `14`, `2`, `6`, `5`, `10`, `4`, `0`, `15`, `8`
234	_blake2s_round `9`, `0`, `5`, `7`, `2`, `4`, `10`, `15`, `14`, `1`, `11`, `12`, `6`, `8`, `3`, `13`
235	_blake2s_round `2`, `12`, `6`, `10`, `0`, `11`, `8`, `3`, `4`, `13`, `7`, `5`, `15`, `14`, `1`, `9`
236	_blake2s_round `12`, `5`, `1`, `15`, `14`, `13`, `4`, `10`, `0`, `7`, `6`, `3`, `9`, `2`, `8`, `11`
237	_blake2s_round `13`, `11`, `7`, `14`, `12`, `1`, `3`, `9`, `5`, `0`, `15`, `4`, `8`, `6`, `2`, `10`
238	_blake2s_round `6`, `15`, `14`, `9`, `11`, `3`, `0`, `8`, `12`, `2`, `13`, `7`, `1`, `4`, `10`, `5`
239	_blake2s_round `10`, `2`, `8`, `4`, `7`, `6`, `1`, `5`, `15`, `11`, `9`, `14`, `3`, `12`, `13`, `0`
240
241	// Fold the final state matrix into the hash chaining value:
242	//
243	// for (i = 0; i < 8; i++)
244	// h[i] ^= v[i] ^ v[i + 8];
245	//
246	ldr r14, [sp, #`96`] // r14 = &h[0]
247	add sp, sp, #`8` // v[8..9] are already loaded.
248	pop {r10-r11} // load v[10..11]
249	eor r0, r0, r8
250	eor r1, r1, r9
251	eor r2, r2, r10
252	eor r3, r3, r11
253	ldm r14, {r8-r11} // load h[0..3]
254	eor r0, r0, r8
255	eor r1, r1, r9
256	eor r2, r2, r10
257	eor r3, r3, r11
258	stmia r14!, {r0-r3} // store new h[0..3]
259	ldm r14, {r0-r3} // load old h[4..7]
260	pop {r8-r11} // load v[12..15]
261	eor r0, r0, r4, ror #brot
262	eor r1, r1, r5, ror #brot
263	eor r2, r2, r6, ror #brot
264	eor r3, r3, r7, ror #brot
265	eor r0, r0, r8, ror #drot
266	eor r1, r1, r9, ror #drot
267	eor r2, r2, r10, ror #drot
268	eor r3, r3, r11, ror #drot
269	add sp, sp, #`64` // skip copy of message block
270	stm r14, {r0-r3} // store new h[4..7]
271
272	// Advance to the next block, if there is one. Note that if there are
273	// multiple blocks, then 'inc' (the counter increment amount) must be
274	// 64. So we can simply set it to 64 without re-loading it.
275	ldm sp, {r0, r1, r2} // load (state, block, nblocks)
276	mov r3, #`64` // set 'inc'
277	subs r2, r2, #`1` // nblocks--
278	str r2, [sp, #`8`]
279	bne .Lnext_block // nblocks != 0?
280
281	pop {r0-r2,r4-r11,pc}
282
283	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
284	// can't be loaded using ldmia. Copy it to the stack buffer (pointed to
285	// by r12) using an alternative method. r2-r9 are free to use.
286	.Lcopy_block_misaligned:
287	mov r2, #`64`
288	`1`:
289	#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
290	ldr r3, [r1], #`4`
291	_le32_bswap r3, r4
292	#else
293	ldrb r3, [r1, #`0`]
294	ldrb r4, [r1, #`1`]
295	ldrb r5, [r1, #`2`]
296	ldrb r6, [r1, #`3`]
297	add r1, r1, #`4`
298	orr r3, r3, r4, lsl #`8`
299	orr r3, r3, r5, lsl #`16`
300	orr r3, r3, r6, lsl #`24`
301	#endif
302	subs r2, r2, #`4`
303	str r3, [r12], #`4`
304	bne `1b`
305	b .Lcopy_block_done
306	ENDPROC(blake2s_compress)
307

source code of linux/arch/arm/crypto/blake2s-core.S