blake2b-neon-core.S source code [linux/arch/arm/crypto/blake2b-neon-core.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* BLAKE2b digest algorithm, NEON accelerated
4	*
5	* Copyright 2020 Google LLC
6	*
7	* Author: Eric Biggers <ebiggers@google.com>
8	*/
9
10	#include <linux/linkage.h>
11
12	.text
13	.fpu neon
14
15	// The arguments to blake2b_compress_neon()
16	STATE .req r0
17	BLOCK .req r1
18	NBLOCKS .req r2
19	INC .req r3
20
21	// Pointers to the rotation tables
22	ROR24_TABLE .req r4
23	ROR16_TABLE .req r5
24
25	// The original stack pointer
26	ORIG_SP .req r6
27
28	// NEON registers which contain the message words of the current block.
29	// M_0-M_3 are occasionally used for other purposes too.
30	M_0 .req d16
31	M_1 .req d17
32	M_2 .req d18
33	M_3 .req d19
34	M_4 .req d20
35	M_5 .req d21
36	M_6 .req d22
37	M_7 .req d23
38	M_8 .req d24
39	M_9 .req d25
40	M_10 .req d26
41	M_11 .req d27
42	M_12 .req d28
43	M_13 .req d29
44	M_14 .req d30
45	M_15 .req d31
46
47	.align `4`
48	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
49	// instruction. This is the most efficient way to implement these
50	// rotation amounts with NEON. (On Cortex-A53 it's the same speed as
51	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
52	.Lror24_table:
53	.byte `3`, `4`, `5`, `6`, `7`, `0`, `1`, `2`
54	.Lror16_table:
55	.byte `2`, `3`, `4`, `5`, `6`, `7`, `0`, `1`
56	// The BLAKE2b initialization vector
57	.Lblake2b_IV:
58	.quad `0x6a09e667f3bcc908`, `0xbb67ae8584caa73b`
59	.quad `0x3c6ef372fe94f82b`, `0xa54ff53a5f1d36f1`
60	.quad `0x510e527fade682d1`, `0x9b05688c2b3e6c1f`
61	.quad `0x1f83d9abfb41bd6b`, `0x5be0cd19137e2179`
62
63	// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
64	// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
65	// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
66	// (M_0-M_3), so that they can be reloaded if they are used as temporary
67	// registers. The macro arguments s0-s15 give the order in which the message
68	// words are used in this round. 'final' is 1 if this is the final round.
69	.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
70	s8, s9, s10, s11, s12, s13, s14, s15, final=`0`
71
72	// Mix the columns:
73	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
74	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
75
76	// a += b + m[blake2b_sigma[r][2i + 0]];*
77	vadd.u64 q0, q0, q2
78	vadd.u64 q1, q1, q3
79	vadd.u64 d0, d0, M_\s0
80	vadd.u64 d1, d1, M_\s2
81	vadd.u64 d2, d2, M_\s4
82	vadd.u64 d3, d3, M_\s6
83
84	// d = ror64(d ^ a, 32);
85	veor q6, q6, q0
86	veor q7, q7, q1
87	vrev64`.32` q6, q6
88	vrev64`.32` q7, q7
89
90	// c += d;
91	vadd.u64 q4, q4, q6
92	vadd.u64 q5, q5, q7
93
94	// b = ror64(b ^ c, 24);
95	vld1`.8` {M_0}, [ROR24_TABLE, :`64`]
96	veor q2, q2, q4
97	veor q3, q3, q5
98	vtbl`.8` d4, {d4}, M_0
99	vtbl`.8` d5, {d5}, M_0
100	vtbl`.8` d6, {d6}, M_0
101	vtbl`.8` d7, {d7}, M_0
102
103	// a += b + m[blake2b_sigma[r][2i + 1]];*
104	//
105	// M_0 got clobbered above, so we have to reload it if any of the four
106	// message words this step needs happens to be M_0. Otherwise we don't
107	// need to reload it here, as it will just get clobbered again below.
108	.if \s1 == `0` \|\| \s3 == `0` \|\| \s5 == `0` \|\| \s7 == `0`
109	vld1`.8` {M_0}, [sp, :`64`]
110	.endif
111	vadd.u64 q0, q0, q2
112	vadd.u64 q1, q1, q3
113	vadd.u64 d0, d0, M_\s1
114	vadd.u64 d1, d1, M_\s3
115	vadd.u64 d2, d2, M_\s5
116	vadd.u64 d3, d3, M_\s7
117
118	// d = ror64(d ^ a, 16);
119	vld1`.8` {M_0}, [ROR16_TABLE, :`64`]
120	veor q6, q6, q0
121	veor q7, q7, q1
122	vtbl`.8` d12, {d12}, M_0
123	vtbl`.8` d13, {d13}, M_0
124	vtbl`.8` d14, {d14}, M_0
125	vtbl`.8` d15, {d15}, M_0
126
127	// c += d;
128	vadd.u64 q4, q4, q6
129	vadd.u64 q5, q5, q7
130
131	// b = ror64(b ^ c, 63);
132	//
133	// This rotation amount isn't a multiple of 8, so it has to be
134	// implemented using a pair of shifts, which requires temporary
135	// registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
136	veor q8, q2, q4
137	veor q9, q3, q5
138	vshr.u64 q2, q8, #`63`
139	vshr.u64 q3, q9, #`63`
140	vsli.u64 q2, q8, #`1`
141	vsli.u64 q3, q9, #`1`
142	vld1`.8` {q8-q9}, [sp, :`256`]
143
144	// Mix the diagonals:
145	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
146	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
147	//
148	// There are two possible ways to do this: use 'vext' instructions to
149	// shift the rows of the matrix so that the diagonals become columns,
150	// and undo it afterwards; or just use 64-bit operations on 'd'
151	// registers instead of 128-bit operations on 'q' registers. We use the
152	// latter approach, as it performs much better on Cortex-A7.
153
154	// a += b + m[blake2b_sigma[r][2i + 0]];*
155	vadd.u64 d0, d0, d5
156	vadd.u64 d1, d1, d6
157	vadd.u64 d2, d2, d7
158	vadd.u64 d3, d3, d4
159	vadd.u64 d0, d0, M_\s8
160	vadd.u64 d1, d1, M_\s10
161	vadd.u64 d2, d2, M_\s12
162	vadd.u64 d3, d3, M_\s14
163
164	// d = ror64(d ^ a, 32);
165	veor d15, d15, d0
166	veor d12, d12, d1
167	veor d13, d13, d2
168	veor d14, d14, d3
169	vrev64`.32` d15, d15
170	vrev64`.32` d12, d12
171	vrev64`.32` d13, d13
172	vrev64`.32` d14, d14
173
174	// c += d;
175	vadd.u64 d10, d10, d15
176	vadd.u64 d11, d11, d12
177	vadd.u64 d8, d8, d13
178	vadd.u64 d9, d9, d14
179
180	// b = ror64(b ^ c, 24);
181	vld1`.8` {M_0}, [ROR24_TABLE, :`64`]
182	veor d5, d5, d10
183	veor d6, d6, d11
184	veor d7, d7, d8
185	veor d4, d4, d9
186	vtbl`.8` d5, {d5}, M_0
187	vtbl`.8` d6, {d6}, M_0
188	vtbl`.8` d7, {d7}, M_0
189	vtbl`.8` d4, {d4}, M_0
190
191	// a += b + m[blake2b_sigma[r][2i + 1]];*
192	.if \s9 == `0` \|\| \s11 == `0` \|\| \s13 == `0` \|\| \s15 == `0`
193	vld1`.8` {M_0}, [sp, :`64`]
194	.endif
195	vadd.u64 d0, d0, d5
196	vadd.u64 d1, d1, d6
197	vadd.u64 d2, d2, d7
198	vadd.u64 d3, d3, d4
199	vadd.u64 d0, d0, M_\s9
200	vadd.u64 d1, d1, M_\s11
201	vadd.u64 d2, d2, M_\s13
202	vadd.u64 d3, d3, M_\s15
203
204	// d = ror64(d ^ a, 16);
205	vld1`.8` {M_0}, [ROR16_TABLE, :`64`]
206	veor d15, d15, d0
207	veor d12, d12, d1
208	veor d13, d13, d2
209	veor d14, d14, d3
210	vtbl`.8` d12, {d12}, M_0
211	vtbl`.8` d13, {d13}, M_0
212	vtbl`.8` d14, {d14}, M_0
213	vtbl`.8` d15, {d15}, M_0
214
215	// c += d;
216	vadd.u64 d10, d10, d15
217	vadd.u64 d11, d11, d12
218	vadd.u64 d8, d8, d13
219	vadd.u64 d9, d9, d14
220
221	// b = ror64(b ^ c, 63);
222	veor d16, d4, d9
223	veor d17, d5, d10
224	veor d18, d6, d11
225	veor d19, d7, d8
226	vshr.u64 q2, q8, #`63`
227	vshr.u64 q3, q9, #`63`
228	vsli.u64 q2, q8, #`1`
229	vsli.u64 q3, q9, #`1`
230	// Reloading q8-q9 can be skipped on the final round.
231	.if ! \final
232	vld1`.8` {q8-q9}, [sp, :`256`]
233	.endif
234	.endm
235
236	//
237	// void blake2b_compress_neon(struct blake2b_state state,*
238	// const u8 block, size_t nblocks, u32 inc);*
239	//
240	// Only the first three fields of struct blake2b_state are used:
241	// u64 h[8]; (inout)
242	// u64 t[2]; (inout)
243	// u64 f[2]; (in)
244	//
245	.align `5`
246	ENTRY(blake2b_compress_neon)
247	push {r4-r10}
248
249	// Allocate a 32-byte stack buffer that is 32-byte aligned.
250	mov ORIG_SP, sp
251	sub ip, sp, #`32`
252	bic ip, ip, #`31`
253	mov sp, ip
254
255	adr ROR24_TABLE, .Lror24_table
256	adr ROR16_TABLE, .Lror16_table
257
258	mov ip, STATE
259	vld1`.64` {q0-q1}, [ip]! // Load h[0..3]
260	vld1`.64` {q2-q3}, [ip]! // Load h[4..7]
261	.Lnext_block:
262	adr r10, .Lblake2b_IV
263	vld1`.64` {q14-q15}, [ip] // Load t[0..1] and f[0..1]
264	vld1`.64` {q4-q5}, [r10]! // Load IV[0..3]
265	vmov r7, r8, d28 // Copy t[0] to (r7, r8)
266	vld1`.64` {q6-q7}, [r10] // Load IV[4..7]
267	adds r7, r7, INC // Increment counter
268	bcs .Lslow_inc_ctr
269	vmov.i32 d28[`0`], r7
270	vst1`.64` {d28}, [ip] // Update t[0]
271	.Linc_ctr_done:
272
273	// Load the next message block and finish initializing the state matrix
274	// 'v'. Fortunately, there are exactly enough NEON registers to fit the
275	// entire state matrix in q0-q7 and the entire message block in q8-15.
276	//
277	// However, _blake2b_round also needs some extra registers for rotates,
278	// so we have to spill some registers. It's better to spill the message
279	// registers than the state registers, as the message doesn't change.
280	// Therefore we store a copy of the first 32 bytes of the message block
281	// (q8-q9) in an aligned buffer on the stack so that they can be
282	// reloaded when needed. (We could just reload directly from the
283	// message buffer, but it's faster to use aligned loads.)
284	vld1`.8` {q8-q9}, [BLOCK]!
285	veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
286	vld1`.8` {q10-q11}, [BLOCK]!
287	veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
288	vld1`.8` {q12-q13}, [BLOCK]!
289	vst1`.8` {q8-q9}, [sp, :`256`]
290	mov ip, STATE
291	vld1`.8` {q14-q15}, [BLOCK]!
292
293	// Execute the rounds. Each round is provided the order in which it
294	// needs to use the message words.
295	_blake2b_round `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`
296	_blake2b_round `14`, `10`, `4`, `8`, `9`, `15`, `13`, `6`, `1`, `12`, `0`, `2`, `11`, `7`, `5`, `3`
297	_blake2b_round `11`, `8`, `12`, `0`, `5`, `2`, `15`, `13`, `10`, `14`, `3`, `6`, `7`, `1`, `9`, `4`
298	_blake2b_round `7`, `9`, `3`, `1`, `13`, `12`, `11`, `14`, `2`, `6`, `5`, `10`, `4`, `0`, `15`, `8`
299	_blake2b_round `9`, `0`, `5`, `7`, `2`, `4`, `10`, `15`, `14`, `1`, `11`, `12`, `6`, `8`, `3`, `13`
300	_blake2b_round `2`, `12`, `6`, `10`, `0`, `11`, `8`, `3`, `4`, `13`, `7`, `5`, `15`, `14`, `1`, `9`
301	_blake2b_round `12`, `5`, `1`, `15`, `14`, `13`, `4`, `10`, `0`, `7`, `6`, `3`, `9`, `2`, `8`, `11`
302	_blake2b_round `13`, `11`, `7`, `14`, `12`, `1`, `3`, `9`, `5`, `0`, `15`, `4`, `8`, `6`, `2`, `10`
303	_blake2b_round `6`, `15`, `14`, `9`, `11`, `3`, `0`, `8`, `12`, `2`, `13`, `7`, `1`, `4`, `10`, `5`
304	_blake2b_round `10`, `2`, `8`, `4`, `7`, `6`, `1`, `5`, `15`, `11`, `9`, `14`, `3`, `12`, `13`, `0`
305	_blake2b_round `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`
306	_blake2b_round `14`, `10`, `4`, `8`, `9`, `15`, `13`, `6`, `1`, `12`, `0`, `2`, `11`, `7`, `5`, `3` \
307	final=`1`
308
309	// Fold the final state matrix into the hash chaining value:
310	//
311	// for (i = 0; i < 8; i++)
312	// h[i] ^= v[i] ^ v[i + 8];
313	//
314	vld1`.64` {q8-q9}, [ip]! // Load old h[0..3]
315	veor q0, q0, q4 // v[0..1] ^= v[8..9]
316	veor q1, q1, q5 // v[2..3] ^= v[10..11]
317	vld1`.64` {q10-q11}, [ip] // Load old h[4..7]
318	veor q2, q2, q6 // v[4..5] ^= v[12..13]
319	veor q3, q3, q7 // v[6..7] ^= v[14..15]
320	veor q0, q0, q8 // v[0..1] ^= h[0..1]
321	veor q1, q1, q9 // v[2..3] ^= h[2..3]
322	mov ip, STATE
323	subs NBLOCKS, NBLOCKS, #`1` // nblocks--
324	vst1`.64` {q0-q1}, [ip]! // Store new h[0..3]
325	veor q2, q2, q10 // v[4..5] ^= h[4..5]
326	veor q3, q3, q11 // v[6..7] ^= h[6..7]
327	vst1`.64` {q2-q3}, [ip]! // Store new h[4..7]
328
329	// Advance to the next block, if there is one.
330	bne .Lnext_block // nblocks != 0?
331
332	mov sp, ORIG_SP
333	pop {r4-r10}
334	mov pc, lr
335
336	.Lslow_inc_ctr:
337	// Handle the case where the counter overflowed its low 32 bits, by
338	// carrying the overflow bit into the full 128-bit counter.
339	vmov r9, r10, d29
340	adcs r8, r8, #`0`
341	adcs r9, r9, #`0`
342	adc r10, r10, #`0`
343	vmov d28, r7, r8
344	vmov d29, r9, r10
345	vst1`.64` {q14}, [ip] // Update t[0] and t[1]
346	b .Linc_ctr_done
347	ENDPROC(blake2b_compress_neon)
348

source code of linux/arch/arm/crypto/blake2b-neon-core.S