1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * BLAKE2b digest algorithm, NEON accelerated |
4 | * |
5 | * Copyright 2020 Google LLC |
6 | * |
7 | * Author: Eric Biggers <ebiggers@google.com> |
8 | */ |
9 | |
10 | #include <linux/linkage.h> |
11 | |
12 | .text |
13 | .fpu neon |
14 | |
15 | // The arguments to blake2b_compress_neon() |
16 | STATE .req r0 |
17 | BLOCK .req r1 |
18 | NBLOCKS .req r2 |
19 | INC .req r3 |
20 | |
21 | // Pointers to the rotation tables |
22 | ROR24_TABLE .req r4 |
23 | ROR16_TABLE .req r5 |
24 | |
25 | // The original stack pointer |
26 | ORIG_SP .req r6 |
27 | |
28 | // NEON registers which contain the message words of the current block. |
29 | // M_0-M_3 are occasionally used for other purposes too. |
30 | M_0 .req d16 |
31 | M_1 .req d17 |
32 | M_2 .req d18 |
33 | M_3 .req d19 |
34 | M_4 .req d20 |
35 | M_5 .req d21 |
36 | M_6 .req d22 |
37 | M_7 .req d23 |
38 | M_8 .req d24 |
39 | M_9 .req d25 |
40 | M_10 .req d26 |
41 | M_11 .req d27 |
42 | M_12 .req d28 |
43 | M_13 .req d29 |
44 | M_14 .req d30 |
45 | M_15 .req d31 |
46 | |
47 | .align 4 |
48 | // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 |
49 | // instruction. This is the most efficient way to implement these |
50 | // rotation amounts with NEON. (On Cortex-A53 it's the same speed as |
51 | // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) |
52 | .Lror24_table: |
53 | .byte 3, 4, 5, 6, 7, 0, 1, 2 |
54 | .Lror16_table: |
55 | .byte 2, 3, 4, 5, 6, 7, 0, 1 |
56 | // The BLAKE2b initialization vector |
57 | .Lblake2b_IV: |
58 | .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b |
59 | .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 |
60 | .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f |
61 | .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 |
62 | |
63 | // Execute one round of BLAKE2b by updating the state matrix v[0..15] in the |
64 | // NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack |
65 | // pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 |
66 | // (M_0-M_3), so that they can be reloaded if they are used as temporary |
67 | // registers. The macro arguments s0-s15 give the order in which the message |
68 | // words are used in this round. 'final' is 1 if this is the final round. |
69 | .macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ |
70 | s8, s9, s10, s11, s12, s13, s14, s15, final=0 |
71 | |
72 | // Mix the columns: |
73 | // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), |
74 | // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). |
75 | |
76 | // a += b + m[blake2b_sigma[r][2*i + 0]]; |
77 | vadd.u64 q0, q0, q2 |
78 | vadd.u64 q1, q1, q3 |
79 | vadd.u64 d0, d0, M_\s0 |
80 | vadd.u64 d1, d1, M_\s2 |
81 | vadd.u64 d2, d2, M_\s4 |
82 | vadd.u64 d3, d3, M_\s6 |
83 | |
84 | // d = ror64(d ^ a, 32); |
85 | veor q6, q6, q0 |
86 | veor q7, q7, q1 |
87 | vrev64.32 q6, q6 |
88 | vrev64.32 q7, q7 |
89 | |
90 | // c += d; |
91 | vadd.u64 q4, q4, q6 |
92 | vadd.u64 q5, q5, q7 |
93 | |
94 | // b = ror64(b ^ c, 24); |
95 | vld1.8 {M_0}, [ROR24_TABLE, :64] |
96 | veor q2, q2, q4 |
97 | veor q3, q3, q5 |
98 | vtbl.8 d4, {d4}, M_0 |
99 | vtbl.8 d5, {d5}, M_0 |
100 | vtbl.8 d6, {d6}, M_0 |
101 | vtbl.8 d7, {d7}, M_0 |
102 | |
103 | // a += b + m[blake2b_sigma[r][2*i + 1]]; |
104 | // |
105 | // M_0 got clobbered above, so we have to reload it if any of the four |
106 | // message words this step needs happens to be M_0. Otherwise we don't |
107 | // need to reload it here, as it will just get clobbered again below. |
108 | .if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 |
109 | vld1.8 {M_0}, [sp, :64] |
110 | .endif |
111 | vadd.u64 q0, q0, q2 |
112 | vadd.u64 q1, q1, q3 |
113 | vadd.u64 d0, d0, M_\s1 |
114 | vadd.u64 d1, d1, M_\s3 |
115 | vadd.u64 d2, d2, M_\s5 |
116 | vadd.u64 d3, d3, M_\s7 |
117 | |
118 | // d = ror64(d ^ a, 16); |
119 | vld1.8 {M_0}, [ROR16_TABLE, :64] |
120 | veor q6, q6, q0 |
121 | veor q7, q7, q1 |
122 | vtbl.8 d12, {d12}, M_0 |
123 | vtbl.8 d13, {d13}, M_0 |
124 | vtbl.8 d14, {d14}, M_0 |
125 | vtbl.8 d15, {d15}, M_0 |
126 | |
127 | // c += d; |
128 | vadd.u64 q4, q4, q6 |
129 | vadd.u64 q5, q5, q7 |
130 | |
131 | // b = ror64(b ^ c, 63); |
132 | // |
133 | // This rotation amount isn't a multiple of 8, so it has to be |
134 | // implemented using a pair of shifts, which requires temporary |
135 | // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. |
136 | veor q8, q2, q4 |
137 | veor q9, q3, q5 |
138 | vshr.u64 q2, q8, #63 |
139 | vshr.u64 q3, q9, #63 |
140 | vsli.u64 q2, q8, #1 |
141 | vsli.u64 q3, q9, #1 |
142 | vld1.8 {q8-q9}, [sp, :256] |
143 | |
144 | // Mix the diagonals: |
145 | // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), |
146 | // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). |
147 | // |
148 | // There are two possible ways to do this: use 'vext' instructions to |
149 | // shift the rows of the matrix so that the diagonals become columns, |
150 | // and undo it afterwards; or just use 64-bit operations on 'd' |
151 | // registers instead of 128-bit operations on 'q' registers. We use the |
152 | // latter approach, as it performs much better on Cortex-A7. |
153 | |
154 | // a += b + m[blake2b_sigma[r][2*i + 0]]; |
155 | vadd.u64 d0, d0, d5 |
156 | vadd.u64 d1, d1, d6 |
157 | vadd.u64 d2, d2, d7 |
158 | vadd.u64 d3, d3, d4 |
159 | vadd.u64 d0, d0, M_\s8 |
160 | vadd.u64 d1, d1, M_\s10 |
161 | vadd.u64 d2, d2, M_\s12 |
162 | vadd.u64 d3, d3, M_\s14 |
163 | |
164 | // d = ror64(d ^ a, 32); |
165 | veor d15, d15, d0 |
166 | veor d12, d12, d1 |
167 | veor d13, d13, d2 |
168 | veor d14, d14, d3 |
169 | vrev64.32 d15, d15 |
170 | vrev64.32 d12, d12 |
171 | vrev64.32 d13, d13 |
172 | vrev64.32 d14, d14 |
173 | |
174 | // c += d; |
175 | vadd.u64 d10, d10, d15 |
176 | vadd.u64 d11, d11, d12 |
177 | vadd.u64 d8, d8, d13 |
178 | vadd.u64 d9, d9, d14 |
179 | |
180 | // b = ror64(b ^ c, 24); |
181 | vld1.8 {M_0}, [ROR24_TABLE, :64] |
182 | veor d5, d5, d10 |
183 | veor d6, d6, d11 |
184 | veor d7, d7, d8 |
185 | veor d4, d4, d9 |
186 | vtbl.8 d5, {d5}, M_0 |
187 | vtbl.8 d6, {d6}, M_0 |
188 | vtbl.8 d7, {d7}, M_0 |
189 | vtbl.8 d4, {d4}, M_0 |
190 | |
191 | // a += b + m[blake2b_sigma[r][2*i + 1]]; |
192 | .if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 |
193 | vld1.8 {M_0}, [sp, :64] |
194 | .endif |
195 | vadd.u64 d0, d0, d5 |
196 | vadd.u64 d1, d1, d6 |
197 | vadd.u64 d2, d2, d7 |
198 | vadd.u64 d3, d3, d4 |
199 | vadd.u64 d0, d0, M_\s9 |
200 | vadd.u64 d1, d1, M_\s11 |
201 | vadd.u64 d2, d2, M_\s13 |
202 | vadd.u64 d3, d3, M_\s15 |
203 | |
204 | // d = ror64(d ^ a, 16); |
205 | vld1.8 {M_0}, [ROR16_TABLE, :64] |
206 | veor d15, d15, d0 |
207 | veor d12, d12, d1 |
208 | veor d13, d13, d2 |
209 | veor d14, d14, d3 |
210 | vtbl.8 d12, {d12}, M_0 |
211 | vtbl.8 d13, {d13}, M_0 |
212 | vtbl.8 d14, {d14}, M_0 |
213 | vtbl.8 d15, {d15}, M_0 |
214 | |
215 | // c += d; |
216 | vadd.u64 d10, d10, d15 |
217 | vadd.u64 d11, d11, d12 |
218 | vadd.u64 d8, d8, d13 |
219 | vadd.u64 d9, d9, d14 |
220 | |
221 | // b = ror64(b ^ c, 63); |
222 | veor d16, d4, d9 |
223 | veor d17, d5, d10 |
224 | veor d18, d6, d11 |
225 | veor d19, d7, d8 |
226 | vshr.u64 q2, q8, #63 |
227 | vshr.u64 q3, q9, #63 |
228 | vsli.u64 q2, q8, #1 |
229 | vsli.u64 q3, q9, #1 |
230 | // Reloading q8-q9 can be skipped on the final round. |
231 | .if ! \final |
232 | vld1.8 {q8-q9}, [sp, :256] |
233 | .endif |
234 | .endm |
235 | |
236 | // |
237 | // void blake2b_compress_neon(struct blake2b_state *state, |
238 | // const u8 *block, size_t nblocks, u32 inc); |
239 | // |
240 | // Only the first three fields of struct blake2b_state are used: |
241 | // u64 h[8]; (inout) |
242 | // u64 t[2]; (inout) |
243 | // u64 f[2]; (in) |
244 | // |
245 | .align 5 |
246 | ENTRY(blake2b_compress_neon) |
247 | push {r4-r10} |
248 | |
249 | // Allocate a 32-byte stack buffer that is 32-byte aligned. |
250 | mov ORIG_SP, sp |
251 | sub ip, sp, #32 |
252 | bic ip, ip, #31 |
253 | mov sp, ip |
254 | |
255 | adr ROR24_TABLE, .Lror24_table |
256 | adr ROR16_TABLE, .Lror16_table |
257 | |
258 | mov ip, STATE |
259 | vld1.64 {q0-q1}, [ip]! // Load h[0..3] |
260 | vld1.64 {q2-q3}, [ip]! // Load h[4..7] |
261 | .Lnext_block: |
262 | adr r10, .Lblake2b_IV |
263 | vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] |
264 | vld1.64 {q4-q5}, [r10]! // Load IV[0..3] |
265 | vmov r7, r8, d28 // Copy t[0] to (r7, r8) |
266 | vld1.64 {q6-q7}, [r10] // Load IV[4..7] |
267 | adds r7, r7, INC // Increment counter |
268 | bcs .Lslow_inc_ctr |
269 | vmov.i32 d28[0], r7 |
270 | vst1.64 {d28}, [ip] // Update t[0] |
271 | .Linc_ctr_done: |
272 | |
273 | // Load the next message block and finish initializing the state matrix |
274 | // 'v'. Fortunately, there are exactly enough NEON registers to fit the |
275 | // entire state matrix in q0-q7 and the entire message block in q8-15. |
276 | // |
277 | // However, _blake2b_round also needs some extra registers for rotates, |
278 | // so we have to spill some registers. It's better to spill the message |
279 | // registers than the state registers, as the message doesn't change. |
280 | // Therefore we store a copy of the first 32 bytes of the message block |
281 | // (q8-q9) in an aligned buffer on the stack so that they can be |
282 | // reloaded when needed. (We could just reload directly from the |
283 | // message buffer, but it's faster to use aligned loads.) |
284 | vld1.8 {q8-q9}, [BLOCK]! |
285 | veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] |
286 | vld1.8 {q10-q11}, [BLOCK]! |
287 | veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] |
288 | vld1.8 {q12-q13}, [BLOCK]! |
289 | vst1.8 {q8-q9}, [sp, :256] |
290 | mov ip, STATE |
291 | vld1.8 {q14-q15}, [BLOCK]! |
292 | |
293 | // Execute the rounds. Each round is provided the order in which it |
294 | // needs to use the message words. |
295 | _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
296 | _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 |
297 | _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 |
298 | _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 |
299 | _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 |
300 | _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 |
301 | _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 |
302 | _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 |
303 | _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 |
304 | _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 |
305 | _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
306 | _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ |
307 | final=1 |
308 | |
309 | // Fold the final state matrix into the hash chaining value: |
310 | // |
311 | // for (i = 0; i < 8; i++) |
312 | // h[i] ^= v[i] ^ v[i + 8]; |
313 | // |
314 | vld1.64 {q8-q9}, [ip]! // Load old h[0..3] |
315 | veor q0, q0, q4 // v[0..1] ^= v[8..9] |
316 | veor q1, q1, q5 // v[2..3] ^= v[10..11] |
317 | vld1.64 {q10-q11}, [ip] // Load old h[4..7] |
318 | veor q2, q2, q6 // v[4..5] ^= v[12..13] |
319 | veor q3, q3, q7 // v[6..7] ^= v[14..15] |
320 | veor q0, q0, q8 // v[0..1] ^= h[0..1] |
321 | veor q1, q1, q9 // v[2..3] ^= h[2..3] |
322 | mov ip, STATE |
323 | subs NBLOCKS, NBLOCKS, #1 // nblocks-- |
324 | vst1.64 {q0-q1}, [ip]! // Store new h[0..3] |
325 | veor q2, q2, q10 // v[4..5] ^= h[4..5] |
326 | veor q3, q3, q11 // v[6..7] ^= h[6..7] |
327 | vst1.64 {q2-q3}, [ip]! // Store new h[4..7] |
328 | |
329 | // Advance to the next block, if there is one. |
330 | bne .Lnext_block // nblocks != 0? |
331 | |
332 | mov sp, ORIG_SP |
333 | pop {r4-r10} |
334 | mov pc, lr |
335 | |
336 | .Lslow_inc_ctr: |
337 | // Handle the case where the counter overflowed its low 32 bits, by |
338 | // carrying the overflow bit into the full 128-bit counter. |
339 | vmov r9, r10, d29 |
340 | adcs r8, r8, #0 |
341 | adcs r9, r9, #0 |
342 | adc r10, r10, #0 |
343 | vmov d28, r7, r8 |
344 | vmov d29, r9, r10 |
345 | vst1.64 {q14}, [ip] // Update t[0] and t[1] |
346 | b .Linc_ctr_done |
347 | ENDPROC(blake2b_compress_neon) |
348 | |