1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * BLAKE2s digest algorithm, ARM scalar implementation |
4 | * |
5 | * Copyright 2020 Google LLC |
6 | * |
7 | * Author: Eric Biggers <ebiggers@google.com> |
8 | */ |
9 | |
10 | #include <linux/linkage.h> |
11 | #include <asm/assembler.h> |
12 | |
13 | // Registers used to hold message words temporarily. There aren't |
14 | // enough ARM registers to hold the whole message block, so we have to |
15 | // load the words on-demand. |
16 | M_0 .req r12 |
17 | M_1 .req r14 |
18 | |
19 | // The BLAKE2s initialization vector |
20 | .Lblake2s_IV: |
21 | .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A |
22 | .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 |
23 | |
24 | .macro __ldrd a, b, src, offset |
25 | #if __LINUX_ARM_ARCH__ >= 6 |
26 | ldrd \a, \b, [\src, #\offset] |
27 | #else |
28 | ldr \a, [\src, #\offset] |
29 | ldr \b, [\src, #\offset + 4] |
30 | #endif |
31 | .endm |
32 | |
33 | .macro __strd a, b, dst, offset |
34 | #if __LINUX_ARM_ARCH__ >= 6 |
35 | strd \a, \b, [\dst, #\offset] |
36 | #else |
37 | str \a, [\dst, #\offset] |
38 | str \b, [\dst, #\offset + 4] |
39 | #endif |
40 | .endm |
41 | |
42 | .macro _le32_bswap a, tmp |
43 | #ifdef __ARMEB__ |
44 | rev_l \a, \tmp |
45 | #endif |
46 | .endm |
47 | |
48 | .macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp |
49 | _le32_bswap \a, \tmp |
50 | _le32_bswap \b, \tmp |
51 | _le32_bswap \c, \tmp |
52 | _le32_bswap \d, \tmp |
53 | _le32_bswap \e, \tmp |
54 | _le32_bswap \f, \tmp |
55 | _le32_bswap \g, \tmp |
56 | _le32_bswap \h, \tmp |
57 | .endm |
58 | |
59 | // Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. |
60 | // (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two |
61 | // columns/diagonals. s0-s1 are the word offsets to the message words the first |
62 | // column/diagonal needs, and likewise s2-s3 for the second column/diagonal. |
63 | // M_0 and M_1 are free to use, and the message block can be found at sp + 32. |
64 | // |
65 | // Note that to save instructions, the rotations don't happen when the |
66 | // pseudocode says they should, but rather they are delayed until the values are |
67 | // used. See the comment above _blake2s_round(). |
68 | .macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 |
69 | |
70 | ldr M_0, [sp, #32 + 4 * \s0] |
71 | ldr M_1, [sp, #32 + 4 * \s2] |
72 | |
73 | // a += b + m[blake2s_sigma[r][2*i + 0]]; |
74 | add \a0, \a0, \b0, ror #brot |
75 | add \a1, \a1, \b1, ror #brot |
76 | add \a0, \a0, M_0 |
77 | add \a1, \a1, M_1 |
78 | |
79 | // d = ror32(d ^ a, 16); |
80 | eor \d0, \a0, \d0, ror #drot |
81 | eor \d1, \a1, \d1, ror #drot |
82 | |
83 | // c += d; |
84 | add \c0, \c0, \d0, ror #16 |
85 | add \c1, \c1, \d1, ror #16 |
86 | |
87 | // b = ror32(b ^ c, 12); |
88 | eor \b0, \c0, \b0, ror #brot |
89 | eor \b1, \c1, \b1, ror #brot |
90 | |
91 | ldr M_0, [sp, #32 + 4 * \s1] |
92 | ldr M_1, [sp, #32 + 4 * \s3] |
93 | |
94 | // a += b + m[blake2s_sigma[r][2*i + 1]]; |
95 | add \a0, \a0, \b0, ror #12 |
96 | add \a1, \a1, \b1, ror #12 |
97 | add \a0, \a0, M_0 |
98 | add \a1, \a1, M_1 |
99 | |
100 | // d = ror32(d ^ a, 8); |
101 | eor \d0, \a0, \d0, ror#16 |
102 | eor \d1, \a1, \d1, ror#16 |
103 | |
104 | // c += d; |
105 | add \c0, \c0, \d0, ror#8 |
106 | add \c1, \c1, \d1, ror#8 |
107 | |
108 | // b = ror32(b ^ c, 7); |
109 | eor \b0, \c0, \b0, ror#12 |
110 | eor \b1, \c1, \b1, ror#12 |
111 | .endm |
112 | |
113 | // Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] |
114 | // are in r0..r9. The stack pointer points to 8 bytes of scratch space for |
115 | // spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and |
116 | // r14 are free to use. The macro arguments s0-s15 give the order in which the |
117 | // message words are used in this round. |
118 | // |
119 | // All rotates are performed using the implicit rotate operand accepted by the |
120 | // 'add' and 'eor' instructions. This is faster than using explicit rotate |
121 | // instructions. To make this work, we allow the values in the second and last |
122 | // rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the |
123 | // wrong rotation amount. The rotation amount is then fixed up just in time |
124 | // when the values are used. 'brot' is the number of bits the values in row 'b' |
125 | // need to be rotated right to arrive at the correct values, and 'drot' |
126 | // similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such |
127 | // that they end up as (7, 8) after every round. |
128 | .macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ |
129 | s8, s9, s10, s11, s12, s13, s14, s15 |
130 | |
131 | // Mix first two columns: |
132 | // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). |
133 | __ldrd r10, r11, sp, 16 // load v[12] and v[13] |
134 | _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ |
135 | \s0, \s1, \s2, \s3 |
136 | __strd r8, r9, sp, 0 |
137 | __strd r10, r11, sp, 16 |
138 | |
139 | // Mix second two columns: |
140 | // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). |
141 | __ldrd r8, r9, sp, 8 // load v[10] and v[11] |
142 | __ldrd r10, r11, sp, 24 // load v[14] and v[15] |
143 | _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ |
144 | \s4, \s5, \s6, \s7 |
145 | str r10, [sp, #24] // store v[14] |
146 | // v[10], v[11], and v[15] are used below, so no need to store them yet. |
147 | |
148 | .set brot, 7 |
149 | .set drot, 8 |
150 | |
151 | // Mix first two diagonals: |
152 | // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). |
153 | ldr r10, [sp, #16] // load v[12] |
154 | _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ |
155 | \s8, \s9, \s10, \s11 |
156 | __strd r8, r9, sp, 8 |
157 | str r11, [sp, #28] |
158 | str r10, [sp, #16] |
159 | |
160 | // Mix second two diagonals: |
161 | // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). |
162 | __ldrd r8, r9, sp, 0 // load v[8] and v[9] |
163 | __ldrd r10, r11, sp, 20 // load v[13] and v[14] |
164 | _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ |
165 | \s12, \s13, \s14, \s15 |
166 | __strd r10, r11, sp, 20 |
167 | .endm |
168 | |
169 | // |
170 | // void blake2s_compress(struct blake2s_state *state, |
171 | // const u8 *block, size_t nblocks, u32 inc); |
172 | // |
173 | // Only the first three fields of struct blake2s_state are used: |
174 | // u32 h[8]; (inout) |
175 | // u32 t[2]; (inout) |
176 | // u32 f[2]; (in) |
177 | // |
178 | .align 5 |
179 | ENTRY(blake2s_compress) |
180 | push {r0-r2,r4-r11,lr} // keep this an even number |
181 | |
182 | .Lnext_block: |
183 | // r0 is 'state' |
184 | // r1 is 'block' |
185 | // r3 is 'inc' |
186 | |
187 | // Load and increment the counter t[0..1]. |
188 | __ldrd r10, r11, r0, 32 |
189 | adds r10, r10, r3 |
190 | adc r11, r11, #0 |
191 | __strd r10, r11, r0, 32 |
192 | |
193 | // _blake2s_round is very short on registers, so copy the message block |
194 | // to the stack to save a register during the rounds. This also has the |
195 | // advantage that misalignment only needs to be dealt with in one place. |
196 | sub sp, sp, #64 |
197 | mov r12, sp |
198 | tst r1, #3 |
199 | bne .Lcopy_block_misaligned |
200 | ldmia r1!, {r2-r9} |
201 | _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 |
202 | stmia r12!, {r2-r9} |
203 | ldmia r1!, {r2-r9} |
204 | _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 |
205 | stmia r12, {r2-r9} |
206 | .Lcopy_block_done: |
207 | str r1, [sp, #68] // Update message pointer |
208 | |
209 | // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space |
210 | // for spilling v[8..9]. Leave v[8..9] in r8-r9. |
211 | mov r14, r0 // r14 = state |
212 | adr r12, .Lblake2s_IV |
213 | ldmia r12!, {r8-r9} // load IV[0..1] |
214 | __ldrd r0, r1, r14, 40 // load f[0..1] |
215 | ldm r12, {r2-r7} // load IV[3..7] |
216 | eor r4, r4, r10 // v[12] = IV[4] ^ t[0] |
217 | eor r5, r5, r11 // v[13] = IV[5] ^ t[1] |
218 | eor r6, r6, r0 // v[14] = IV[6] ^ f[0] |
219 | eor r7, r7, r1 // v[15] = IV[7] ^ f[1] |
220 | push {r2-r7} // push v[9..15] |
221 | sub sp, sp, #8 // leave space for v[8..9] |
222 | |
223 | // Load h[0..7] == v[0..7]. |
224 | ldm r14, {r0-r7} |
225 | |
226 | // Execute the rounds. Each round is provided the order in which it |
227 | // needs to use the message words. |
228 | .set brot, 0 |
229 | .set drot, 0 |
230 | _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
231 | _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 |
232 | _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 |
233 | _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 |
234 | _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 |
235 | _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 |
236 | _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 |
237 | _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 |
238 | _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 |
239 | _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 |
240 | |
241 | // Fold the final state matrix into the hash chaining value: |
242 | // |
243 | // for (i = 0; i < 8; i++) |
244 | // h[i] ^= v[i] ^ v[i + 8]; |
245 | // |
246 | ldr r14, [sp, #96] // r14 = &h[0] |
247 | add sp, sp, #8 // v[8..9] are already loaded. |
248 | pop {r10-r11} // load v[10..11] |
249 | eor r0, r0, r8 |
250 | eor r1, r1, r9 |
251 | eor r2, r2, r10 |
252 | eor r3, r3, r11 |
253 | ldm r14, {r8-r11} // load h[0..3] |
254 | eor r0, r0, r8 |
255 | eor r1, r1, r9 |
256 | eor r2, r2, r10 |
257 | eor r3, r3, r11 |
258 | stmia r14!, {r0-r3} // store new h[0..3] |
259 | ldm r14, {r0-r3} // load old h[4..7] |
260 | pop {r8-r11} // load v[12..15] |
261 | eor r0, r0, r4, ror #brot |
262 | eor r1, r1, r5, ror #brot |
263 | eor r2, r2, r6, ror #brot |
264 | eor r3, r3, r7, ror #brot |
265 | eor r0, r0, r8, ror #drot |
266 | eor r1, r1, r9, ror #drot |
267 | eor r2, r2, r10, ror #drot |
268 | eor r3, r3, r11, ror #drot |
269 | add sp, sp, #64 // skip copy of message block |
270 | stm r14, {r0-r3} // store new h[4..7] |
271 | |
272 | // Advance to the next block, if there is one. Note that if there are |
273 | // multiple blocks, then 'inc' (the counter increment amount) must be |
274 | // 64. So we can simply set it to 64 without re-loading it. |
275 | ldm sp, {r0, r1, r2} // load (state, block, nblocks) |
276 | mov r3, #64 // set 'inc' |
277 | subs r2, r2, #1 // nblocks-- |
278 | str r2, [sp, #8] |
279 | bne .Lnext_block // nblocks != 0? |
280 | |
281 | pop {r0-r2,r4-r11,pc} |
282 | |
283 | // The next message block (pointed to by r1) isn't 4-byte aligned, so it |
284 | // can't be loaded using ldmia. Copy it to the stack buffer (pointed to |
285 | // by r12) using an alternative method. r2-r9 are free to use. |
286 | .Lcopy_block_misaligned: |
287 | mov r2, #64 |
288 | 1: |
289 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
290 | ldr r3, [r1], #4 |
291 | _le32_bswap r3, r4 |
292 | #else |
293 | ldrb r3, [r1, #0] |
294 | ldrb r4, [r1, #1] |
295 | ldrb r5, [r1, #2] |
296 | ldrb r6, [r1, #3] |
297 | add r1, r1, #4 |
298 | orr r3, r3, r4, lsl #8 |
299 | orr r3, r3, r5, lsl #16 |
300 | orr r3, r3, r6, lsl #24 |
301 | #endif |
302 | subs r2, r2, #4 |
303 | str r3, [r12], #4 |
304 | bne 1b |
305 | b .Lcopy_block_done |
306 | ENDPROC(blake2s_compress) |
307 | |