1 | /* |
2 | * ChaCha/XChaCha NEON helper functions |
3 | * |
4 | * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License version 2 as |
8 | * published by the Free Software Foundation. |
9 | * |
10 | * Based on: |
11 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions |
12 | * |
13 | * Copyright (C) 2015 Martin Willi |
14 | * |
15 | * This program is free software; you can redistribute it and/or modify |
16 | * it under the terms of the GNU General Public License as published by |
17 | * the Free Software Foundation; either version 2 of the License, or |
18 | * (at your option) any later version. |
19 | */ |
20 | |
21 | /* |
22 | * NEON doesn't have a rotate instruction. The alternatives are, more or less: |
23 | * |
24 | * (a) vshl.u32 + vsri.u32 (needs temporary register) |
25 | * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) |
26 | * (c) vrev32.16 (16-bit rotations only) |
27 | * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, |
28 | * needs index vector) |
29 | * |
30 | * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, |
31 | * the only choices are (a) and (b). We use (a) since it takes two-thirds the |
32 | * cycles of (b) on both Cortex-A7 and Cortex-A53. |
33 | * |
34 | * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest |
35 | * and doesn't need a temporary register. |
36 | * |
37 | * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence |
38 | * is twice as fast as (a), even when doing (a) on multiple registers |
39 | * simultaneously to eliminate the stall between vshl and vsri. Also, it |
40 | * parallelizes better when temporary registers are scarce. |
41 | * |
42 | * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as |
43 | * (a), so the need to load the rotation table actually makes the vtbl method |
44 | * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it |
45 | * seems to be a good compromise to get a more significant speed boost on some |
46 | * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. |
47 | */ |
48 | |
49 | #include <linux/linkage.h> |
50 | #include <asm/cache.h> |
51 | |
52 | .text |
53 | .fpu neon |
54 | .align 5 |
55 | |
56 | /* |
57 | * chacha_permute - permute one block |
58 | * |
59 | * Permute one 64-byte block where the state matrix is stored in the four NEON |
60 | * registers q0-q3. It performs matrix operations on four words in parallel, |
61 | * but requires shuffling to rearrange the words after each round. |
62 | * |
63 | * The round count is given in r3. |
64 | * |
65 | * Clobbers: r3, ip, q4-q5 |
66 | */ |
67 | chacha_permute: |
68 | |
69 | adr ip, .Lrol8_table |
70 | vld1.8 {d10}, [ip, :64] |
71 | |
72 | .Ldoubleround: |
73 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
74 | vadd.i32 q0, q0, q1 |
75 | veor q3, q3, q0 |
76 | vrev32.16 q3, q3 |
77 | |
78 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
79 | vadd.i32 q2, q2, q3 |
80 | veor q4, q1, q2 |
81 | vshl.u32 q1, q4, #12 |
82 | vsri.u32 q1, q4, #20 |
83 | |
84 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
85 | vadd.i32 q0, q0, q1 |
86 | veor q3, q3, q0 |
87 | vtbl.8 d6, {d6}, d10 |
88 | vtbl.8 d7, {d7}, d10 |
89 | |
90 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
91 | vadd.i32 q2, q2, q3 |
92 | veor q4, q1, q2 |
93 | vshl.u32 q1, q4, #7 |
94 | vsri.u32 q1, q4, #25 |
95 | |
96 | // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
97 | vext.8 q1, q1, q1, #4 |
98 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
99 | vext.8 q2, q2, q2, #8 |
100 | // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
101 | vext.8 q3, q3, q3, #12 |
102 | |
103 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
104 | vadd.i32 q0, q0, q1 |
105 | veor q3, q3, q0 |
106 | vrev32.16 q3, q3 |
107 | |
108 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
109 | vadd.i32 q2, q2, q3 |
110 | veor q4, q1, q2 |
111 | vshl.u32 q1, q4, #12 |
112 | vsri.u32 q1, q4, #20 |
113 | |
114 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
115 | vadd.i32 q0, q0, q1 |
116 | veor q3, q3, q0 |
117 | vtbl.8 d6, {d6}, d10 |
118 | vtbl.8 d7, {d7}, d10 |
119 | |
120 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
121 | vadd.i32 q2, q2, q3 |
122 | veor q4, q1, q2 |
123 | vshl.u32 q1, q4, #7 |
124 | vsri.u32 q1, q4, #25 |
125 | |
126 | // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
127 | vext.8 q1, q1, q1, #12 |
128 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
129 | vext.8 q2, q2, q2, #8 |
130 | // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
131 | vext.8 q3, q3, q3, #4 |
132 | |
133 | subs r3, r3, #2 |
134 | bne .Ldoubleround |
135 | |
136 | bx lr |
137 | ENDPROC(chacha_permute) |
138 | |
139 | ENTRY(chacha_block_xor_neon) |
140 | // r0: Input state matrix, s |
141 | // r1: 1 data block output, o |
142 | // r2: 1 data block input, i |
143 | // r3: nrounds |
144 | push {lr} |
145 | |
146 | // x0..3 = s0..3 |
147 | add ip, r0, #0x20 |
148 | vld1.32 {q0-q1}, [r0] |
149 | vld1.32 {q2-q3}, [ip] |
150 | |
151 | vmov q8, q0 |
152 | vmov q9, q1 |
153 | vmov q10, q2 |
154 | vmov q11, q3 |
155 | |
156 | bl chacha_permute |
157 | |
158 | add ip, r2, #0x20 |
159 | vld1.8 {q4-q5}, [r2] |
160 | vld1.8 {q6-q7}, [ip] |
161 | |
162 | // o0 = i0 ^ (x0 + s0) |
163 | vadd.i32 q0, q0, q8 |
164 | veor q0, q0, q4 |
165 | |
166 | // o1 = i1 ^ (x1 + s1) |
167 | vadd.i32 q1, q1, q9 |
168 | veor q1, q1, q5 |
169 | |
170 | // o2 = i2 ^ (x2 + s2) |
171 | vadd.i32 q2, q2, q10 |
172 | veor q2, q2, q6 |
173 | |
174 | // o3 = i3 ^ (x3 + s3) |
175 | vadd.i32 q3, q3, q11 |
176 | veor q3, q3, q7 |
177 | |
178 | add ip, r1, #0x20 |
179 | vst1.8 {q0-q1}, [r1] |
180 | vst1.8 {q2-q3}, [ip] |
181 | |
182 | pop {pc} |
183 | ENDPROC(chacha_block_xor_neon) |
184 | |
185 | ENTRY(hchacha_block_neon) |
186 | // r0: Input state matrix, s |
187 | // r1: output (8 32-bit words) |
188 | // r2: nrounds |
189 | push {lr} |
190 | |
191 | vld1.32 {q0-q1}, [r0]! |
192 | vld1.32 {q2-q3}, [r0] |
193 | |
194 | mov r3, r2 |
195 | bl chacha_permute |
196 | |
197 | vst1.32 {q0}, [r1]! |
198 | vst1.32 {q3}, [r1] |
199 | |
200 | pop {pc} |
201 | ENDPROC(hchacha_block_neon) |
202 | |
203 | .align 4 |
204 | .Lctrinc: .word 0, 1, 2, 3 |
205 | .Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 |
206 | |
207 | .align 5 |
208 | ENTRY(chacha_4block_xor_neon) |
209 | push {r4, lr} |
210 | mov r4, sp // preserve the stack pointer |
211 | sub ip, sp, #0x20 // allocate a 32 byte buffer |
212 | bic ip, ip, #0x1f // aligned to 32 bytes |
213 | mov sp, ip |
214 | |
215 | // r0: Input state matrix, s |
216 | // r1: 4 data blocks output, o |
217 | // r2: 4 data blocks input, i |
218 | // r3: nrounds |
219 | |
220 | // |
221 | // This function encrypts four consecutive ChaCha blocks by loading |
222 | // the state matrix in NEON registers four times. The algorithm performs |
223 | // each operation on the corresponding word of each state matrix, hence |
224 | // requires no word shuffling. The words are re-interleaved before the |
225 | // final addition of the original state and the XORing step. |
226 | // |
227 | |
228 | // x0..15[0-3] = s0..15[0-3] |
229 | add ip, r0, #0x20 |
230 | vld1.32 {q0-q1}, [r0] |
231 | vld1.32 {q2-q3}, [ip] |
232 | |
233 | adr lr, .Lctrinc |
234 | vdup.32 q15, d7[1] |
235 | vdup.32 q14, d7[0] |
236 | vld1.32 {q4}, [lr, :128] |
237 | vdup.32 q13, d6[1] |
238 | vdup.32 q12, d6[0] |
239 | vdup.32 q11, d5[1] |
240 | vdup.32 q10, d5[0] |
241 | vadd.u32 q12, q12, q4 // x12 += counter values 0-3 |
242 | vdup.32 q9, d4[1] |
243 | vdup.32 q8, d4[0] |
244 | vdup.32 q7, d3[1] |
245 | vdup.32 q6, d3[0] |
246 | vdup.32 q5, d2[1] |
247 | vdup.32 q4, d2[0] |
248 | vdup.32 q3, d1[1] |
249 | vdup.32 q2, d1[0] |
250 | vdup.32 q1, d0[1] |
251 | vdup.32 q0, d0[0] |
252 | |
253 | adr ip, .Lrol8_table |
254 | b 1f |
255 | |
256 | .Ldoubleround4: |
257 | vld1.32 {q8-q9}, [sp, :256] |
258 | 1: |
259 | // x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
260 | // x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
261 | // x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
262 | // x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
263 | vadd.i32 q0, q0, q4 |
264 | vadd.i32 q1, q1, q5 |
265 | vadd.i32 q2, q2, q6 |
266 | vadd.i32 q3, q3, q7 |
267 | |
268 | veor q12, q12, q0 |
269 | veor q13, q13, q1 |
270 | veor q14, q14, q2 |
271 | veor q15, q15, q3 |
272 | |
273 | vrev32.16 q12, q12 |
274 | vrev32.16 q13, q13 |
275 | vrev32.16 q14, q14 |
276 | vrev32.16 q15, q15 |
277 | |
278 | // x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
279 | // x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
280 | // x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
281 | // x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
282 | vadd.i32 q8, q8, q12 |
283 | vadd.i32 q9, q9, q13 |
284 | vadd.i32 q10, q10, q14 |
285 | vadd.i32 q11, q11, q15 |
286 | |
287 | vst1.32 {q8-q9}, [sp, :256] |
288 | |
289 | veor q8, q4, q8 |
290 | veor q9, q5, q9 |
291 | vshl.u32 q4, q8, #12 |
292 | vshl.u32 q5, q9, #12 |
293 | vsri.u32 q4, q8, #20 |
294 | vsri.u32 q5, q9, #20 |
295 | |
296 | veor q8, q6, q10 |
297 | veor q9, q7, q11 |
298 | vshl.u32 q6, q8, #12 |
299 | vshl.u32 q7, q9, #12 |
300 | vsri.u32 q6, q8, #20 |
301 | vsri.u32 q7, q9, #20 |
302 | |
303 | // x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
304 | // x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
305 | // x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
306 | // x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
307 | vld1.8 {d16}, [ip, :64] |
308 | vadd.i32 q0, q0, q4 |
309 | vadd.i32 q1, q1, q5 |
310 | vadd.i32 q2, q2, q6 |
311 | vadd.i32 q3, q3, q7 |
312 | |
313 | veor q12, q12, q0 |
314 | veor q13, q13, q1 |
315 | veor q14, q14, q2 |
316 | veor q15, q15, q3 |
317 | |
318 | vtbl.8 d24, {d24}, d16 |
319 | vtbl.8 d25, {d25}, d16 |
320 | vtbl.8 d26, {d26}, d16 |
321 | vtbl.8 d27, {d27}, d16 |
322 | vtbl.8 d28, {d28}, d16 |
323 | vtbl.8 d29, {d29}, d16 |
324 | vtbl.8 d30, {d30}, d16 |
325 | vtbl.8 d31, {d31}, d16 |
326 | |
327 | vld1.32 {q8-q9}, [sp, :256] |
328 | |
329 | // x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
330 | // x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
331 | // x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
332 | // x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
333 | vadd.i32 q8, q8, q12 |
334 | vadd.i32 q9, q9, q13 |
335 | vadd.i32 q10, q10, q14 |
336 | vadd.i32 q11, q11, q15 |
337 | |
338 | vst1.32 {q8-q9}, [sp, :256] |
339 | |
340 | veor q8, q4, q8 |
341 | veor q9, q5, q9 |
342 | vshl.u32 q4, q8, #7 |
343 | vshl.u32 q5, q9, #7 |
344 | vsri.u32 q4, q8, #25 |
345 | vsri.u32 q5, q9, #25 |
346 | |
347 | veor q8, q6, q10 |
348 | veor q9, q7, q11 |
349 | vshl.u32 q6, q8, #7 |
350 | vshl.u32 q7, q9, #7 |
351 | vsri.u32 q6, q8, #25 |
352 | vsri.u32 q7, q9, #25 |
353 | |
354 | vld1.32 {q8-q9}, [sp, :256] |
355 | |
356 | // x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
357 | // x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
358 | // x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
359 | // x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
360 | vadd.i32 q0, q0, q5 |
361 | vadd.i32 q1, q1, q6 |
362 | vadd.i32 q2, q2, q7 |
363 | vadd.i32 q3, q3, q4 |
364 | |
365 | veor q15, q15, q0 |
366 | veor q12, q12, q1 |
367 | veor q13, q13, q2 |
368 | veor q14, q14, q3 |
369 | |
370 | vrev32.16 q15, q15 |
371 | vrev32.16 q12, q12 |
372 | vrev32.16 q13, q13 |
373 | vrev32.16 q14, q14 |
374 | |
375 | // x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
376 | // x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
377 | // x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
378 | // x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
379 | vadd.i32 q10, q10, q15 |
380 | vadd.i32 q11, q11, q12 |
381 | vadd.i32 q8, q8, q13 |
382 | vadd.i32 q9, q9, q14 |
383 | |
384 | vst1.32 {q8-q9}, [sp, :256] |
385 | |
386 | veor q8, q7, q8 |
387 | veor q9, q4, q9 |
388 | vshl.u32 q7, q8, #12 |
389 | vshl.u32 q4, q9, #12 |
390 | vsri.u32 q7, q8, #20 |
391 | vsri.u32 q4, q9, #20 |
392 | |
393 | veor q8, q5, q10 |
394 | veor q9, q6, q11 |
395 | vshl.u32 q5, q8, #12 |
396 | vshl.u32 q6, q9, #12 |
397 | vsri.u32 q5, q8, #20 |
398 | vsri.u32 q6, q9, #20 |
399 | |
400 | // x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
401 | // x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
402 | // x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
403 | // x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
404 | vld1.8 {d16}, [ip, :64] |
405 | vadd.i32 q0, q0, q5 |
406 | vadd.i32 q1, q1, q6 |
407 | vadd.i32 q2, q2, q7 |
408 | vadd.i32 q3, q3, q4 |
409 | |
410 | veor q15, q15, q0 |
411 | veor q12, q12, q1 |
412 | veor q13, q13, q2 |
413 | veor q14, q14, q3 |
414 | |
415 | vtbl.8 d30, {d30}, d16 |
416 | vtbl.8 d31, {d31}, d16 |
417 | vtbl.8 d24, {d24}, d16 |
418 | vtbl.8 d25, {d25}, d16 |
419 | vtbl.8 d26, {d26}, d16 |
420 | vtbl.8 d27, {d27}, d16 |
421 | vtbl.8 d28, {d28}, d16 |
422 | vtbl.8 d29, {d29}, d16 |
423 | |
424 | vld1.32 {q8-q9}, [sp, :256] |
425 | |
426 | // x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
427 | // x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
428 | // x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
429 | // x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
430 | vadd.i32 q10, q10, q15 |
431 | vadd.i32 q11, q11, q12 |
432 | vadd.i32 q8, q8, q13 |
433 | vadd.i32 q9, q9, q14 |
434 | |
435 | vst1.32 {q8-q9}, [sp, :256] |
436 | |
437 | veor q8, q7, q8 |
438 | veor q9, q4, q9 |
439 | vshl.u32 q7, q8, #7 |
440 | vshl.u32 q4, q9, #7 |
441 | vsri.u32 q7, q8, #25 |
442 | vsri.u32 q4, q9, #25 |
443 | |
444 | veor q8, q5, q10 |
445 | veor q9, q6, q11 |
446 | vshl.u32 q5, q8, #7 |
447 | vshl.u32 q6, q9, #7 |
448 | vsri.u32 q5, q8, #25 |
449 | vsri.u32 q6, q9, #25 |
450 | |
451 | subs r3, r3, #2 |
452 | bne .Ldoubleround4 |
453 | |
454 | // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. |
455 | // x8..9[0-3] are on the stack. |
456 | |
457 | // Re-interleave the words in the first two rows of each block (x0..7). |
458 | // Also add the counter values 0-3 to x12[0-3]. |
459 | vld1.32 {q8}, [lr, :128] // load counter values 0-3 |
460 | vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) |
461 | vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) |
462 | vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) |
463 | vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) |
464 | vadd.u32 q12, q8 // x12 += counter values 0-3 |
465 | vswp d1, d4 |
466 | vswp d3, d6 |
467 | vld1.32 {q8-q9}, [r0]! // load s0..7 |
468 | vswp d9, d12 |
469 | vswp d11, d14 |
470 | |
471 | // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) |
472 | // after XORing the first 32 bytes. |
473 | vswp q1, q4 |
474 | |
475 | // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) |
476 | |
477 | // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) |
478 | vadd.u32 q0, q0, q8 |
479 | vadd.u32 q2, q2, q8 |
480 | vadd.u32 q4, q4, q8 |
481 | vadd.u32 q3, q3, q8 |
482 | |
483 | // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) |
484 | vadd.u32 q1, q1, q9 |
485 | vadd.u32 q6, q6, q9 |
486 | vadd.u32 q5, q5, q9 |
487 | vadd.u32 q7, q7, q9 |
488 | |
489 | // XOR first 32 bytes using keystream from first two rows of first block |
490 | vld1.8 {q8-q9}, [r2]! |
491 | veor q8, q8, q0 |
492 | veor q9, q9, q1 |
493 | vst1.8 {q8-q9}, [r1]! |
494 | |
495 | // Re-interleave the words in the last two rows of each block (x8..15). |
496 | vld1.32 {q8-q9}, [sp, :256] |
497 | mov sp, r4 // restore original stack pointer |
498 | ldr r4, [r4, #8] // load number of bytes |
499 | vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) |
500 | vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) |
501 | vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) |
502 | vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) |
503 | vld1.32 {q0-q1}, [r0] // load s8..15 |
504 | vswp d25, d28 |
505 | vswp d27, d30 |
506 | vswp d17, d20 |
507 | vswp d19, d22 |
508 | |
509 | // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) |
510 | |
511 | // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) |
512 | vadd.u32 q8, q8, q0 |
513 | vadd.u32 q10, q10, q0 |
514 | vadd.u32 q9, q9, q0 |
515 | vadd.u32 q11, q11, q0 |
516 | |
517 | // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) |
518 | vadd.u32 q12, q12, q1 |
519 | vadd.u32 q14, q14, q1 |
520 | vadd.u32 q13, q13, q1 |
521 | vadd.u32 q15, q15, q1 |
522 | |
523 | // XOR the rest of the data with the keystream |
524 | |
525 | vld1.8 {q0-q1}, [r2]! |
526 | subs r4, r4, #96 |
527 | veor q0, q0, q8 |
528 | veor q1, q1, q12 |
529 | ble .Lle96 |
530 | vst1.8 {q0-q1}, [r1]! |
531 | |
532 | vld1.8 {q0-q1}, [r2]! |
533 | subs r4, r4, #32 |
534 | veor q0, q0, q2 |
535 | veor q1, q1, q6 |
536 | ble .Lle128 |
537 | vst1.8 {q0-q1}, [r1]! |
538 | |
539 | vld1.8 {q0-q1}, [r2]! |
540 | subs r4, r4, #32 |
541 | veor q0, q0, q10 |
542 | veor q1, q1, q14 |
543 | ble .Lle160 |
544 | vst1.8 {q0-q1}, [r1]! |
545 | |
546 | vld1.8 {q0-q1}, [r2]! |
547 | subs r4, r4, #32 |
548 | veor q0, q0, q4 |
549 | veor q1, q1, q5 |
550 | ble .Lle192 |
551 | vst1.8 {q0-q1}, [r1]! |
552 | |
553 | vld1.8 {q0-q1}, [r2]! |
554 | subs r4, r4, #32 |
555 | veor q0, q0, q9 |
556 | veor q1, q1, q13 |
557 | ble .Lle224 |
558 | vst1.8 {q0-q1}, [r1]! |
559 | |
560 | vld1.8 {q0-q1}, [r2]! |
561 | subs r4, r4, #32 |
562 | veor q0, q0, q3 |
563 | veor q1, q1, q7 |
564 | blt .Llt256 |
565 | .Lout: |
566 | vst1.8 {q0-q1}, [r1]! |
567 | |
568 | vld1.8 {q0-q1}, [r2] |
569 | veor q0, q0, q11 |
570 | veor q1, q1, q15 |
571 | vst1.8 {q0-q1}, [r1] |
572 | |
573 | pop {r4, pc} |
574 | |
575 | .Lle192: |
576 | vmov q4, q9 |
577 | vmov q5, q13 |
578 | |
579 | .Lle160: |
580 | // nothing to do |
581 | |
582 | .Lfinalblock: |
583 | // Process the final block if processing less than 4 full blocks. |
584 | // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the |
585 | // previous 32 byte output block that still needs to be written at |
586 | // [r1] in q0-q1. |
587 | beq .Lfullblock |
588 | |
589 | .Lpartialblock: |
590 | adr lr, .Lpermute + 32 |
591 | add r2, r2, r4 |
592 | add lr, lr, r4 |
593 | add r4, r4, r1 |
594 | |
595 | vld1.8 {q2-q3}, [lr] |
596 | vld1.8 {q6-q7}, [r2] |
597 | |
598 | add r4, r4, #32 |
599 | |
600 | vtbl.8 d4, {q4-q5}, d4 |
601 | vtbl.8 d5, {q4-q5}, d5 |
602 | vtbl.8 d6, {q4-q5}, d6 |
603 | vtbl.8 d7, {q4-q5}, d7 |
604 | |
605 | veor q6, q6, q2 |
606 | veor q7, q7, q3 |
607 | |
608 | vst1.8 {q6-q7}, [r4] // overlapping stores |
609 | vst1.8 {q0-q1}, [r1] |
610 | pop {r4, pc} |
611 | |
612 | .Lfullblock: |
613 | vmov q11, q4 |
614 | vmov q15, q5 |
615 | b .Lout |
616 | .Lle96: |
617 | vmov q4, q2 |
618 | vmov q5, q6 |
619 | b .Lfinalblock |
620 | .Lle128: |
621 | vmov q4, q10 |
622 | vmov q5, q14 |
623 | b .Lfinalblock |
624 | .Lle224: |
625 | vmov q4, q3 |
626 | vmov q5, q7 |
627 | b .Lfinalblock |
628 | .Llt256: |
629 | vmov q4, q11 |
630 | vmov q5, q15 |
631 | b .Lpartialblock |
632 | ENDPROC(chacha_4block_xor_neon) |
633 | |
634 | .align L1_CACHE_SHIFT |
635 | .Lpermute: |
636 | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 |
637 | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f |
638 | .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 |
639 | .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f |
640 | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 |
641 | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f |
642 | .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 |
643 | .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f |
644 | |