1 | /* |
2 | * ChaCha/XChaCha NEON helper functions |
3 | * |
4 | * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License version 2 as |
8 | * published by the Free Software Foundation. |
9 | * |
10 | * Originally based on: |
11 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions |
12 | * |
13 | * Copyright (C) 2015 Martin Willi |
14 | * |
15 | * This program is free software; you can redistribute it and/or modify |
16 | * it under the terms of the GNU General Public License as published by |
17 | * the Free Software Foundation; either version 2 of the License, or |
18 | * (at your option) any later version. |
19 | */ |
20 | |
21 | #include <linux/linkage.h> |
22 | #include <asm/assembler.h> |
23 | #include <asm/cache.h> |
24 | |
25 | .text |
26 | .align 6 |
27 | |
28 | /* |
29 | * chacha_permute - permute one block |
30 | * |
31 | * Permute one 64-byte block where the state matrix is stored in the four NEON |
32 | * registers v0-v3. It performs matrix operations on four words in parallel, |
33 | * but requires shuffling to rearrange the words after each round. |
34 | * |
35 | * The round count is given in w3. |
36 | * |
37 | * Clobbers: w3, x10, v4, v12 |
38 | */ |
39 | SYM_FUNC_START_LOCAL(chacha_permute) |
40 | |
41 | adr_l x10, ROT8 |
42 | ld1 {v12.4s}, [x10] |
43 | |
44 | .Ldoubleround: |
45 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
46 | add v0.4s, v0.4s, v1.4s |
47 | eor v3.16b, v3.16b, v0.16b |
48 | rev32 v3.8h, v3.8h |
49 | |
50 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
51 | add v2.4s, v2.4s, v3.4s |
52 | eor v4.16b, v1.16b, v2.16b |
53 | shl v1.4s, v4.4s, #12 |
54 | sri v1.4s, v4.4s, #20 |
55 | |
56 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
57 | add v0.4s, v0.4s, v1.4s |
58 | eor v3.16b, v3.16b, v0.16b |
59 | tbl v3.16b, {v3.16b}, v12.16b |
60 | |
61 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
62 | add v2.4s, v2.4s, v3.4s |
63 | eor v4.16b, v1.16b, v2.16b |
64 | shl v1.4s, v4.4s, #7 |
65 | sri v1.4s, v4.4s, #25 |
66 | |
67 | // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
68 | ext v1.16b, v1.16b, v1.16b, #4 |
69 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
70 | ext v2.16b, v2.16b, v2.16b, #8 |
71 | // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
72 | ext v3.16b, v3.16b, v3.16b, #12 |
73 | |
74 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
75 | add v0.4s, v0.4s, v1.4s |
76 | eor v3.16b, v3.16b, v0.16b |
77 | rev32 v3.8h, v3.8h |
78 | |
79 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
80 | add v2.4s, v2.4s, v3.4s |
81 | eor v4.16b, v1.16b, v2.16b |
82 | shl v1.4s, v4.4s, #12 |
83 | sri v1.4s, v4.4s, #20 |
84 | |
85 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
86 | add v0.4s, v0.4s, v1.4s |
87 | eor v3.16b, v3.16b, v0.16b |
88 | tbl v3.16b, {v3.16b}, v12.16b |
89 | |
90 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
91 | add v2.4s, v2.4s, v3.4s |
92 | eor v4.16b, v1.16b, v2.16b |
93 | shl v1.4s, v4.4s, #7 |
94 | sri v1.4s, v4.4s, #25 |
95 | |
96 | // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
97 | ext v1.16b, v1.16b, v1.16b, #12 |
98 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
99 | ext v2.16b, v2.16b, v2.16b, #8 |
100 | // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
101 | ext v3.16b, v3.16b, v3.16b, #4 |
102 | |
103 | subs w3, w3, #2 |
104 | b.ne .Ldoubleround |
105 | |
106 | ret |
107 | SYM_FUNC_END(chacha_permute) |
108 | |
109 | SYM_FUNC_START(chacha_block_xor_neon) |
110 | // x0: Input state matrix, s |
111 | // x1: 1 data block output, o |
112 | // x2: 1 data block input, i |
113 | // w3: nrounds |
114 | |
115 | stp x29, x30, [sp, #-16]! |
116 | mov x29, sp |
117 | |
118 | // x0..3 = s0..3 |
119 | ld1 {v0.4s-v3.4s}, [x0] |
120 | ld1 {v8.4s-v11.4s}, [x0] |
121 | |
122 | bl chacha_permute |
123 | |
124 | ld1 {v4.16b-v7.16b}, [x2] |
125 | |
126 | // o0 = i0 ^ (x0 + s0) |
127 | add v0.4s, v0.4s, v8.4s |
128 | eor v0.16b, v0.16b, v4.16b |
129 | |
130 | // o1 = i1 ^ (x1 + s1) |
131 | add v1.4s, v1.4s, v9.4s |
132 | eor v1.16b, v1.16b, v5.16b |
133 | |
134 | // o2 = i2 ^ (x2 + s2) |
135 | add v2.4s, v2.4s, v10.4s |
136 | eor v2.16b, v2.16b, v6.16b |
137 | |
138 | // o3 = i3 ^ (x3 + s3) |
139 | add v3.4s, v3.4s, v11.4s |
140 | eor v3.16b, v3.16b, v7.16b |
141 | |
142 | st1 {v0.16b-v3.16b}, [x1] |
143 | |
144 | ldp x29, x30, [sp], #16 |
145 | ret |
146 | SYM_FUNC_END(chacha_block_xor_neon) |
147 | |
148 | SYM_FUNC_START(hchacha_block_neon) |
149 | // x0: Input state matrix, s |
150 | // x1: output (8 32-bit words) |
151 | // w2: nrounds |
152 | |
153 | stp x29, x30, [sp, #-16]! |
154 | mov x29, sp |
155 | |
156 | ld1 {v0.4s-v3.4s}, [x0] |
157 | |
158 | mov w3, w2 |
159 | bl chacha_permute |
160 | |
161 | st1 {v0.4s}, [x1], #16 |
162 | st1 {v3.4s}, [x1] |
163 | |
164 | ldp x29, x30, [sp], #16 |
165 | ret |
166 | SYM_FUNC_END(hchacha_block_neon) |
167 | |
168 | a0 .req w12 |
169 | a1 .req w13 |
170 | a2 .req w14 |
171 | a3 .req w15 |
172 | a4 .req w16 |
173 | a5 .req w17 |
174 | a6 .req w19 |
175 | a7 .req w20 |
176 | a8 .req w21 |
177 | a9 .req w22 |
178 | a10 .req w23 |
179 | a11 .req w24 |
180 | a12 .req w25 |
181 | a13 .req w26 |
182 | a14 .req w27 |
183 | a15 .req w28 |
184 | |
185 | .align 6 |
186 | SYM_FUNC_START(chacha_4block_xor_neon) |
187 | frame_push 10 |
188 | |
189 | // x0: Input state matrix, s |
190 | // x1: 4 data blocks output, o |
191 | // x2: 4 data blocks input, i |
192 | // w3: nrounds |
193 | // x4: byte count |
194 | |
195 | adr_l x10, .Lpermute |
196 | and x5, x4, #63 |
197 | add x10, x10, x5 |
198 | |
199 | // |
200 | // This function encrypts four consecutive ChaCha blocks by loading |
201 | // the state matrix in NEON registers four times. The algorithm performs |
202 | // each operation on the corresponding word of each state matrix, hence |
203 | // requires no word shuffling. For final XORing step we transpose the |
204 | // matrix by interleaving 32- and then 64-bit words, which allows us to |
205 | // do XOR in NEON registers. |
206 | // |
207 | // At the same time, a fifth block is encrypted in parallel using |
208 | // scalar registers |
209 | // |
210 | adr_l x9, CTRINC // ... and ROT8 |
211 | ld1 {v30.4s-v31.4s}, [x9] |
212 | |
213 | // x0..15[0-3] = s0..3[0..3] |
214 | add x8, x0, #16 |
215 | ld4r { v0.4s- v3.4s}, [x0] |
216 | ld4r { v4.4s- v7.4s}, [x8], #16 |
217 | ld4r { v8.4s-v11.4s}, [x8], #16 |
218 | ld4r {v12.4s-v15.4s}, [x8] |
219 | |
220 | mov a0, v0.s[0] |
221 | mov a1, v1.s[0] |
222 | mov a2, v2.s[0] |
223 | mov a3, v3.s[0] |
224 | mov a4, v4.s[0] |
225 | mov a5, v5.s[0] |
226 | mov a6, v6.s[0] |
227 | mov a7, v7.s[0] |
228 | mov a8, v8.s[0] |
229 | mov a9, v9.s[0] |
230 | mov a10, v10.s[0] |
231 | mov a11, v11.s[0] |
232 | mov a12, v12.s[0] |
233 | mov a13, v13.s[0] |
234 | mov a14, v14.s[0] |
235 | mov a15, v15.s[0] |
236 | |
237 | // x12 += counter values 1-4 |
238 | add v12.4s, v12.4s, v30.4s |
239 | |
240 | .Ldoubleround4: |
241 | // x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
242 | // x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
243 | // x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
244 | // x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
245 | add v0.4s, v0.4s, v4.4s |
246 | add a0, a0, a4 |
247 | add v1.4s, v1.4s, v5.4s |
248 | add a1, a1, a5 |
249 | add v2.4s, v2.4s, v6.4s |
250 | add a2, a2, a6 |
251 | add v3.4s, v3.4s, v7.4s |
252 | add a3, a3, a7 |
253 | |
254 | eor v12.16b, v12.16b, v0.16b |
255 | eor a12, a12, a0 |
256 | eor v13.16b, v13.16b, v1.16b |
257 | eor a13, a13, a1 |
258 | eor v14.16b, v14.16b, v2.16b |
259 | eor a14, a14, a2 |
260 | eor v15.16b, v15.16b, v3.16b |
261 | eor a15, a15, a3 |
262 | |
263 | rev32 v12.8h, v12.8h |
264 | ror a12, a12, #16 |
265 | rev32 v13.8h, v13.8h |
266 | ror a13, a13, #16 |
267 | rev32 v14.8h, v14.8h |
268 | ror a14, a14, #16 |
269 | rev32 v15.8h, v15.8h |
270 | ror a15, a15, #16 |
271 | |
272 | // x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
273 | // x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
274 | // x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
275 | // x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
276 | add v8.4s, v8.4s, v12.4s |
277 | add a8, a8, a12 |
278 | add v9.4s, v9.4s, v13.4s |
279 | add a9, a9, a13 |
280 | add v10.4s, v10.4s, v14.4s |
281 | add a10, a10, a14 |
282 | add v11.4s, v11.4s, v15.4s |
283 | add a11, a11, a15 |
284 | |
285 | eor v16.16b, v4.16b, v8.16b |
286 | eor a4, a4, a8 |
287 | eor v17.16b, v5.16b, v9.16b |
288 | eor a5, a5, a9 |
289 | eor v18.16b, v6.16b, v10.16b |
290 | eor a6, a6, a10 |
291 | eor v19.16b, v7.16b, v11.16b |
292 | eor a7, a7, a11 |
293 | |
294 | shl v4.4s, v16.4s, #12 |
295 | shl v5.4s, v17.4s, #12 |
296 | shl v6.4s, v18.4s, #12 |
297 | shl v7.4s, v19.4s, #12 |
298 | |
299 | sri v4.4s, v16.4s, #20 |
300 | ror a4, a4, #20 |
301 | sri v5.4s, v17.4s, #20 |
302 | ror a5, a5, #20 |
303 | sri v6.4s, v18.4s, #20 |
304 | ror a6, a6, #20 |
305 | sri v7.4s, v19.4s, #20 |
306 | ror a7, a7, #20 |
307 | |
308 | // x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
309 | // x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
310 | // x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
311 | // x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
312 | add v0.4s, v0.4s, v4.4s |
313 | add a0, a0, a4 |
314 | add v1.4s, v1.4s, v5.4s |
315 | add a1, a1, a5 |
316 | add v2.4s, v2.4s, v6.4s |
317 | add a2, a2, a6 |
318 | add v3.4s, v3.4s, v7.4s |
319 | add a3, a3, a7 |
320 | |
321 | eor v12.16b, v12.16b, v0.16b |
322 | eor a12, a12, a0 |
323 | eor v13.16b, v13.16b, v1.16b |
324 | eor a13, a13, a1 |
325 | eor v14.16b, v14.16b, v2.16b |
326 | eor a14, a14, a2 |
327 | eor v15.16b, v15.16b, v3.16b |
328 | eor a15, a15, a3 |
329 | |
330 | tbl v12.16b, {v12.16b}, v31.16b |
331 | ror a12, a12, #24 |
332 | tbl v13.16b, {v13.16b}, v31.16b |
333 | ror a13, a13, #24 |
334 | tbl v14.16b, {v14.16b}, v31.16b |
335 | ror a14, a14, #24 |
336 | tbl v15.16b, {v15.16b}, v31.16b |
337 | ror a15, a15, #24 |
338 | |
339 | // x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
340 | // x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
341 | // x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
342 | // x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
343 | add v8.4s, v8.4s, v12.4s |
344 | add a8, a8, a12 |
345 | add v9.4s, v9.4s, v13.4s |
346 | add a9, a9, a13 |
347 | add v10.4s, v10.4s, v14.4s |
348 | add a10, a10, a14 |
349 | add v11.4s, v11.4s, v15.4s |
350 | add a11, a11, a15 |
351 | |
352 | eor v16.16b, v4.16b, v8.16b |
353 | eor a4, a4, a8 |
354 | eor v17.16b, v5.16b, v9.16b |
355 | eor a5, a5, a9 |
356 | eor v18.16b, v6.16b, v10.16b |
357 | eor a6, a6, a10 |
358 | eor v19.16b, v7.16b, v11.16b |
359 | eor a7, a7, a11 |
360 | |
361 | shl v4.4s, v16.4s, #7 |
362 | shl v5.4s, v17.4s, #7 |
363 | shl v6.4s, v18.4s, #7 |
364 | shl v7.4s, v19.4s, #7 |
365 | |
366 | sri v4.4s, v16.4s, #25 |
367 | ror a4, a4, #25 |
368 | sri v5.4s, v17.4s, #25 |
369 | ror a5, a5, #25 |
370 | sri v6.4s, v18.4s, #25 |
371 | ror a6, a6, #25 |
372 | sri v7.4s, v19.4s, #25 |
373 | ror a7, a7, #25 |
374 | |
375 | // x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
376 | // x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
377 | // x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
378 | // x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
379 | add v0.4s, v0.4s, v5.4s |
380 | add a0, a0, a5 |
381 | add v1.4s, v1.4s, v6.4s |
382 | add a1, a1, a6 |
383 | add v2.4s, v2.4s, v7.4s |
384 | add a2, a2, a7 |
385 | add v3.4s, v3.4s, v4.4s |
386 | add a3, a3, a4 |
387 | |
388 | eor v15.16b, v15.16b, v0.16b |
389 | eor a15, a15, a0 |
390 | eor v12.16b, v12.16b, v1.16b |
391 | eor a12, a12, a1 |
392 | eor v13.16b, v13.16b, v2.16b |
393 | eor a13, a13, a2 |
394 | eor v14.16b, v14.16b, v3.16b |
395 | eor a14, a14, a3 |
396 | |
397 | rev32 v15.8h, v15.8h |
398 | ror a15, a15, #16 |
399 | rev32 v12.8h, v12.8h |
400 | ror a12, a12, #16 |
401 | rev32 v13.8h, v13.8h |
402 | ror a13, a13, #16 |
403 | rev32 v14.8h, v14.8h |
404 | ror a14, a14, #16 |
405 | |
406 | // x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
407 | // x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
408 | // x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
409 | // x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
410 | add v10.4s, v10.4s, v15.4s |
411 | add a10, a10, a15 |
412 | add v11.4s, v11.4s, v12.4s |
413 | add a11, a11, a12 |
414 | add v8.4s, v8.4s, v13.4s |
415 | add a8, a8, a13 |
416 | add v9.4s, v9.4s, v14.4s |
417 | add a9, a9, a14 |
418 | |
419 | eor v16.16b, v5.16b, v10.16b |
420 | eor a5, a5, a10 |
421 | eor v17.16b, v6.16b, v11.16b |
422 | eor a6, a6, a11 |
423 | eor v18.16b, v7.16b, v8.16b |
424 | eor a7, a7, a8 |
425 | eor v19.16b, v4.16b, v9.16b |
426 | eor a4, a4, a9 |
427 | |
428 | shl v5.4s, v16.4s, #12 |
429 | shl v6.4s, v17.4s, #12 |
430 | shl v7.4s, v18.4s, #12 |
431 | shl v4.4s, v19.4s, #12 |
432 | |
433 | sri v5.4s, v16.4s, #20 |
434 | ror a5, a5, #20 |
435 | sri v6.4s, v17.4s, #20 |
436 | ror a6, a6, #20 |
437 | sri v7.4s, v18.4s, #20 |
438 | ror a7, a7, #20 |
439 | sri v4.4s, v19.4s, #20 |
440 | ror a4, a4, #20 |
441 | |
442 | // x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
443 | // x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
444 | // x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
445 | // x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
446 | add v0.4s, v0.4s, v5.4s |
447 | add a0, a0, a5 |
448 | add v1.4s, v1.4s, v6.4s |
449 | add a1, a1, a6 |
450 | add v2.4s, v2.4s, v7.4s |
451 | add a2, a2, a7 |
452 | add v3.4s, v3.4s, v4.4s |
453 | add a3, a3, a4 |
454 | |
455 | eor v15.16b, v15.16b, v0.16b |
456 | eor a15, a15, a0 |
457 | eor v12.16b, v12.16b, v1.16b |
458 | eor a12, a12, a1 |
459 | eor v13.16b, v13.16b, v2.16b |
460 | eor a13, a13, a2 |
461 | eor v14.16b, v14.16b, v3.16b |
462 | eor a14, a14, a3 |
463 | |
464 | tbl v15.16b, {v15.16b}, v31.16b |
465 | ror a15, a15, #24 |
466 | tbl v12.16b, {v12.16b}, v31.16b |
467 | ror a12, a12, #24 |
468 | tbl v13.16b, {v13.16b}, v31.16b |
469 | ror a13, a13, #24 |
470 | tbl v14.16b, {v14.16b}, v31.16b |
471 | ror a14, a14, #24 |
472 | |
473 | // x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
474 | // x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
475 | // x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
476 | // x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
477 | add v10.4s, v10.4s, v15.4s |
478 | add a10, a10, a15 |
479 | add v11.4s, v11.4s, v12.4s |
480 | add a11, a11, a12 |
481 | add v8.4s, v8.4s, v13.4s |
482 | add a8, a8, a13 |
483 | add v9.4s, v9.4s, v14.4s |
484 | add a9, a9, a14 |
485 | |
486 | eor v16.16b, v5.16b, v10.16b |
487 | eor a5, a5, a10 |
488 | eor v17.16b, v6.16b, v11.16b |
489 | eor a6, a6, a11 |
490 | eor v18.16b, v7.16b, v8.16b |
491 | eor a7, a7, a8 |
492 | eor v19.16b, v4.16b, v9.16b |
493 | eor a4, a4, a9 |
494 | |
495 | shl v5.4s, v16.4s, #7 |
496 | shl v6.4s, v17.4s, #7 |
497 | shl v7.4s, v18.4s, #7 |
498 | shl v4.4s, v19.4s, #7 |
499 | |
500 | sri v5.4s, v16.4s, #25 |
501 | ror a5, a5, #25 |
502 | sri v6.4s, v17.4s, #25 |
503 | ror a6, a6, #25 |
504 | sri v7.4s, v18.4s, #25 |
505 | ror a7, a7, #25 |
506 | sri v4.4s, v19.4s, #25 |
507 | ror a4, a4, #25 |
508 | |
509 | subs w3, w3, #2 |
510 | b.ne .Ldoubleround4 |
511 | |
512 | ld4r {v16.4s-v19.4s}, [x0], #16 |
513 | ld4r {v20.4s-v23.4s}, [x0], #16 |
514 | |
515 | // x12 += counter values 0-3 |
516 | add v12.4s, v12.4s, v30.4s |
517 | |
518 | // x0[0-3] += s0[0] |
519 | // x1[0-3] += s0[1] |
520 | // x2[0-3] += s0[2] |
521 | // x3[0-3] += s0[3] |
522 | add v0.4s, v0.4s, v16.4s |
523 | mov w6, v16.s[0] |
524 | mov w7, v17.s[0] |
525 | add v1.4s, v1.4s, v17.4s |
526 | mov w8, v18.s[0] |
527 | mov w9, v19.s[0] |
528 | add v2.4s, v2.4s, v18.4s |
529 | add a0, a0, w6 |
530 | add a1, a1, w7 |
531 | add v3.4s, v3.4s, v19.4s |
532 | add a2, a2, w8 |
533 | add a3, a3, w9 |
534 | CPU_BE( rev a0, a0 ) |
535 | CPU_BE( rev a1, a1 ) |
536 | CPU_BE( rev a2, a2 ) |
537 | CPU_BE( rev a3, a3 ) |
538 | |
539 | ld4r {v24.4s-v27.4s}, [x0], #16 |
540 | ld4r {v28.4s-v31.4s}, [x0] |
541 | |
542 | // x4[0-3] += s1[0] |
543 | // x5[0-3] += s1[1] |
544 | // x6[0-3] += s1[2] |
545 | // x7[0-3] += s1[3] |
546 | add v4.4s, v4.4s, v20.4s |
547 | mov w6, v20.s[0] |
548 | mov w7, v21.s[0] |
549 | add v5.4s, v5.4s, v21.4s |
550 | mov w8, v22.s[0] |
551 | mov w9, v23.s[0] |
552 | add v6.4s, v6.4s, v22.4s |
553 | add a4, a4, w6 |
554 | add a5, a5, w7 |
555 | add v7.4s, v7.4s, v23.4s |
556 | add a6, a6, w8 |
557 | add a7, a7, w9 |
558 | CPU_BE( rev a4, a4 ) |
559 | CPU_BE( rev a5, a5 ) |
560 | CPU_BE( rev a6, a6 ) |
561 | CPU_BE( rev a7, a7 ) |
562 | |
563 | // x8[0-3] += s2[0] |
564 | // x9[0-3] += s2[1] |
565 | // x10[0-3] += s2[2] |
566 | // x11[0-3] += s2[3] |
567 | add v8.4s, v8.4s, v24.4s |
568 | mov w6, v24.s[0] |
569 | mov w7, v25.s[0] |
570 | add v9.4s, v9.4s, v25.4s |
571 | mov w8, v26.s[0] |
572 | mov w9, v27.s[0] |
573 | add v10.4s, v10.4s, v26.4s |
574 | add a8, a8, w6 |
575 | add a9, a9, w7 |
576 | add v11.4s, v11.4s, v27.4s |
577 | add a10, a10, w8 |
578 | add a11, a11, w9 |
579 | CPU_BE( rev a8, a8 ) |
580 | CPU_BE( rev a9, a9 ) |
581 | CPU_BE( rev a10, a10 ) |
582 | CPU_BE( rev a11, a11 ) |
583 | |
584 | // x12[0-3] += s3[0] |
585 | // x13[0-3] += s3[1] |
586 | // x14[0-3] += s3[2] |
587 | // x15[0-3] += s3[3] |
588 | add v12.4s, v12.4s, v28.4s |
589 | mov w6, v28.s[0] |
590 | mov w7, v29.s[0] |
591 | add v13.4s, v13.4s, v29.4s |
592 | mov w8, v30.s[0] |
593 | mov w9, v31.s[0] |
594 | add v14.4s, v14.4s, v30.4s |
595 | add a12, a12, w6 |
596 | add a13, a13, w7 |
597 | add v15.4s, v15.4s, v31.4s |
598 | add a14, a14, w8 |
599 | add a15, a15, w9 |
600 | CPU_BE( rev a12, a12 ) |
601 | CPU_BE( rev a13, a13 ) |
602 | CPU_BE( rev a14, a14 ) |
603 | CPU_BE( rev a15, a15 ) |
604 | |
605 | // interleave 32-bit words in state n, n+1 |
606 | ldp w6, w7, [x2], #64 |
607 | zip1 v16.4s, v0.4s, v1.4s |
608 | ldp w8, w9, [x2, #-56] |
609 | eor a0, a0, w6 |
610 | zip2 v17.4s, v0.4s, v1.4s |
611 | eor a1, a1, w7 |
612 | zip1 v18.4s, v2.4s, v3.4s |
613 | eor a2, a2, w8 |
614 | zip2 v19.4s, v2.4s, v3.4s |
615 | eor a3, a3, w9 |
616 | ldp w6, w7, [x2, #-48] |
617 | zip1 v20.4s, v4.4s, v5.4s |
618 | ldp w8, w9, [x2, #-40] |
619 | eor a4, a4, w6 |
620 | zip2 v21.4s, v4.4s, v5.4s |
621 | eor a5, a5, w7 |
622 | zip1 v22.4s, v6.4s, v7.4s |
623 | eor a6, a6, w8 |
624 | zip2 v23.4s, v6.4s, v7.4s |
625 | eor a7, a7, w9 |
626 | ldp w6, w7, [x2, #-32] |
627 | zip1 v24.4s, v8.4s, v9.4s |
628 | ldp w8, w9, [x2, #-24] |
629 | eor a8, a8, w6 |
630 | zip2 v25.4s, v8.4s, v9.4s |
631 | eor a9, a9, w7 |
632 | zip1 v26.4s, v10.4s, v11.4s |
633 | eor a10, a10, w8 |
634 | zip2 v27.4s, v10.4s, v11.4s |
635 | eor a11, a11, w9 |
636 | ldp w6, w7, [x2, #-16] |
637 | zip1 v28.4s, v12.4s, v13.4s |
638 | ldp w8, w9, [x2, #-8] |
639 | eor a12, a12, w6 |
640 | zip2 v29.4s, v12.4s, v13.4s |
641 | eor a13, a13, w7 |
642 | zip1 v30.4s, v14.4s, v15.4s |
643 | eor a14, a14, w8 |
644 | zip2 v31.4s, v14.4s, v15.4s |
645 | eor a15, a15, w9 |
646 | |
647 | add x3, x2, x4 |
648 | sub x3, x3, #128 // start of last block |
649 | |
650 | subs x5, x4, #128 |
651 | csel x2, x2, x3, ge |
652 | |
653 | // interleave 64-bit words in state n, n+2 |
654 | zip1 v0.2d, v16.2d, v18.2d |
655 | zip2 v4.2d, v16.2d, v18.2d |
656 | stp a0, a1, [x1], #64 |
657 | zip1 v8.2d, v17.2d, v19.2d |
658 | zip2 v12.2d, v17.2d, v19.2d |
659 | stp a2, a3, [x1, #-56] |
660 | |
661 | subs x6, x4, #192 |
662 | ld1 {v16.16b-v19.16b}, [x2], #64 |
663 | csel x2, x2, x3, ge |
664 | |
665 | zip1 v1.2d, v20.2d, v22.2d |
666 | zip2 v5.2d, v20.2d, v22.2d |
667 | stp a4, a5, [x1, #-48] |
668 | zip1 v9.2d, v21.2d, v23.2d |
669 | zip2 v13.2d, v21.2d, v23.2d |
670 | stp a6, a7, [x1, #-40] |
671 | |
672 | subs x7, x4, #256 |
673 | ld1 {v20.16b-v23.16b}, [x2], #64 |
674 | csel x2, x2, x3, ge |
675 | |
676 | zip1 v2.2d, v24.2d, v26.2d |
677 | zip2 v6.2d, v24.2d, v26.2d |
678 | stp a8, a9, [x1, #-32] |
679 | zip1 v10.2d, v25.2d, v27.2d |
680 | zip2 v14.2d, v25.2d, v27.2d |
681 | stp a10, a11, [x1, #-24] |
682 | |
683 | subs x8, x4, #320 |
684 | ld1 {v24.16b-v27.16b}, [x2], #64 |
685 | csel x2, x2, x3, ge |
686 | |
687 | zip1 v3.2d, v28.2d, v30.2d |
688 | zip2 v7.2d, v28.2d, v30.2d |
689 | stp a12, a13, [x1, #-16] |
690 | zip1 v11.2d, v29.2d, v31.2d |
691 | zip2 v15.2d, v29.2d, v31.2d |
692 | stp a14, a15, [x1, #-8] |
693 | |
694 | tbnz x5, #63, .Lt128 |
695 | ld1 {v28.16b-v31.16b}, [x2] |
696 | |
697 | // xor with corresponding input, write to output |
698 | eor v16.16b, v16.16b, v0.16b |
699 | eor v17.16b, v17.16b, v1.16b |
700 | eor v18.16b, v18.16b, v2.16b |
701 | eor v19.16b, v19.16b, v3.16b |
702 | |
703 | tbnz x6, #63, .Lt192 |
704 | |
705 | eor v20.16b, v20.16b, v4.16b |
706 | eor v21.16b, v21.16b, v5.16b |
707 | eor v22.16b, v22.16b, v6.16b |
708 | eor v23.16b, v23.16b, v7.16b |
709 | |
710 | st1 {v16.16b-v19.16b}, [x1], #64 |
711 | tbnz x7, #63, .Lt256 |
712 | |
713 | eor v24.16b, v24.16b, v8.16b |
714 | eor v25.16b, v25.16b, v9.16b |
715 | eor v26.16b, v26.16b, v10.16b |
716 | eor v27.16b, v27.16b, v11.16b |
717 | |
718 | st1 {v20.16b-v23.16b}, [x1], #64 |
719 | tbnz x8, #63, .Lt320 |
720 | |
721 | eor v28.16b, v28.16b, v12.16b |
722 | eor v29.16b, v29.16b, v13.16b |
723 | eor v30.16b, v30.16b, v14.16b |
724 | eor v31.16b, v31.16b, v15.16b |
725 | |
726 | st1 {v24.16b-v27.16b}, [x1], #64 |
727 | st1 {v28.16b-v31.16b}, [x1] |
728 | |
729 | .Lout: frame_pop |
730 | ret |
731 | |
732 | // fewer than 192 bytes of in/output |
733 | .Lt192: cbz x5, 1f // exactly 128 bytes? |
734 | ld1 {v28.16b-v31.16b}, [x10] |
735 | add x5, x5, x1 |
736 | tbl v28.16b, {v4.16b-v7.16b}, v28.16b |
737 | tbl v29.16b, {v4.16b-v7.16b}, v29.16b |
738 | tbl v30.16b, {v4.16b-v7.16b}, v30.16b |
739 | tbl v31.16b, {v4.16b-v7.16b}, v31.16b |
740 | |
741 | 0: eor v20.16b, v20.16b, v28.16b |
742 | eor v21.16b, v21.16b, v29.16b |
743 | eor v22.16b, v22.16b, v30.16b |
744 | eor v23.16b, v23.16b, v31.16b |
745 | st1 {v20.16b-v23.16b}, [x5] // overlapping stores |
746 | 1: st1 {v16.16b-v19.16b}, [x1] |
747 | b .Lout |
748 | |
749 | // fewer than 128 bytes of in/output |
750 | .Lt128: ld1 {v28.16b-v31.16b}, [x10] |
751 | add x5, x5, x1 |
752 | sub x1, x1, #64 |
753 | tbl v28.16b, {v0.16b-v3.16b}, v28.16b |
754 | tbl v29.16b, {v0.16b-v3.16b}, v29.16b |
755 | tbl v30.16b, {v0.16b-v3.16b}, v30.16b |
756 | tbl v31.16b, {v0.16b-v3.16b}, v31.16b |
757 | ld1 {v16.16b-v19.16b}, [x1] // reload first output block |
758 | b 0b |
759 | |
760 | // fewer than 256 bytes of in/output |
761 | .Lt256: cbz x6, 2f // exactly 192 bytes? |
762 | ld1 {v4.16b-v7.16b}, [x10] |
763 | add x6, x6, x1 |
764 | tbl v0.16b, {v8.16b-v11.16b}, v4.16b |
765 | tbl v1.16b, {v8.16b-v11.16b}, v5.16b |
766 | tbl v2.16b, {v8.16b-v11.16b}, v6.16b |
767 | tbl v3.16b, {v8.16b-v11.16b}, v7.16b |
768 | |
769 | eor v28.16b, v28.16b, v0.16b |
770 | eor v29.16b, v29.16b, v1.16b |
771 | eor v30.16b, v30.16b, v2.16b |
772 | eor v31.16b, v31.16b, v3.16b |
773 | st1 {v28.16b-v31.16b}, [x6] // overlapping stores |
774 | 2: st1 {v20.16b-v23.16b}, [x1] |
775 | b .Lout |
776 | |
777 | // fewer than 320 bytes of in/output |
778 | .Lt320: cbz x7, 3f // exactly 256 bytes? |
779 | ld1 {v4.16b-v7.16b}, [x10] |
780 | add x7, x7, x1 |
781 | tbl v0.16b, {v12.16b-v15.16b}, v4.16b |
782 | tbl v1.16b, {v12.16b-v15.16b}, v5.16b |
783 | tbl v2.16b, {v12.16b-v15.16b}, v6.16b |
784 | tbl v3.16b, {v12.16b-v15.16b}, v7.16b |
785 | |
786 | eor v28.16b, v28.16b, v0.16b |
787 | eor v29.16b, v29.16b, v1.16b |
788 | eor v30.16b, v30.16b, v2.16b |
789 | eor v31.16b, v31.16b, v3.16b |
790 | st1 {v28.16b-v31.16b}, [x7] // overlapping stores |
791 | 3: st1 {v24.16b-v27.16b}, [x1] |
792 | b .Lout |
793 | SYM_FUNC_END(chacha_4block_xor_neon) |
794 | |
795 | .section ".rodata" , "a" , %progbits |
796 | .align L1_CACHE_SHIFT |
797 | .Lpermute: |
798 | .set .Li, 0 |
799 | .rept 128 |
800 | .byte (.Li - 64) |
801 | .set .Li, .Li + 1 |
802 | .endr |
803 | |
804 | CTRINC: .word 1, 2, 3, 4 |
805 | ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f |
806 | |