1/* SPDX-License-Identifier: GPL-2.0+ */
2/*
3 * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
4 *
5 * Copyright (C) 2018 Martin Willi
6 */
7
8#include <linux/linkage.h>
9
10.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
11.align 32
12CTR2BL: .octa 0x00000000000000000000000000000000
13 .octa 0x00000000000000000000000000000001
14
15.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
16.align 32
17CTR4BL: .octa 0x00000000000000000000000000000002
18 .octa 0x00000000000000000000000000000003
19
20.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
21.align 32
22CTR8BL: .octa 0x00000003000000020000000100000000
23 .octa 0x00000007000000060000000500000004
24
25.text
26
27SYM_FUNC_START(chacha_2block_xor_avx512vl)
28 # %rdi: Input state matrix, s
29 # %rsi: up to 2 data blocks output, o
30 # %rdx: up to 2 data blocks input, i
31 # %rcx: input/output length in bytes
32 # %r8d: nrounds
33
34 # This function encrypts two ChaCha blocks by loading the state
35 # matrix twice across four AVX registers. It performs matrix operations
36 # on four words in each matrix in parallel, but requires shuffling to
37 # rearrange the words after each round.
38
39 vzeroupper
40
41 # x0..3[0-2] = s0..3
42 vbroadcasti128 0x00(%rdi),%ymm0
43 vbroadcasti128 0x10(%rdi),%ymm1
44 vbroadcasti128 0x20(%rdi),%ymm2
45 vbroadcasti128 0x30(%rdi),%ymm3
46
47 vpaddd CTR2BL(%rip),%ymm3,%ymm3
48
49 vmovdqa %ymm0,%ymm8
50 vmovdqa %ymm1,%ymm9
51 vmovdqa %ymm2,%ymm10
52 vmovdqa %ymm3,%ymm11
53
54.Ldoubleround:
55
56 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
57 vpaddd %ymm1,%ymm0,%ymm0
58 vpxord %ymm0,%ymm3,%ymm3
59 vprold $16,%ymm3,%ymm3
60
61 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
62 vpaddd %ymm3,%ymm2,%ymm2
63 vpxord %ymm2,%ymm1,%ymm1
64 vprold $12,%ymm1,%ymm1
65
66 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
67 vpaddd %ymm1,%ymm0,%ymm0
68 vpxord %ymm0,%ymm3,%ymm3
69 vprold $8,%ymm3,%ymm3
70
71 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
72 vpaddd %ymm3,%ymm2,%ymm2
73 vpxord %ymm2,%ymm1,%ymm1
74 vprold $7,%ymm1,%ymm1
75
76 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
77 vpshufd $0x39,%ymm1,%ymm1
78 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
79 vpshufd $0x4e,%ymm2,%ymm2
80 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
81 vpshufd $0x93,%ymm3,%ymm3
82
83 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
84 vpaddd %ymm1,%ymm0,%ymm0
85 vpxord %ymm0,%ymm3,%ymm3
86 vprold $16,%ymm3,%ymm3
87
88 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
89 vpaddd %ymm3,%ymm2,%ymm2
90 vpxord %ymm2,%ymm1,%ymm1
91 vprold $12,%ymm1,%ymm1
92
93 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
94 vpaddd %ymm1,%ymm0,%ymm0
95 vpxord %ymm0,%ymm3,%ymm3
96 vprold $8,%ymm3,%ymm3
97
98 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
99 vpaddd %ymm3,%ymm2,%ymm2
100 vpxord %ymm2,%ymm1,%ymm1
101 vprold $7,%ymm1,%ymm1
102
103 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
104 vpshufd $0x93,%ymm1,%ymm1
105 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
106 vpshufd $0x4e,%ymm2,%ymm2
107 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
108 vpshufd $0x39,%ymm3,%ymm3
109
110 sub $2,%r8d
111 jnz .Ldoubleround
112
113 # o0 = i0 ^ (x0 + s0)
114 vpaddd %ymm8,%ymm0,%ymm7
115 cmp $0x10,%rcx
116 jl .Lxorpart2
117 vpxord 0x00(%rdx),%xmm7,%xmm6
118 vmovdqu %xmm6,0x00(%rsi)
119 vextracti128 $1,%ymm7,%xmm0
120 # o1 = i1 ^ (x1 + s1)
121 vpaddd %ymm9,%ymm1,%ymm7
122 cmp $0x20,%rcx
123 jl .Lxorpart2
124 vpxord 0x10(%rdx),%xmm7,%xmm6
125 vmovdqu %xmm6,0x10(%rsi)
126 vextracti128 $1,%ymm7,%xmm1
127 # o2 = i2 ^ (x2 + s2)
128 vpaddd %ymm10,%ymm2,%ymm7
129 cmp $0x30,%rcx
130 jl .Lxorpart2
131 vpxord 0x20(%rdx),%xmm7,%xmm6
132 vmovdqu %xmm6,0x20(%rsi)
133 vextracti128 $1,%ymm7,%xmm2
134 # o3 = i3 ^ (x3 + s3)
135 vpaddd %ymm11,%ymm3,%ymm7
136 cmp $0x40,%rcx
137 jl .Lxorpart2
138 vpxord 0x30(%rdx),%xmm7,%xmm6
139 vmovdqu %xmm6,0x30(%rsi)
140 vextracti128 $1,%ymm7,%xmm3
141
142 # xor and write second block
143 vmovdqa %xmm0,%xmm7
144 cmp $0x50,%rcx
145 jl .Lxorpart2
146 vpxord 0x40(%rdx),%xmm7,%xmm6
147 vmovdqu %xmm6,0x40(%rsi)
148
149 vmovdqa %xmm1,%xmm7
150 cmp $0x60,%rcx
151 jl .Lxorpart2
152 vpxord 0x50(%rdx),%xmm7,%xmm6
153 vmovdqu %xmm6,0x50(%rsi)
154
155 vmovdqa %xmm2,%xmm7
156 cmp $0x70,%rcx
157 jl .Lxorpart2
158 vpxord 0x60(%rdx),%xmm7,%xmm6
159 vmovdqu %xmm6,0x60(%rsi)
160
161 vmovdqa %xmm3,%xmm7
162 cmp $0x80,%rcx
163 jl .Lxorpart2
164 vpxord 0x70(%rdx),%xmm7,%xmm6
165 vmovdqu %xmm6,0x70(%rsi)
166
167.Ldone2:
168 vzeroupper
169 RET
170
171.Lxorpart2:
172 # xor remaining bytes from partial register into output
173 mov %rcx,%rax
174 and $0xf,%rcx
175 jz .Ldone2
176 mov %rax,%r9
177 and $~0xf,%r9
178
179 mov $1,%rax
180 shld %cl,%rax,%rax
181 sub $1,%rax
182 kmovq %rax,%k1
183
184 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
185 vpxord %xmm7,%xmm1,%xmm1
186 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
187
188 jmp .Ldone2
189
190SYM_FUNC_END(chacha_2block_xor_avx512vl)
191
192SYM_FUNC_START(chacha_4block_xor_avx512vl)
193 # %rdi: Input state matrix, s
194 # %rsi: up to 4 data blocks output, o
195 # %rdx: up to 4 data blocks input, i
196 # %rcx: input/output length in bytes
197 # %r8d: nrounds
198
199 # This function encrypts four ChaCha blocks by loading the state
200 # matrix four times across eight AVX registers. It performs matrix
201 # operations on four words in two matrices in parallel, sequentially
202 # to the operations on the four words of the other two matrices. The
203 # required word shuffling has a rather high latency, we can do the
204 # arithmetic on two matrix-pairs without much slowdown.
205
206 vzeroupper
207
208 # x0..3[0-4] = s0..3
209 vbroadcasti128 0x00(%rdi),%ymm0
210 vbroadcasti128 0x10(%rdi),%ymm1
211 vbroadcasti128 0x20(%rdi),%ymm2
212 vbroadcasti128 0x30(%rdi),%ymm3
213
214 vmovdqa %ymm0,%ymm4
215 vmovdqa %ymm1,%ymm5
216 vmovdqa %ymm2,%ymm6
217 vmovdqa %ymm3,%ymm7
218
219 vpaddd CTR2BL(%rip),%ymm3,%ymm3
220 vpaddd CTR4BL(%rip),%ymm7,%ymm7
221
222 vmovdqa %ymm0,%ymm11
223 vmovdqa %ymm1,%ymm12
224 vmovdqa %ymm2,%ymm13
225 vmovdqa %ymm3,%ymm14
226 vmovdqa %ymm7,%ymm15
227
228.Ldoubleround4:
229
230 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
231 vpaddd %ymm1,%ymm0,%ymm0
232 vpxord %ymm0,%ymm3,%ymm3
233 vprold $16,%ymm3,%ymm3
234
235 vpaddd %ymm5,%ymm4,%ymm4
236 vpxord %ymm4,%ymm7,%ymm7
237 vprold $16,%ymm7,%ymm7
238
239 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
240 vpaddd %ymm3,%ymm2,%ymm2
241 vpxord %ymm2,%ymm1,%ymm1
242 vprold $12,%ymm1,%ymm1
243
244 vpaddd %ymm7,%ymm6,%ymm6
245 vpxord %ymm6,%ymm5,%ymm5
246 vprold $12,%ymm5,%ymm5
247
248 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
249 vpaddd %ymm1,%ymm0,%ymm0
250 vpxord %ymm0,%ymm3,%ymm3
251 vprold $8,%ymm3,%ymm3
252
253 vpaddd %ymm5,%ymm4,%ymm4
254 vpxord %ymm4,%ymm7,%ymm7
255 vprold $8,%ymm7,%ymm7
256
257 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
258 vpaddd %ymm3,%ymm2,%ymm2
259 vpxord %ymm2,%ymm1,%ymm1
260 vprold $7,%ymm1,%ymm1
261
262 vpaddd %ymm7,%ymm6,%ymm6
263 vpxord %ymm6,%ymm5,%ymm5
264 vprold $7,%ymm5,%ymm5
265
266 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
267 vpshufd $0x39,%ymm1,%ymm1
268 vpshufd $0x39,%ymm5,%ymm5
269 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
270 vpshufd $0x4e,%ymm2,%ymm2
271 vpshufd $0x4e,%ymm6,%ymm6
272 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
273 vpshufd $0x93,%ymm3,%ymm3
274 vpshufd $0x93,%ymm7,%ymm7
275
276 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
277 vpaddd %ymm1,%ymm0,%ymm0
278 vpxord %ymm0,%ymm3,%ymm3
279 vprold $16,%ymm3,%ymm3
280
281 vpaddd %ymm5,%ymm4,%ymm4
282 vpxord %ymm4,%ymm7,%ymm7
283 vprold $16,%ymm7,%ymm7
284
285 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
286 vpaddd %ymm3,%ymm2,%ymm2
287 vpxord %ymm2,%ymm1,%ymm1
288 vprold $12,%ymm1,%ymm1
289
290 vpaddd %ymm7,%ymm6,%ymm6
291 vpxord %ymm6,%ymm5,%ymm5
292 vprold $12,%ymm5,%ymm5
293
294 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
295 vpaddd %ymm1,%ymm0,%ymm0
296 vpxord %ymm0,%ymm3,%ymm3
297 vprold $8,%ymm3,%ymm3
298
299 vpaddd %ymm5,%ymm4,%ymm4
300 vpxord %ymm4,%ymm7,%ymm7
301 vprold $8,%ymm7,%ymm7
302
303 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
304 vpaddd %ymm3,%ymm2,%ymm2
305 vpxord %ymm2,%ymm1,%ymm1
306 vprold $7,%ymm1,%ymm1
307
308 vpaddd %ymm7,%ymm6,%ymm6
309 vpxord %ymm6,%ymm5,%ymm5
310 vprold $7,%ymm5,%ymm5
311
312 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
313 vpshufd $0x93,%ymm1,%ymm1
314 vpshufd $0x93,%ymm5,%ymm5
315 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
316 vpshufd $0x4e,%ymm2,%ymm2
317 vpshufd $0x4e,%ymm6,%ymm6
318 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
319 vpshufd $0x39,%ymm3,%ymm3
320 vpshufd $0x39,%ymm7,%ymm7
321
322 sub $2,%r8d
323 jnz .Ldoubleround4
324
325 # o0 = i0 ^ (x0 + s0), first block
326 vpaddd %ymm11,%ymm0,%ymm10
327 cmp $0x10,%rcx
328 jl .Lxorpart4
329 vpxord 0x00(%rdx),%xmm10,%xmm9
330 vmovdqu %xmm9,0x00(%rsi)
331 vextracti128 $1,%ymm10,%xmm0
332 # o1 = i1 ^ (x1 + s1), first block
333 vpaddd %ymm12,%ymm1,%ymm10
334 cmp $0x20,%rcx
335 jl .Lxorpart4
336 vpxord 0x10(%rdx),%xmm10,%xmm9
337 vmovdqu %xmm9,0x10(%rsi)
338 vextracti128 $1,%ymm10,%xmm1
339 # o2 = i2 ^ (x2 + s2), first block
340 vpaddd %ymm13,%ymm2,%ymm10
341 cmp $0x30,%rcx
342 jl .Lxorpart4
343 vpxord 0x20(%rdx),%xmm10,%xmm9
344 vmovdqu %xmm9,0x20(%rsi)
345 vextracti128 $1,%ymm10,%xmm2
346 # o3 = i3 ^ (x3 + s3), first block
347 vpaddd %ymm14,%ymm3,%ymm10
348 cmp $0x40,%rcx
349 jl .Lxorpart4
350 vpxord 0x30(%rdx),%xmm10,%xmm9
351 vmovdqu %xmm9,0x30(%rsi)
352 vextracti128 $1,%ymm10,%xmm3
353
354 # xor and write second block
355 vmovdqa %xmm0,%xmm10
356 cmp $0x50,%rcx
357 jl .Lxorpart4
358 vpxord 0x40(%rdx),%xmm10,%xmm9
359 vmovdqu %xmm9,0x40(%rsi)
360
361 vmovdqa %xmm1,%xmm10
362 cmp $0x60,%rcx
363 jl .Lxorpart4
364 vpxord 0x50(%rdx),%xmm10,%xmm9
365 vmovdqu %xmm9,0x50(%rsi)
366
367 vmovdqa %xmm2,%xmm10
368 cmp $0x70,%rcx
369 jl .Lxorpart4
370 vpxord 0x60(%rdx),%xmm10,%xmm9
371 vmovdqu %xmm9,0x60(%rsi)
372
373 vmovdqa %xmm3,%xmm10
374 cmp $0x80,%rcx
375 jl .Lxorpart4
376 vpxord 0x70(%rdx),%xmm10,%xmm9
377 vmovdqu %xmm9,0x70(%rsi)
378
379 # o0 = i0 ^ (x0 + s0), third block
380 vpaddd %ymm11,%ymm4,%ymm10
381 cmp $0x90,%rcx
382 jl .Lxorpart4
383 vpxord 0x80(%rdx),%xmm10,%xmm9
384 vmovdqu %xmm9,0x80(%rsi)
385 vextracti128 $1,%ymm10,%xmm4
386 # o1 = i1 ^ (x1 + s1), third block
387 vpaddd %ymm12,%ymm5,%ymm10
388 cmp $0xa0,%rcx
389 jl .Lxorpart4
390 vpxord 0x90(%rdx),%xmm10,%xmm9
391 vmovdqu %xmm9,0x90(%rsi)
392 vextracti128 $1,%ymm10,%xmm5
393 # o2 = i2 ^ (x2 + s2), third block
394 vpaddd %ymm13,%ymm6,%ymm10
395 cmp $0xb0,%rcx
396 jl .Lxorpart4
397 vpxord 0xa0(%rdx),%xmm10,%xmm9
398 vmovdqu %xmm9,0xa0(%rsi)
399 vextracti128 $1,%ymm10,%xmm6
400 # o3 = i3 ^ (x3 + s3), third block
401 vpaddd %ymm15,%ymm7,%ymm10
402 cmp $0xc0,%rcx
403 jl .Lxorpart4
404 vpxord 0xb0(%rdx),%xmm10,%xmm9
405 vmovdqu %xmm9,0xb0(%rsi)
406 vextracti128 $1,%ymm10,%xmm7
407
408 # xor and write fourth block
409 vmovdqa %xmm4,%xmm10
410 cmp $0xd0,%rcx
411 jl .Lxorpart4
412 vpxord 0xc0(%rdx),%xmm10,%xmm9
413 vmovdqu %xmm9,0xc0(%rsi)
414
415 vmovdqa %xmm5,%xmm10
416 cmp $0xe0,%rcx
417 jl .Lxorpart4
418 vpxord 0xd0(%rdx),%xmm10,%xmm9
419 vmovdqu %xmm9,0xd0(%rsi)
420
421 vmovdqa %xmm6,%xmm10
422 cmp $0xf0,%rcx
423 jl .Lxorpart4
424 vpxord 0xe0(%rdx),%xmm10,%xmm9
425 vmovdqu %xmm9,0xe0(%rsi)
426
427 vmovdqa %xmm7,%xmm10
428 cmp $0x100,%rcx
429 jl .Lxorpart4
430 vpxord 0xf0(%rdx),%xmm10,%xmm9
431 vmovdqu %xmm9,0xf0(%rsi)
432
433.Ldone4:
434 vzeroupper
435 RET
436
437.Lxorpart4:
438 # xor remaining bytes from partial register into output
439 mov %rcx,%rax
440 and $0xf,%rcx
441 jz .Ldone4
442 mov %rax,%r9
443 and $~0xf,%r9
444
445 mov $1,%rax
446 shld %cl,%rax,%rax
447 sub $1,%rax
448 kmovq %rax,%k1
449
450 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
451 vpxord %xmm10,%xmm1,%xmm1
452 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
453
454 jmp .Ldone4
455
456SYM_FUNC_END(chacha_4block_xor_avx512vl)
457
458SYM_FUNC_START(chacha_8block_xor_avx512vl)
459 # %rdi: Input state matrix, s
460 # %rsi: up to 8 data blocks output, o
461 # %rdx: up to 8 data blocks input, i
462 # %rcx: input/output length in bytes
463 # %r8d: nrounds
464
465 # This function encrypts eight consecutive ChaCha blocks by loading
466 # the state matrix in AVX registers eight times. Compared to AVX2, this
467 # mostly benefits from the new rotate instructions in VL and the
468 # additional registers.
469
470 vzeroupper
471
472 # x0..15[0-7] = s[0..15]
473 vpbroadcastd 0x00(%rdi),%ymm0
474 vpbroadcastd 0x04(%rdi),%ymm1
475 vpbroadcastd 0x08(%rdi),%ymm2
476 vpbroadcastd 0x0c(%rdi),%ymm3
477 vpbroadcastd 0x10(%rdi),%ymm4
478 vpbroadcastd 0x14(%rdi),%ymm5
479 vpbroadcastd 0x18(%rdi),%ymm6
480 vpbroadcastd 0x1c(%rdi),%ymm7
481 vpbroadcastd 0x20(%rdi),%ymm8
482 vpbroadcastd 0x24(%rdi),%ymm9
483 vpbroadcastd 0x28(%rdi),%ymm10
484 vpbroadcastd 0x2c(%rdi),%ymm11
485 vpbroadcastd 0x30(%rdi),%ymm12
486 vpbroadcastd 0x34(%rdi),%ymm13
487 vpbroadcastd 0x38(%rdi),%ymm14
488 vpbroadcastd 0x3c(%rdi),%ymm15
489
490 # x12 += counter values 0-3
491 vpaddd CTR8BL(%rip),%ymm12,%ymm12
492
493 vmovdqa64 %ymm0,%ymm16
494 vmovdqa64 %ymm1,%ymm17
495 vmovdqa64 %ymm2,%ymm18
496 vmovdqa64 %ymm3,%ymm19
497 vmovdqa64 %ymm4,%ymm20
498 vmovdqa64 %ymm5,%ymm21
499 vmovdqa64 %ymm6,%ymm22
500 vmovdqa64 %ymm7,%ymm23
501 vmovdqa64 %ymm8,%ymm24
502 vmovdqa64 %ymm9,%ymm25
503 vmovdqa64 %ymm10,%ymm26
504 vmovdqa64 %ymm11,%ymm27
505 vmovdqa64 %ymm12,%ymm28
506 vmovdqa64 %ymm13,%ymm29
507 vmovdqa64 %ymm14,%ymm30
508 vmovdqa64 %ymm15,%ymm31
509
510.Ldoubleround8:
511 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
512 vpaddd %ymm0,%ymm4,%ymm0
513 vpxord %ymm0,%ymm12,%ymm12
514 vprold $16,%ymm12,%ymm12
515 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
516 vpaddd %ymm1,%ymm5,%ymm1
517 vpxord %ymm1,%ymm13,%ymm13
518 vprold $16,%ymm13,%ymm13
519 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
520 vpaddd %ymm2,%ymm6,%ymm2
521 vpxord %ymm2,%ymm14,%ymm14
522 vprold $16,%ymm14,%ymm14
523 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
524 vpaddd %ymm3,%ymm7,%ymm3
525 vpxord %ymm3,%ymm15,%ymm15
526 vprold $16,%ymm15,%ymm15
527
528 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
529 vpaddd %ymm12,%ymm8,%ymm8
530 vpxord %ymm8,%ymm4,%ymm4
531 vprold $12,%ymm4,%ymm4
532 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
533 vpaddd %ymm13,%ymm9,%ymm9
534 vpxord %ymm9,%ymm5,%ymm5
535 vprold $12,%ymm5,%ymm5
536 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
537 vpaddd %ymm14,%ymm10,%ymm10
538 vpxord %ymm10,%ymm6,%ymm6
539 vprold $12,%ymm6,%ymm6
540 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
541 vpaddd %ymm15,%ymm11,%ymm11
542 vpxord %ymm11,%ymm7,%ymm7
543 vprold $12,%ymm7,%ymm7
544
545 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
546 vpaddd %ymm0,%ymm4,%ymm0
547 vpxord %ymm0,%ymm12,%ymm12
548 vprold $8,%ymm12,%ymm12
549 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
550 vpaddd %ymm1,%ymm5,%ymm1
551 vpxord %ymm1,%ymm13,%ymm13
552 vprold $8,%ymm13,%ymm13
553 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
554 vpaddd %ymm2,%ymm6,%ymm2
555 vpxord %ymm2,%ymm14,%ymm14
556 vprold $8,%ymm14,%ymm14
557 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
558 vpaddd %ymm3,%ymm7,%ymm3
559 vpxord %ymm3,%ymm15,%ymm15
560 vprold $8,%ymm15,%ymm15
561
562 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
563 vpaddd %ymm12,%ymm8,%ymm8
564 vpxord %ymm8,%ymm4,%ymm4
565 vprold $7,%ymm4,%ymm4
566 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
567 vpaddd %ymm13,%ymm9,%ymm9
568 vpxord %ymm9,%ymm5,%ymm5
569 vprold $7,%ymm5,%ymm5
570 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
571 vpaddd %ymm14,%ymm10,%ymm10
572 vpxord %ymm10,%ymm6,%ymm6
573 vprold $7,%ymm6,%ymm6
574 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
575 vpaddd %ymm15,%ymm11,%ymm11
576 vpxord %ymm11,%ymm7,%ymm7
577 vprold $7,%ymm7,%ymm7
578
579 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
580 vpaddd %ymm0,%ymm5,%ymm0
581 vpxord %ymm0,%ymm15,%ymm15
582 vprold $16,%ymm15,%ymm15
583 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
584 vpaddd %ymm1,%ymm6,%ymm1
585 vpxord %ymm1,%ymm12,%ymm12
586 vprold $16,%ymm12,%ymm12
587 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
588 vpaddd %ymm2,%ymm7,%ymm2
589 vpxord %ymm2,%ymm13,%ymm13
590 vprold $16,%ymm13,%ymm13
591 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
592 vpaddd %ymm3,%ymm4,%ymm3
593 vpxord %ymm3,%ymm14,%ymm14
594 vprold $16,%ymm14,%ymm14
595
596 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
597 vpaddd %ymm15,%ymm10,%ymm10
598 vpxord %ymm10,%ymm5,%ymm5
599 vprold $12,%ymm5,%ymm5
600 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
601 vpaddd %ymm12,%ymm11,%ymm11
602 vpxord %ymm11,%ymm6,%ymm6
603 vprold $12,%ymm6,%ymm6
604 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
605 vpaddd %ymm13,%ymm8,%ymm8
606 vpxord %ymm8,%ymm7,%ymm7
607 vprold $12,%ymm7,%ymm7
608 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
609 vpaddd %ymm14,%ymm9,%ymm9
610 vpxord %ymm9,%ymm4,%ymm4
611 vprold $12,%ymm4,%ymm4
612
613 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
614 vpaddd %ymm0,%ymm5,%ymm0
615 vpxord %ymm0,%ymm15,%ymm15
616 vprold $8,%ymm15,%ymm15
617 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
618 vpaddd %ymm1,%ymm6,%ymm1
619 vpxord %ymm1,%ymm12,%ymm12
620 vprold $8,%ymm12,%ymm12
621 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
622 vpaddd %ymm2,%ymm7,%ymm2
623 vpxord %ymm2,%ymm13,%ymm13
624 vprold $8,%ymm13,%ymm13
625 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
626 vpaddd %ymm3,%ymm4,%ymm3
627 vpxord %ymm3,%ymm14,%ymm14
628 vprold $8,%ymm14,%ymm14
629
630 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
631 vpaddd %ymm15,%ymm10,%ymm10
632 vpxord %ymm10,%ymm5,%ymm5
633 vprold $7,%ymm5,%ymm5
634 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
635 vpaddd %ymm12,%ymm11,%ymm11
636 vpxord %ymm11,%ymm6,%ymm6
637 vprold $7,%ymm6,%ymm6
638 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
639 vpaddd %ymm13,%ymm8,%ymm8
640 vpxord %ymm8,%ymm7,%ymm7
641 vprold $7,%ymm7,%ymm7
642 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
643 vpaddd %ymm14,%ymm9,%ymm9
644 vpxord %ymm9,%ymm4,%ymm4
645 vprold $7,%ymm4,%ymm4
646
647 sub $2,%r8d
648 jnz .Ldoubleround8
649
650 # x0..15[0-3] += s[0..15]
651 vpaddd %ymm16,%ymm0,%ymm0
652 vpaddd %ymm17,%ymm1,%ymm1
653 vpaddd %ymm18,%ymm2,%ymm2
654 vpaddd %ymm19,%ymm3,%ymm3
655 vpaddd %ymm20,%ymm4,%ymm4
656 vpaddd %ymm21,%ymm5,%ymm5
657 vpaddd %ymm22,%ymm6,%ymm6
658 vpaddd %ymm23,%ymm7,%ymm7
659 vpaddd %ymm24,%ymm8,%ymm8
660 vpaddd %ymm25,%ymm9,%ymm9
661 vpaddd %ymm26,%ymm10,%ymm10
662 vpaddd %ymm27,%ymm11,%ymm11
663 vpaddd %ymm28,%ymm12,%ymm12
664 vpaddd %ymm29,%ymm13,%ymm13
665 vpaddd %ymm30,%ymm14,%ymm14
666 vpaddd %ymm31,%ymm15,%ymm15
667
668 # interleave 32-bit words in state n, n+1
669 vpunpckldq %ymm1,%ymm0,%ymm16
670 vpunpckhdq %ymm1,%ymm0,%ymm17
671 vpunpckldq %ymm3,%ymm2,%ymm18
672 vpunpckhdq %ymm3,%ymm2,%ymm19
673 vpunpckldq %ymm5,%ymm4,%ymm20
674 vpunpckhdq %ymm5,%ymm4,%ymm21
675 vpunpckldq %ymm7,%ymm6,%ymm22
676 vpunpckhdq %ymm7,%ymm6,%ymm23
677 vpunpckldq %ymm9,%ymm8,%ymm24
678 vpunpckhdq %ymm9,%ymm8,%ymm25
679 vpunpckldq %ymm11,%ymm10,%ymm26
680 vpunpckhdq %ymm11,%ymm10,%ymm27
681 vpunpckldq %ymm13,%ymm12,%ymm28
682 vpunpckhdq %ymm13,%ymm12,%ymm29
683 vpunpckldq %ymm15,%ymm14,%ymm30
684 vpunpckhdq %ymm15,%ymm14,%ymm31
685
686 # interleave 64-bit words in state n, n+2
687 vpunpcklqdq %ymm18,%ymm16,%ymm0
688 vpunpcklqdq %ymm19,%ymm17,%ymm1
689 vpunpckhqdq %ymm18,%ymm16,%ymm2
690 vpunpckhqdq %ymm19,%ymm17,%ymm3
691 vpunpcklqdq %ymm22,%ymm20,%ymm4
692 vpunpcklqdq %ymm23,%ymm21,%ymm5
693 vpunpckhqdq %ymm22,%ymm20,%ymm6
694 vpunpckhqdq %ymm23,%ymm21,%ymm7
695 vpunpcklqdq %ymm26,%ymm24,%ymm8
696 vpunpcklqdq %ymm27,%ymm25,%ymm9
697 vpunpckhqdq %ymm26,%ymm24,%ymm10
698 vpunpckhqdq %ymm27,%ymm25,%ymm11
699 vpunpcklqdq %ymm30,%ymm28,%ymm12
700 vpunpcklqdq %ymm31,%ymm29,%ymm13
701 vpunpckhqdq %ymm30,%ymm28,%ymm14
702 vpunpckhqdq %ymm31,%ymm29,%ymm15
703
704 # interleave 128-bit words in state n, n+4
705 # xor/write first four blocks
706 vmovdqa64 %ymm0,%ymm16
707 vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
708 cmp $0x0020,%rcx
709 jl .Lxorpart8
710 vpxord 0x0000(%rdx),%ymm0,%ymm0
711 vmovdqu64 %ymm0,0x0000(%rsi)
712 vmovdqa64 %ymm16,%ymm0
713 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
714
715 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
716 cmp $0x0040,%rcx
717 jl .Lxorpart8
718 vpxord 0x0020(%rdx),%ymm0,%ymm0
719 vmovdqu64 %ymm0,0x0020(%rsi)
720 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
721
722 vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
723 cmp $0x0060,%rcx
724 jl .Lxorpart8
725 vpxord 0x0040(%rdx),%ymm0,%ymm0
726 vmovdqu64 %ymm0,0x0040(%rsi)
727 vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
728
729 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
730 cmp $0x0080,%rcx
731 jl .Lxorpart8
732 vpxord 0x0060(%rdx),%ymm0,%ymm0
733 vmovdqu64 %ymm0,0x0060(%rsi)
734 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
735
736 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
737 cmp $0x00a0,%rcx
738 jl .Lxorpart8
739 vpxord 0x0080(%rdx),%ymm0,%ymm0
740 vmovdqu64 %ymm0,0x0080(%rsi)
741 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
742
743 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
744 cmp $0x00c0,%rcx
745 jl .Lxorpart8
746 vpxord 0x00a0(%rdx),%ymm0,%ymm0
747 vmovdqu64 %ymm0,0x00a0(%rsi)
748 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
749
750 vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
751 cmp $0x00e0,%rcx
752 jl .Lxorpart8
753 vpxord 0x00c0(%rdx),%ymm0,%ymm0
754 vmovdqu64 %ymm0,0x00c0(%rsi)
755 vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
756
757 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
758 cmp $0x0100,%rcx
759 jl .Lxorpart8
760 vpxord 0x00e0(%rdx),%ymm0,%ymm0
761 vmovdqu64 %ymm0,0x00e0(%rsi)
762 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
763
764 # xor remaining blocks, write to output
765 vmovdqa64 %ymm4,%ymm0
766 cmp $0x0120,%rcx
767 jl .Lxorpart8
768 vpxord 0x0100(%rdx),%ymm0,%ymm0
769 vmovdqu64 %ymm0,0x0100(%rsi)
770
771 vmovdqa64 %ymm12,%ymm0
772 cmp $0x0140,%rcx
773 jl .Lxorpart8
774 vpxord 0x0120(%rdx),%ymm0,%ymm0
775 vmovdqu64 %ymm0,0x0120(%rsi)
776
777 vmovdqa64 %ymm6,%ymm0
778 cmp $0x0160,%rcx
779 jl .Lxorpart8
780 vpxord 0x0140(%rdx),%ymm0,%ymm0
781 vmovdqu64 %ymm0,0x0140(%rsi)
782
783 vmovdqa64 %ymm14,%ymm0
784 cmp $0x0180,%rcx
785 jl .Lxorpart8
786 vpxord 0x0160(%rdx),%ymm0,%ymm0
787 vmovdqu64 %ymm0,0x0160(%rsi)
788
789 vmovdqa64 %ymm5,%ymm0
790 cmp $0x01a0,%rcx
791 jl .Lxorpart8
792 vpxord 0x0180(%rdx),%ymm0,%ymm0
793 vmovdqu64 %ymm0,0x0180(%rsi)
794
795 vmovdqa64 %ymm13,%ymm0
796 cmp $0x01c0,%rcx
797 jl .Lxorpart8
798 vpxord 0x01a0(%rdx),%ymm0,%ymm0
799 vmovdqu64 %ymm0,0x01a0(%rsi)
800
801 vmovdqa64 %ymm7,%ymm0
802 cmp $0x01e0,%rcx
803 jl .Lxorpart8
804 vpxord 0x01c0(%rdx),%ymm0,%ymm0
805 vmovdqu64 %ymm0,0x01c0(%rsi)
806
807 vmovdqa64 %ymm15,%ymm0
808 cmp $0x0200,%rcx
809 jl .Lxorpart8
810 vpxord 0x01e0(%rdx),%ymm0,%ymm0
811 vmovdqu64 %ymm0,0x01e0(%rsi)
812
813.Ldone8:
814 vzeroupper
815 RET
816
817.Lxorpart8:
818 # xor remaining bytes from partial register into output
819 mov %rcx,%rax
820 and $0x1f,%rcx
821 jz .Ldone8
822 mov %rax,%r9
823 and $~0x1f,%r9
824
825 mov $1,%rax
826 shld %cl,%rax,%rax
827 sub $1,%rax
828 kmovq %rax,%k1
829
830 vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
831 vpxord %ymm0,%ymm1,%ymm1
832 vmovdqu8 %ymm1,(%rsi,%r9){%k1}
833
834 jmp .Ldone8
835
836SYM_FUNC_END(chacha_8block_xor_avx512vl)
837

source code of linux/arch/x86/crypto/chacha-avx512vl-x86_64.S