1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
4 *
5 * Copyright (C) 2015 Martin Willi
6 */
7
8#include <linux/linkage.h>
9
10.section .rodata.cst32.ROT8, "aM", @progbits, 32
11.align 32
12ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
13 .octa 0x0e0d0c0f0a09080b0605040702010003
14
15.section .rodata.cst32.ROT16, "aM", @progbits, 32
16.align 32
17ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
18 .octa 0x0d0c0f0e09080b0a0504070601000302
19
20.section .rodata.cst32.CTRINC, "aM", @progbits, 32
21.align 32
22CTRINC: .octa 0x00000003000000020000000100000000
23 .octa 0x00000007000000060000000500000004
24
25.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
26.align 32
27CTR2BL: .octa 0x00000000000000000000000000000000
28 .octa 0x00000000000000000000000000000001
29
30.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
31.align 32
32CTR4BL: .octa 0x00000000000000000000000000000002
33 .octa 0x00000000000000000000000000000003
34
35.text
36
37SYM_FUNC_START(chacha_2block_xor_avx2)
38 # %rdi: Input state matrix, s
39 # %rsi: up to 2 data blocks output, o
40 # %rdx: up to 2 data blocks input, i
41 # %rcx: input/output length in bytes
42 # %r8d: nrounds
43
44 # This function encrypts two ChaCha blocks by loading the state
45 # matrix twice across four AVX registers. It performs matrix operations
46 # on four words in each matrix in parallel, but requires shuffling to
47 # rearrange the words after each round.
48
49 vzeroupper
50
51 # x0..3[0-2] = s0..3
52 vbroadcasti128 0x00(%rdi),%ymm0
53 vbroadcasti128 0x10(%rdi),%ymm1
54 vbroadcasti128 0x20(%rdi),%ymm2
55 vbroadcasti128 0x30(%rdi),%ymm3
56
57 vpaddd CTR2BL(%rip),%ymm3,%ymm3
58
59 vmovdqa %ymm0,%ymm8
60 vmovdqa %ymm1,%ymm9
61 vmovdqa %ymm2,%ymm10
62 vmovdqa %ymm3,%ymm11
63
64 vmovdqa ROT8(%rip),%ymm4
65 vmovdqa ROT16(%rip),%ymm5
66
67 mov %rcx,%rax
68
69.Ldoubleround:
70
71 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
72 vpaddd %ymm1,%ymm0,%ymm0
73 vpxor %ymm0,%ymm3,%ymm3
74 vpshufb %ymm5,%ymm3,%ymm3
75
76 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
77 vpaddd %ymm3,%ymm2,%ymm2
78 vpxor %ymm2,%ymm1,%ymm1
79 vmovdqa %ymm1,%ymm6
80 vpslld $12,%ymm6,%ymm6
81 vpsrld $20,%ymm1,%ymm1
82 vpor %ymm6,%ymm1,%ymm1
83
84 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85 vpaddd %ymm1,%ymm0,%ymm0
86 vpxor %ymm0,%ymm3,%ymm3
87 vpshufb %ymm4,%ymm3,%ymm3
88
89 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90 vpaddd %ymm3,%ymm2,%ymm2
91 vpxor %ymm2,%ymm1,%ymm1
92 vmovdqa %ymm1,%ymm7
93 vpslld $7,%ymm7,%ymm7
94 vpsrld $25,%ymm1,%ymm1
95 vpor %ymm7,%ymm1,%ymm1
96
97 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98 vpshufd $0x39,%ymm1,%ymm1
99 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100 vpshufd $0x4e,%ymm2,%ymm2
101 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
102 vpshufd $0x93,%ymm3,%ymm3
103
104 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
105 vpaddd %ymm1,%ymm0,%ymm0
106 vpxor %ymm0,%ymm3,%ymm3
107 vpshufb %ymm5,%ymm3,%ymm3
108
109 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
110 vpaddd %ymm3,%ymm2,%ymm2
111 vpxor %ymm2,%ymm1,%ymm1
112 vmovdqa %ymm1,%ymm6
113 vpslld $12,%ymm6,%ymm6
114 vpsrld $20,%ymm1,%ymm1
115 vpor %ymm6,%ymm1,%ymm1
116
117 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
118 vpaddd %ymm1,%ymm0,%ymm0
119 vpxor %ymm0,%ymm3,%ymm3
120 vpshufb %ymm4,%ymm3,%ymm3
121
122 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
123 vpaddd %ymm3,%ymm2,%ymm2
124 vpxor %ymm2,%ymm1,%ymm1
125 vmovdqa %ymm1,%ymm7
126 vpslld $7,%ymm7,%ymm7
127 vpsrld $25,%ymm1,%ymm1
128 vpor %ymm7,%ymm1,%ymm1
129
130 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
131 vpshufd $0x93,%ymm1,%ymm1
132 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
133 vpshufd $0x4e,%ymm2,%ymm2
134 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
135 vpshufd $0x39,%ymm3,%ymm3
136
137 sub $2,%r8d
138 jnz .Ldoubleround
139
140 # o0 = i0 ^ (x0 + s0)
141 vpaddd %ymm8,%ymm0,%ymm7
142 cmp $0x10,%rax
143 jl .Lxorpart2
144 vpxor 0x00(%rdx),%xmm7,%xmm6
145 vmovdqu %xmm6,0x00(%rsi)
146 vextracti128 $1,%ymm7,%xmm0
147 # o1 = i1 ^ (x1 + s1)
148 vpaddd %ymm9,%ymm1,%ymm7
149 cmp $0x20,%rax
150 jl .Lxorpart2
151 vpxor 0x10(%rdx),%xmm7,%xmm6
152 vmovdqu %xmm6,0x10(%rsi)
153 vextracti128 $1,%ymm7,%xmm1
154 # o2 = i2 ^ (x2 + s2)
155 vpaddd %ymm10,%ymm2,%ymm7
156 cmp $0x30,%rax
157 jl .Lxorpart2
158 vpxor 0x20(%rdx),%xmm7,%xmm6
159 vmovdqu %xmm6,0x20(%rsi)
160 vextracti128 $1,%ymm7,%xmm2
161 # o3 = i3 ^ (x3 + s3)
162 vpaddd %ymm11,%ymm3,%ymm7
163 cmp $0x40,%rax
164 jl .Lxorpart2
165 vpxor 0x30(%rdx),%xmm7,%xmm6
166 vmovdqu %xmm6,0x30(%rsi)
167 vextracti128 $1,%ymm7,%xmm3
168
169 # xor and write second block
170 vmovdqa %xmm0,%xmm7
171 cmp $0x50,%rax
172 jl .Lxorpart2
173 vpxor 0x40(%rdx),%xmm7,%xmm6
174 vmovdqu %xmm6,0x40(%rsi)
175
176 vmovdqa %xmm1,%xmm7
177 cmp $0x60,%rax
178 jl .Lxorpart2
179 vpxor 0x50(%rdx),%xmm7,%xmm6
180 vmovdqu %xmm6,0x50(%rsi)
181
182 vmovdqa %xmm2,%xmm7
183 cmp $0x70,%rax
184 jl .Lxorpart2
185 vpxor 0x60(%rdx),%xmm7,%xmm6
186 vmovdqu %xmm6,0x60(%rsi)
187
188 vmovdqa %xmm3,%xmm7
189 cmp $0x80,%rax
190 jl .Lxorpart2
191 vpxor 0x70(%rdx),%xmm7,%xmm6
192 vmovdqu %xmm6,0x70(%rsi)
193
194.Ldone2:
195 vzeroupper
196 RET
197
198.Lxorpart2:
199 # xor remaining bytes from partial register into output
200 mov %rax,%r9
201 and $0x0f,%r9
202 jz .Ldone2
203 and $~0x0f,%rax
204
205 mov %rsi,%r11
206
207 lea 8(%rsp),%r10
208 sub $0x10,%rsp
209 and $~31,%rsp
210
211 lea (%rdx,%rax),%rsi
212 mov %rsp,%rdi
213 mov %r9,%rcx
214 rep movsb
215
216 vpxor 0x00(%rsp),%xmm7,%xmm7
217 vmovdqa %xmm7,0x00(%rsp)
218
219 mov %rsp,%rsi
220 lea (%r11,%rax),%rdi
221 mov %r9,%rcx
222 rep movsb
223
224 lea -8(%r10),%rsp
225 jmp .Ldone2
226
227SYM_FUNC_END(chacha_2block_xor_avx2)
228
229SYM_FUNC_START(chacha_4block_xor_avx2)
230 # %rdi: Input state matrix, s
231 # %rsi: up to 4 data blocks output, o
232 # %rdx: up to 4 data blocks input, i
233 # %rcx: input/output length in bytes
234 # %r8d: nrounds
235
236 # This function encrypts four ChaCha blocks by loading the state
237 # matrix four times across eight AVX registers. It performs matrix
238 # operations on four words in two matrices in parallel, sequentially
239 # to the operations on the four words of the other two matrices. The
240 # required word shuffling has a rather high latency, we can do the
241 # arithmetic on two matrix-pairs without much slowdown.
242
243 vzeroupper
244
245 # x0..3[0-4] = s0..3
246 vbroadcasti128 0x00(%rdi),%ymm0
247 vbroadcasti128 0x10(%rdi),%ymm1
248 vbroadcasti128 0x20(%rdi),%ymm2
249 vbroadcasti128 0x30(%rdi),%ymm3
250
251 vmovdqa %ymm0,%ymm4
252 vmovdqa %ymm1,%ymm5
253 vmovdqa %ymm2,%ymm6
254 vmovdqa %ymm3,%ymm7
255
256 vpaddd CTR2BL(%rip),%ymm3,%ymm3
257 vpaddd CTR4BL(%rip),%ymm7,%ymm7
258
259 vmovdqa %ymm0,%ymm11
260 vmovdqa %ymm1,%ymm12
261 vmovdqa %ymm2,%ymm13
262 vmovdqa %ymm3,%ymm14
263 vmovdqa %ymm7,%ymm15
264
265 vmovdqa ROT8(%rip),%ymm8
266 vmovdqa ROT16(%rip),%ymm9
267
268 mov %rcx,%rax
269
270.Ldoubleround4:
271
272 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
273 vpaddd %ymm1,%ymm0,%ymm0
274 vpxor %ymm0,%ymm3,%ymm3
275 vpshufb %ymm9,%ymm3,%ymm3
276
277 vpaddd %ymm5,%ymm4,%ymm4
278 vpxor %ymm4,%ymm7,%ymm7
279 vpshufb %ymm9,%ymm7,%ymm7
280
281 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
282 vpaddd %ymm3,%ymm2,%ymm2
283 vpxor %ymm2,%ymm1,%ymm1
284 vmovdqa %ymm1,%ymm10
285 vpslld $12,%ymm10,%ymm10
286 vpsrld $20,%ymm1,%ymm1
287 vpor %ymm10,%ymm1,%ymm1
288
289 vpaddd %ymm7,%ymm6,%ymm6
290 vpxor %ymm6,%ymm5,%ymm5
291 vmovdqa %ymm5,%ymm10
292 vpslld $12,%ymm10,%ymm10
293 vpsrld $20,%ymm5,%ymm5
294 vpor %ymm10,%ymm5,%ymm5
295
296 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297 vpaddd %ymm1,%ymm0,%ymm0
298 vpxor %ymm0,%ymm3,%ymm3
299 vpshufb %ymm8,%ymm3,%ymm3
300
301 vpaddd %ymm5,%ymm4,%ymm4
302 vpxor %ymm4,%ymm7,%ymm7
303 vpshufb %ymm8,%ymm7,%ymm7
304
305 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306 vpaddd %ymm3,%ymm2,%ymm2
307 vpxor %ymm2,%ymm1,%ymm1
308 vmovdqa %ymm1,%ymm10
309 vpslld $7,%ymm10,%ymm10
310 vpsrld $25,%ymm1,%ymm1
311 vpor %ymm10,%ymm1,%ymm1
312
313 vpaddd %ymm7,%ymm6,%ymm6
314 vpxor %ymm6,%ymm5,%ymm5
315 vmovdqa %ymm5,%ymm10
316 vpslld $7,%ymm10,%ymm10
317 vpsrld $25,%ymm5,%ymm5
318 vpor %ymm10,%ymm5,%ymm5
319
320 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
321 vpshufd $0x39,%ymm1,%ymm1
322 vpshufd $0x39,%ymm5,%ymm5
323 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
324 vpshufd $0x4e,%ymm2,%ymm2
325 vpshufd $0x4e,%ymm6,%ymm6
326 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
327 vpshufd $0x93,%ymm3,%ymm3
328 vpshufd $0x93,%ymm7,%ymm7
329
330 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
331 vpaddd %ymm1,%ymm0,%ymm0
332 vpxor %ymm0,%ymm3,%ymm3
333 vpshufb %ymm9,%ymm3,%ymm3
334
335 vpaddd %ymm5,%ymm4,%ymm4
336 vpxor %ymm4,%ymm7,%ymm7
337 vpshufb %ymm9,%ymm7,%ymm7
338
339 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
340 vpaddd %ymm3,%ymm2,%ymm2
341 vpxor %ymm2,%ymm1,%ymm1
342 vmovdqa %ymm1,%ymm10
343 vpslld $12,%ymm10,%ymm10
344 vpsrld $20,%ymm1,%ymm1
345 vpor %ymm10,%ymm1,%ymm1
346
347 vpaddd %ymm7,%ymm6,%ymm6
348 vpxor %ymm6,%ymm5,%ymm5
349 vmovdqa %ymm5,%ymm10
350 vpslld $12,%ymm10,%ymm10
351 vpsrld $20,%ymm5,%ymm5
352 vpor %ymm10,%ymm5,%ymm5
353
354 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
355 vpaddd %ymm1,%ymm0,%ymm0
356 vpxor %ymm0,%ymm3,%ymm3
357 vpshufb %ymm8,%ymm3,%ymm3
358
359 vpaddd %ymm5,%ymm4,%ymm4
360 vpxor %ymm4,%ymm7,%ymm7
361 vpshufb %ymm8,%ymm7,%ymm7
362
363 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
364 vpaddd %ymm3,%ymm2,%ymm2
365 vpxor %ymm2,%ymm1,%ymm1
366 vmovdqa %ymm1,%ymm10
367 vpslld $7,%ymm10,%ymm10
368 vpsrld $25,%ymm1,%ymm1
369 vpor %ymm10,%ymm1,%ymm1
370
371 vpaddd %ymm7,%ymm6,%ymm6
372 vpxor %ymm6,%ymm5,%ymm5
373 vmovdqa %ymm5,%ymm10
374 vpslld $7,%ymm10,%ymm10
375 vpsrld $25,%ymm5,%ymm5
376 vpor %ymm10,%ymm5,%ymm5
377
378 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
379 vpshufd $0x93,%ymm1,%ymm1
380 vpshufd $0x93,%ymm5,%ymm5
381 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
382 vpshufd $0x4e,%ymm2,%ymm2
383 vpshufd $0x4e,%ymm6,%ymm6
384 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
385 vpshufd $0x39,%ymm3,%ymm3
386 vpshufd $0x39,%ymm7,%ymm7
387
388 sub $2,%r8d
389 jnz .Ldoubleround4
390
391 # o0 = i0 ^ (x0 + s0), first block
392 vpaddd %ymm11,%ymm0,%ymm10
393 cmp $0x10,%rax
394 jl .Lxorpart4
395 vpxor 0x00(%rdx),%xmm10,%xmm9
396 vmovdqu %xmm9,0x00(%rsi)
397 vextracti128 $1,%ymm10,%xmm0
398 # o1 = i1 ^ (x1 + s1), first block
399 vpaddd %ymm12,%ymm1,%ymm10
400 cmp $0x20,%rax
401 jl .Lxorpart4
402 vpxor 0x10(%rdx),%xmm10,%xmm9
403 vmovdqu %xmm9,0x10(%rsi)
404 vextracti128 $1,%ymm10,%xmm1
405 # o2 = i2 ^ (x2 + s2), first block
406 vpaddd %ymm13,%ymm2,%ymm10
407 cmp $0x30,%rax
408 jl .Lxorpart4
409 vpxor 0x20(%rdx),%xmm10,%xmm9
410 vmovdqu %xmm9,0x20(%rsi)
411 vextracti128 $1,%ymm10,%xmm2
412 # o3 = i3 ^ (x3 + s3), first block
413 vpaddd %ymm14,%ymm3,%ymm10
414 cmp $0x40,%rax
415 jl .Lxorpart4
416 vpxor 0x30(%rdx),%xmm10,%xmm9
417 vmovdqu %xmm9,0x30(%rsi)
418 vextracti128 $1,%ymm10,%xmm3
419
420 # xor and write second block
421 vmovdqa %xmm0,%xmm10
422 cmp $0x50,%rax
423 jl .Lxorpart4
424 vpxor 0x40(%rdx),%xmm10,%xmm9
425 vmovdqu %xmm9,0x40(%rsi)
426
427 vmovdqa %xmm1,%xmm10
428 cmp $0x60,%rax
429 jl .Lxorpart4
430 vpxor 0x50(%rdx),%xmm10,%xmm9
431 vmovdqu %xmm9,0x50(%rsi)
432
433 vmovdqa %xmm2,%xmm10
434 cmp $0x70,%rax
435 jl .Lxorpart4
436 vpxor 0x60(%rdx),%xmm10,%xmm9
437 vmovdqu %xmm9,0x60(%rsi)
438
439 vmovdqa %xmm3,%xmm10
440 cmp $0x80,%rax
441 jl .Lxorpart4
442 vpxor 0x70(%rdx),%xmm10,%xmm9
443 vmovdqu %xmm9,0x70(%rsi)
444
445 # o0 = i0 ^ (x0 + s0), third block
446 vpaddd %ymm11,%ymm4,%ymm10
447 cmp $0x90,%rax
448 jl .Lxorpart4
449 vpxor 0x80(%rdx),%xmm10,%xmm9
450 vmovdqu %xmm9,0x80(%rsi)
451 vextracti128 $1,%ymm10,%xmm4
452 # o1 = i1 ^ (x1 + s1), third block
453 vpaddd %ymm12,%ymm5,%ymm10
454 cmp $0xa0,%rax
455 jl .Lxorpart4
456 vpxor 0x90(%rdx),%xmm10,%xmm9
457 vmovdqu %xmm9,0x90(%rsi)
458 vextracti128 $1,%ymm10,%xmm5
459 # o2 = i2 ^ (x2 + s2), third block
460 vpaddd %ymm13,%ymm6,%ymm10
461 cmp $0xb0,%rax
462 jl .Lxorpart4
463 vpxor 0xa0(%rdx),%xmm10,%xmm9
464 vmovdqu %xmm9,0xa0(%rsi)
465 vextracti128 $1,%ymm10,%xmm6
466 # o3 = i3 ^ (x3 + s3), third block
467 vpaddd %ymm15,%ymm7,%ymm10
468 cmp $0xc0,%rax
469 jl .Lxorpart4
470 vpxor 0xb0(%rdx),%xmm10,%xmm9
471 vmovdqu %xmm9,0xb0(%rsi)
472 vextracti128 $1,%ymm10,%xmm7
473
474 # xor and write fourth block
475 vmovdqa %xmm4,%xmm10
476 cmp $0xd0,%rax
477 jl .Lxorpart4
478 vpxor 0xc0(%rdx),%xmm10,%xmm9
479 vmovdqu %xmm9,0xc0(%rsi)
480
481 vmovdqa %xmm5,%xmm10
482 cmp $0xe0,%rax
483 jl .Lxorpart4
484 vpxor 0xd0(%rdx),%xmm10,%xmm9
485 vmovdqu %xmm9,0xd0(%rsi)
486
487 vmovdqa %xmm6,%xmm10
488 cmp $0xf0,%rax
489 jl .Lxorpart4
490 vpxor 0xe0(%rdx),%xmm10,%xmm9
491 vmovdqu %xmm9,0xe0(%rsi)
492
493 vmovdqa %xmm7,%xmm10
494 cmp $0x100,%rax
495 jl .Lxorpart4
496 vpxor 0xf0(%rdx),%xmm10,%xmm9
497 vmovdqu %xmm9,0xf0(%rsi)
498
499.Ldone4:
500 vzeroupper
501 RET
502
503.Lxorpart4:
504 # xor remaining bytes from partial register into output
505 mov %rax,%r9
506 and $0x0f,%r9
507 jz .Ldone4
508 and $~0x0f,%rax
509
510 mov %rsi,%r11
511
512 lea 8(%rsp),%r10
513 sub $0x10,%rsp
514 and $~31,%rsp
515
516 lea (%rdx,%rax),%rsi
517 mov %rsp,%rdi
518 mov %r9,%rcx
519 rep movsb
520
521 vpxor 0x00(%rsp),%xmm10,%xmm10
522 vmovdqa %xmm10,0x00(%rsp)
523
524 mov %rsp,%rsi
525 lea (%r11,%rax),%rdi
526 mov %r9,%rcx
527 rep movsb
528
529 lea -8(%r10),%rsp
530 jmp .Ldone4
531
532SYM_FUNC_END(chacha_4block_xor_avx2)
533
534SYM_FUNC_START(chacha_8block_xor_avx2)
535 # %rdi: Input state matrix, s
536 # %rsi: up to 8 data blocks output, o
537 # %rdx: up to 8 data blocks input, i
538 # %rcx: input/output length in bytes
539 # %r8d: nrounds
540
541 # This function encrypts eight consecutive ChaCha blocks by loading
542 # the state matrix in AVX registers eight times. As we need some
543 # scratch registers, we save the first four registers on the stack. The
544 # algorithm performs each operation on the corresponding word of each
545 # state matrix, hence requires no word shuffling. For final XORing step
546 # we transpose the matrix by interleaving 32-, 64- and then 128-bit
547 # words, which allows us to do XOR in AVX registers. 8/16-bit word
548 # rotation is done with the slightly better performing byte shuffling,
549 # 7/12-bit word rotation uses traditional shift+OR.
550
551 vzeroupper
552 # 4 * 32 byte stack, 32-byte aligned
553 lea 8(%rsp),%r10
554 and $~31, %rsp
555 sub $0x80, %rsp
556 mov %rcx,%rax
557
558 # x0..15[0-7] = s[0..15]
559 vpbroadcastd 0x00(%rdi),%ymm0
560 vpbroadcastd 0x04(%rdi),%ymm1
561 vpbroadcastd 0x08(%rdi),%ymm2
562 vpbroadcastd 0x0c(%rdi),%ymm3
563 vpbroadcastd 0x10(%rdi),%ymm4
564 vpbroadcastd 0x14(%rdi),%ymm5
565 vpbroadcastd 0x18(%rdi),%ymm6
566 vpbroadcastd 0x1c(%rdi),%ymm7
567 vpbroadcastd 0x20(%rdi),%ymm8
568 vpbroadcastd 0x24(%rdi),%ymm9
569 vpbroadcastd 0x28(%rdi),%ymm10
570 vpbroadcastd 0x2c(%rdi),%ymm11
571 vpbroadcastd 0x30(%rdi),%ymm12
572 vpbroadcastd 0x34(%rdi),%ymm13
573 vpbroadcastd 0x38(%rdi),%ymm14
574 vpbroadcastd 0x3c(%rdi),%ymm15
575 # x0..3 on stack
576 vmovdqa %ymm0,0x00(%rsp)
577 vmovdqa %ymm1,0x20(%rsp)
578 vmovdqa %ymm2,0x40(%rsp)
579 vmovdqa %ymm3,0x60(%rsp)
580
581 vmovdqa CTRINC(%rip),%ymm1
582 vmovdqa ROT8(%rip),%ymm2
583 vmovdqa ROT16(%rip),%ymm3
584
585 # x12 += counter values 0-3
586 vpaddd %ymm1,%ymm12,%ymm12
587
588.Ldoubleround8:
589 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
590 vpaddd 0x00(%rsp),%ymm4,%ymm0
591 vmovdqa %ymm0,0x00(%rsp)
592 vpxor %ymm0,%ymm12,%ymm12
593 vpshufb %ymm3,%ymm12,%ymm12
594 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
595 vpaddd 0x20(%rsp),%ymm5,%ymm0
596 vmovdqa %ymm0,0x20(%rsp)
597 vpxor %ymm0,%ymm13,%ymm13
598 vpshufb %ymm3,%ymm13,%ymm13
599 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
600 vpaddd 0x40(%rsp),%ymm6,%ymm0
601 vmovdqa %ymm0,0x40(%rsp)
602 vpxor %ymm0,%ymm14,%ymm14
603 vpshufb %ymm3,%ymm14,%ymm14
604 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
605 vpaddd 0x60(%rsp),%ymm7,%ymm0
606 vmovdqa %ymm0,0x60(%rsp)
607 vpxor %ymm0,%ymm15,%ymm15
608 vpshufb %ymm3,%ymm15,%ymm15
609
610 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
611 vpaddd %ymm12,%ymm8,%ymm8
612 vpxor %ymm8,%ymm4,%ymm4
613 vpslld $12,%ymm4,%ymm0
614 vpsrld $20,%ymm4,%ymm4
615 vpor %ymm0,%ymm4,%ymm4
616 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
617 vpaddd %ymm13,%ymm9,%ymm9
618 vpxor %ymm9,%ymm5,%ymm5
619 vpslld $12,%ymm5,%ymm0
620 vpsrld $20,%ymm5,%ymm5
621 vpor %ymm0,%ymm5,%ymm5
622 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
623 vpaddd %ymm14,%ymm10,%ymm10
624 vpxor %ymm10,%ymm6,%ymm6
625 vpslld $12,%ymm6,%ymm0
626 vpsrld $20,%ymm6,%ymm6
627 vpor %ymm0,%ymm6,%ymm6
628 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
629 vpaddd %ymm15,%ymm11,%ymm11
630 vpxor %ymm11,%ymm7,%ymm7
631 vpslld $12,%ymm7,%ymm0
632 vpsrld $20,%ymm7,%ymm7
633 vpor %ymm0,%ymm7,%ymm7
634
635 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
636 vpaddd 0x00(%rsp),%ymm4,%ymm0
637 vmovdqa %ymm0,0x00(%rsp)
638 vpxor %ymm0,%ymm12,%ymm12
639 vpshufb %ymm2,%ymm12,%ymm12
640 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
641 vpaddd 0x20(%rsp),%ymm5,%ymm0
642 vmovdqa %ymm0,0x20(%rsp)
643 vpxor %ymm0,%ymm13,%ymm13
644 vpshufb %ymm2,%ymm13,%ymm13
645 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
646 vpaddd 0x40(%rsp),%ymm6,%ymm0
647 vmovdqa %ymm0,0x40(%rsp)
648 vpxor %ymm0,%ymm14,%ymm14
649 vpshufb %ymm2,%ymm14,%ymm14
650 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
651 vpaddd 0x60(%rsp),%ymm7,%ymm0
652 vmovdqa %ymm0,0x60(%rsp)
653 vpxor %ymm0,%ymm15,%ymm15
654 vpshufb %ymm2,%ymm15,%ymm15
655
656 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
657 vpaddd %ymm12,%ymm8,%ymm8
658 vpxor %ymm8,%ymm4,%ymm4
659 vpslld $7,%ymm4,%ymm0
660 vpsrld $25,%ymm4,%ymm4
661 vpor %ymm0,%ymm4,%ymm4
662 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
663 vpaddd %ymm13,%ymm9,%ymm9
664 vpxor %ymm9,%ymm5,%ymm5
665 vpslld $7,%ymm5,%ymm0
666 vpsrld $25,%ymm5,%ymm5
667 vpor %ymm0,%ymm5,%ymm5
668 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
669 vpaddd %ymm14,%ymm10,%ymm10
670 vpxor %ymm10,%ymm6,%ymm6
671 vpslld $7,%ymm6,%ymm0
672 vpsrld $25,%ymm6,%ymm6
673 vpor %ymm0,%ymm6,%ymm6
674 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
675 vpaddd %ymm15,%ymm11,%ymm11
676 vpxor %ymm11,%ymm7,%ymm7
677 vpslld $7,%ymm7,%ymm0
678 vpsrld $25,%ymm7,%ymm7
679 vpor %ymm0,%ymm7,%ymm7
680
681 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
682 vpaddd 0x00(%rsp),%ymm5,%ymm0
683 vmovdqa %ymm0,0x00(%rsp)
684 vpxor %ymm0,%ymm15,%ymm15
685 vpshufb %ymm3,%ymm15,%ymm15
686 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
687 vpaddd 0x20(%rsp),%ymm6,%ymm0
688 vmovdqa %ymm0,0x20(%rsp)
689 vpxor %ymm0,%ymm12,%ymm12
690 vpshufb %ymm3,%ymm12,%ymm12
691 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
692 vpaddd 0x40(%rsp),%ymm7,%ymm0
693 vmovdqa %ymm0,0x40(%rsp)
694 vpxor %ymm0,%ymm13,%ymm13
695 vpshufb %ymm3,%ymm13,%ymm13
696 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
697 vpaddd 0x60(%rsp),%ymm4,%ymm0
698 vmovdqa %ymm0,0x60(%rsp)
699 vpxor %ymm0,%ymm14,%ymm14
700 vpshufb %ymm3,%ymm14,%ymm14
701
702 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
703 vpaddd %ymm15,%ymm10,%ymm10
704 vpxor %ymm10,%ymm5,%ymm5
705 vpslld $12,%ymm5,%ymm0
706 vpsrld $20,%ymm5,%ymm5
707 vpor %ymm0,%ymm5,%ymm5
708 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
709 vpaddd %ymm12,%ymm11,%ymm11
710 vpxor %ymm11,%ymm6,%ymm6
711 vpslld $12,%ymm6,%ymm0
712 vpsrld $20,%ymm6,%ymm6
713 vpor %ymm0,%ymm6,%ymm6
714 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
715 vpaddd %ymm13,%ymm8,%ymm8
716 vpxor %ymm8,%ymm7,%ymm7
717 vpslld $12,%ymm7,%ymm0
718 vpsrld $20,%ymm7,%ymm7
719 vpor %ymm0,%ymm7,%ymm7
720 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
721 vpaddd %ymm14,%ymm9,%ymm9
722 vpxor %ymm9,%ymm4,%ymm4
723 vpslld $12,%ymm4,%ymm0
724 vpsrld $20,%ymm4,%ymm4
725 vpor %ymm0,%ymm4,%ymm4
726
727 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
728 vpaddd 0x00(%rsp),%ymm5,%ymm0
729 vmovdqa %ymm0,0x00(%rsp)
730 vpxor %ymm0,%ymm15,%ymm15
731 vpshufb %ymm2,%ymm15,%ymm15
732 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
733 vpaddd 0x20(%rsp),%ymm6,%ymm0
734 vmovdqa %ymm0,0x20(%rsp)
735 vpxor %ymm0,%ymm12,%ymm12
736 vpshufb %ymm2,%ymm12,%ymm12
737 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
738 vpaddd 0x40(%rsp),%ymm7,%ymm0
739 vmovdqa %ymm0,0x40(%rsp)
740 vpxor %ymm0,%ymm13,%ymm13
741 vpshufb %ymm2,%ymm13,%ymm13
742 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
743 vpaddd 0x60(%rsp),%ymm4,%ymm0
744 vmovdqa %ymm0,0x60(%rsp)
745 vpxor %ymm0,%ymm14,%ymm14
746 vpshufb %ymm2,%ymm14,%ymm14
747
748 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
749 vpaddd %ymm15,%ymm10,%ymm10
750 vpxor %ymm10,%ymm5,%ymm5
751 vpslld $7,%ymm5,%ymm0
752 vpsrld $25,%ymm5,%ymm5
753 vpor %ymm0,%ymm5,%ymm5
754 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
755 vpaddd %ymm12,%ymm11,%ymm11
756 vpxor %ymm11,%ymm6,%ymm6
757 vpslld $7,%ymm6,%ymm0
758 vpsrld $25,%ymm6,%ymm6
759 vpor %ymm0,%ymm6,%ymm6
760 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
761 vpaddd %ymm13,%ymm8,%ymm8
762 vpxor %ymm8,%ymm7,%ymm7
763 vpslld $7,%ymm7,%ymm0
764 vpsrld $25,%ymm7,%ymm7
765 vpor %ymm0,%ymm7,%ymm7
766 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
767 vpaddd %ymm14,%ymm9,%ymm9
768 vpxor %ymm9,%ymm4,%ymm4
769 vpslld $7,%ymm4,%ymm0
770 vpsrld $25,%ymm4,%ymm4
771 vpor %ymm0,%ymm4,%ymm4
772
773 sub $2,%r8d
774 jnz .Ldoubleround8
775
776 # x0..15[0-3] += s[0..15]
777 vpbroadcastd 0x00(%rdi),%ymm0
778 vpaddd 0x00(%rsp),%ymm0,%ymm0
779 vmovdqa %ymm0,0x00(%rsp)
780 vpbroadcastd 0x04(%rdi),%ymm0
781 vpaddd 0x20(%rsp),%ymm0,%ymm0
782 vmovdqa %ymm0,0x20(%rsp)
783 vpbroadcastd 0x08(%rdi),%ymm0
784 vpaddd 0x40(%rsp),%ymm0,%ymm0
785 vmovdqa %ymm0,0x40(%rsp)
786 vpbroadcastd 0x0c(%rdi),%ymm0
787 vpaddd 0x60(%rsp),%ymm0,%ymm0
788 vmovdqa %ymm0,0x60(%rsp)
789 vpbroadcastd 0x10(%rdi),%ymm0
790 vpaddd %ymm0,%ymm4,%ymm4
791 vpbroadcastd 0x14(%rdi),%ymm0
792 vpaddd %ymm0,%ymm5,%ymm5
793 vpbroadcastd 0x18(%rdi),%ymm0
794 vpaddd %ymm0,%ymm6,%ymm6
795 vpbroadcastd 0x1c(%rdi),%ymm0
796 vpaddd %ymm0,%ymm7,%ymm7
797 vpbroadcastd 0x20(%rdi),%ymm0
798 vpaddd %ymm0,%ymm8,%ymm8
799 vpbroadcastd 0x24(%rdi),%ymm0
800 vpaddd %ymm0,%ymm9,%ymm9
801 vpbroadcastd 0x28(%rdi),%ymm0
802 vpaddd %ymm0,%ymm10,%ymm10
803 vpbroadcastd 0x2c(%rdi),%ymm0
804 vpaddd %ymm0,%ymm11,%ymm11
805 vpbroadcastd 0x30(%rdi),%ymm0
806 vpaddd %ymm0,%ymm12,%ymm12
807 vpbroadcastd 0x34(%rdi),%ymm0
808 vpaddd %ymm0,%ymm13,%ymm13
809 vpbroadcastd 0x38(%rdi),%ymm0
810 vpaddd %ymm0,%ymm14,%ymm14
811 vpbroadcastd 0x3c(%rdi),%ymm0
812 vpaddd %ymm0,%ymm15,%ymm15
813
814 # x12 += counter values 0-3
815 vpaddd %ymm1,%ymm12,%ymm12
816
817 # interleave 32-bit words in state n, n+1
818 vmovdqa 0x00(%rsp),%ymm0
819 vmovdqa 0x20(%rsp),%ymm1
820 vpunpckldq %ymm1,%ymm0,%ymm2
821 vpunpckhdq %ymm1,%ymm0,%ymm1
822 vmovdqa %ymm2,0x00(%rsp)
823 vmovdqa %ymm1,0x20(%rsp)
824 vmovdqa 0x40(%rsp),%ymm0
825 vmovdqa 0x60(%rsp),%ymm1
826 vpunpckldq %ymm1,%ymm0,%ymm2
827 vpunpckhdq %ymm1,%ymm0,%ymm1
828 vmovdqa %ymm2,0x40(%rsp)
829 vmovdqa %ymm1,0x60(%rsp)
830 vmovdqa %ymm4,%ymm0
831 vpunpckldq %ymm5,%ymm0,%ymm4
832 vpunpckhdq %ymm5,%ymm0,%ymm5
833 vmovdqa %ymm6,%ymm0
834 vpunpckldq %ymm7,%ymm0,%ymm6
835 vpunpckhdq %ymm7,%ymm0,%ymm7
836 vmovdqa %ymm8,%ymm0
837 vpunpckldq %ymm9,%ymm0,%ymm8
838 vpunpckhdq %ymm9,%ymm0,%ymm9
839 vmovdqa %ymm10,%ymm0
840 vpunpckldq %ymm11,%ymm0,%ymm10
841 vpunpckhdq %ymm11,%ymm0,%ymm11
842 vmovdqa %ymm12,%ymm0
843 vpunpckldq %ymm13,%ymm0,%ymm12
844 vpunpckhdq %ymm13,%ymm0,%ymm13
845 vmovdqa %ymm14,%ymm0
846 vpunpckldq %ymm15,%ymm0,%ymm14
847 vpunpckhdq %ymm15,%ymm0,%ymm15
848
849 # interleave 64-bit words in state n, n+2
850 vmovdqa 0x00(%rsp),%ymm0
851 vmovdqa 0x40(%rsp),%ymm2
852 vpunpcklqdq %ymm2,%ymm0,%ymm1
853 vpunpckhqdq %ymm2,%ymm0,%ymm2
854 vmovdqa %ymm1,0x00(%rsp)
855 vmovdqa %ymm2,0x40(%rsp)
856 vmovdqa 0x20(%rsp),%ymm0
857 vmovdqa 0x60(%rsp),%ymm2
858 vpunpcklqdq %ymm2,%ymm0,%ymm1
859 vpunpckhqdq %ymm2,%ymm0,%ymm2
860 vmovdqa %ymm1,0x20(%rsp)
861 vmovdqa %ymm2,0x60(%rsp)
862 vmovdqa %ymm4,%ymm0
863 vpunpcklqdq %ymm6,%ymm0,%ymm4
864 vpunpckhqdq %ymm6,%ymm0,%ymm6
865 vmovdqa %ymm5,%ymm0
866 vpunpcklqdq %ymm7,%ymm0,%ymm5
867 vpunpckhqdq %ymm7,%ymm0,%ymm7
868 vmovdqa %ymm8,%ymm0
869 vpunpcklqdq %ymm10,%ymm0,%ymm8
870 vpunpckhqdq %ymm10,%ymm0,%ymm10
871 vmovdqa %ymm9,%ymm0
872 vpunpcklqdq %ymm11,%ymm0,%ymm9
873 vpunpckhqdq %ymm11,%ymm0,%ymm11
874 vmovdqa %ymm12,%ymm0
875 vpunpcklqdq %ymm14,%ymm0,%ymm12
876 vpunpckhqdq %ymm14,%ymm0,%ymm14
877 vmovdqa %ymm13,%ymm0
878 vpunpcklqdq %ymm15,%ymm0,%ymm13
879 vpunpckhqdq %ymm15,%ymm0,%ymm15
880
881 # interleave 128-bit words in state n, n+4
882 # xor/write first four blocks
883 vmovdqa 0x00(%rsp),%ymm1
884 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
885 cmp $0x0020,%rax
886 jl .Lxorpart8
887 vpxor 0x0000(%rdx),%ymm0,%ymm0
888 vmovdqu %ymm0,0x0000(%rsi)
889 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
890
891 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
892 cmp $0x0040,%rax
893 jl .Lxorpart8
894 vpxor 0x0020(%rdx),%ymm0,%ymm0
895 vmovdqu %ymm0,0x0020(%rsi)
896 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
897
898 vmovdqa 0x40(%rsp),%ymm1
899 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
900 cmp $0x0060,%rax
901 jl .Lxorpart8
902 vpxor 0x0040(%rdx),%ymm0,%ymm0
903 vmovdqu %ymm0,0x0040(%rsi)
904 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
905
906 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
907 cmp $0x0080,%rax
908 jl .Lxorpart8
909 vpxor 0x0060(%rdx),%ymm0,%ymm0
910 vmovdqu %ymm0,0x0060(%rsi)
911 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
912
913 vmovdqa 0x20(%rsp),%ymm1
914 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
915 cmp $0x00a0,%rax
916 jl .Lxorpart8
917 vpxor 0x0080(%rdx),%ymm0,%ymm0
918 vmovdqu %ymm0,0x0080(%rsi)
919 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
920
921 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
922 cmp $0x00c0,%rax
923 jl .Lxorpart8
924 vpxor 0x00a0(%rdx),%ymm0,%ymm0
925 vmovdqu %ymm0,0x00a0(%rsi)
926 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
927
928 vmovdqa 0x60(%rsp),%ymm1
929 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
930 cmp $0x00e0,%rax
931 jl .Lxorpart8
932 vpxor 0x00c0(%rdx),%ymm0,%ymm0
933 vmovdqu %ymm0,0x00c0(%rsi)
934 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
935
936 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
937 cmp $0x0100,%rax
938 jl .Lxorpart8
939 vpxor 0x00e0(%rdx),%ymm0,%ymm0
940 vmovdqu %ymm0,0x00e0(%rsi)
941 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
942
943 # xor remaining blocks, write to output
944 vmovdqa %ymm4,%ymm0
945 cmp $0x0120,%rax
946 jl .Lxorpart8
947 vpxor 0x0100(%rdx),%ymm0,%ymm0
948 vmovdqu %ymm0,0x0100(%rsi)
949
950 vmovdqa %ymm12,%ymm0
951 cmp $0x0140,%rax
952 jl .Lxorpart8
953 vpxor 0x0120(%rdx),%ymm0,%ymm0
954 vmovdqu %ymm0,0x0120(%rsi)
955
956 vmovdqa %ymm6,%ymm0
957 cmp $0x0160,%rax
958 jl .Lxorpart8
959 vpxor 0x0140(%rdx),%ymm0,%ymm0
960 vmovdqu %ymm0,0x0140(%rsi)
961
962 vmovdqa %ymm14,%ymm0
963 cmp $0x0180,%rax
964 jl .Lxorpart8
965 vpxor 0x0160(%rdx),%ymm0,%ymm0
966 vmovdqu %ymm0,0x0160(%rsi)
967
968 vmovdqa %ymm5,%ymm0
969 cmp $0x01a0,%rax
970 jl .Lxorpart8
971 vpxor 0x0180(%rdx),%ymm0,%ymm0
972 vmovdqu %ymm0,0x0180(%rsi)
973
974 vmovdqa %ymm13,%ymm0
975 cmp $0x01c0,%rax
976 jl .Lxorpart8
977 vpxor 0x01a0(%rdx),%ymm0,%ymm0
978 vmovdqu %ymm0,0x01a0(%rsi)
979
980 vmovdqa %ymm7,%ymm0
981 cmp $0x01e0,%rax
982 jl .Lxorpart8
983 vpxor 0x01c0(%rdx),%ymm0,%ymm0
984 vmovdqu %ymm0,0x01c0(%rsi)
985
986 vmovdqa %ymm15,%ymm0
987 cmp $0x0200,%rax
988 jl .Lxorpart8
989 vpxor 0x01e0(%rdx),%ymm0,%ymm0
990 vmovdqu %ymm0,0x01e0(%rsi)
991
992.Ldone8:
993 vzeroupper
994 lea -8(%r10),%rsp
995 RET
996
997.Lxorpart8:
998 # xor remaining bytes from partial register into output
999 mov %rax,%r9
1000 and $0x1f,%r9
1001 jz .Ldone8
1002 and $~0x1f,%rax
1003
1004 mov %rsi,%r11
1005
1006 lea (%rdx,%rax),%rsi
1007 mov %rsp,%rdi
1008 mov %r9,%rcx
1009 rep movsb
1010
1011 vpxor 0x00(%rsp),%ymm0,%ymm0
1012 vmovdqa %ymm0,0x00(%rsp)
1013
1014 mov %rsp,%rsi
1015 lea (%r11,%rax),%rdi
1016 mov %r9,%rcx
1017 rep movsb
1018
1019 jmp .Ldone8
1020
1021SYM_FUNC_END(chacha_8block_xor_avx2)
1022

source code of linux/arch/x86/crypto/chacha-avx2-x86_64.S