1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * ChaCha 256-bit cipher algorithm, x64 AVX2 functions |
4 | * |
5 | * Copyright (C) 2015 Martin Willi |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | |
10 | .section .rodata.cst32.ROT8, "aM" , @progbits, 32 |
11 | .align 32 |
12 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
13 | .octa 0x0e0d0c0f0a09080b0605040702010003 |
14 | |
15 | .section .rodata.cst32.ROT16, "aM" , @progbits, 32 |
16 | .align 32 |
17 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
18 | .octa 0x0d0c0f0e09080b0a0504070601000302 |
19 | |
20 | .section .rodata.cst32.CTRINC, "aM" , @progbits, 32 |
21 | .align 32 |
22 | CTRINC: .octa 0x00000003000000020000000100000000 |
23 | .octa 0x00000007000000060000000500000004 |
24 | |
25 | .section .rodata.cst32.CTR2BL, "aM" , @progbits, 32 |
26 | .align 32 |
27 | CTR2BL: .octa 0x00000000000000000000000000000000 |
28 | .octa 0x00000000000000000000000000000001 |
29 | |
30 | .section .rodata.cst32.CTR4BL, "aM" , @progbits, 32 |
31 | .align 32 |
32 | CTR4BL: .octa 0x00000000000000000000000000000002 |
33 | .octa 0x00000000000000000000000000000003 |
34 | |
35 | .text |
36 | |
37 | SYM_FUNC_START(chacha_2block_xor_avx2) |
38 | # %rdi: Input state matrix, s |
39 | # %rsi: up to 2 data blocks output, o |
40 | # %rdx: up to 2 data blocks input, i |
41 | # %rcx: input/output length in bytes |
42 | # %r8d: nrounds |
43 | |
44 | # This function encrypts two ChaCha blocks by loading the state |
45 | # matrix twice across four AVX registers. It performs matrix operations |
46 | # on four words in each matrix in parallel, but requires shuffling to |
47 | # rearrange the words after each round. |
48 | |
49 | vzeroupper |
50 | |
51 | # x0..3[0-2] = s0..3 |
52 | vbroadcasti128 0x00(%rdi),%ymm0 |
53 | vbroadcasti128 0x10(%rdi),%ymm1 |
54 | vbroadcasti128 0x20(%rdi),%ymm2 |
55 | vbroadcasti128 0x30(%rdi),%ymm3 |
56 | |
57 | vpaddd CTR2BL(%rip),%ymm3,%ymm3 |
58 | |
59 | vmovdqa %ymm0,%ymm8 |
60 | vmovdqa %ymm1,%ymm9 |
61 | vmovdqa %ymm2,%ymm10 |
62 | vmovdqa %ymm3,%ymm11 |
63 | |
64 | vmovdqa ROT8(%rip),%ymm4 |
65 | vmovdqa ROT16(%rip),%ymm5 |
66 | |
67 | mov %rcx,%rax |
68 | |
69 | .Ldoubleround: |
70 | |
71 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
72 | vpaddd %ymm1,%ymm0,%ymm0 |
73 | vpxor %ymm0,%ymm3,%ymm3 |
74 | vpshufb %ymm5,%ymm3,%ymm3 |
75 | |
76 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
77 | vpaddd %ymm3,%ymm2,%ymm2 |
78 | vpxor %ymm2,%ymm1,%ymm1 |
79 | vmovdqa %ymm1,%ymm6 |
80 | vpslld $12,%ymm6,%ymm6 |
81 | vpsrld $20,%ymm1,%ymm1 |
82 | vpor %ymm6,%ymm1,%ymm1 |
83 | |
84 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
85 | vpaddd %ymm1,%ymm0,%ymm0 |
86 | vpxor %ymm0,%ymm3,%ymm3 |
87 | vpshufb %ymm4,%ymm3,%ymm3 |
88 | |
89 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
90 | vpaddd %ymm3,%ymm2,%ymm2 |
91 | vpxor %ymm2,%ymm1,%ymm1 |
92 | vmovdqa %ymm1,%ymm7 |
93 | vpslld $7,%ymm7,%ymm7 |
94 | vpsrld $25,%ymm1,%ymm1 |
95 | vpor %ymm7,%ymm1,%ymm1 |
96 | |
97 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
98 | vpshufd $0x39,%ymm1,%ymm1 |
99 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
100 | vpshufd $0x4e,%ymm2,%ymm2 |
101 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
102 | vpshufd $0x93,%ymm3,%ymm3 |
103 | |
104 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
105 | vpaddd %ymm1,%ymm0,%ymm0 |
106 | vpxor %ymm0,%ymm3,%ymm3 |
107 | vpshufb %ymm5,%ymm3,%ymm3 |
108 | |
109 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
110 | vpaddd %ymm3,%ymm2,%ymm2 |
111 | vpxor %ymm2,%ymm1,%ymm1 |
112 | vmovdqa %ymm1,%ymm6 |
113 | vpslld $12,%ymm6,%ymm6 |
114 | vpsrld $20,%ymm1,%ymm1 |
115 | vpor %ymm6,%ymm1,%ymm1 |
116 | |
117 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
118 | vpaddd %ymm1,%ymm0,%ymm0 |
119 | vpxor %ymm0,%ymm3,%ymm3 |
120 | vpshufb %ymm4,%ymm3,%ymm3 |
121 | |
122 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
123 | vpaddd %ymm3,%ymm2,%ymm2 |
124 | vpxor %ymm2,%ymm1,%ymm1 |
125 | vmovdqa %ymm1,%ymm7 |
126 | vpslld $7,%ymm7,%ymm7 |
127 | vpsrld $25,%ymm1,%ymm1 |
128 | vpor %ymm7,%ymm1,%ymm1 |
129 | |
130 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
131 | vpshufd $0x93,%ymm1,%ymm1 |
132 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
133 | vpshufd $0x4e,%ymm2,%ymm2 |
134 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
135 | vpshufd $0x39,%ymm3,%ymm3 |
136 | |
137 | sub $2,%r8d |
138 | jnz .Ldoubleround |
139 | |
140 | # o0 = i0 ^ (x0 + s0) |
141 | vpaddd %ymm8,%ymm0,%ymm7 |
142 | cmp $0x10,%rax |
143 | jl .Lxorpart2 |
144 | vpxor 0x00(%rdx),%xmm7,%xmm6 |
145 | vmovdqu %xmm6,0x00(%rsi) |
146 | vextracti128 $1,%ymm7,%xmm0 |
147 | # o1 = i1 ^ (x1 + s1) |
148 | vpaddd %ymm9,%ymm1,%ymm7 |
149 | cmp $0x20,%rax |
150 | jl .Lxorpart2 |
151 | vpxor 0x10(%rdx),%xmm7,%xmm6 |
152 | vmovdqu %xmm6,0x10(%rsi) |
153 | vextracti128 $1,%ymm7,%xmm1 |
154 | # o2 = i2 ^ (x2 + s2) |
155 | vpaddd %ymm10,%ymm2,%ymm7 |
156 | cmp $0x30,%rax |
157 | jl .Lxorpart2 |
158 | vpxor 0x20(%rdx),%xmm7,%xmm6 |
159 | vmovdqu %xmm6,0x20(%rsi) |
160 | vextracti128 $1,%ymm7,%xmm2 |
161 | # o3 = i3 ^ (x3 + s3) |
162 | vpaddd %ymm11,%ymm3,%ymm7 |
163 | cmp $0x40,%rax |
164 | jl .Lxorpart2 |
165 | vpxor 0x30(%rdx),%xmm7,%xmm6 |
166 | vmovdqu %xmm6,0x30(%rsi) |
167 | vextracti128 $1,%ymm7,%xmm3 |
168 | |
169 | # xor and write second block |
170 | vmovdqa %xmm0,%xmm7 |
171 | cmp $0x50,%rax |
172 | jl .Lxorpart2 |
173 | vpxor 0x40(%rdx),%xmm7,%xmm6 |
174 | vmovdqu %xmm6,0x40(%rsi) |
175 | |
176 | vmovdqa %xmm1,%xmm7 |
177 | cmp $0x60,%rax |
178 | jl .Lxorpart2 |
179 | vpxor 0x50(%rdx),%xmm7,%xmm6 |
180 | vmovdqu %xmm6,0x50(%rsi) |
181 | |
182 | vmovdqa %xmm2,%xmm7 |
183 | cmp $0x70,%rax |
184 | jl .Lxorpart2 |
185 | vpxor 0x60(%rdx),%xmm7,%xmm6 |
186 | vmovdqu %xmm6,0x60(%rsi) |
187 | |
188 | vmovdqa %xmm3,%xmm7 |
189 | cmp $0x80,%rax |
190 | jl .Lxorpart2 |
191 | vpxor 0x70(%rdx),%xmm7,%xmm6 |
192 | vmovdqu %xmm6,0x70(%rsi) |
193 | |
194 | .Ldone2: |
195 | vzeroupper |
196 | RET |
197 | |
198 | .Lxorpart2: |
199 | # xor remaining bytes from partial register into output |
200 | mov %rax,%r9 |
201 | and $0x0f,%r9 |
202 | jz .Ldone2 |
203 | and $~0x0f,%rax |
204 | |
205 | mov %rsi,%r11 |
206 | |
207 | lea 8(%rsp),%r10 |
208 | sub $0x10,%rsp |
209 | and $~31,%rsp |
210 | |
211 | lea (%rdx,%rax),%rsi |
212 | mov %rsp,%rdi |
213 | mov %r9,%rcx |
214 | rep movsb |
215 | |
216 | vpxor 0x00(%rsp),%xmm7,%xmm7 |
217 | vmovdqa %xmm7,0x00(%rsp) |
218 | |
219 | mov %rsp,%rsi |
220 | lea (%r11,%rax),%rdi |
221 | mov %r9,%rcx |
222 | rep movsb |
223 | |
224 | lea -8(%r10),%rsp |
225 | jmp .Ldone2 |
226 | |
227 | SYM_FUNC_END(chacha_2block_xor_avx2) |
228 | |
229 | SYM_FUNC_START(chacha_4block_xor_avx2) |
230 | # %rdi: Input state matrix, s |
231 | # %rsi: up to 4 data blocks output, o |
232 | # %rdx: up to 4 data blocks input, i |
233 | # %rcx: input/output length in bytes |
234 | # %r8d: nrounds |
235 | |
236 | # This function encrypts four ChaCha blocks by loading the state |
237 | # matrix four times across eight AVX registers. It performs matrix |
238 | # operations on four words in two matrices in parallel, sequentially |
239 | # to the operations on the four words of the other two matrices. The |
240 | # required word shuffling has a rather high latency, we can do the |
241 | # arithmetic on two matrix-pairs without much slowdown. |
242 | |
243 | vzeroupper |
244 | |
245 | # x0..3[0-4] = s0..3 |
246 | vbroadcasti128 0x00(%rdi),%ymm0 |
247 | vbroadcasti128 0x10(%rdi),%ymm1 |
248 | vbroadcasti128 0x20(%rdi),%ymm2 |
249 | vbroadcasti128 0x30(%rdi),%ymm3 |
250 | |
251 | vmovdqa %ymm0,%ymm4 |
252 | vmovdqa %ymm1,%ymm5 |
253 | vmovdqa %ymm2,%ymm6 |
254 | vmovdqa %ymm3,%ymm7 |
255 | |
256 | vpaddd CTR2BL(%rip),%ymm3,%ymm3 |
257 | vpaddd CTR4BL(%rip),%ymm7,%ymm7 |
258 | |
259 | vmovdqa %ymm0,%ymm11 |
260 | vmovdqa %ymm1,%ymm12 |
261 | vmovdqa %ymm2,%ymm13 |
262 | vmovdqa %ymm3,%ymm14 |
263 | vmovdqa %ymm7,%ymm15 |
264 | |
265 | vmovdqa ROT8(%rip),%ymm8 |
266 | vmovdqa ROT16(%rip),%ymm9 |
267 | |
268 | mov %rcx,%rax |
269 | |
270 | .Ldoubleround4: |
271 | |
272 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
273 | vpaddd %ymm1,%ymm0,%ymm0 |
274 | vpxor %ymm0,%ymm3,%ymm3 |
275 | vpshufb %ymm9,%ymm3,%ymm3 |
276 | |
277 | vpaddd %ymm5,%ymm4,%ymm4 |
278 | vpxor %ymm4,%ymm7,%ymm7 |
279 | vpshufb %ymm9,%ymm7,%ymm7 |
280 | |
281 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
282 | vpaddd %ymm3,%ymm2,%ymm2 |
283 | vpxor %ymm2,%ymm1,%ymm1 |
284 | vmovdqa %ymm1,%ymm10 |
285 | vpslld $12,%ymm10,%ymm10 |
286 | vpsrld $20,%ymm1,%ymm1 |
287 | vpor %ymm10,%ymm1,%ymm1 |
288 | |
289 | vpaddd %ymm7,%ymm6,%ymm6 |
290 | vpxor %ymm6,%ymm5,%ymm5 |
291 | vmovdqa %ymm5,%ymm10 |
292 | vpslld $12,%ymm10,%ymm10 |
293 | vpsrld $20,%ymm5,%ymm5 |
294 | vpor %ymm10,%ymm5,%ymm5 |
295 | |
296 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
297 | vpaddd %ymm1,%ymm0,%ymm0 |
298 | vpxor %ymm0,%ymm3,%ymm3 |
299 | vpshufb %ymm8,%ymm3,%ymm3 |
300 | |
301 | vpaddd %ymm5,%ymm4,%ymm4 |
302 | vpxor %ymm4,%ymm7,%ymm7 |
303 | vpshufb %ymm8,%ymm7,%ymm7 |
304 | |
305 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
306 | vpaddd %ymm3,%ymm2,%ymm2 |
307 | vpxor %ymm2,%ymm1,%ymm1 |
308 | vmovdqa %ymm1,%ymm10 |
309 | vpslld $7,%ymm10,%ymm10 |
310 | vpsrld $25,%ymm1,%ymm1 |
311 | vpor %ymm10,%ymm1,%ymm1 |
312 | |
313 | vpaddd %ymm7,%ymm6,%ymm6 |
314 | vpxor %ymm6,%ymm5,%ymm5 |
315 | vmovdqa %ymm5,%ymm10 |
316 | vpslld $7,%ymm10,%ymm10 |
317 | vpsrld $25,%ymm5,%ymm5 |
318 | vpor %ymm10,%ymm5,%ymm5 |
319 | |
320 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
321 | vpshufd $0x39,%ymm1,%ymm1 |
322 | vpshufd $0x39,%ymm5,%ymm5 |
323 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
324 | vpshufd $0x4e,%ymm2,%ymm2 |
325 | vpshufd $0x4e,%ymm6,%ymm6 |
326 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
327 | vpshufd $0x93,%ymm3,%ymm3 |
328 | vpshufd $0x93,%ymm7,%ymm7 |
329 | |
330 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
331 | vpaddd %ymm1,%ymm0,%ymm0 |
332 | vpxor %ymm0,%ymm3,%ymm3 |
333 | vpshufb %ymm9,%ymm3,%ymm3 |
334 | |
335 | vpaddd %ymm5,%ymm4,%ymm4 |
336 | vpxor %ymm4,%ymm7,%ymm7 |
337 | vpshufb %ymm9,%ymm7,%ymm7 |
338 | |
339 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
340 | vpaddd %ymm3,%ymm2,%ymm2 |
341 | vpxor %ymm2,%ymm1,%ymm1 |
342 | vmovdqa %ymm1,%ymm10 |
343 | vpslld $12,%ymm10,%ymm10 |
344 | vpsrld $20,%ymm1,%ymm1 |
345 | vpor %ymm10,%ymm1,%ymm1 |
346 | |
347 | vpaddd %ymm7,%ymm6,%ymm6 |
348 | vpxor %ymm6,%ymm5,%ymm5 |
349 | vmovdqa %ymm5,%ymm10 |
350 | vpslld $12,%ymm10,%ymm10 |
351 | vpsrld $20,%ymm5,%ymm5 |
352 | vpor %ymm10,%ymm5,%ymm5 |
353 | |
354 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
355 | vpaddd %ymm1,%ymm0,%ymm0 |
356 | vpxor %ymm0,%ymm3,%ymm3 |
357 | vpshufb %ymm8,%ymm3,%ymm3 |
358 | |
359 | vpaddd %ymm5,%ymm4,%ymm4 |
360 | vpxor %ymm4,%ymm7,%ymm7 |
361 | vpshufb %ymm8,%ymm7,%ymm7 |
362 | |
363 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
364 | vpaddd %ymm3,%ymm2,%ymm2 |
365 | vpxor %ymm2,%ymm1,%ymm1 |
366 | vmovdqa %ymm1,%ymm10 |
367 | vpslld $7,%ymm10,%ymm10 |
368 | vpsrld $25,%ymm1,%ymm1 |
369 | vpor %ymm10,%ymm1,%ymm1 |
370 | |
371 | vpaddd %ymm7,%ymm6,%ymm6 |
372 | vpxor %ymm6,%ymm5,%ymm5 |
373 | vmovdqa %ymm5,%ymm10 |
374 | vpslld $7,%ymm10,%ymm10 |
375 | vpsrld $25,%ymm5,%ymm5 |
376 | vpor %ymm10,%ymm5,%ymm5 |
377 | |
378 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
379 | vpshufd $0x93,%ymm1,%ymm1 |
380 | vpshufd $0x93,%ymm5,%ymm5 |
381 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
382 | vpshufd $0x4e,%ymm2,%ymm2 |
383 | vpshufd $0x4e,%ymm6,%ymm6 |
384 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
385 | vpshufd $0x39,%ymm3,%ymm3 |
386 | vpshufd $0x39,%ymm7,%ymm7 |
387 | |
388 | sub $2,%r8d |
389 | jnz .Ldoubleround4 |
390 | |
391 | # o0 = i0 ^ (x0 + s0), first block |
392 | vpaddd %ymm11,%ymm0,%ymm10 |
393 | cmp $0x10,%rax |
394 | jl .Lxorpart4 |
395 | vpxor 0x00(%rdx),%xmm10,%xmm9 |
396 | vmovdqu %xmm9,0x00(%rsi) |
397 | vextracti128 $1,%ymm10,%xmm0 |
398 | # o1 = i1 ^ (x1 + s1), first block |
399 | vpaddd %ymm12,%ymm1,%ymm10 |
400 | cmp $0x20,%rax |
401 | jl .Lxorpart4 |
402 | vpxor 0x10(%rdx),%xmm10,%xmm9 |
403 | vmovdqu %xmm9,0x10(%rsi) |
404 | vextracti128 $1,%ymm10,%xmm1 |
405 | # o2 = i2 ^ (x2 + s2), first block |
406 | vpaddd %ymm13,%ymm2,%ymm10 |
407 | cmp $0x30,%rax |
408 | jl .Lxorpart4 |
409 | vpxor 0x20(%rdx),%xmm10,%xmm9 |
410 | vmovdqu %xmm9,0x20(%rsi) |
411 | vextracti128 $1,%ymm10,%xmm2 |
412 | # o3 = i3 ^ (x3 + s3), first block |
413 | vpaddd %ymm14,%ymm3,%ymm10 |
414 | cmp $0x40,%rax |
415 | jl .Lxorpart4 |
416 | vpxor 0x30(%rdx),%xmm10,%xmm9 |
417 | vmovdqu %xmm9,0x30(%rsi) |
418 | vextracti128 $1,%ymm10,%xmm3 |
419 | |
420 | # xor and write second block |
421 | vmovdqa %xmm0,%xmm10 |
422 | cmp $0x50,%rax |
423 | jl .Lxorpart4 |
424 | vpxor 0x40(%rdx),%xmm10,%xmm9 |
425 | vmovdqu %xmm9,0x40(%rsi) |
426 | |
427 | vmovdqa %xmm1,%xmm10 |
428 | cmp $0x60,%rax |
429 | jl .Lxorpart4 |
430 | vpxor 0x50(%rdx),%xmm10,%xmm9 |
431 | vmovdqu %xmm9,0x50(%rsi) |
432 | |
433 | vmovdqa %xmm2,%xmm10 |
434 | cmp $0x70,%rax |
435 | jl .Lxorpart4 |
436 | vpxor 0x60(%rdx),%xmm10,%xmm9 |
437 | vmovdqu %xmm9,0x60(%rsi) |
438 | |
439 | vmovdqa %xmm3,%xmm10 |
440 | cmp $0x80,%rax |
441 | jl .Lxorpart4 |
442 | vpxor 0x70(%rdx),%xmm10,%xmm9 |
443 | vmovdqu %xmm9,0x70(%rsi) |
444 | |
445 | # o0 = i0 ^ (x0 + s0), third block |
446 | vpaddd %ymm11,%ymm4,%ymm10 |
447 | cmp $0x90,%rax |
448 | jl .Lxorpart4 |
449 | vpxor 0x80(%rdx),%xmm10,%xmm9 |
450 | vmovdqu %xmm9,0x80(%rsi) |
451 | vextracti128 $1,%ymm10,%xmm4 |
452 | # o1 = i1 ^ (x1 + s1), third block |
453 | vpaddd %ymm12,%ymm5,%ymm10 |
454 | cmp $0xa0,%rax |
455 | jl .Lxorpart4 |
456 | vpxor 0x90(%rdx),%xmm10,%xmm9 |
457 | vmovdqu %xmm9,0x90(%rsi) |
458 | vextracti128 $1,%ymm10,%xmm5 |
459 | # o2 = i2 ^ (x2 + s2), third block |
460 | vpaddd %ymm13,%ymm6,%ymm10 |
461 | cmp $0xb0,%rax |
462 | jl .Lxorpart4 |
463 | vpxor 0xa0(%rdx),%xmm10,%xmm9 |
464 | vmovdqu %xmm9,0xa0(%rsi) |
465 | vextracti128 $1,%ymm10,%xmm6 |
466 | # o3 = i3 ^ (x3 + s3), third block |
467 | vpaddd %ymm15,%ymm7,%ymm10 |
468 | cmp $0xc0,%rax |
469 | jl .Lxorpart4 |
470 | vpxor 0xb0(%rdx),%xmm10,%xmm9 |
471 | vmovdqu %xmm9,0xb0(%rsi) |
472 | vextracti128 $1,%ymm10,%xmm7 |
473 | |
474 | # xor and write fourth block |
475 | vmovdqa %xmm4,%xmm10 |
476 | cmp $0xd0,%rax |
477 | jl .Lxorpart4 |
478 | vpxor 0xc0(%rdx),%xmm10,%xmm9 |
479 | vmovdqu %xmm9,0xc0(%rsi) |
480 | |
481 | vmovdqa %xmm5,%xmm10 |
482 | cmp $0xe0,%rax |
483 | jl .Lxorpart4 |
484 | vpxor 0xd0(%rdx),%xmm10,%xmm9 |
485 | vmovdqu %xmm9,0xd0(%rsi) |
486 | |
487 | vmovdqa %xmm6,%xmm10 |
488 | cmp $0xf0,%rax |
489 | jl .Lxorpart4 |
490 | vpxor 0xe0(%rdx),%xmm10,%xmm9 |
491 | vmovdqu %xmm9,0xe0(%rsi) |
492 | |
493 | vmovdqa %xmm7,%xmm10 |
494 | cmp $0x100,%rax |
495 | jl .Lxorpart4 |
496 | vpxor 0xf0(%rdx),%xmm10,%xmm9 |
497 | vmovdqu %xmm9,0xf0(%rsi) |
498 | |
499 | .Ldone4: |
500 | vzeroupper |
501 | RET |
502 | |
503 | .Lxorpart4: |
504 | # xor remaining bytes from partial register into output |
505 | mov %rax,%r9 |
506 | and $0x0f,%r9 |
507 | jz .Ldone4 |
508 | and $~0x0f,%rax |
509 | |
510 | mov %rsi,%r11 |
511 | |
512 | lea 8(%rsp),%r10 |
513 | sub $0x10,%rsp |
514 | and $~31,%rsp |
515 | |
516 | lea (%rdx,%rax),%rsi |
517 | mov %rsp,%rdi |
518 | mov %r9,%rcx |
519 | rep movsb |
520 | |
521 | vpxor 0x00(%rsp),%xmm10,%xmm10 |
522 | vmovdqa %xmm10,0x00(%rsp) |
523 | |
524 | mov %rsp,%rsi |
525 | lea (%r11,%rax),%rdi |
526 | mov %r9,%rcx |
527 | rep movsb |
528 | |
529 | lea -8(%r10),%rsp |
530 | jmp .Ldone4 |
531 | |
532 | SYM_FUNC_END(chacha_4block_xor_avx2) |
533 | |
534 | SYM_FUNC_START(chacha_8block_xor_avx2) |
535 | # %rdi: Input state matrix, s |
536 | # %rsi: up to 8 data blocks output, o |
537 | # %rdx: up to 8 data blocks input, i |
538 | # %rcx: input/output length in bytes |
539 | # %r8d: nrounds |
540 | |
541 | # This function encrypts eight consecutive ChaCha blocks by loading |
542 | # the state matrix in AVX registers eight times. As we need some |
543 | # scratch registers, we save the first four registers on the stack. The |
544 | # algorithm performs each operation on the corresponding word of each |
545 | # state matrix, hence requires no word shuffling. For final XORing step |
546 | # we transpose the matrix by interleaving 32-, 64- and then 128-bit |
547 | # words, which allows us to do XOR in AVX registers. 8/16-bit word |
548 | # rotation is done with the slightly better performing byte shuffling, |
549 | # 7/12-bit word rotation uses traditional shift+OR. |
550 | |
551 | vzeroupper |
552 | # 4 * 32 byte stack, 32-byte aligned |
553 | lea 8(%rsp),%r10 |
554 | and $~31, %rsp |
555 | sub $0x80, %rsp |
556 | mov %rcx,%rax |
557 | |
558 | # x0..15[0-7] = s[0..15] |
559 | vpbroadcastd 0x00(%rdi),%ymm0 |
560 | vpbroadcastd 0x04(%rdi),%ymm1 |
561 | vpbroadcastd 0x08(%rdi),%ymm2 |
562 | vpbroadcastd 0x0c(%rdi),%ymm3 |
563 | vpbroadcastd 0x10(%rdi),%ymm4 |
564 | vpbroadcastd 0x14(%rdi),%ymm5 |
565 | vpbroadcastd 0x18(%rdi),%ymm6 |
566 | vpbroadcastd 0x1c(%rdi),%ymm7 |
567 | vpbroadcastd 0x20(%rdi),%ymm8 |
568 | vpbroadcastd 0x24(%rdi),%ymm9 |
569 | vpbroadcastd 0x28(%rdi),%ymm10 |
570 | vpbroadcastd 0x2c(%rdi),%ymm11 |
571 | vpbroadcastd 0x30(%rdi),%ymm12 |
572 | vpbroadcastd 0x34(%rdi),%ymm13 |
573 | vpbroadcastd 0x38(%rdi),%ymm14 |
574 | vpbroadcastd 0x3c(%rdi),%ymm15 |
575 | # x0..3 on stack |
576 | vmovdqa %ymm0,0x00(%rsp) |
577 | vmovdqa %ymm1,0x20(%rsp) |
578 | vmovdqa %ymm2,0x40(%rsp) |
579 | vmovdqa %ymm3,0x60(%rsp) |
580 | |
581 | vmovdqa CTRINC(%rip),%ymm1 |
582 | vmovdqa ROT8(%rip),%ymm2 |
583 | vmovdqa ROT16(%rip),%ymm3 |
584 | |
585 | # x12 += counter values 0-3 |
586 | vpaddd %ymm1,%ymm12,%ymm12 |
587 | |
588 | .Ldoubleround8: |
589 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
590 | vpaddd 0x00(%rsp),%ymm4,%ymm0 |
591 | vmovdqa %ymm0,0x00(%rsp) |
592 | vpxor %ymm0,%ymm12,%ymm12 |
593 | vpshufb %ymm3,%ymm12,%ymm12 |
594 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
595 | vpaddd 0x20(%rsp),%ymm5,%ymm0 |
596 | vmovdqa %ymm0,0x20(%rsp) |
597 | vpxor %ymm0,%ymm13,%ymm13 |
598 | vpshufb %ymm3,%ymm13,%ymm13 |
599 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
600 | vpaddd 0x40(%rsp),%ymm6,%ymm0 |
601 | vmovdqa %ymm0,0x40(%rsp) |
602 | vpxor %ymm0,%ymm14,%ymm14 |
603 | vpshufb %ymm3,%ymm14,%ymm14 |
604 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
605 | vpaddd 0x60(%rsp),%ymm7,%ymm0 |
606 | vmovdqa %ymm0,0x60(%rsp) |
607 | vpxor %ymm0,%ymm15,%ymm15 |
608 | vpshufb %ymm3,%ymm15,%ymm15 |
609 | |
610 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
611 | vpaddd %ymm12,%ymm8,%ymm8 |
612 | vpxor %ymm8,%ymm4,%ymm4 |
613 | vpslld $12,%ymm4,%ymm0 |
614 | vpsrld $20,%ymm4,%ymm4 |
615 | vpor %ymm0,%ymm4,%ymm4 |
616 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
617 | vpaddd %ymm13,%ymm9,%ymm9 |
618 | vpxor %ymm9,%ymm5,%ymm5 |
619 | vpslld $12,%ymm5,%ymm0 |
620 | vpsrld $20,%ymm5,%ymm5 |
621 | vpor %ymm0,%ymm5,%ymm5 |
622 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
623 | vpaddd %ymm14,%ymm10,%ymm10 |
624 | vpxor %ymm10,%ymm6,%ymm6 |
625 | vpslld $12,%ymm6,%ymm0 |
626 | vpsrld $20,%ymm6,%ymm6 |
627 | vpor %ymm0,%ymm6,%ymm6 |
628 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
629 | vpaddd %ymm15,%ymm11,%ymm11 |
630 | vpxor %ymm11,%ymm7,%ymm7 |
631 | vpslld $12,%ymm7,%ymm0 |
632 | vpsrld $20,%ymm7,%ymm7 |
633 | vpor %ymm0,%ymm7,%ymm7 |
634 | |
635 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
636 | vpaddd 0x00(%rsp),%ymm4,%ymm0 |
637 | vmovdqa %ymm0,0x00(%rsp) |
638 | vpxor %ymm0,%ymm12,%ymm12 |
639 | vpshufb %ymm2,%ymm12,%ymm12 |
640 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
641 | vpaddd 0x20(%rsp),%ymm5,%ymm0 |
642 | vmovdqa %ymm0,0x20(%rsp) |
643 | vpxor %ymm0,%ymm13,%ymm13 |
644 | vpshufb %ymm2,%ymm13,%ymm13 |
645 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
646 | vpaddd 0x40(%rsp),%ymm6,%ymm0 |
647 | vmovdqa %ymm0,0x40(%rsp) |
648 | vpxor %ymm0,%ymm14,%ymm14 |
649 | vpshufb %ymm2,%ymm14,%ymm14 |
650 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
651 | vpaddd 0x60(%rsp),%ymm7,%ymm0 |
652 | vmovdqa %ymm0,0x60(%rsp) |
653 | vpxor %ymm0,%ymm15,%ymm15 |
654 | vpshufb %ymm2,%ymm15,%ymm15 |
655 | |
656 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
657 | vpaddd %ymm12,%ymm8,%ymm8 |
658 | vpxor %ymm8,%ymm4,%ymm4 |
659 | vpslld $7,%ymm4,%ymm0 |
660 | vpsrld $25,%ymm4,%ymm4 |
661 | vpor %ymm0,%ymm4,%ymm4 |
662 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
663 | vpaddd %ymm13,%ymm9,%ymm9 |
664 | vpxor %ymm9,%ymm5,%ymm5 |
665 | vpslld $7,%ymm5,%ymm0 |
666 | vpsrld $25,%ymm5,%ymm5 |
667 | vpor %ymm0,%ymm5,%ymm5 |
668 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
669 | vpaddd %ymm14,%ymm10,%ymm10 |
670 | vpxor %ymm10,%ymm6,%ymm6 |
671 | vpslld $7,%ymm6,%ymm0 |
672 | vpsrld $25,%ymm6,%ymm6 |
673 | vpor %ymm0,%ymm6,%ymm6 |
674 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
675 | vpaddd %ymm15,%ymm11,%ymm11 |
676 | vpxor %ymm11,%ymm7,%ymm7 |
677 | vpslld $7,%ymm7,%ymm0 |
678 | vpsrld $25,%ymm7,%ymm7 |
679 | vpor %ymm0,%ymm7,%ymm7 |
680 | |
681 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
682 | vpaddd 0x00(%rsp),%ymm5,%ymm0 |
683 | vmovdqa %ymm0,0x00(%rsp) |
684 | vpxor %ymm0,%ymm15,%ymm15 |
685 | vpshufb %ymm3,%ymm15,%ymm15 |
686 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 |
687 | vpaddd 0x20(%rsp),%ymm6,%ymm0 |
688 | vmovdqa %ymm0,0x20(%rsp) |
689 | vpxor %ymm0,%ymm12,%ymm12 |
690 | vpshufb %ymm3,%ymm12,%ymm12 |
691 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
692 | vpaddd 0x40(%rsp),%ymm7,%ymm0 |
693 | vmovdqa %ymm0,0x40(%rsp) |
694 | vpxor %ymm0,%ymm13,%ymm13 |
695 | vpshufb %ymm3,%ymm13,%ymm13 |
696 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
697 | vpaddd 0x60(%rsp),%ymm4,%ymm0 |
698 | vmovdqa %ymm0,0x60(%rsp) |
699 | vpxor %ymm0,%ymm14,%ymm14 |
700 | vpshufb %ymm3,%ymm14,%ymm14 |
701 | |
702 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
703 | vpaddd %ymm15,%ymm10,%ymm10 |
704 | vpxor %ymm10,%ymm5,%ymm5 |
705 | vpslld $12,%ymm5,%ymm0 |
706 | vpsrld $20,%ymm5,%ymm5 |
707 | vpor %ymm0,%ymm5,%ymm5 |
708 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
709 | vpaddd %ymm12,%ymm11,%ymm11 |
710 | vpxor %ymm11,%ymm6,%ymm6 |
711 | vpslld $12,%ymm6,%ymm0 |
712 | vpsrld $20,%ymm6,%ymm6 |
713 | vpor %ymm0,%ymm6,%ymm6 |
714 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
715 | vpaddd %ymm13,%ymm8,%ymm8 |
716 | vpxor %ymm8,%ymm7,%ymm7 |
717 | vpslld $12,%ymm7,%ymm0 |
718 | vpsrld $20,%ymm7,%ymm7 |
719 | vpor %ymm0,%ymm7,%ymm7 |
720 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
721 | vpaddd %ymm14,%ymm9,%ymm9 |
722 | vpxor %ymm9,%ymm4,%ymm4 |
723 | vpslld $12,%ymm4,%ymm0 |
724 | vpsrld $20,%ymm4,%ymm4 |
725 | vpor %ymm0,%ymm4,%ymm4 |
726 | |
727 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
728 | vpaddd 0x00(%rsp),%ymm5,%ymm0 |
729 | vmovdqa %ymm0,0x00(%rsp) |
730 | vpxor %ymm0,%ymm15,%ymm15 |
731 | vpshufb %ymm2,%ymm15,%ymm15 |
732 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
733 | vpaddd 0x20(%rsp),%ymm6,%ymm0 |
734 | vmovdqa %ymm0,0x20(%rsp) |
735 | vpxor %ymm0,%ymm12,%ymm12 |
736 | vpshufb %ymm2,%ymm12,%ymm12 |
737 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
738 | vpaddd 0x40(%rsp),%ymm7,%ymm0 |
739 | vmovdqa %ymm0,0x40(%rsp) |
740 | vpxor %ymm0,%ymm13,%ymm13 |
741 | vpshufb %ymm2,%ymm13,%ymm13 |
742 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
743 | vpaddd 0x60(%rsp),%ymm4,%ymm0 |
744 | vmovdqa %ymm0,0x60(%rsp) |
745 | vpxor %ymm0,%ymm14,%ymm14 |
746 | vpshufb %ymm2,%ymm14,%ymm14 |
747 | |
748 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
749 | vpaddd %ymm15,%ymm10,%ymm10 |
750 | vpxor %ymm10,%ymm5,%ymm5 |
751 | vpslld $7,%ymm5,%ymm0 |
752 | vpsrld $25,%ymm5,%ymm5 |
753 | vpor %ymm0,%ymm5,%ymm5 |
754 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
755 | vpaddd %ymm12,%ymm11,%ymm11 |
756 | vpxor %ymm11,%ymm6,%ymm6 |
757 | vpslld $7,%ymm6,%ymm0 |
758 | vpsrld $25,%ymm6,%ymm6 |
759 | vpor %ymm0,%ymm6,%ymm6 |
760 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
761 | vpaddd %ymm13,%ymm8,%ymm8 |
762 | vpxor %ymm8,%ymm7,%ymm7 |
763 | vpslld $7,%ymm7,%ymm0 |
764 | vpsrld $25,%ymm7,%ymm7 |
765 | vpor %ymm0,%ymm7,%ymm7 |
766 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
767 | vpaddd %ymm14,%ymm9,%ymm9 |
768 | vpxor %ymm9,%ymm4,%ymm4 |
769 | vpslld $7,%ymm4,%ymm0 |
770 | vpsrld $25,%ymm4,%ymm4 |
771 | vpor %ymm0,%ymm4,%ymm4 |
772 | |
773 | sub $2,%r8d |
774 | jnz .Ldoubleround8 |
775 | |
776 | # x0..15[0-3] += s[0..15] |
777 | vpbroadcastd 0x00(%rdi),%ymm0 |
778 | vpaddd 0x00(%rsp),%ymm0,%ymm0 |
779 | vmovdqa %ymm0,0x00(%rsp) |
780 | vpbroadcastd 0x04(%rdi),%ymm0 |
781 | vpaddd 0x20(%rsp),%ymm0,%ymm0 |
782 | vmovdqa %ymm0,0x20(%rsp) |
783 | vpbroadcastd 0x08(%rdi),%ymm0 |
784 | vpaddd 0x40(%rsp),%ymm0,%ymm0 |
785 | vmovdqa %ymm0,0x40(%rsp) |
786 | vpbroadcastd 0x0c(%rdi),%ymm0 |
787 | vpaddd 0x60(%rsp),%ymm0,%ymm0 |
788 | vmovdqa %ymm0,0x60(%rsp) |
789 | vpbroadcastd 0x10(%rdi),%ymm0 |
790 | vpaddd %ymm0,%ymm4,%ymm4 |
791 | vpbroadcastd 0x14(%rdi),%ymm0 |
792 | vpaddd %ymm0,%ymm5,%ymm5 |
793 | vpbroadcastd 0x18(%rdi),%ymm0 |
794 | vpaddd %ymm0,%ymm6,%ymm6 |
795 | vpbroadcastd 0x1c(%rdi),%ymm0 |
796 | vpaddd %ymm0,%ymm7,%ymm7 |
797 | vpbroadcastd 0x20(%rdi),%ymm0 |
798 | vpaddd %ymm0,%ymm8,%ymm8 |
799 | vpbroadcastd 0x24(%rdi),%ymm0 |
800 | vpaddd %ymm0,%ymm9,%ymm9 |
801 | vpbroadcastd 0x28(%rdi),%ymm0 |
802 | vpaddd %ymm0,%ymm10,%ymm10 |
803 | vpbroadcastd 0x2c(%rdi),%ymm0 |
804 | vpaddd %ymm0,%ymm11,%ymm11 |
805 | vpbroadcastd 0x30(%rdi),%ymm0 |
806 | vpaddd %ymm0,%ymm12,%ymm12 |
807 | vpbroadcastd 0x34(%rdi),%ymm0 |
808 | vpaddd %ymm0,%ymm13,%ymm13 |
809 | vpbroadcastd 0x38(%rdi),%ymm0 |
810 | vpaddd %ymm0,%ymm14,%ymm14 |
811 | vpbroadcastd 0x3c(%rdi),%ymm0 |
812 | vpaddd %ymm0,%ymm15,%ymm15 |
813 | |
814 | # x12 += counter values 0-3 |
815 | vpaddd %ymm1,%ymm12,%ymm12 |
816 | |
817 | # interleave 32-bit words in state n, n+1 |
818 | vmovdqa 0x00(%rsp),%ymm0 |
819 | vmovdqa 0x20(%rsp),%ymm1 |
820 | vpunpckldq %ymm1,%ymm0,%ymm2 |
821 | vpunpckhdq %ymm1,%ymm0,%ymm1 |
822 | vmovdqa %ymm2,0x00(%rsp) |
823 | vmovdqa %ymm1,0x20(%rsp) |
824 | vmovdqa 0x40(%rsp),%ymm0 |
825 | vmovdqa 0x60(%rsp),%ymm1 |
826 | vpunpckldq %ymm1,%ymm0,%ymm2 |
827 | vpunpckhdq %ymm1,%ymm0,%ymm1 |
828 | vmovdqa %ymm2,0x40(%rsp) |
829 | vmovdqa %ymm1,0x60(%rsp) |
830 | vmovdqa %ymm4,%ymm0 |
831 | vpunpckldq %ymm5,%ymm0,%ymm4 |
832 | vpunpckhdq %ymm5,%ymm0,%ymm5 |
833 | vmovdqa %ymm6,%ymm0 |
834 | vpunpckldq %ymm7,%ymm0,%ymm6 |
835 | vpunpckhdq %ymm7,%ymm0,%ymm7 |
836 | vmovdqa %ymm8,%ymm0 |
837 | vpunpckldq %ymm9,%ymm0,%ymm8 |
838 | vpunpckhdq %ymm9,%ymm0,%ymm9 |
839 | vmovdqa %ymm10,%ymm0 |
840 | vpunpckldq %ymm11,%ymm0,%ymm10 |
841 | vpunpckhdq %ymm11,%ymm0,%ymm11 |
842 | vmovdqa %ymm12,%ymm0 |
843 | vpunpckldq %ymm13,%ymm0,%ymm12 |
844 | vpunpckhdq %ymm13,%ymm0,%ymm13 |
845 | vmovdqa %ymm14,%ymm0 |
846 | vpunpckldq %ymm15,%ymm0,%ymm14 |
847 | vpunpckhdq %ymm15,%ymm0,%ymm15 |
848 | |
849 | # interleave 64-bit words in state n, n+2 |
850 | vmovdqa 0x00(%rsp),%ymm0 |
851 | vmovdqa 0x40(%rsp),%ymm2 |
852 | vpunpcklqdq %ymm2,%ymm0,%ymm1 |
853 | vpunpckhqdq %ymm2,%ymm0,%ymm2 |
854 | vmovdqa %ymm1,0x00(%rsp) |
855 | vmovdqa %ymm2,0x40(%rsp) |
856 | vmovdqa 0x20(%rsp),%ymm0 |
857 | vmovdqa 0x60(%rsp),%ymm2 |
858 | vpunpcklqdq %ymm2,%ymm0,%ymm1 |
859 | vpunpckhqdq %ymm2,%ymm0,%ymm2 |
860 | vmovdqa %ymm1,0x20(%rsp) |
861 | vmovdqa %ymm2,0x60(%rsp) |
862 | vmovdqa %ymm4,%ymm0 |
863 | vpunpcklqdq %ymm6,%ymm0,%ymm4 |
864 | vpunpckhqdq %ymm6,%ymm0,%ymm6 |
865 | vmovdqa %ymm5,%ymm0 |
866 | vpunpcklqdq %ymm7,%ymm0,%ymm5 |
867 | vpunpckhqdq %ymm7,%ymm0,%ymm7 |
868 | vmovdqa %ymm8,%ymm0 |
869 | vpunpcklqdq %ymm10,%ymm0,%ymm8 |
870 | vpunpckhqdq %ymm10,%ymm0,%ymm10 |
871 | vmovdqa %ymm9,%ymm0 |
872 | vpunpcklqdq %ymm11,%ymm0,%ymm9 |
873 | vpunpckhqdq %ymm11,%ymm0,%ymm11 |
874 | vmovdqa %ymm12,%ymm0 |
875 | vpunpcklqdq %ymm14,%ymm0,%ymm12 |
876 | vpunpckhqdq %ymm14,%ymm0,%ymm14 |
877 | vmovdqa %ymm13,%ymm0 |
878 | vpunpcklqdq %ymm15,%ymm0,%ymm13 |
879 | vpunpckhqdq %ymm15,%ymm0,%ymm15 |
880 | |
881 | # interleave 128-bit words in state n, n+4 |
882 | # xor/write first four blocks |
883 | vmovdqa 0x00(%rsp),%ymm1 |
884 | vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 |
885 | cmp $0x0020,%rax |
886 | jl .Lxorpart8 |
887 | vpxor 0x0000(%rdx),%ymm0,%ymm0 |
888 | vmovdqu %ymm0,0x0000(%rsi) |
889 | vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 |
890 | |
891 | vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 |
892 | cmp $0x0040,%rax |
893 | jl .Lxorpart8 |
894 | vpxor 0x0020(%rdx),%ymm0,%ymm0 |
895 | vmovdqu %ymm0,0x0020(%rsi) |
896 | vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 |
897 | |
898 | vmovdqa 0x40(%rsp),%ymm1 |
899 | vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 |
900 | cmp $0x0060,%rax |
901 | jl .Lxorpart8 |
902 | vpxor 0x0040(%rdx),%ymm0,%ymm0 |
903 | vmovdqu %ymm0,0x0040(%rsi) |
904 | vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 |
905 | |
906 | vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 |
907 | cmp $0x0080,%rax |
908 | jl .Lxorpart8 |
909 | vpxor 0x0060(%rdx),%ymm0,%ymm0 |
910 | vmovdqu %ymm0,0x0060(%rsi) |
911 | vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 |
912 | |
913 | vmovdqa 0x20(%rsp),%ymm1 |
914 | vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 |
915 | cmp $0x00a0,%rax |
916 | jl .Lxorpart8 |
917 | vpxor 0x0080(%rdx),%ymm0,%ymm0 |
918 | vmovdqu %ymm0,0x0080(%rsi) |
919 | vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 |
920 | |
921 | vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 |
922 | cmp $0x00c0,%rax |
923 | jl .Lxorpart8 |
924 | vpxor 0x00a0(%rdx),%ymm0,%ymm0 |
925 | vmovdqu %ymm0,0x00a0(%rsi) |
926 | vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 |
927 | |
928 | vmovdqa 0x60(%rsp),%ymm1 |
929 | vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 |
930 | cmp $0x00e0,%rax |
931 | jl .Lxorpart8 |
932 | vpxor 0x00c0(%rdx),%ymm0,%ymm0 |
933 | vmovdqu %ymm0,0x00c0(%rsi) |
934 | vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 |
935 | |
936 | vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 |
937 | cmp $0x0100,%rax |
938 | jl .Lxorpart8 |
939 | vpxor 0x00e0(%rdx),%ymm0,%ymm0 |
940 | vmovdqu %ymm0,0x00e0(%rsi) |
941 | vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 |
942 | |
943 | # xor remaining blocks, write to output |
944 | vmovdqa %ymm4,%ymm0 |
945 | cmp $0x0120,%rax |
946 | jl .Lxorpart8 |
947 | vpxor 0x0100(%rdx),%ymm0,%ymm0 |
948 | vmovdqu %ymm0,0x0100(%rsi) |
949 | |
950 | vmovdqa %ymm12,%ymm0 |
951 | cmp $0x0140,%rax |
952 | jl .Lxorpart8 |
953 | vpxor 0x0120(%rdx),%ymm0,%ymm0 |
954 | vmovdqu %ymm0,0x0120(%rsi) |
955 | |
956 | vmovdqa %ymm6,%ymm0 |
957 | cmp $0x0160,%rax |
958 | jl .Lxorpart8 |
959 | vpxor 0x0140(%rdx),%ymm0,%ymm0 |
960 | vmovdqu %ymm0,0x0140(%rsi) |
961 | |
962 | vmovdqa %ymm14,%ymm0 |
963 | cmp $0x0180,%rax |
964 | jl .Lxorpart8 |
965 | vpxor 0x0160(%rdx),%ymm0,%ymm0 |
966 | vmovdqu %ymm0,0x0160(%rsi) |
967 | |
968 | vmovdqa %ymm5,%ymm0 |
969 | cmp $0x01a0,%rax |
970 | jl .Lxorpart8 |
971 | vpxor 0x0180(%rdx),%ymm0,%ymm0 |
972 | vmovdqu %ymm0,0x0180(%rsi) |
973 | |
974 | vmovdqa %ymm13,%ymm0 |
975 | cmp $0x01c0,%rax |
976 | jl .Lxorpart8 |
977 | vpxor 0x01a0(%rdx),%ymm0,%ymm0 |
978 | vmovdqu %ymm0,0x01a0(%rsi) |
979 | |
980 | vmovdqa %ymm7,%ymm0 |
981 | cmp $0x01e0,%rax |
982 | jl .Lxorpart8 |
983 | vpxor 0x01c0(%rdx),%ymm0,%ymm0 |
984 | vmovdqu %ymm0,0x01c0(%rsi) |
985 | |
986 | vmovdqa %ymm15,%ymm0 |
987 | cmp $0x0200,%rax |
988 | jl .Lxorpart8 |
989 | vpxor 0x01e0(%rdx),%ymm0,%ymm0 |
990 | vmovdqu %ymm0,0x01e0(%rsi) |
991 | |
992 | .Ldone8: |
993 | vzeroupper |
994 | lea -8(%r10),%rsp |
995 | RET |
996 | |
997 | .Lxorpart8: |
998 | # xor remaining bytes from partial register into output |
999 | mov %rax,%r9 |
1000 | and $0x1f,%r9 |
1001 | jz .Ldone8 |
1002 | and $~0x1f,%rax |
1003 | |
1004 | mov %rsi,%r11 |
1005 | |
1006 | lea (%rdx,%rax),%rsi |
1007 | mov %rsp,%rdi |
1008 | mov %r9,%rcx |
1009 | rep movsb |
1010 | |
1011 | vpxor 0x00(%rsp),%ymm0,%ymm0 |
1012 | vmovdqa %ymm0,0x00(%rsp) |
1013 | |
1014 | mov %rsp,%rsi |
1015 | lea (%r11,%rax),%rdi |
1016 | mov %r9,%rcx |
1017 | rep movsb |
1018 | |
1019 | jmp .Ldone8 |
1020 | |
1021 | SYM_FUNC_END(chacha_8block_xor_avx2) |
1022 | |