1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
4 *
5 * Copyright (C) 2015 Martin Willi
6 */
7
8#include <linux/linkage.h>
9#include <asm/frame.h>
10
11.section .rodata.cst16.ROT8, "aM", @progbits, 16
12.align 16
13ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
14.section .rodata.cst16.ROT16, "aM", @progbits, 16
15.align 16
16ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
17.section .rodata.cst16.CTRINC, "aM", @progbits, 16
18.align 16
19CTRINC: .octa 0x00000003000000020000000100000000
20
21.text
22
23/*
24 * chacha_permute - permute one block
25 *
26 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
27 * function performs matrix operations on four words in parallel, but requires
28 * shuffling to rearrange the words after each round. 8/16-bit word rotation is
29 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
30 * rotation uses traditional shift+OR.
31 *
32 * The round count is given in %r8d.
33 *
34 * Clobbers: %r8d, %xmm4-%xmm7
35 */
36SYM_FUNC_START_LOCAL(chacha_permute)
37
38 movdqa ROT8(%rip),%xmm4
39 movdqa ROT16(%rip),%xmm5
40
41.Ldoubleround:
42 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
43 paddd %xmm1,%xmm0
44 pxor %xmm0,%xmm3
45 pshufb %xmm5,%xmm3
46
47 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
48 paddd %xmm3,%xmm2
49 pxor %xmm2,%xmm1
50 movdqa %xmm1,%xmm6
51 pslld $12,%xmm6
52 psrld $20,%xmm1
53 por %xmm6,%xmm1
54
55 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
56 paddd %xmm1,%xmm0
57 pxor %xmm0,%xmm3
58 pshufb %xmm4,%xmm3
59
60 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
61 paddd %xmm3,%xmm2
62 pxor %xmm2,%xmm1
63 movdqa %xmm1,%xmm7
64 pslld $7,%xmm7
65 psrld $25,%xmm1
66 por %xmm7,%xmm1
67
68 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
69 pshufd $0x39,%xmm1,%xmm1
70 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
71 pshufd $0x4e,%xmm2,%xmm2
72 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
73 pshufd $0x93,%xmm3,%xmm3
74
75 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
76 paddd %xmm1,%xmm0
77 pxor %xmm0,%xmm3
78 pshufb %xmm5,%xmm3
79
80 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
81 paddd %xmm3,%xmm2
82 pxor %xmm2,%xmm1
83 movdqa %xmm1,%xmm6
84 pslld $12,%xmm6
85 psrld $20,%xmm1
86 por %xmm6,%xmm1
87
88 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
89 paddd %xmm1,%xmm0
90 pxor %xmm0,%xmm3
91 pshufb %xmm4,%xmm3
92
93 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
94 paddd %xmm3,%xmm2
95 pxor %xmm2,%xmm1
96 movdqa %xmm1,%xmm7
97 pslld $7,%xmm7
98 psrld $25,%xmm1
99 por %xmm7,%xmm1
100
101 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
102 pshufd $0x93,%xmm1,%xmm1
103 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104 pshufd $0x4e,%xmm2,%xmm2
105 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
106 pshufd $0x39,%xmm3,%xmm3
107
108 sub $2,%r8d
109 jnz .Ldoubleround
110
111 RET
112SYM_FUNC_END(chacha_permute)
113
114SYM_FUNC_START(chacha_block_xor_ssse3)
115 # %rdi: Input state matrix, s
116 # %rsi: up to 1 data block output, o
117 # %rdx: up to 1 data block input, i
118 # %rcx: input/output length in bytes
119 # %r8d: nrounds
120 FRAME_BEGIN
121
122 # x0..3 = s0..3
123 movdqu 0x00(%rdi),%xmm0
124 movdqu 0x10(%rdi),%xmm1
125 movdqu 0x20(%rdi),%xmm2
126 movdqu 0x30(%rdi),%xmm3
127 movdqa %xmm0,%xmm8
128 movdqa %xmm1,%xmm9
129 movdqa %xmm2,%xmm10
130 movdqa %xmm3,%xmm11
131
132 mov %rcx,%rax
133 call chacha_permute
134
135 # o0 = i0 ^ (x0 + s0)
136 paddd %xmm8,%xmm0
137 cmp $0x10,%rax
138 jl .Lxorpart
139 movdqu 0x00(%rdx),%xmm4
140 pxor %xmm4,%xmm0
141 movdqu %xmm0,0x00(%rsi)
142 # o1 = i1 ^ (x1 + s1)
143 paddd %xmm9,%xmm1
144 movdqa %xmm1,%xmm0
145 cmp $0x20,%rax
146 jl .Lxorpart
147 movdqu 0x10(%rdx),%xmm0
148 pxor %xmm1,%xmm0
149 movdqu %xmm0,0x10(%rsi)
150 # o2 = i2 ^ (x2 + s2)
151 paddd %xmm10,%xmm2
152 movdqa %xmm2,%xmm0
153 cmp $0x30,%rax
154 jl .Lxorpart
155 movdqu 0x20(%rdx),%xmm0
156 pxor %xmm2,%xmm0
157 movdqu %xmm0,0x20(%rsi)
158 # o3 = i3 ^ (x3 + s3)
159 paddd %xmm11,%xmm3
160 movdqa %xmm3,%xmm0
161 cmp $0x40,%rax
162 jl .Lxorpart
163 movdqu 0x30(%rdx),%xmm0
164 pxor %xmm3,%xmm0
165 movdqu %xmm0,0x30(%rsi)
166
167.Ldone:
168 FRAME_END
169 RET
170
171.Lxorpart:
172 # xor remaining bytes from partial register into output
173 mov %rax,%r9
174 and $0x0f,%r9
175 jz .Ldone
176 and $~0x0f,%rax
177
178 mov %rsi,%r11
179
180 lea 8(%rsp),%r10
181 sub $0x10,%rsp
182 and $~31,%rsp
183
184 lea (%rdx,%rax),%rsi
185 mov %rsp,%rdi
186 mov %r9,%rcx
187 rep movsb
188
189 pxor 0x00(%rsp),%xmm0
190 movdqa %xmm0,0x00(%rsp)
191
192 mov %rsp,%rsi
193 lea (%r11,%rax),%rdi
194 mov %r9,%rcx
195 rep movsb
196
197 lea -8(%r10),%rsp
198 jmp .Ldone
199
200SYM_FUNC_END(chacha_block_xor_ssse3)
201
202SYM_FUNC_START(hchacha_block_ssse3)
203 # %rdi: Input state matrix, s
204 # %rsi: output (8 32-bit words)
205 # %edx: nrounds
206 FRAME_BEGIN
207
208 movdqu 0x00(%rdi),%xmm0
209 movdqu 0x10(%rdi),%xmm1
210 movdqu 0x20(%rdi),%xmm2
211 movdqu 0x30(%rdi),%xmm3
212
213 mov %edx,%r8d
214 call chacha_permute
215
216 movdqu %xmm0,0x00(%rsi)
217 movdqu %xmm3,0x10(%rsi)
218
219 FRAME_END
220 RET
221SYM_FUNC_END(hchacha_block_ssse3)
222
223SYM_FUNC_START(chacha_4block_xor_ssse3)
224 # %rdi: Input state matrix, s
225 # %rsi: up to 4 data blocks output, o
226 # %rdx: up to 4 data blocks input, i
227 # %rcx: input/output length in bytes
228 # %r8d: nrounds
229
230 # This function encrypts four consecutive ChaCha blocks by loading the
231 # the state matrix in SSE registers four times. As we need some scratch
232 # registers, we save the first four registers on the stack. The
233 # algorithm performs each operation on the corresponding word of each
234 # state matrix, hence requires no word shuffling. For final XORing step
235 # we transpose the matrix by interleaving 32- and then 64-bit words,
236 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
237 # done with the slightly better performing SSSE3 byte shuffling,
238 # 7/12-bit word rotation uses traditional shift+OR.
239
240 lea 8(%rsp),%r10
241 sub $0x80,%rsp
242 and $~63,%rsp
243 mov %rcx,%rax
244
245 # x0..15[0-3] = s0..3[0..3]
246 movq 0x00(%rdi),%xmm1
247 pshufd $0x00,%xmm1,%xmm0
248 pshufd $0x55,%xmm1,%xmm1
249 movq 0x08(%rdi),%xmm3
250 pshufd $0x00,%xmm3,%xmm2
251 pshufd $0x55,%xmm3,%xmm3
252 movq 0x10(%rdi),%xmm5
253 pshufd $0x00,%xmm5,%xmm4
254 pshufd $0x55,%xmm5,%xmm5
255 movq 0x18(%rdi),%xmm7
256 pshufd $0x00,%xmm7,%xmm6
257 pshufd $0x55,%xmm7,%xmm7
258 movq 0x20(%rdi),%xmm9
259 pshufd $0x00,%xmm9,%xmm8
260 pshufd $0x55,%xmm9,%xmm9
261 movq 0x28(%rdi),%xmm11
262 pshufd $0x00,%xmm11,%xmm10
263 pshufd $0x55,%xmm11,%xmm11
264 movq 0x30(%rdi),%xmm13
265 pshufd $0x00,%xmm13,%xmm12
266 pshufd $0x55,%xmm13,%xmm13
267 movq 0x38(%rdi),%xmm15
268 pshufd $0x00,%xmm15,%xmm14
269 pshufd $0x55,%xmm15,%xmm15
270 # x0..3 on stack
271 movdqa %xmm0,0x00(%rsp)
272 movdqa %xmm1,0x10(%rsp)
273 movdqa %xmm2,0x20(%rsp)
274 movdqa %xmm3,0x30(%rsp)
275
276 movdqa CTRINC(%rip),%xmm1
277 movdqa ROT8(%rip),%xmm2
278 movdqa ROT16(%rip),%xmm3
279
280 # x12 += counter values 0-3
281 paddd %xmm1,%xmm12
282
283.Ldoubleround4:
284 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
285 movdqa 0x00(%rsp),%xmm0
286 paddd %xmm4,%xmm0
287 movdqa %xmm0,0x00(%rsp)
288 pxor %xmm0,%xmm12
289 pshufb %xmm3,%xmm12
290 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
291 movdqa 0x10(%rsp),%xmm0
292 paddd %xmm5,%xmm0
293 movdqa %xmm0,0x10(%rsp)
294 pxor %xmm0,%xmm13
295 pshufb %xmm3,%xmm13
296 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
297 movdqa 0x20(%rsp),%xmm0
298 paddd %xmm6,%xmm0
299 movdqa %xmm0,0x20(%rsp)
300 pxor %xmm0,%xmm14
301 pshufb %xmm3,%xmm14
302 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
303 movdqa 0x30(%rsp),%xmm0
304 paddd %xmm7,%xmm0
305 movdqa %xmm0,0x30(%rsp)
306 pxor %xmm0,%xmm15
307 pshufb %xmm3,%xmm15
308
309 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
310 paddd %xmm12,%xmm8
311 pxor %xmm8,%xmm4
312 movdqa %xmm4,%xmm0
313 pslld $12,%xmm0
314 psrld $20,%xmm4
315 por %xmm0,%xmm4
316 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
317 paddd %xmm13,%xmm9
318 pxor %xmm9,%xmm5
319 movdqa %xmm5,%xmm0
320 pslld $12,%xmm0
321 psrld $20,%xmm5
322 por %xmm0,%xmm5
323 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
324 paddd %xmm14,%xmm10
325 pxor %xmm10,%xmm6
326 movdqa %xmm6,%xmm0
327 pslld $12,%xmm0
328 psrld $20,%xmm6
329 por %xmm0,%xmm6
330 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
331 paddd %xmm15,%xmm11
332 pxor %xmm11,%xmm7
333 movdqa %xmm7,%xmm0
334 pslld $12,%xmm0
335 psrld $20,%xmm7
336 por %xmm0,%xmm7
337
338 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
339 movdqa 0x00(%rsp),%xmm0
340 paddd %xmm4,%xmm0
341 movdqa %xmm0,0x00(%rsp)
342 pxor %xmm0,%xmm12
343 pshufb %xmm2,%xmm12
344 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
345 movdqa 0x10(%rsp),%xmm0
346 paddd %xmm5,%xmm0
347 movdqa %xmm0,0x10(%rsp)
348 pxor %xmm0,%xmm13
349 pshufb %xmm2,%xmm13
350 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
351 movdqa 0x20(%rsp),%xmm0
352 paddd %xmm6,%xmm0
353 movdqa %xmm0,0x20(%rsp)
354 pxor %xmm0,%xmm14
355 pshufb %xmm2,%xmm14
356 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
357 movdqa 0x30(%rsp),%xmm0
358 paddd %xmm7,%xmm0
359 movdqa %xmm0,0x30(%rsp)
360 pxor %xmm0,%xmm15
361 pshufb %xmm2,%xmm15
362
363 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
364 paddd %xmm12,%xmm8
365 pxor %xmm8,%xmm4
366 movdqa %xmm4,%xmm0
367 pslld $7,%xmm0
368 psrld $25,%xmm4
369 por %xmm0,%xmm4
370 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
371 paddd %xmm13,%xmm9
372 pxor %xmm9,%xmm5
373 movdqa %xmm5,%xmm0
374 pslld $7,%xmm0
375 psrld $25,%xmm5
376 por %xmm0,%xmm5
377 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
378 paddd %xmm14,%xmm10
379 pxor %xmm10,%xmm6
380 movdqa %xmm6,%xmm0
381 pslld $7,%xmm0
382 psrld $25,%xmm6
383 por %xmm0,%xmm6
384 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
385 paddd %xmm15,%xmm11
386 pxor %xmm11,%xmm7
387 movdqa %xmm7,%xmm0
388 pslld $7,%xmm0
389 psrld $25,%xmm7
390 por %xmm0,%xmm7
391
392 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
393 movdqa 0x00(%rsp),%xmm0
394 paddd %xmm5,%xmm0
395 movdqa %xmm0,0x00(%rsp)
396 pxor %xmm0,%xmm15
397 pshufb %xmm3,%xmm15
398 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
399 movdqa 0x10(%rsp),%xmm0
400 paddd %xmm6,%xmm0
401 movdqa %xmm0,0x10(%rsp)
402 pxor %xmm0,%xmm12
403 pshufb %xmm3,%xmm12
404 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
405 movdqa 0x20(%rsp),%xmm0
406 paddd %xmm7,%xmm0
407 movdqa %xmm0,0x20(%rsp)
408 pxor %xmm0,%xmm13
409 pshufb %xmm3,%xmm13
410 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
411 movdqa 0x30(%rsp),%xmm0
412 paddd %xmm4,%xmm0
413 movdqa %xmm0,0x30(%rsp)
414 pxor %xmm0,%xmm14
415 pshufb %xmm3,%xmm14
416
417 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
418 paddd %xmm15,%xmm10
419 pxor %xmm10,%xmm5
420 movdqa %xmm5,%xmm0
421 pslld $12,%xmm0
422 psrld $20,%xmm5
423 por %xmm0,%xmm5
424 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
425 paddd %xmm12,%xmm11
426 pxor %xmm11,%xmm6
427 movdqa %xmm6,%xmm0
428 pslld $12,%xmm0
429 psrld $20,%xmm6
430 por %xmm0,%xmm6
431 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
432 paddd %xmm13,%xmm8
433 pxor %xmm8,%xmm7
434 movdqa %xmm7,%xmm0
435 pslld $12,%xmm0
436 psrld $20,%xmm7
437 por %xmm0,%xmm7
438 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
439 paddd %xmm14,%xmm9
440 pxor %xmm9,%xmm4
441 movdqa %xmm4,%xmm0
442 pslld $12,%xmm0
443 psrld $20,%xmm4
444 por %xmm0,%xmm4
445
446 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
447 movdqa 0x00(%rsp),%xmm0
448 paddd %xmm5,%xmm0
449 movdqa %xmm0,0x00(%rsp)
450 pxor %xmm0,%xmm15
451 pshufb %xmm2,%xmm15
452 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
453 movdqa 0x10(%rsp),%xmm0
454 paddd %xmm6,%xmm0
455 movdqa %xmm0,0x10(%rsp)
456 pxor %xmm0,%xmm12
457 pshufb %xmm2,%xmm12
458 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
459 movdqa 0x20(%rsp),%xmm0
460 paddd %xmm7,%xmm0
461 movdqa %xmm0,0x20(%rsp)
462 pxor %xmm0,%xmm13
463 pshufb %xmm2,%xmm13
464 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
465 movdqa 0x30(%rsp),%xmm0
466 paddd %xmm4,%xmm0
467 movdqa %xmm0,0x30(%rsp)
468 pxor %xmm0,%xmm14
469 pshufb %xmm2,%xmm14
470
471 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
472 paddd %xmm15,%xmm10
473 pxor %xmm10,%xmm5
474 movdqa %xmm5,%xmm0
475 pslld $7,%xmm0
476 psrld $25,%xmm5
477 por %xmm0,%xmm5
478 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
479 paddd %xmm12,%xmm11
480 pxor %xmm11,%xmm6
481 movdqa %xmm6,%xmm0
482 pslld $7,%xmm0
483 psrld $25,%xmm6
484 por %xmm0,%xmm6
485 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
486 paddd %xmm13,%xmm8
487 pxor %xmm8,%xmm7
488 movdqa %xmm7,%xmm0
489 pslld $7,%xmm0
490 psrld $25,%xmm7
491 por %xmm0,%xmm7
492 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
493 paddd %xmm14,%xmm9
494 pxor %xmm9,%xmm4
495 movdqa %xmm4,%xmm0
496 pslld $7,%xmm0
497 psrld $25,%xmm4
498 por %xmm0,%xmm4
499
500 sub $2,%r8d
501 jnz .Ldoubleround4
502
503 # x0[0-3] += s0[0]
504 # x1[0-3] += s0[1]
505 movq 0x00(%rdi),%xmm3
506 pshufd $0x00,%xmm3,%xmm2
507 pshufd $0x55,%xmm3,%xmm3
508 paddd 0x00(%rsp),%xmm2
509 movdqa %xmm2,0x00(%rsp)
510 paddd 0x10(%rsp),%xmm3
511 movdqa %xmm3,0x10(%rsp)
512 # x2[0-3] += s0[2]
513 # x3[0-3] += s0[3]
514 movq 0x08(%rdi),%xmm3
515 pshufd $0x00,%xmm3,%xmm2
516 pshufd $0x55,%xmm3,%xmm3
517 paddd 0x20(%rsp),%xmm2
518 movdqa %xmm2,0x20(%rsp)
519 paddd 0x30(%rsp),%xmm3
520 movdqa %xmm3,0x30(%rsp)
521
522 # x4[0-3] += s1[0]
523 # x5[0-3] += s1[1]
524 movq 0x10(%rdi),%xmm3
525 pshufd $0x00,%xmm3,%xmm2
526 pshufd $0x55,%xmm3,%xmm3
527 paddd %xmm2,%xmm4
528 paddd %xmm3,%xmm5
529 # x6[0-3] += s1[2]
530 # x7[0-3] += s1[3]
531 movq 0x18(%rdi),%xmm3
532 pshufd $0x00,%xmm3,%xmm2
533 pshufd $0x55,%xmm3,%xmm3
534 paddd %xmm2,%xmm6
535 paddd %xmm3,%xmm7
536
537 # x8[0-3] += s2[0]
538 # x9[0-3] += s2[1]
539 movq 0x20(%rdi),%xmm3
540 pshufd $0x00,%xmm3,%xmm2
541 pshufd $0x55,%xmm3,%xmm3
542 paddd %xmm2,%xmm8
543 paddd %xmm3,%xmm9
544 # x10[0-3] += s2[2]
545 # x11[0-3] += s2[3]
546 movq 0x28(%rdi),%xmm3
547 pshufd $0x00,%xmm3,%xmm2
548 pshufd $0x55,%xmm3,%xmm3
549 paddd %xmm2,%xmm10
550 paddd %xmm3,%xmm11
551
552 # x12[0-3] += s3[0]
553 # x13[0-3] += s3[1]
554 movq 0x30(%rdi),%xmm3
555 pshufd $0x00,%xmm3,%xmm2
556 pshufd $0x55,%xmm3,%xmm3
557 paddd %xmm2,%xmm12
558 paddd %xmm3,%xmm13
559 # x14[0-3] += s3[2]
560 # x15[0-3] += s3[3]
561 movq 0x38(%rdi),%xmm3
562 pshufd $0x00,%xmm3,%xmm2
563 pshufd $0x55,%xmm3,%xmm3
564 paddd %xmm2,%xmm14
565 paddd %xmm3,%xmm15
566
567 # x12 += counter values 0-3
568 paddd %xmm1,%xmm12
569
570 # interleave 32-bit words in state n, n+1
571 movdqa 0x00(%rsp),%xmm0
572 movdqa 0x10(%rsp),%xmm1
573 movdqa %xmm0,%xmm2
574 punpckldq %xmm1,%xmm2
575 punpckhdq %xmm1,%xmm0
576 movdqa %xmm2,0x00(%rsp)
577 movdqa %xmm0,0x10(%rsp)
578 movdqa 0x20(%rsp),%xmm0
579 movdqa 0x30(%rsp),%xmm1
580 movdqa %xmm0,%xmm2
581 punpckldq %xmm1,%xmm2
582 punpckhdq %xmm1,%xmm0
583 movdqa %xmm2,0x20(%rsp)
584 movdqa %xmm0,0x30(%rsp)
585 movdqa %xmm4,%xmm0
586 punpckldq %xmm5,%xmm4
587 punpckhdq %xmm5,%xmm0
588 movdqa %xmm0,%xmm5
589 movdqa %xmm6,%xmm0
590 punpckldq %xmm7,%xmm6
591 punpckhdq %xmm7,%xmm0
592 movdqa %xmm0,%xmm7
593 movdqa %xmm8,%xmm0
594 punpckldq %xmm9,%xmm8
595 punpckhdq %xmm9,%xmm0
596 movdqa %xmm0,%xmm9
597 movdqa %xmm10,%xmm0
598 punpckldq %xmm11,%xmm10
599 punpckhdq %xmm11,%xmm0
600 movdqa %xmm0,%xmm11
601 movdqa %xmm12,%xmm0
602 punpckldq %xmm13,%xmm12
603 punpckhdq %xmm13,%xmm0
604 movdqa %xmm0,%xmm13
605 movdqa %xmm14,%xmm0
606 punpckldq %xmm15,%xmm14
607 punpckhdq %xmm15,%xmm0
608 movdqa %xmm0,%xmm15
609
610 # interleave 64-bit words in state n, n+2
611 movdqa 0x00(%rsp),%xmm0
612 movdqa 0x20(%rsp),%xmm1
613 movdqa %xmm0,%xmm2
614 punpcklqdq %xmm1,%xmm2
615 punpckhqdq %xmm1,%xmm0
616 movdqa %xmm2,0x00(%rsp)
617 movdqa %xmm0,0x20(%rsp)
618 movdqa 0x10(%rsp),%xmm0
619 movdqa 0x30(%rsp),%xmm1
620 movdqa %xmm0,%xmm2
621 punpcklqdq %xmm1,%xmm2
622 punpckhqdq %xmm1,%xmm0
623 movdqa %xmm2,0x10(%rsp)
624 movdqa %xmm0,0x30(%rsp)
625 movdqa %xmm4,%xmm0
626 punpcklqdq %xmm6,%xmm4
627 punpckhqdq %xmm6,%xmm0
628 movdqa %xmm0,%xmm6
629 movdqa %xmm5,%xmm0
630 punpcklqdq %xmm7,%xmm5
631 punpckhqdq %xmm7,%xmm0
632 movdqa %xmm0,%xmm7
633 movdqa %xmm8,%xmm0
634 punpcklqdq %xmm10,%xmm8
635 punpckhqdq %xmm10,%xmm0
636 movdqa %xmm0,%xmm10
637 movdqa %xmm9,%xmm0
638 punpcklqdq %xmm11,%xmm9
639 punpckhqdq %xmm11,%xmm0
640 movdqa %xmm0,%xmm11
641 movdqa %xmm12,%xmm0
642 punpcklqdq %xmm14,%xmm12
643 punpckhqdq %xmm14,%xmm0
644 movdqa %xmm0,%xmm14
645 movdqa %xmm13,%xmm0
646 punpcklqdq %xmm15,%xmm13
647 punpckhqdq %xmm15,%xmm0
648 movdqa %xmm0,%xmm15
649
650 # xor with corresponding input, write to output
651 movdqa 0x00(%rsp),%xmm0
652 cmp $0x10,%rax
653 jl .Lxorpart4
654 movdqu 0x00(%rdx),%xmm1
655 pxor %xmm1,%xmm0
656 movdqu %xmm0,0x00(%rsi)
657
658 movdqu %xmm4,%xmm0
659 cmp $0x20,%rax
660 jl .Lxorpart4
661 movdqu 0x10(%rdx),%xmm1
662 pxor %xmm1,%xmm0
663 movdqu %xmm0,0x10(%rsi)
664
665 movdqu %xmm8,%xmm0
666 cmp $0x30,%rax
667 jl .Lxorpart4
668 movdqu 0x20(%rdx),%xmm1
669 pxor %xmm1,%xmm0
670 movdqu %xmm0,0x20(%rsi)
671
672 movdqu %xmm12,%xmm0
673 cmp $0x40,%rax
674 jl .Lxorpart4
675 movdqu 0x30(%rdx),%xmm1
676 pxor %xmm1,%xmm0
677 movdqu %xmm0,0x30(%rsi)
678
679 movdqa 0x20(%rsp),%xmm0
680 cmp $0x50,%rax
681 jl .Lxorpart4
682 movdqu 0x40(%rdx),%xmm1
683 pxor %xmm1,%xmm0
684 movdqu %xmm0,0x40(%rsi)
685
686 movdqu %xmm6,%xmm0
687 cmp $0x60,%rax
688 jl .Lxorpart4
689 movdqu 0x50(%rdx),%xmm1
690 pxor %xmm1,%xmm0
691 movdqu %xmm0,0x50(%rsi)
692
693 movdqu %xmm10,%xmm0
694 cmp $0x70,%rax
695 jl .Lxorpart4
696 movdqu 0x60(%rdx),%xmm1
697 pxor %xmm1,%xmm0
698 movdqu %xmm0,0x60(%rsi)
699
700 movdqu %xmm14,%xmm0
701 cmp $0x80,%rax
702 jl .Lxorpart4
703 movdqu 0x70(%rdx),%xmm1
704 pxor %xmm1,%xmm0
705 movdqu %xmm0,0x70(%rsi)
706
707 movdqa 0x10(%rsp),%xmm0
708 cmp $0x90,%rax
709 jl .Lxorpart4
710 movdqu 0x80(%rdx),%xmm1
711 pxor %xmm1,%xmm0
712 movdqu %xmm0,0x80(%rsi)
713
714 movdqu %xmm5,%xmm0
715 cmp $0xa0,%rax
716 jl .Lxorpart4
717 movdqu 0x90(%rdx),%xmm1
718 pxor %xmm1,%xmm0
719 movdqu %xmm0,0x90(%rsi)
720
721 movdqu %xmm9,%xmm0
722 cmp $0xb0,%rax
723 jl .Lxorpart4
724 movdqu 0xa0(%rdx),%xmm1
725 pxor %xmm1,%xmm0
726 movdqu %xmm0,0xa0(%rsi)
727
728 movdqu %xmm13,%xmm0
729 cmp $0xc0,%rax
730 jl .Lxorpart4
731 movdqu 0xb0(%rdx),%xmm1
732 pxor %xmm1,%xmm0
733 movdqu %xmm0,0xb0(%rsi)
734
735 movdqa 0x30(%rsp),%xmm0
736 cmp $0xd0,%rax
737 jl .Lxorpart4
738 movdqu 0xc0(%rdx),%xmm1
739 pxor %xmm1,%xmm0
740 movdqu %xmm0,0xc0(%rsi)
741
742 movdqu %xmm7,%xmm0
743 cmp $0xe0,%rax
744 jl .Lxorpart4
745 movdqu 0xd0(%rdx),%xmm1
746 pxor %xmm1,%xmm0
747 movdqu %xmm0,0xd0(%rsi)
748
749 movdqu %xmm11,%xmm0
750 cmp $0xf0,%rax
751 jl .Lxorpart4
752 movdqu 0xe0(%rdx),%xmm1
753 pxor %xmm1,%xmm0
754 movdqu %xmm0,0xe0(%rsi)
755
756 movdqu %xmm15,%xmm0
757 cmp $0x100,%rax
758 jl .Lxorpart4
759 movdqu 0xf0(%rdx),%xmm1
760 pxor %xmm1,%xmm0
761 movdqu %xmm0,0xf0(%rsi)
762
763.Ldone4:
764 lea -8(%r10),%rsp
765 RET
766
767.Lxorpart4:
768 # xor remaining bytes from partial register into output
769 mov %rax,%r9
770 and $0x0f,%r9
771 jz .Ldone4
772 and $~0x0f,%rax
773
774 mov %rsi,%r11
775
776 lea (%rdx,%rax),%rsi
777 mov %rsp,%rdi
778 mov %r9,%rcx
779 rep movsb
780
781 pxor 0x00(%rsp),%xmm0
782 movdqa %xmm0,0x00(%rsp)
783
784 mov %rsp,%rsi
785 lea (%r11,%rax),%rdi
786 mov %r9,%rcx
787 rep movsb
788
789 jmp .Ldone4
790
791SYM_FUNC_END(chacha_4block_xor_ssse3)
792

source code of linux/arch/x86/crypto/chacha-ssse3-x86_64.S