1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions |
4 | * |
5 | * Copyright (C) 2015 Martin Willi |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | #include <asm/frame.h> |
10 | |
11 | .section .rodata.cst16.ROT8, "aM" , @progbits, 16 |
12 | .align 16 |
13 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
14 | .section .rodata.cst16.ROT16, "aM" , @progbits, 16 |
15 | .align 16 |
16 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
17 | .section .rodata.cst16.CTRINC, "aM" , @progbits, 16 |
18 | .align 16 |
19 | CTRINC: .octa 0x00000003000000020000000100000000 |
20 | |
21 | .text |
22 | |
23 | /* |
24 | * chacha_permute - permute one block |
25 | * |
26 | * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This |
27 | * function performs matrix operations on four words in parallel, but requires |
28 | * shuffling to rearrange the words after each round. 8/16-bit word rotation is |
29 | * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word |
30 | * rotation uses traditional shift+OR. |
31 | * |
32 | * The round count is given in %r8d. |
33 | * |
34 | * Clobbers: %r8d, %xmm4-%xmm7 |
35 | */ |
36 | SYM_FUNC_START_LOCAL(chacha_permute) |
37 | |
38 | movdqa ROT8(%rip),%xmm4 |
39 | movdqa ROT16(%rip),%xmm5 |
40 | |
41 | .Ldoubleround: |
42 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
43 | paddd %xmm1,%xmm0 |
44 | pxor %xmm0,%xmm3 |
45 | pshufb %xmm5,%xmm3 |
46 | |
47 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
48 | paddd %xmm3,%xmm2 |
49 | pxor %xmm2,%xmm1 |
50 | movdqa %xmm1,%xmm6 |
51 | pslld $12,%xmm6 |
52 | psrld $20,%xmm1 |
53 | por %xmm6,%xmm1 |
54 | |
55 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
56 | paddd %xmm1,%xmm0 |
57 | pxor %xmm0,%xmm3 |
58 | pshufb %xmm4,%xmm3 |
59 | |
60 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
61 | paddd %xmm3,%xmm2 |
62 | pxor %xmm2,%xmm1 |
63 | movdqa %xmm1,%xmm7 |
64 | pslld $7,%xmm7 |
65 | psrld $25,%xmm1 |
66 | por %xmm7,%xmm1 |
67 | |
68 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
69 | pshufd $0x39,%xmm1,%xmm1 |
70 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
71 | pshufd $0x4e,%xmm2,%xmm2 |
72 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
73 | pshufd $0x93,%xmm3,%xmm3 |
74 | |
75 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
76 | paddd %xmm1,%xmm0 |
77 | pxor %xmm0,%xmm3 |
78 | pshufb %xmm5,%xmm3 |
79 | |
80 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
81 | paddd %xmm3,%xmm2 |
82 | pxor %xmm2,%xmm1 |
83 | movdqa %xmm1,%xmm6 |
84 | pslld $12,%xmm6 |
85 | psrld $20,%xmm1 |
86 | por %xmm6,%xmm1 |
87 | |
88 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
89 | paddd %xmm1,%xmm0 |
90 | pxor %xmm0,%xmm3 |
91 | pshufb %xmm4,%xmm3 |
92 | |
93 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
94 | paddd %xmm3,%xmm2 |
95 | pxor %xmm2,%xmm1 |
96 | movdqa %xmm1,%xmm7 |
97 | pslld $7,%xmm7 |
98 | psrld $25,%xmm1 |
99 | por %xmm7,%xmm1 |
100 | |
101 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
102 | pshufd $0x93,%xmm1,%xmm1 |
103 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
104 | pshufd $0x4e,%xmm2,%xmm2 |
105 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
106 | pshufd $0x39,%xmm3,%xmm3 |
107 | |
108 | sub $2,%r8d |
109 | jnz .Ldoubleround |
110 | |
111 | RET |
112 | SYM_FUNC_END(chacha_permute) |
113 | |
114 | SYM_FUNC_START(chacha_block_xor_ssse3) |
115 | # %rdi: Input state matrix, s |
116 | # %rsi: up to 1 data block output, o |
117 | # %rdx: up to 1 data block input, i |
118 | # %rcx: input/output length in bytes |
119 | # %r8d: nrounds |
120 | FRAME_BEGIN |
121 | |
122 | # x0..3 = s0..3 |
123 | movdqu 0x00(%rdi),%xmm0 |
124 | movdqu 0x10(%rdi),%xmm1 |
125 | movdqu 0x20(%rdi),%xmm2 |
126 | movdqu 0x30(%rdi),%xmm3 |
127 | movdqa %xmm0,%xmm8 |
128 | movdqa %xmm1,%xmm9 |
129 | movdqa %xmm2,%xmm10 |
130 | movdqa %xmm3,%xmm11 |
131 | |
132 | mov %rcx,%rax |
133 | call chacha_permute |
134 | |
135 | # o0 = i0 ^ (x0 + s0) |
136 | paddd %xmm8,%xmm0 |
137 | cmp $0x10,%rax |
138 | jl .Lxorpart |
139 | movdqu 0x00(%rdx),%xmm4 |
140 | pxor %xmm4,%xmm0 |
141 | movdqu %xmm0,0x00(%rsi) |
142 | # o1 = i1 ^ (x1 + s1) |
143 | paddd %xmm9,%xmm1 |
144 | movdqa %xmm1,%xmm0 |
145 | cmp $0x20,%rax |
146 | jl .Lxorpart |
147 | movdqu 0x10(%rdx),%xmm0 |
148 | pxor %xmm1,%xmm0 |
149 | movdqu %xmm0,0x10(%rsi) |
150 | # o2 = i2 ^ (x2 + s2) |
151 | paddd %xmm10,%xmm2 |
152 | movdqa %xmm2,%xmm0 |
153 | cmp $0x30,%rax |
154 | jl .Lxorpart |
155 | movdqu 0x20(%rdx),%xmm0 |
156 | pxor %xmm2,%xmm0 |
157 | movdqu %xmm0,0x20(%rsi) |
158 | # o3 = i3 ^ (x3 + s3) |
159 | paddd %xmm11,%xmm3 |
160 | movdqa %xmm3,%xmm0 |
161 | cmp $0x40,%rax |
162 | jl .Lxorpart |
163 | movdqu 0x30(%rdx),%xmm0 |
164 | pxor %xmm3,%xmm0 |
165 | movdqu %xmm0,0x30(%rsi) |
166 | |
167 | .Ldone: |
168 | FRAME_END |
169 | RET |
170 | |
171 | .Lxorpart: |
172 | # xor remaining bytes from partial register into output |
173 | mov %rax,%r9 |
174 | and $0x0f,%r9 |
175 | jz .Ldone |
176 | and $~0x0f,%rax |
177 | |
178 | mov %rsi,%r11 |
179 | |
180 | lea 8(%rsp),%r10 |
181 | sub $0x10,%rsp |
182 | and $~31,%rsp |
183 | |
184 | lea (%rdx,%rax),%rsi |
185 | mov %rsp,%rdi |
186 | mov %r9,%rcx |
187 | rep movsb |
188 | |
189 | pxor 0x00(%rsp),%xmm0 |
190 | movdqa %xmm0,0x00(%rsp) |
191 | |
192 | mov %rsp,%rsi |
193 | lea (%r11,%rax),%rdi |
194 | mov %r9,%rcx |
195 | rep movsb |
196 | |
197 | lea -8(%r10),%rsp |
198 | jmp .Ldone |
199 | |
200 | SYM_FUNC_END(chacha_block_xor_ssse3) |
201 | |
202 | SYM_FUNC_START(hchacha_block_ssse3) |
203 | # %rdi: Input state matrix, s |
204 | # %rsi: output (8 32-bit words) |
205 | # %edx: nrounds |
206 | FRAME_BEGIN |
207 | |
208 | movdqu 0x00(%rdi),%xmm0 |
209 | movdqu 0x10(%rdi),%xmm1 |
210 | movdqu 0x20(%rdi),%xmm2 |
211 | movdqu 0x30(%rdi),%xmm3 |
212 | |
213 | mov %edx,%r8d |
214 | call chacha_permute |
215 | |
216 | movdqu %xmm0,0x00(%rsi) |
217 | movdqu %xmm3,0x10(%rsi) |
218 | |
219 | FRAME_END |
220 | RET |
221 | SYM_FUNC_END(hchacha_block_ssse3) |
222 | |
223 | SYM_FUNC_START(chacha_4block_xor_ssse3) |
224 | # %rdi: Input state matrix, s |
225 | # %rsi: up to 4 data blocks output, o |
226 | # %rdx: up to 4 data blocks input, i |
227 | # %rcx: input/output length in bytes |
228 | # %r8d: nrounds |
229 | |
230 | # This function encrypts four consecutive ChaCha blocks by loading the |
231 | # the state matrix in SSE registers four times. As we need some scratch |
232 | # registers, we save the first four registers on the stack. The |
233 | # algorithm performs each operation on the corresponding word of each |
234 | # state matrix, hence requires no word shuffling. For final XORing step |
235 | # we transpose the matrix by interleaving 32- and then 64-bit words, |
236 | # which allows us to do XOR in SSE registers. 8/16-bit word rotation is |
237 | # done with the slightly better performing SSSE3 byte shuffling, |
238 | # 7/12-bit word rotation uses traditional shift+OR. |
239 | |
240 | lea 8(%rsp),%r10 |
241 | sub $0x80,%rsp |
242 | and $~63,%rsp |
243 | mov %rcx,%rax |
244 | |
245 | # x0..15[0-3] = s0..3[0..3] |
246 | movq 0x00(%rdi),%xmm1 |
247 | pshufd $0x00,%xmm1,%xmm0 |
248 | pshufd $0x55,%xmm1,%xmm1 |
249 | movq 0x08(%rdi),%xmm3 |
250 | pshufd $0x00,%xmm3,%xmm2 |
251 | pshufd $0x55,%xmm3,%xmm3 |
252 | movq 0x10(%rdi),%xmm5 |
253 | pshufd $0x00,%xmm5,%xmm4 |
254 | pshufd $0x55,%xmm5,%xmm5 |
255 | movq 0x18(%rdi),%xmm7 |
256 | pshufd $0x00,%xmm7,%xmm6 |
257 | pshufd $0x55,%xmm7,%xmm7 |
258 | movq 0x20(%rdi),%xmm9 |
259 | pshufd $0x00,%xmm9,%xmm8 |
260 | pshufd $0x55,%xmm9,%xmm9 |
261 | movq 0x28(%rdi),%xmm11 |
262 | pshufd $0x00,%xmm11,%xmm10 |
263 | pshufd $0x55,%xmm11,%xmm11 |
264 | movq 0x30(%rdi),%xmm13 |
265 | pshufd $0x00,%xmm13,%xmm12 |
266 | pshufd $0x55,%xmm13,%xmm13 |
267 | movq 0x38(%rdi),%xmm15 |
268 | pshufd $0x00,%xmm15,%xmm14 |
269 | pshufd $0x55,%xmm15,%xmm15 |
270 | # x0..3 on stack |
271 | movdqa %xmm0,0x00(%rsp) |
272 | movdqa %xmm1,0x10(%rsp) |
273 | movdqa %xmm2,0x20(%rsp) |
274 | movdqa %xmm3,0x30(%rsp) |
275 | |
276 | movdqa CTRINC(%rip),%xmm1 |
277 | movdqa ROT8(%rip),%xmm2 |
278 | movdqa ROT16(%rip),%xmm3 |
279 | |
280 | # x12 += counter values 0-3 |
281 | paddd %xmm1,%xmm12 |
282 | |
283 | .Ldoubleround4: |
284 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
285 | movdqa 0x00(%rsp),%xmm0 |
286 | paddd %xmm4,%xmm0 |
287 | movdqa %xmm0,0x00(%rsp) |
288 | pxor %xmm0,%xmm12 |
289 | pshufb %xmm3,%xmm12 |
290 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
291 | movdqa 0x10(%rsp),%xmm0 |
292 | paddd %xmm5,%xmm0 |
293 | movdqa %xmm0,0x10(%rsp) |
294 | pxor %xmm0,%xmm13 |
295 | pshufb %xmm3,%xmm13 |
296 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
297 | movdqa 0x20(%rsp),%xmm0 |
298 | paddd %xmm6,%xmm0 |
299 | movdqa %xmm0,0x20(%rsp) |
300 | pxor %xmm0,%xmm14 |
301 | pshufb %xmm3,%xmm14 |
302 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
303 | movdqa 0x30(%rsp),%xmm0 |
304 | paddd %xmm7,%xmm0 |
305 | movdqa %xmm0,0x30(%rsp) |
306 | pxor %xmm0,%xmm15 |
307 | pshufb %xmm3,%xmm15 |
308 | |
309 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
310 | paddd %xmm12,%xmm8 |
311 | pxor %xmm8,%xmm4 |
312 | movdqa %xmm4,%xmm0 |
313 | pslld $12,%xmm0 |
314 | psrld $20,%xmm4 |
315 | por %xmm0,%xmm4 |
316 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
317 | paddd %xmm13,%xmm9 |
318 | pxor %xmm9,%xmm5 |
319 | movdqa %xmm5,%xmm0 |
320 | pslld $12,%xmm0 |
321 | psrld $20,%xmm5 |
322 | por %xmm0,%xmm5 |
323 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
324 | paddd %xmm14,%xmm10 |
325 | pxor %xmm10,%xmm6 |
326 | movdqa %xmm6,%xmm0 |
327 | pslld $12,%xmm0 |
328 | psrld $20,%xmm6 |
329 | por %xmm0,%xmm6 |
330 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
331 | paddd %xmm15,%xmm11 |
332 | pxor %xmm11,%xmm7 |
333 | movdqa %xmm7,%xmm0 |
334 | pslld $12,%xmm0 |
335 | psrld $20,%xmm7 |
336 | por %xmm0,%xmm7 |
337 | |
338 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
339 | movdqa 0x00(%rsp),%xmm0 |
340 | paddd %xmm4,%xmm0 |
341 | movdqa %xmm0,0x00(%rsp) |
342 | pxor %xmm0,%xmm12 |
343 | pshufb %xmm2,%xmm12 |
344 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
345 | movdqa 0x10(%rsp),%xmm0 |
346 | paddd %xmm5,%xmm0 |
347 | movdqa %xmm0,0x10(%rsp) |
348 | pxor %xmm0,%xmm13 |
349 | pshufb %xmm2,%xmm13 |
350 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
351 | movdqa 0x20(%rsp),%xmm0 |
352 | paddd %xmm6,%xmm0 |
353 | movdqa %xmm0,0x20(%rsp) |
354 | pxor %xmm0,%xmm14 |
355 | pshufb %xmm2,%xmm14 |
356 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
357 | movdqa 0x30(%rsp),%xmm0 |
358 | paddd %xmm7,%xmm0 |
359 | movdqa %xmm0,0x30(%rsp) |
360 | pxor %xmm0,%xmm15 |
361 | pshufb %xmm2,%xmm15 |
362 | |
363 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
364 | paddd %xmm12,%xmm8 |
365 | pxor %xmm8,%xmm4 |
366 | movdqa %xmm4,%xmm0 |
367 | pslld $7,%xmm0 |
368 | psrld $25,%xmm4 |
369 | por %xmm0,%xmm4 |
370 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
371 | paddd %xmm13,%xmm9 |
372 | pxor %xmm9,%xmm5 |
373 | movdqa %xmm5,%xmm0 |
374 | pslld $7,%xmm0 |
375 | psrld $25,%xmm5 |
376 | por %xmm0,%xmm5 |
377 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
378 | paddd %xmm14,%xmm10 |
379 | pxor %xmm10,%xmm6 |
380 | movdqa %xmm6,%xmm0 |
381 | pslld $7,%xmm0 |
382 | psrld $25,%xmm6 |
383 | por %xmm0,%xmm6 |
384 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
385 | paddd %xmm15,%xmm11 |
386 | pxor %xmm11,%xmm7 |
387 | movdqa %xmm7,%xmm0 |
388 | pslld $7,%xmm0 |
389 | psrld $25,%xmm7 |
390 | por %xmm0,%xmm7 |
391 | |
392 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
393 | movdqa 0x00(%rsp),%xmm0 |
394 | paddd %xmm5,%xmm0 |
395 | movdqa %xmm0,0x00(%rsp) |
396 | pxor %xmm0,%xmm15 |
397 | pshufb %xmm3,%xmm15 |
398 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
399 | movdqa 0x10(%rsp),%xmm0 |
400 | paddd %xmm6,%xmm0 |
401 | movdqa %xmm0,0x10(%rsp) |
402 | pxor %xmm0,%xmm12 |
403 | pshufb %xmm3,%xmm12 |
404 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
405 | movdqa 0x20(%rsp),%xmm0 |
406 | paddd %xmm7,%xmm0 |
407 | movdqa %xmm0,0x20(%rsp) |
408 | pxor %xmm0,%xmm13 |
409 | pshufb %xmm3,%xmm13 |
410 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
411 | movdqa 0x30(%rsp),%xmm0 |
412 | paddd %xmm4,%xmm0 |
413 | movdqa %xmm0,0x30(%rsp) |
414 | pxor %xmm0,%xmm14 |
415 | pshufb %xmm3,%xmm14 |
416 | |
417 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
418 | paddd %xmm15,%xmm10 |
419 | pxor %xmm10,%xmm5 |
420 | movdqa %xmm5,%xmm0 |
421 | pslld $12,%xmm0 |
422 | psrld $20,%xmm5 |
423 | por %xmm0,%xmm5 |
424 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
425 | paddd %xmm12,%xmm11 |
426 | pxor %xmm11,%xmm6 |
427 | movdqa %xmm6,%xmm0 |
428 | pslld $12,%xmm0 |
429 | psrld $20,%xmm6 |
430 | por %xmm0,%xmm6 |
431 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
432 | paddd %xmm13,%xmm8 |
433 | pxor %xmm8,%xmm7 |
434 | movdqa %xmm7,%xmm0 |
435 | pslld $12,%xmm0 |
436 | psrld $20,%xmm7 |
437 | por %xmm0,%xmm7 |
438 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
439 | paddd %xmm14,%xmm9 |
440 | pxor %xmm9,%xmm4 |
441 | movdqa %xmm4,%xmm0 |
442 | pslld $12,%xmm0 |
443 | psrld $20,%xmm4 |
444 | por %xmm0,%xmm4 |
445 | |
446 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
447 | movdqa 0x00(%rsp),%xmm0 |
448 | paddd %xmm5,%xmm0 |
449 | movdqa %xmm0,0x00(%rsp) |
450 | pxor %xmm0,%xmm15 |
451 | pshufb %xmm2,%xmm15 |
452 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
453 | movdqa 0x10(%rsp),%xmm0 |
454 | paddd %xmm6,%xmm0 |
455 | movdqa %xmm0,0x10(%rsp) |
456 | pxor %xmm0,%xmm12 |
457 | pshufb %xmm2,%xmm12 |
458 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
459 | movdqa 0x20(%rsp),%xmm0 |
460 | paddd %xmm7,%xmm0 |
461 | movdqa %xmm0,0x20(%rsp) |
462 | pxor %xmm0,%xmm13 |
463 | pshufb %xmm2,%xmm13 |
464 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
465 | movdqa 0x30(%rsp),%xmm0 |
466 | paddd %xmm4,%xmm0 |
467 | movdqa %xmm0,0x30(%rsp) |
468 | pxor %xmm0,%xmm14 |
469 | pshufb %xmm2,%xmm14 |
470 | |
471 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
472 | paddd %xmm15,%xmm10 |
473 | pxor %xmm10,%xmm5 |
474 | movdqa %xmm5,%xmm0 |
475 | pslld $7,%xmm0 |
476 | psrld $25,%xmm5 |
477 | por %xmm0,%xmm5 |
478 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
479 | paddd %xmm12,%xmm11 |
480 | pxor %xmm11,%xmm6 |
481 | movdqa %xmm6,%xmm0 |
482 | pslld $7,%xmm0 |
483 | psrld $25,%xmm6 |
484 | por %xmm0,%xmm6 |
485 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
486 | paddd %xmm13,%xmm8 |
487 | pxor %xmm8,%xmm7 |
488 | movdqa %xmm7,%xmm0 |
489 | pslld $7,%xmm0 |
490 | psrld $25,%xmm7 |
491 | por %xmm0,%xmm7 |
492 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
493 | paddd %xmm14,%xmm9 |
494 | pxor %xmm9,%xmm4 |
495 | movdqa %xmm4,%xmm0 |
496 | pslld $7,%xmm0 |
497 | psrld $25,%xmm4 |
498 | por %xmm0,%xmm4 |
499 | |
500 | sub $2,%r8d |
501 | jnz .Ldoubleround4 |
502 | |
503 | # x0[0-3] += s0[0] |
504 | # x1[0-3] += s0[1] |
505 | movq 0x00(%rdi),%xmm3 |
506 | pshufd $0x00,%xmm3,%xmm2 |
507 | pshufd $0x55,%xmm3,%xmm3 |
508 | paddd 0x00(%rsp),%xmm2 |
509 | movdqa %xmm2,0x00(%rsp) |
510 | paddd 0x10(%rsp),%xmm3 |
511 | movdqa %xmm3,0x10(%rsp) |
512 | # x2[0-3] += s0[2] |
513 | # x3[0-3] += s0[3] |
514 | movq 0x08(%rdi),%xmm3 |
515 | pshufd $0x00,%xmm3,%xmm2 |
516 | pshufd $0x55,%xmm3,%xmm3 |
517 | paddd 0x20(%rsp),%xmm2 |
518 | movdqa %xmm2,0x20(%rsp) |
519 | paddd 0x30(%rsp),%xmm3 |
520 | movdqa %xmm3,0x30(%rsp) |
521 | |
522 | # x4[0-3] += s1[0] |
523 | # x5[0-3] += s1[1] |
524 | movq 0x10(%rdi),%xmm3 |
525 | pshufd $0x00,%xmm3,%xmm2 |
526 | pshufd $0x55,%xmm3,%xmm3 |
527 | paddd %xmm2,%xmm4 |
528 | paddd %xmm3,%xmm5 |
529 | # x6[0-3] += s1[2] |
530 | # x7[0-3] += s1[3] |
531 | movq 0x18(%rdi),%xmm3 |
532 | pshufd $0x00,%xmm3,%xmm2 |
533 | pshufd $0x55,%xmm3,%xmm3 |
534 | paddd %xmm2,%xmm6 |
535 | paddd %xmm3,%xmm7 |
536 | |
537 | # x8[0-3] += s2[0] |
538 | # x9[0-3] += s2[1] |
539 | movq 0x20(%rdi),%xmm3 |
540 | pshufd $0x00,%xmm3,%xmm2 |
541 | pshufd $0x55,%xmm3,%xmm3 |
542 | paddd %xmm2,%xmm8 |
543 | paddd %xmm3,%xmm9 |
544 | # x10[0-3] += s2[2] |
545 | # x11[0-3] += s2[3] |
546 | movq 0x28(%rdi),%xmm3 |
547 | pshufd $0x00,%xmm3,%xmm2 |
548 | pshufd $0x55,%xmm3,%xmm3 |
549 | paddd %xmm2,%xmm10 |
550 | paddd %xmm3,%xmm11 |
551 | |
552 | # x12[0-3] += s3[0] |
553 | # x13[0-3] += s3[1] |
554 | movq 0x30(%rdi),%xmm3 |
555 | pshufd $0x00,%xmm3,%xmm2 |
556 | pshufd $0x55,%xmm3,%xmm3 |
557 | paddd %xmm2,%xmm12 |
558 | paddd %xmm3,%xmm13 |
559 | # x14[0-3] += s3[2] |
560 | # x15[0-3] += s3[3] |
561 | movq 0x38(%rdi),%xmm3 |
562 | pshufd $0x00,%xmm3,%xmm2 |
563 | pshufd $0x55,%xmm3,%xmm3 |
564 | paddd %xmm2,%xmm14 |
565 | paddd %xmm3,%xmm15 |
566 | |
567 | # x12 += counter values 0-3 |
568 | paddd %xmm1,%xmm12 |
569 | |
570 | # interleave 32-bit words in state n, n+1 |
571 | movdqa 0x00(%rsp),%xmm0 |
572 | movdqa 0x10(%rsp),%xmm1 |
573 | movdqa %xmm0,%xmm2 |
574 | punpckldq %xmm1,%xmm2 |
575 | punpckhdq %xmm1,%xmm0 |
576 | movdqa %xmm2,0x00(%rsp) |
577 | movdqa %xmm0,0x10(%rsp) |
578 | movdqa 0x20(%rsp),%xmm0 |
579 | movdqa 0x30(%rsp),%xmm1 |
580 | movdqa %xmm0,%xmm2 |
581 | punpckldq %xmm1,%xmm2 |
582 | punpckhdq %xmm1,%xmm0 |
583 | movdqa %xmm2,0x20(%rsp) |
584 | movdqa %xmm0,0x30(%rsp) |
585 | movdqa %xmm4,%xmm0 |
586 | punpckldq %xmm5,%xmm4 |
587 | punpckhdq %xmm5,%xmm0 |
588 | movdqa %xmm0,%xmm5 |
589 | movdqa %xmm6,%xmm0 |
590 | punpckldq %xmm7,%xmm6 |
591 | punpckhdq %xmm7,%xmm0 |
592 | movdqa %xmm0,%xmm7 |
593 | movdqa %xmm8,%xmm0 |
594 | punpckldq %xmm9,%xmm8 |
595 | punpckhdq %xmm9,%xmm0 |
596 | movdqa %xmm0,%xmm9 |
597 | movdqa %xmm10,%xmm0 |
598 | punpckldq %xmm11,%xmm10 |
599 | punpckhdq %xmm11,%xmm0 |
600 | movdqa %xmm0,%xmm11 |
601 | movdqa %xmm12,%xmm0 |
602 | punpckldq %xmm13,%xmm12 |
603 | punpckhdq %xmm13,%xmm0 |
604 | movdqa %xmm0,%xmm13 |
605 | movdqa %xmm14,%xmm0 |
606 | punpckldq %xmm15,%xmm14 |
607 | punpckhdq %xmm15,%xmm0 |
608 | movdqa %xmm0,%xmm15 |
609 | |
610 | # interleave 64-bit words in state n, n+2 |
611 | movdqa 0x00(%rsp),%xmm0 |
612 | movdqa 0x20(%rsp),%xmm1 |
613 | movdqa %xmm0,%xmm2 |
614 | punpcklqdq %xmm1,%xmm2 |
615 | punpckhqdq %xmm1,%xmm0 |
616 | movdqa %xmm2,0x00(%rsp) |
617 | movdqa %xmm0,0x20(%rsp) |
618 | movdqa 0x10(%rsp),%xmm0 |
619 | movdqa 0x30(%rsp),%xmm1 |
620 | movdqa %xmm0,%xmm2 |
621 | punpcklqdq %xmm1,%xmm2 |
622 | punpckhqdq %xmm1,%xmm0 |
623 | movdqa %xmm2,0x10(%rsp) |
624 | movdqa %xmm0,0x30(%rsp) |
625 | movdqa %xmm4,%xmm0 |
626 | punpcklqdq %xmm6,%xmm4 |
627 | punpckhqdq %xmm6,%xmm0 |
628 | movdqa %xmm0,%xmm6 |
629 | movdqa %xmm5,%xmm0 |
630 | punpcklqdq %xmm7,%xmm5 |
631 | punpckhqdq %xmm7,%xmm0 |
632 | movdqa %xmm0,%xmm7 |
633 | movdqa %xmm8,%xmm0 |
634 | punpcklqdq %xmm10,%xmm8 |
635 | punpckhqdq %xmm10,%xmm0 |
636 | movdqa %xmm0,%xmm10 |
637 | movdqa %xmm9,%xmm0 |
638 | punpcklqdq %xmm11,%xmm9 |
639 | punpckhqdq %xmm11,%xmm0 |
640 | movdqa %xmm0,%xmm11 |
641 | movdqa %xmm12,%xmm0 |
642 | punpcklqdq %xmm14,%xmm12 |
643 | punpckhqdq %xmm14,%xmm0 |
644 | movdqa %xmm0,%xmm14 |
645 | movdqa %xmm13,%xmm0 |
646 | punpcklqdq %xmm15,%xmm13 |
647 | punpckhqdq %xmm15,%xmm0 |
648 | movdqa %xmm0,%xmm15 |
649 | |
650 | # xor with corresponding input, write to output |
651 | movdqa 0x00(%rsp),%xmm0 |
652 | cmp $0x10,%rax |
653 | jl .Lxorpart4 |
654 | movdqu 0x00(%rdx),%xmm1 |
655 | pxor %xmm1,%xmm0 |
656 | movdqu %xmm0,0x00(%rsi) |
657 | |
658 | movdqu %xmm4,%xmm0 |
659 | cmp $0x20,%rax |
660 | jl .Lxorpart4 |
661 | movdqu 0x10(%rdx),%xmm1 |
662 | pxor %xmm1,%xmm0 |
663 | movdqu %xmm0,0x10(%rsi) |
664 | |
665 | movdqu %xmm8,%xmm0 |
666 | cmp $0x30,%rax |
667 | jl .Lxorpart4 |
668 | movdqu 0x20(%rdx),%xmm1 |
669 | pxor %xmm1,%xmm0 |
670 | movdqu %xmm0,0x20(%rsi) |
671 | |
672 | movdqu %xmm12,%xmm0 |
673 | cmp $0x40,%rax |
674 | jl .Lxorpart4 |
675 | movdqu 0x30(%rdx),%xmm1 |
676 | pxor %xmm1,%xmm0 |
677 | movdqu %xmm0,0x30(%rsi) |
678 | |
679 | movdqa 0x20(%rsp),%xmm0 |
680 | cmp $0x50,%rax |
681 | jl .Lxorpart4 |
682 | movdqu 0x40(%rdx),%xmm1 |
683 | pxor %xmm1,%xmm0 |
684 | movdqu %xmm0,0x40(%rsi) |
685 | |
686 | movdqu %xmm6,%xmm0 |
687 | cmp $0x60,%rax |
688 | jl .Lxorpart4 |
689 | movdqu 0x50(%rdx),%xmm1 |
690 | pxor %xmm1,%xmm0 |
691 | movdqu %xmm0,0x50(%rsi) |
692 | |
693 | movdqu %xmm10,%xmm0 |
694 | cmp $0x70,%rax |
695 | jl .Lxorpart4 |
696 | movdqu 0x60(%rdx),%xmm1 |
697 | pxor %xmm1,%xmm0 |
698 | movdqu %xmm0,0x60(%rsi) |
699 | |
700 | movdqu %xmm14,%xmm0 |
701 | cmp $0x80,%rax |
702 | jl .Lxorpart4 |
703 | movdqu 0x70(%rdx),%xmm1 |
704 | pxor %xmm1,%xmm0 |
705 | movdqu %xmm0,0x70(%rsi) |
706 | |
707 | movdqa 0x10(%rsp),%xmm0 |
708 | cmp $0x90,%rax |
709 | jl .Lxorpart4 |
710 | movdqu 0x80(%rdx),%xmm1 |
711 | pxor %xmm1,%xmm0 |
712 | movdqu %xmm0,0x80(%rsi) |
713 | |
714 | movdqu %xmm5,%xmm0 |
715 | cmp $0xa0,%rax |
716 | jl .Lxorpart4 |
717 | movdqu 0x90(%rdx),%xmm1 |
718 | pxor %xmm1,%xmm0 |
719 | movdqu %xmm0,0x90(%rsi) |
720 | |
721 | movdqu %xmm9,%xmm0 |
722 | cmp $0xb0,%rax |
723 | jl .Lxorpart4 |
724 | movdqu 0xa0(%rdx),%xmm1 |
725 | pxor %xmm1,%xmm0 |
726 | movdqu %xmm0,0xa0(%rsi) |
727 | |
728 | movdqu %xmm13,%xmm0 |
729 | cmp $0xc0,%rax |
730 | jl .Lxorpart4 |
731 | movdqu 0xb0(%rdx),%xmm1 |
732 | pxor %xmm1,%xmm0 |
733 | movdqu %xmm0,0xb0(%rsi) |
734 | |
735 | movdqa 0x30(%rsp),%xmm0 |
736 | cmp $0xd0,%rax |
737 | jl .Lxorpart4 |
738 | movdqu 0xc0(%rdx),%xmm1 |
739 | pxor %xmm1,%xmm0 |
740 | movdqu %xmm0,0xc0(%rsi) |
741 | |
742 | movdqu %xmm7,%xmm0 |
743 | cmp $0xe0,%rax |
744 | jl .Lxorpart4 |
745 | movdqu 0xd0(%rdx),%xmm1 |
746 | pxor %xmm1,%xmm0 |
747 | movdqu %xmm0,0xd0(%rsi) |
748 | |
749 | movdqu %xmm11,%xmm0 |
750 | cmp $0xf0,%rax |
751 | jl .Lxorpart4 |
752 | movdqu 0xe0(%rdx),%xmm1 |
753 | pxor %xmm1,%xmm0 |
754 | movdqu %xmm0,0xe0(%rsi) |
755 | |
756 | movdqu %xmm15,%xmm0 |
757 | cmp $0x100,%rax |
758 | jl .Lxorpart4 |
759 | movdqu 0xf0(%rdx),%xmm1 |
760 | pxor %xmm1,%xmm0 |
761 | movdqu %xmm0,0xf0(%rsi) |
762 | |
763 | .Ldone4: |
764 | lea -8(%r10),%rsp |
765 | RET |
766 | |
767 | .Lxorpart4: |
768 | # xor remaining bytes from partial register into output |
769 | mov %rax,%r9 |
770 | and $0x0f,%r9 |
771 | jz .Ldone4 |
772 | and $~0x0f,%rax |
773 | |
774 | mov %rsi,%r11 |
775 | |
776 | lea (%rdx,%rax),%rsi |
777 | mov %rsp,%rdi |
778 | mov %r9,%rcx |
779 | rep movsb |
780 | |
781 | pxor 0x00(%rsp),%xmm0 |
782 | movdqa %xmm0,0x00(%rsp) |
783 | |
784 | mov %rsp,%rsi |
785 | lea (%r11,%rax),%rdi |
786 | mov %r9,%rcx |
787 | rep movsb |
788 | |
789 | jmp .Ldone4 |
790 | |
791 | SYM_FUNC_END(chacha_4block_xor_ssse3) |
792 | |