| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
| 2 | /* |
| 3 | * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions |
| 4 | * |
| 5 | * Copyright (C) 2015 Martin Willi |
| 6 | */ |
| 7 | |
| 8 | #include <linux/linkage.h> |
| 9 | #include <asm/frame.h> |
| 10 | |
| 11 | .section .rodata.cst16.ROT8, "aM" , @progbits, 16 |
| 12 | .align 16 |
| 13 | ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 |
| 14 | .section .rodata.cst16.ROT16, "aM" , @progbits, 16 |
| 15 | .align 16 |
| 16 | ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 |
| 17 | .section .rodata.cst16.CTRINC, "aM" , @progbits, 16 |
| 18 | .align 16 |
| 19 | CTRINC: .octa 0x00000003000000020000000100000000 |
| 20 | |
| 21 | .text |
| 22 | |
| 23 | /* |
| 24 | * chacha_permute - permute one block |
| 25 | * |
| 26 | * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This |
| 27 | * function performs matrix operations on four words in parallel, but requires |
| 28 | * shuffling to rearrange the words after each round. 8/16-bit word rotation is |
| 29 | * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word |
| 30 | * rotation uses traditional shift+OR. |
| 31 | * |
| 32 | * The round count is given in %r8d. |
| 33 | * |
| 34 | * Clobbers: %r8d, %xmm4-%xmm7 |
| 35 | */ |
| 36 | SYM_FUNC_START_LOCAL(chacha_permute) |
| 37 | |
| 38 | movdqa ROT8(%rip),%xmm4 |
| 39 | movdqa ROT16(%rip),%xmm5 |
| 40 | |
| 41 | .Ldoubleround: |
| 42 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 43 | paddd %xmm1,%xmm0 |
| 44 | pxor %xmm0,%xmm3 |
| 45 | pshufb %xmm5,%xmm3 |
| 46 | |
| 47 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 48 | paddd %xmm3,%xmm2 |
| 49 | pxor %xmm2,%xmm1 |
| 50 | movdqa %xmm1,%xmm6 |
| 51 | pslld $12,%xmm6 |
| 52 | psrld $20,%xmm1 |
| 53 | por %xmm6,%xmm1 |
| 54 | |
| 55 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 56 | paddd %xmm1,%xmm0 |
| 57 | pxor %xmm0,%xmm3 |
| 58 | pshufb %xmm4,%xmm3 |
| 59 | |
| 60 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 61 | paddd %xmm3,%xmm2 |
| 62 | pxor %xmm2,%xmm1 |
| 63 | movdqa %xmm1,%xmm7 |
| 64 | pslld $7,%xmm7 |
| 65 | psrld $25,%xmm1 |
| 66 | por %xmm7,%xmm1 |
| 67 | |
| 68 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
| 69 | pshufd $0x39,%xmm1,%xmm1 |
| 70 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 71 | pshufd $0x4e,%xmm2,%xmm2 |
| 72 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
| 73 | pshufd $0x93,%xmm3,%xmm3 |
| 74 | |
| 75 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 76 | paddd %xmm1,%xmm0 |
| 77 | pxor %xmm0,%xmm3 |
| 78 | pshufb %xmm5,%xmm3 |
| 79 | |
| 80 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 81 | paddd %xmm3,%xmm2 |
| 82 | pxor %xmm2,%xmm1 |
| 83 | movdqa %xmm1,%xmm6 |
| 84 | pslld $12,%xmm6 |
| 85 | psrld $20,%xmm1 |
| 86 | por %xmm6,%xmm1 |
| 87 | |
| 88 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 89 | paddd %xmm1,%xmm0 |
| 90 | pxor %xmm0,%xmm3 |
| 91 | pshufb %xmm4,%xmm3 |
| 92 | |
| 93 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 94 | paddd %xmm3,%xmm2 |
| 95 | pxor %xmm2,%xmm1 |
| 96 | movdqa %xmm1,%xmm7 |
| 97 | pslld $7,%xmm7 |
| 98 | psrld $25,%xmm1 |
| 99 | por %xmm7,%xmm1 |
| 100 | |
| 101 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
| 102 | pshufd $0x93,%xmm1,%xmm1 |
| 103 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 104 | pshufd $0x4e,%xmm2,%xmm2 |
| 105 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
| 106 | pshufd $0x39,%xmm3,%xmm3 |
| 107 | |
| 108 | sub $2,%r8d |
| 109 | jnz .Ldoubleround |
| 110 | |
| 111 | RET |
| 112 | SYM_FUNC_END(chacha_permute) |
| 113 | |
| 114 | SYM_FUNC_START(chacha_block_xor_ssse3) |
| 115 | # %rdi: Input state matrix, s |
| 116 | # %rsi: up to 1 data block output, o |
| 117 | # %rdx: up to 1 data block input, i |
| 118 | # %rcx: input/output length in bytes |
| 119 | # %r8d: nrounds |
| 120 | FRAME_BEGIN |
| 121 | |
| 122 | # x0..3 = s0..3 |
| 123 | movdqu 0x00(%rdi),%xmm0 |
| 124 | movdqu 0x10(%rdi),%xmm1 |
| 125 | movdqu 0x20(%rdi),%xmm2 |
| 126 | movdqu 0x30(%rdi),%xmm3 |
| 127 | movdqa %xmm0,%xmm8 |
| 128 | movdqa %xmm1,%xmm9 |
| 129 | movdqa %xmm2,%xmm10 |
| 130 | movdqa %xmm3,%xmm11 |
| 131 | |
| 132 | mov %rcx,%rax |
| 133 | call chacha_permute |
| 134 | |
| 135 | # o0 = i0 ^ (x0 + s0) |
| 136 | paddd %xmm8,%xmm0 |
| 137 | cmp $0x10,%rax |
| 138 | jl .Lxorpart |
| 139 | movdqu 0x00(%rdx),%xmm4 |
| 140 | pxor %xmm4,%xmm0 |
| 141 | movdqu %xmm0,0x00(%rsi) |
| 142 | # o1 = i1 ^ (x1 + s1) |
| 143 | paddd %xmm9,%xmm1 |
| 144 | movdqa %xmm1,%xmm0 |
| 145 | cmp $0x20,%rax |
| 146 | jl .Lxorpart |
| 147 | movdqu 0x10(%rdx),%xmm0 |
| 148 | pxor %xmm1,%xmm0 |
| 149 | movdqu %xmm0,0x10(%rsi) |
| 150 | # o2 = i2 ^ (x2 + s2) |
| 151 | paddd %xmm10,%xmm2 |
| 152 | movdqa %xmm2,%xmm0 |
| 153 | cmp $0x30,%rax |
| 154 | jl .Lxorpart |
| 155 | movdqu 0x20(%rdx),%xmm0 |
| 156 | pxor %xmm2,%xmm0 |
| 157 | movdqu %xmm0,0x20(%rsi) |
| 158 | # o3 = i3 ^ (x3 + s3) |
| 159 | paddd %xmm11,%xmm3 |
| 160 | movdqa %xmm3,%xmm0 |
| 161 | cmp $0x40,%rax |
| 162 | jl .Lxorpart |
| 163 | movdqu 0x30(%rdx),%xmm0 |
| 164 | pxor %xmm3,%xmm0 |
| 165 | movdqu %xmm0,0x30(%rsi) |
| 166 | |
| 167 | .Ldone: |
| 168 | FRAME_END |
| 169 | RET |
| 170 | |
| 171 | .Lxorpart: |
| 172 | # xor remaining bytes from partial register into output |
| 173 | mov %rax,%r9 |
| 174 | and $0x0f,%r9 |
| 175 | jz .Ldone |
| 176 | and $~0x0f,%rax |
| 177 | |
| 178 | mov %rsi,%r11 |
| 179 | |
| 180 | lea 8(%rsp),%r10 |
| 181 | sub $0x10,%rsp |
| 182 | and $~31,%rsp |
| 183 | |
| 184 | lea (%rdx,%rax),%rsi |
| 185 | mov %rsp,%rdi |
| 186 | mov %r9,%rcx |
| 187 | rep movsb |
| 188 | |
| 189 | pxor 0x00(%rsp),%xmm0 |
| 190 | movdqa %xmm0,0x00(%rsp) |
| 191 | |
| 192 | mov %rsp,%rsi |
| 193 | lea (%r11,%rax),%rdi |
| 194 | mov %r9,%rcx |
| 195 | rep movsb |
| 196 | |
| 197 | lea -8(%r10),%rsp |
| 198 | jmp .Ldone |
| 199 | |
| 200 | SYM_FUNC_END(chacha_block_xor_ssse3) |
| 201 | |
| 202 | SYM_FUNC_START(hchacha_block_ssse3) |
| 203 | # %rdi: Input state matrix, s |
| 204 | # %rsi: output (8 32-bit words) |
| 205 | # %edx: nrounds |
| 206 | FRAME_BEGIN |
| 207 | |
| 208 | movdqu 0x00(%rdi),%xmm0 |
| 209 | movdqu 0x10(%rdi),%xmm1 |
| 210 | movdqu 0x20(%rdi),%xmm2 |
| 211 | movdqu 0x30(%rdi),%xmm3 |
| 212 | |
| 213 | mov %edx,%r8d |
| 214 | call chacha_permute |
| 215 | |
| 216 | movdqu %xmm0,0x00(%rsi) |
| 217 | movdqu %xmm3,0x10(%rsi) |
| 218 | |
| 219 | FRAME_END |
| 220 | RET |
| 221 | SYM_FUNC_END(hchacha_block_ssse3) |
| 222 | |
| 223 | SYM_FUNC_START(chacha_4block_xor_ssse3) |
| 224 | # %rdi: Input state matrix, s |
| 225 | # %rsi: up to 4 data blocks output, o |
| 226 | # %rdx: up to 4 data blocks input, i |
| 227 | # %rcx: input/output length in bytes |
| 228 | # %r8d: nrounds |
| 229 | |
| 230 | # This function encrypts four consecutive ChaCha blocks by loading the |
| 231 | # the state matrix in SSE registers four times. As we need some scratch |
| 232 | # registers, we save the first four registers on the stack. The |
| 233 | # algorithm performs each operation on the corresponding word of each |
| 234 | # state matrix, hence requires no word shuffling. For final XORing step |
| 235 | # we transpose the matrix by interleaving 32- and then 64-bit words, |
| 236 | # which allows us to do XOR in SSE registers. 8/16-bit word rotation is |
| 237 | # done with the slightly better performing SSSE3 byte shuffling, |
| 238 | # 7/12-bit word rotation uses traditional shift+OR. |
| 239 | |
| 240 | lea 8(%rsp),%r10 |
| 241 | sub $0x80,%rsp |
| 242 | and $~63,%rsp |
| 243 | mov %rcx,%rax |
| 244 | |
| 245 | # x0..15[0-3] = s0..3[0..3] |
| 246 | movq 0x00(%rdi),%xmm1 |
| 247 | pshufd $0x00,%xmm1,%xmm0 |
| 248 | pshufd $0x55,%xmm1,%xmm1 |
| 249 | movq 0x08(%rdi),%xmm3 |
| 250 | pshufd $0x00,%xmm3,%xmm2 |
| 251 | pshufd $0x55,%xmm3,%xmm3 |
| 252 | movq 0x10(%rdi),%xmm5 |
| 253 | pshufd $0x00,%xmm5,%xmm4 |
| 254 | pshufd $0x55,%xmm5,%xmm5 |
| 255 | movq 0x18(%rdi),%xmm7 |
| 256 | pshufd $0x00,%xmm7,%xmm6 |
| 257 | pshufd $0x55,%xmm7,%xmm7 |
| 258 | movq 0x20(%rdi),%xmm9 |
| 259 | pshufd $0x00,%xmm9,%xmm8 |
| 260 | pshufd $0x55,%xmm9,%xmm9 |
| 261 | movq 0x28(%rdi),%xmm11 |
| 262 | pshufd $0x00,%xmm11,%xmm10 |
| 263 | pshufd $0x55,%xmm11,%xmm11 |
| 264 | movq 0x30(%rdi),%xmm13 |
| 265 | pshufd $0x00,%xmm13,%xmm12 |
| 266 | pshufd $0x55,%xmm13,%xmm13 |
| 267 | movq 0x38(%rdi),%xmm15 |
| 268 | pshufd $0x00,%xmm15,%xmm14 |
| 269 | pshufd $0x55,%xmm15,%xmm15 |
| 270 | # x0..3 on stack |
| 271 | movdqa %xmm0,0x00(%rsp) |
| 272 | movdqa %xmm1,0x10(%rsp) |
| 273 | movdqa %xmm2,0x20(%rsp) |
| 274 | movdqa %xmm3,0x30(%rsp) |
| 275 | |
| 276 | movdqa CTRINC(%rip),%xmm1 |
| 277 | movdqa ROT8(%rip),%xmm2 |
| 278 | movdqa ROT16(%rip),%xmm3 |
| 279 | |
| 280 | # x12 += counter values 0-3 |
| 281 | paddd %xmm1,%xmm12 |
| 282 | |
| 283 | .Ldoubleround4: |
| 284 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
| 285 | movdqa 0x00(%rsp),%xmm0 |
| 286 | paddd %xmm4,%xmm0 |
| 287 | movdqa %xmm0,0x00(%rsp) |
| 288 | pxor %xmm0,%xmm12 |
| 289 | pshufb %xmm3,%xmm12 |
| 290 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
| 291 | movdqa 0x10(%rsp),%xmm0 |
| 292 | paddd %xmm5,%xmm0 |
| 293 | movdqa %xmm0,0x10(%rsp) |
| 294 | pxor %xmm0,%xmm13 |
| 295 | pshufb %xmm3,%xmm13 |
| 296 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
| 297 | movdqa 0x20(%rsp),%xmm0 |
| 298 | paddd %xmm6,%xmm0 |
| 299 | movdqa %xmm0,0x20(%rsp) |
| 300 | pxor %xmm0,%xmm14 |
| 301 | pshufb %xmm3,%xmm14 |
| 302 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
| 303 | movdqa 0x30(%rsp),%xmm0 |
| 304 | paddd %xmm7,%xmm0 |
| 305 | movdqa %xmm0,0x30(%rsp) |
| 306 | pxor %xmm0,%xmm15 |
| 307 | pshufb %xmm3,%xmm15 |
| 308 | |
| 309 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
| 310 | paddd %xmm12,%xmm8 |
| 311 | pxor %xmm8,%xmm4 |
| 312 | movdqa %xmm4,%xmm0 |
| 313 | pslld $12,%xmm0 |
| 314 | psrld $20,%xmm4 |
| 315 | por %xmm0,%xmm4 |
| 316 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
| 317 | paddd %xmm13,%xmm9 |
| 318 | pxor %xmm9,%xmm5 |
| 319 | movdqa %xmm5,%xmm0 |
| 320 | pslld $12,%xmm0 |
| 321 | psrld $20,%xmm5 |
| 322 | por %xmm0,%xmm5 |
| 323 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
| 324 | paddd %xmm14,%xmm10 |
| 325 | pxor %xmm10,%xmm6 |
| 326 | movdqa %xmm6,%xmm0 |
| 327 | pslld $12,%xmm0 |
| 328 | psrld $20,%xmm6 |
| 329 | por %xmm0,%xmm6 |
| 330 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
| 331 | paddd %xmm15,%xmm11 |
| 332 | pxor %xmm11,%xmm7 |
| 333 | movdqa %xmm7,%xmm0 |
| 334 | pslld $12,%xmm0 |
| 335 | psrld $20,%xmm7 |
| 336 | por %xmm0,%xmm7 |
| 337 | |
| 338 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
| 339 | movdqa 0x00(%rsp),%xmm0 |
| 340 | paddd %xmm4,%xmm0 |
| 341 | movdqa %xmm0,0x00(%rsp) |
| 342 | pxor %xmm0,%xmm12 |
| 343 | pshufb %xmm2,%xmm12 |
| 344 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
| 345 | movdqa 0x10(%rsp),%xmm0 |
| 346 | paddd %xmm5,%xmm0 |
| 347 | movdqa %xmm0,0x10(%rsp) |
| 348 | pxor %xmm0,%xmm13 |
| 349 | pshufb %xmm2,%xmm13 |
| 350 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
| 351 | movdqa 0x20(%rsp),%xmm0 |
| 352 | paddd %xmm6,%xmm0 |
| 353 | movdqa %xmm0,0x20(%rsp) |
| 354 | pxor %xmm0,%xmm14 |
| 355 | pshufb %xmm2,%xmm14 |
| 356 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
| 357 | movdqa 0x30(%rsp),%xmm0 |
| 358 | paddd %xmm7,%xmm0 |
| 359 | movdqa %xmm0,0x30(%rsp) |
| 360 | pxor %xmm0,%xmm15 |
| 361 | pshufb %xmm2,%xmm15 |
| 362 | |
| 363 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
| 364 | paddd %xmm12,%xmm8 |
| 365 | pxor %xmm8,%xmm4 |
| 366 | movdqa %xmm4,%xmm0 |
| 367 | pslld $7,%xmm0 |
| 368 | psrld $25,%xmm4 |
| 369 | por %xmm0,%xmm4 |
| 370 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
| 371 | paddd %xmm13,%xmm9 |
| 372 | pxor %xmm9,%xmm5 |
| 373 | movdqa %xmm5,%xmm0 |
| 374 | pslld $7,%xmm0 |
| 375 | psrld $25,%xmm5 |
| 376 | por %xmm0,%xmm5 |
| 377 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
| 378 | paddd %xmm14,%xmm10 |
| 379 | pxor %xmm10,%xmm6 |
| 380 | movdqa %xmm6,%xmm0 |
| 381 | pslld $7,%xmm0 |
| 382 | psrld $25,%xmm6 |
| 383 | por %xmm0,%xmm6 |
| 384 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
| 385 | paddd %xmm15,%xmm11 |
| 386 | pxor %xmm11,%xmm7 |
| 387 | movdqa %xmm7,%xmm0 |
| 388 | pslld $7,%xmm0 |
| 389 | psrld $25,%xmm7 |
| 390 | por %xmm0,%xmm7 |
| 391 | |
| 392 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
| 393 | movdqa 0x00(%rsp),%xmm0 |
| 394 | paddd %xmm5,%xmm0 |
| 395 | movdqa %xmm0,0x00(%rsp) |
| 396 | pxor %xmm0,%xmm15 |
| 397 | pshufb %xmm3,%xmm15 |
| 398 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
| 399 | movdqa 0x10(%rsp),%xmm0 |
| 400 | paddd %xmm6,%xmm0 |
| 401 | movdqa %xmm0,0x10(%rsp) |
| 402 | pxor %xmm0,%xmm12 |
| 403 | pshufb %xmm3,%xmm12 |
| 404 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
| 405 | movdqa 0x20(%rsp),%xmm0 |
| 406 | paddd %xmm7,%xmm0 |
| 407 | movdqa %xmm0,0x20(%rsp) |
| 408 | pxor %xmm0,%xmm13 |
| 409 | pshufb %xmm3,%xmm13 |
| 410 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
| 411 | movdqa 0x30(%rsp),%xmm0 |
| 412 | paddd %xmm4,%xmm0 |
| 413 | movdqa %xmm0,0x30(%rsp) |
| 414 | pxor %xmm0,%xmm14 |
| 415 | pshufb %xmm3,%xmm14 |
| 416 | |
| 417 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
| 418 | paddd %xmm15,%xmm10 |
| 419 | pxor %xmm10,%xmm5 |
| 420 | movdqa %xmm5,%xmm0 |
| 421 | pslld $12,%xmm0 |
| 422 | psrld $20,%xmm5 |
| 423 | por %xmm0,%xmm5 |
| 424 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
| 425 | paddd %xmm12,%xmm11 |
| 426 | pxor %xmm11,%xmm6 |
| 427 | movdqa %xmm6,%xmm0 |
| 428 | pslld $12,%xmm0 |
| 429 | psrld $20,%xmm6 |
| 430 | por %xmm0,%xmm6 |
| 431 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
| 432 | paddd %xmm13,%xmm8 |
| 433 | pxor %xmm8,%xmm7 |
| 434 | movdqa %xmm7,%xmm0 |
| 435 | pslld $12,%xmm0 |
| 436 | psrld $20,%xmm7 |
| 437 | por %xmm0,%xmm7 |
| 438 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
| 439 | paddd %xmm14,%xmm9 |
| 440 | pxor %xmm9,%xmm4 |
| 441 | movdqa %xmm4,%xmm0 |
| 442 | pslld $12,%xmm0 |
| 443 | psrld $20,%xmm4 |
| 444 | por %xmm0,%xmm4 |
| 445 | |
| 446 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
| 447 | movdqa 0x00(%rsp),%xmm0 |
| 448 | paddd %xmm5,%xmm0 |
| 449 | movdqa %xmm0,0x00(%rsp) |
| 450 | pxor %xmm0,%xmm15 |
| 451 | pshufb %xmm2,%xmm15 |
| 452 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
| 453 | movdqa 0x10(%rsp),%xmm0 |
| 454 | paddd %xmm6,%xmm0 |
| 455 | movdqa %xmm0,0x10(%rsp) |
| 456 | pxor %xmm0,%xmm12 |
| 457 | pshufb %xmm2,%xmm12 |
| 458 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
| 459 | movdqa 0x20(%rsp),%xmm0 |
| 460 | paddd %xmm7,%xmm0 |
| 461 | movdqa %xmm0,0x20(%rsp) |
| 462 | pxor %xmm0,%xmm13 |
| 463 | pshufb %xmm2,%xmm13 |
| 464 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
| 465 | movdqa 0x30(%rsp),%xmm0 |
| 466 | paddd %xmm4,%xmm0 |
| 467 | movdqa %xmm0,0x30(%rsp) |
| 468 | pxor %xmm0,%xmm14 |
| 469 | pshufb %xmm2,%xmm14 |
| 470 | |
| 471 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
| 472 | paddd %xmm15,%xmm10 |
| 473 | pxor %xmm10,%xmm5 |
| 474 | movdqa %xmm5,%xmm0 |
| 475 | pslld $7,%xmm0 |
| 476 | psrld $25,%xmm5 |
| 477 | por %xmm0,%xmm5 |
| 478 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
| 479 | paddd %xmm12,%xmm11 |
| 480 | pxor %xmm11,%xmm6 |
| 481 | movdqa %xmm6,%xmm0 |
| 482 | pslld $7,%xmm0 |
| 483 | psrld $25,%xmm6 |
| 484 | por %xmm0,%xmm6 |
| 485 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
| 486 | paddd %xmm13,%xmm8 |
| 487 | pxor %xmm8,%xmm7 |
| 488 | movdqa %xmm7,%xmm0 |
| 489 | pslld $7,%xmm0 |
| 490 | psrld $25,%xmm7 |
| 491 | por %xmm0,%xmm7 |
| 492 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
| 493 | paddd %xmm14,%xmm9 |
| 494 | pxor %xmm9,%xmm4 |
| 495 | movdqa %xmm4,%xmm0 |
| 496 | pslld $7,%xmm0 |
| 497 | psrld $25,%xmm4 |
| 498 | por %xmm0,%xmm4 |
| 499 | |
| 500 | sub $2,%r8d |
| 501 | jnz .Ldoubleround4 |
| 502 | |
| 503 | # x0[0-3] += s0[0] |
| 504 | # x1[0-3] += s0[1] |
| 505 | movq 0x00(%rdi),%xmm3 |
| 506 | pshufd $0x00,%xmm3,%xmm2 |
| 507 | pshufd $0x55,%xmm3,%xmm3 |
| 508 | paddd 0x00(%rsp),%xmm2 |
| 509 | movdqa %xmm2,0x00(%rsp) |
| 510 | paddd 0x10(%rsp),%xmm3 |
| 511 | movdqa %xmm3,0x10(%rsp) |
| 512 | # x2[0-3] += s0[2] |
| 513 | # x3[0-3] += s0[3] |
| 514 | movq 0x08(%rdi),%xmm3 |
| 515 | pshufd $0x00,%xmm3,%xmm2 |
| 516 | pshufd $0x55,%xmm3,%xmm3 |
| 517 | paddd 0x20(%rsp),%xmm2 |
| 518 | movdqa %xmm2,0x20(%rsp) |
| 519 | paddd 0x30(%rsp),%xmm3 |
| 520 | movdqa %xmm3,0x30(%rsp) |
| 521 | |
| 522 | # x4[0-3] += s1[0] |
| 523 | # x5[0-3] += s1[1] |
| 524 | movq 0x10(%rdi),%xmm3 |
| 525 | pshufd $0x00,%xmm3,%xmm2 |
| 526 | pshufd $0x55,%xmm3,%xmm3 |
| 527 | paddd %xmm2,%xmm4 |
| 528 | paddd %xmm3,%xmm5 |
| 529 | # x6[0-3] += s1[2] |
| 530 | # x7[0-3] += s1[3] |
| 531 | movq 0x18(%rdi),%xmm3 |
| 532 | pshufd $0x00,%xmm3,%xmm2 |
| 533 | pshufd $0x55,%xmm3,%xmm3 |
| 534 | paddd %xmm2,%xmm6 |
| 535 | paddd %xmm3,%xmm7 |
| 536 | |
| 537 | # x8[0-3] += s2[0] |
| 538 | # x9[0-3] += s2[1] |
| 539 | movq 0x20(%rdi),%xmm3 |
| 540 | pshufd $0x00,%xmm3,%xmm2 |
| 541 | pshufd $0x55,%xmm3,%xmm3 |
| 542 | paddd %xmm2,%xmm8 |
| 543 | paddd %xmm3,%xmm9 |
| 544 | # x10[0-3] += s2[2] |
| 545 | # x11[0-3] += s2[3] |
| 546 | movq 0x28(%rdi),%xmm3 |
| 547 | pshufd $0x00,%xmm3,%xmm2 |
| 548 | pshufd $0x55,%xmm3,%xmm3 |
| 549 | paddd %xmm2,%xmm10 |
| 550 | paddd %xmm3,%xmm11 |
| 551 | |
| 552 | # x12[0-3] += s3[0] |
| 553 | # x13[0-3] += s3[1] |
| 554 | movq 0x30(%rdi),%xmm3 |
| 555 | pshufd $0x00,%xmm3,%xmm2 |
| 556 | pshufd $0x55,%xmm3,%xmm3 |
| 557 | paddd %xmm2,%xmm12 |
| 558 | paddd %xmm3,%xmm13 |
| 559 | # x14[0-3] += s3[2] |
| 560 | # x15[0-3] += s3[3] |
| 561 | movq 0x38(%rdi),%xmm3 |
| 562 | pshufd $0x00,%xmm3,%xmm2 |
| 563 | pshufd $0x55,%xmm3,%xmm3 |
| 564 | paddd %xmm2,%xmm14 |
| 565 | paddd %xmm3,%xmm15 |
| 566 | |
| 567 | # x12 += counter values 0-3 |
| 568 | paddd %xmm1,%xmm12 |
| 569 | |
| 570 | # interleave 32-bit words in state n, n+1 |
| 571 | movdqa 0x00(%rsp),%xmm0 |
| 572 | movdqa 0x10(%rsp),%xmm1 |
| 573 | movdqa %xmm0,%xmm2 |
| 574 | punpckldq %xmm1,%xmm2 |
| 575 | punpckhdq %xmm1,%xmm0 |
| 576 | movdqa %xmm2,0x00(%rsp) |
| 577 | movdqa %xmm0,0x10(%rsp) |
| 578 | movdqa 0x20(%rsp),%xmm0 |
| 579 | movdqa 0x30(%rsp),%xmm1 |
| 580 | movdqa %xmm0,%xmm2 |
| 581 | punpckldq %xmm1,%xmm2 |
| 582 | punpckhdq %xmm1,%xmm0 |
| 583 | movdqa %xmm2,0x20(%rsp) |
| 584 | movdqa %xmm0,0x30(%rsp) |
| 585 | movdqa %xmm4,%xmm0 |
| 586 | punpckldq %xmm5,%xmm4 |
| 587 | punpckhdq %xmm5,%xmm0 |
| 588 | movdqa %xmm0,%xmm5 |
| 589 | movdqa %xmm6,%xmm0 |
| 590 | punpckldq %xmm7,%xmm6 |
| 591 | punpckhdq %xmm7,%xmm0 |
| 592 | movdqa %xmm0,%xmm7 |
| 593 | movdqa %xmm8,%xmm0 |
| 594 | punpckldq %xmm9,%xmm8 |
| 595 | punpckhdq %xmm9,%xmm0 |
| 596 | movdqa %xmm0,%xmm9 |
| 597 | movdqa %xmm10,%xmm0 |
| 598 | punpckldq %xmm11,%xmm10 |
| 599 | punpckhdq %xmm11,%xmm0 |
| 600 | movdqa %xmm0,%xmm11 |
| 601 | movdqa %xmm12,%xmm0 |
| 602 | punpckldq %xmm13,%xmm12 |
| 603 | punpckhdq %xmm13,%xmm0 |
| 604 | movdqa %xmm0,%xmm13 |
| 605 | movdqa %xmm14,%xmm0 |
| 606 | punpckldq %xmm15,%xmm14 |
| 607 | punpckhdq %xmm15,%xmm0 |
| 608 | movdqa %xmm0,%xmm15 |
| 609 | |
| 610 | # interleave 64-bit words in state n, n+2 |
| 611 | movdqa 0x00(%rsp),%xmm0 |
| 612 | movdqa 0x20(%rsp),%xmm1 |
| 613 | movdqa %xmm0,%xmm2 |
| 614 | punpcklqdq %xmm1,%xmm2 |
| 615 | punpckhqdq %xmm1,%xmm0 |
| 616 | movdqa %xmm2,0x00(%rsp) |
| 617 | movdqa %xmm0,0x20(%rsp) |
| 618 | movdqa 0x10(%rsp),%xmm0 |
| 619 | movdqa 0x30(%rsp),%xmm1 |
| 620 | movdqa %xmm0,%xmm2 |
| 621 | punpcklqdq %xmm1,%xmm2 |
| 622 | punpckhqdq %xmm1,%xmm0 |
| 623 | movdqa %xmm2,0x10(%rsp) |
| 624 | movdqa %xmm0,0x30(%rsp) |
| 625 | movdqa %xmm4,%xmm0 |
| 626 | punpcklqdq %xmm6,%xmm4 |
| 627 | punpckhqdq %xmm6,%xmm0 |
| 628 | movdqa %xmm0,%xmm6 |
| 629 | movdqa %xmm5,%xmm0 |
| 630 | punpcklqdq %xmm7,%xmm5 |
| 631 | punpckhqdq %xmm7,%xmm0 |
| 632 | movdqa %xmm0,%xmm7 |
| 633 | movdqa %xmm8,%xmm0 |
| 634 | punpcklqdq %xmm10,%xmm8 |
| 635 | punpckhqdq %xmm10,%xmm0 |
| 636 | movdqa %xmm0,%xmm10 |
| 637 | movdqa %xmm9,%xmm0 |
| 638 | punpcklqdq %xmm11,%xmm9 |
| 639 | punpckhqdq %xmm11,%xmm0 |
| 640 | movdqa %xmm0,%xmm11 |
| 641 | movdqa %xmm12,%xmm0 |
| 642 | punpcklqdq %xmm14,%xmm12 |
| 643 | punpckhqdq %xmm14,%xmm0 |
| 644 | movdqa %xmm0,%xmm14 |
| 645 | movdqa %xmm13,%xmm0 |
| 646 | punpcklqdq %xmm15,%xmm13 |
| 647 | punpckhqdq %xmm15,%xmm0 |
| 648 | movdqa %xmm0,%xmm15 |
| 649 | |
| 650 | # xor with corresponding input, write to output |
| 651 | movdqa 0x00(%rsp),%xmm0 |
| 652 | cmp $0x10,%rax |
| 653 | jl .Lxorpart4 |
| 654 | movdqu 0x00(%rdx),%xmm1 |
| 655 | pxor %xmm1,%xmm0 |
| 656 | movdqu %xmm0,0x00(%rsi) |
| 657 | |
| 658 | movdqu %xmm4,%xmm0 |
| 659 | cmp $0x20,%rax |
| 660 | jl .Lxorpart4 |
| 661 | movdqu 0x10(%rdx),%xmm1 |
| 662 | pxor %xmm1,%xmm0 |
| 663 | movdqu %xmm0,0x10(%rsi) |
| 664 | |
| 665 | movdqu %xmm8,%xmm0 |
| 666 | cmp $0x30,%rax |
| 667 | jl .Lxorpart4 |
| 668 | movdqu 0x20(%rdx),%xmm1 |
| 669 | pxor %xmm1,%xmm0 |
| 670 | movdqu %xmm0,0x20(%rsi) |
| 671 | |
| 672 | movdqu %xmm12,%xmm0 |
| 673 | cmp $0x40,%rax |
| 674 | jl .Lxorpart4 |
| 675 | movdqu 0x30(%rdx),%xmm1 |
| 676 | pxor %xmm1,%xmm0 |
| 677 | movdqu %xmm0,0x30(%rsi) |
| 678 | |
| 679 | movdqa 0x20(%rsp),%xmm0 |
| 680 | cmp $0x50,%rax |
| 681 | jl .Lxorpart4 |
| 682 | movdqu 0x40(%rdx),%xmm1 |
| 683 | pxor %xmm1,%xmm0 |
| 684 | movdqu %xmm0,0x40(%rsi) |
| 685 | |
| 686 | movdqu %xmm6,%xmm0 |
| 687 | cmp $0x60,%rax |
| 688 | jl .Lxorpart4 |
| 689 | movdqu 0x50(%rdx),%xmm1 |
| 690 | pxor %xmm1,%xmm0 |
| 691 | movdqu %xmm0,0x50(%rsi) |
| 692 | |
| 693 | movdqu %xmm10,%xmm0 |
| 694 | cmp $0x70,%rax |
| 695 | jl .Lxorpart4 |
| 696 | movdqu 0x60(%rdx),%xmm1 |
| 697 | pxor %xmm1,%xmm0 |
| 698 | movdqu %xmm0,0x60(%rsi) |
| 699 | |
| 700 | movdqu %xmm14,%xmm0 |
| 701 | cmp $0x80,%rax |
| 702 | jl .Lxorpart4 |
| 703 | movdqu 0x70(%rdx),%xmm1 |
| 704 | pxor %xmm1,%xmm0 |
| 705 | movdqu %xmm0,0x70(%rsi) |
| 706 | |
| 707 | movdqa 0x10(%rsp),%xmm0 |
| 708 | cmp $0x90,%rax |
| 709 | jl .Lxorpart4 |
| 710 | movdqu 0x80(%rdx),%xmm1 |
| 711 | pxor %xmm1,%xmm0 |
| 712 | movdqu %xmm0,0x80(%rsi) |
| 713 | |
| 714 | movdqu %xmm5,%xmm0 |
| 715 | cmp $0xa0,%rax |
| 716 | jl .Lxorpart4 |
| 717 | movdqu 0x90(%rdx),%xmm1 |
| 718 | pxor %xmm1,%xmm0 |
| 719 | movdqu %xmm0,0x90(%rsi) |
| 720 | |
| 721 | movdqu %xmm9,%xmm0 |
| 722 | cmp $0xb0,%rax |
| 723 | jl .Lxorpart4 |
| 724 | movdqu 0xa0(%rdx),%xmm1 |
| 725 | pxor %xmm1,%xmm0 |
| 726 | movdqu %xmm0,0xa0(%rsi) |
| 727 | |
| 728 | movdqu %xmm13,%xmm0 |
| 729 | cmp $0xc0,%rax |
| 730 | jl .Lxorpart4 |
| 731 | movdqu 0xb0(%rdx),%xmm1 |
| 732 | pxor %xmm1,%xmm0 |
| 733 | movdqu %xmm0,0xb0(%rsi) |
| 734 | |
| 735 | movdqa 0x30(%rsp),%xmm0 |
| 736 | cmp $0xd0,%rax |
| 737 | jl .Lxorpart4 |
| 738 | movdqu 0xc0(%rdx),%xmm1 |
| 739 | pxor %xmm1,%xmm0 |
| 740 | movdqu %xmm0,0xc0(%rsi) |
| 741 | |
| 742 | movdqu %xmm7,%xmm0 |
| 743 | cmp $0xe0,%rax |
| 744 | jl .Lxorpart4 |
| 745 | movdqu 0xd0(%rdx),%xmm1 |
| 746 | pxor %xmm1,%xmm0 |
| 747 | movdqu %xmm0,0xd0(%rsi) |
| 748 | |
| 749 | movdqu %xmm11,%xmm0 |
| 750 | cmp $0xf0,%rax |
| 751 | jl .Lxorpart4 |
| 752 | movdqu 0xe0(%rdx),%xmm1 |
| 753 | pxor %xmm1,%xmm0 |
| 754 | movdqu %xmm0,0xe0(%rsi) |
| 755 | |
| 756 | movdqu %xmm15,%xmm0 |
| 757 | cmp $0x100,%rax |
| 758 | jl .Lxorpart4 |
| 759 | movdqu 0xf0(%rdx),%xmm1 |
| 760 | pxor %xmm1,%xmm0 |
| 761 | movdqu %xmm0,0xf0(%rsi) |
| 762 | |
| 763 | .Ldone4: |
| 764 | lea -8(%r10),%rsp |
| 765 | RET |
| 766 | |
| 767 | .Lxorpart4: |
| 768 | # xor remaining bytes from partial register into output |
| 769 | mov %rax,%r9 |
| 770 | and $0x0f,%r9 |
| 771 | jz .Ldone4 |
| 772 | and $~0x0f,%rax |
| 773 | |
| 774 | mov %rsi,%r11 |
| 775 | |
| 776 | lea (%rdx,%rax),%rsi |
| 777 | mov %rsp,%rdi |
| 778 | mov %r9,%rcx |
| 779 | rep movsb |
| 780 | |
| 781 | pxor 0x00(%rsp),%xmm0 |
| 782 | movdqa %xmm0,0x00(%rsp) |
| 783 | |
| 784 | mov %rsp,%rsi |
| 785 | lea (%r11,%rax),%rdi |
| 786 | mov %r9,%rcx |
| 787 | rep movsb |
| 788 | |
| 789 | jmp .Ldone4 |
| 790 | |
| 791 | SYM_FUNC_END(chacha_4block_xor_ssse3) |
| 792 | |