| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
| 4 | */ |
| 5 | |
| 6 | #include <linux/linkage.h> |
| 7 | #include <asm/frame.h> |
| 8 | |
| 9 | .section .rodata, "a" |
| 10 | .align 16 |
| 11 | CONSTANTS: .octa 0x6b20657479622d323320646e61707865 |
| 12 | .text |
| 13 | |
| 14 | /* |
| 15 | * Very basic SSE2 implementation of ChaCha20. Produces a given positive number |
| 16 | * of blocks of output with a nonce of 0, taking an input key and 8-byte |
| 17 | * counter. Importantly does not spill to the stack. Its arguments are: |
| 18 | * |
| 19 | * rdi: output bytes |
| 20 | * rsi: 32-byte key input |
| 21 | * rdx: 8-byte counter input/output |
| 22 | * rcx: number of 64-byte blocks to write to output |
| 23 | */ |
| 24 | SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
| 25 | |
| 26 | .set output, %rdi |
| 27 | .set key, %rsi |
| 28 | .set counter, %rdx |
| 29 | .set nblocks, %rcx |
| 30 | .set i, %al |
| 31 | /* xmm registers are *not* callee-save. */ |
| 32 | .set temp, %xmm0 |
| 33 | .set state0, %xmm1 |
| 34 | .set state1, %xmm2 |
| 35 | .set state2, %xmm3 |
| 36 | .set state3, %xmm4 |
| 37 | .set copy0, %xmm5 |
| 38 | .set copy1, %xmm6 |
| 39 | .set copy2, %xmm7 |
| 40 | .set copy3, %xmm8 |
| 41 | .set one, %xmm9 |
| 42 | |
| 43 | /* copy0 = "expand 32-byte k" */ |
| 44 | movaps CONSTANTS(%rip),copy0 |
| 45 | /* copy1,copy2 = key */ |
| 46 | movups 0x00(key),copy1 |
| 47 | movups 0x10(key),copy2 |
| 48 | /* copy3 = counter || zero nonce */ |
| 49 | movq 0x00(counter),copy3 |
| 50 | /* one = 1 || 0 */ |
| 51 | movq $1,%rax |
| 52 | movq %rax,one |
| 53 | |
| 54 | .Lblock: |
| 55 | /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ |
| 56 | movdqa copy0,state0 |
| 57 | movdqa copy1,state1 |
| 58 | movdqa copy2,state2 |
| 59 | movdqa copy3,state3 |
| 60 | |
| 61 | movb $10,i |
| 62 | .Lpermute: |
| 63 | /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| 64 | paddd state1,state0 |
| 65 | pxor state0,state3 |
| 66 | movdqa state3,temp |
| 67 | pslld $16,temp |
| 68 | psrld $16,state3 |
| 69 | por temp,state3 |
| 70 | |
| 71 | /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| 72 | paddd state3,state2 |
| 73 | pxor state2,state1 |
| 74 | movdqa state1,temp |
| 75 | pslld $12,temp |
| 76 | psrld $20,state1 |
| 77 | por temp,state1 |
| 78 | |
| 79 | /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| 80 | paddd state1,state0 |
| 81 | pxor state0,state3 |
| 82 | movdqa state3,temp |
| 83 | pslld $8,temp |
| 84 | psrld $24,state3 |
| 85 | por temp,state3 |
| 86 | |
| 87 | /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| 88 | paddd state3,state2 |
| 89 | pxor state2,state1 |
| 90 | movdqa state1,temp |
| 91 | pslld $7,temp |
| 92 | psrld $25,state1 |
| 93 | por temp,state1 |
| 94 | |
| 95 | /* state1[0,1,2,3] = state1[1,2,3,0] */ |
| 96 | pshufd $0x39,state1,state1 |
| 97 | /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| 98 | pshufd $0x4e,state2,state2 |
| 99 | /* state3[0,1,2,3] = state3[3,0,1,2] */ |
| 100 | pshufd $0x93,state3,state3 |
| 101 | |
| 102 | /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| 103 | paddd state1,state0 |
| 104 | pxor state0,state3 |
| 105 | movdqa state3,temp |
| 106 | pslld $16,temp |
| 107 | psrld $16,state3 |
| 108 | por temp,state3 |
| 109 | |
| 110 | /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| 111 | paddd state3,state2 |
| 112 | pxor state2,state1 |
| 113 | movdqa state1,temp |
| 114 | pslld $12,temp |
| 115 | psrld $20,state1 |
| 116 | por temp,state1 |
| 117 | |
| 118 | /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| 119 | paddd state1,state0 |
| 120 | pxor state0,state3 |
| 121 | movdqa state3,temp |
| 122 | pslld $8,temp |
| 123 | psrld $24,state3 |
| 124 | por temp,state3 |
| 125 | |
| 126 | /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| 127 | paddd state3,state2 |
| 128 | pxor state2,state1 |
| 129 | movdqa state1,temp |
| 130 | pslld $7,temp |
| 131 | psrld $25,state1 |
| 132 | por temp,state1 |
| 133 | |
| 134 | /* state1[0,1,2,3] = state1[3,0,1,2] */ |
| 135 | pshufd $0x93,state1,state1 |
| 136 | /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| 137 | pshufd $0x4e,state2,state2 |
| 138 | /* state3[0,1,2,3] = state3[1,2,3,0] */ |
| 139 | pshufd $0x39,state3,state3 |
| 140 | |
| 141 | decb i |
| 142 | jnz .Lpermute |
| 143 | |
| 144 | /* output0 = state0 + copy0 */ |
| 145 | paddd copy0,state0 |
| 146 | movups state0,0x00(output) |
| 147 | /* output1 = state1 + copy1 */ |
| 148 | paddd copy1,state1 |
| 149 | movups state1,0x10(output) |
| 150 | /* output2 = state2 + copy2 */ |
| 151 | paddd copy2,state2 |
| 152 | movups state2,0x20(output) |
| 153 | /* output3 = state3 + copy3 */ |
| 154 | paddd copy3,state3 |
| 155 | movups state3,0x30(output) |
| 156 | |
| 157 | /* ++copy3.counter */ |
| 158 | paddq one,copy3 |
| 159 | |
| 160 | /* output += 64, --nblocks */ |
| 161 | addq $64,output |
| 162 | decq nblocks |
| 163 | jnz .Lblock |
| 164 | |
| 165 | /* counter = copy3.counter */ |
| 166 | movq copy3,0x00(counter) |
| 167 | |
| 168 | /* Zero out the potentially sensitive regs, in case nothing uses these again. */ |
| 169 | pxor state0,state0 |
| 170 | pxor state1,state1 |
| 171 | pxor state2,state2 |
| 172 | pxor state3,state3 |
| 173 | pxor copy1,copy1 |
| 174 | pxor copy2,copy2 |
| 175 | pxor temp,temp |
| 176 | |
| 177 | ret |
| 178 | SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
| 179 | |