1 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ |
2 | /* |
3 | * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
4 | * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. |
5 | */ |
6 | |
7 | #include <linux/linkage.h> |
8 | |
9 | .section .rodata.cst32.BLAKE2S_IV, "aM" , @progbits, 32 |
10 | .align 32 |
11 | IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 |
12 | .octa 0x5BE0CD191F83D9AB9B05688C510E527F |
13 | .section .rodata.cst16.ROT16, "aM" , @progbits, 16 |
14 | .align 16 |
15 | ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 |
16 | .section .rodata.cst16.ROR328, "aM" , @progbits, 16 |
17 | .align 16 |
18 | ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 |
19 | .section .rodata.cst64.BLAKE2S_SIGMA, "aM" , @progbits, 160 |
20 | .align 64 |
21 | SIGMA: |
22 | .byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 |
23 | .byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 |
24 | .byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 |
25 | .byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 |
26 | .byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 |
27 | .byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 |
28 | .byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 |
29 | .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 |
30 | .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 |
31 | .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 |
32 | #ifdef CONFIG_AS_AVX512 |
33 | .section .rodata.cst64.BLAKE2S_SIGMA2, "aM" , @progbits, 640 |
34 | .align 64 |
35 | SIGMA2: |
36 | .long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 |
37 | .long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 |
38 | .long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 |
39 | .long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 |
40 | .long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 |
41 | .long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 |
42 | .long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 |
43 | .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 |
44 | .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 |
45 | .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 |
46 | #endif /* CONFIG_AS_AVX512 */ |
47 | |
48 | .text |
49 | SYM_FUNC_START(blake2s_compress_ssse3) |
50 | testq %rdx,%rdx |
51 | je .Lendofloop |
52 | movdqu (%rdi),%xmm0 |
53 | movdqu 0x10(%rdi),%xmm1 |
54 | movdqa ROT16(%rip),%xmm12 |
55 | movdqa ROR328(%rip),%xmm13 |
56 | movdqu 0x20(%rdi),%xmm14 |
57 | movq %rcx,%xmm15 |
58 | leaq SIGMA+0xa0(%rip),%r8 |
59 | jmp .Lbeginofloop |
60 | .align 32 |
61 | .Lbeginofloop: |
62 | movdqa %xmm0,%xmm10 |
63 | movdqa %xmm1,%xmm11 |
64 | paddq %xmm15,%xmm14 |
65 | movdqa IV(%rip),%xmm2 |
66 | movdqa %xmm14,%xmm3 |
67 | pxor IV+0x10(%rip),%xmm3 |
68 | leaq SIGMA(%rip),%rcx |
69 | .Lroundloop: |
70 | movzbl (%rcx),%eax |
71 | movd (%rsi,%rax,4),%xmm4 |
72 | movzbl 0x1(%rcx),%eax |
73 | movd (%rsi,%rax,4),%xmm5 |
74 | movzbl 0x2(%rcx),%eax |
75 | movd (%rsi,%rax,4),%xmm6 |
76 | movzbl 0x3(%rcx),%eax |
77 | movd (%rsi,%rax,4),%xmm7 |
78 | punpckldq %xmm5,%xmm4 |
79 | punpckldq %xmm7,%xmm6 |
80 | punpcklqdq %xmm6,%xmm4 |
81 | paddd %xmm4,%xmm0 |
82 | paddd %xmm1,%xmm0 |
83 | pxor %xmm0,%xmm3 |
84 | pshufb %xmm12,%xmm3 |
85 | paddd %xmm3,%xmm2 |
86 | pxor %xmm2,%xmm1 |
87 | movdqa %xmm1,%xmm8 |
88 | psrld $0xc,%xmm1 |
89 | pslld $0x14,%xmm8 |
90 | por %xmm8,%xmm1 |
91 | movzbl 0x4(%rcx),%eax |
92 | movd (%rsi,%rax,4),%xmm5 |
93 | movzbl 0x5(%rcx),%eax |
94 | movd (%rsi,%rax,4),%xmm6 |
95 | movzbl 0x6(%rcx),%eax |
96 | movd (%rsi,%rax,4),%xmm7 |
97 | movzbl 0x7(%rcx),%eax |
98 | movd (%rsi,%rax,4),%xmm4 |
99 | punpckldq %xmm6,%xmm5 |
100 | punpckldq %xmm4,%xmm7 |
101 | punpcklqdq %xmm7,%xmm5 |
102 | paddd %xmm5,%xmm0 |
103 | paddd %xmm1,%xmm0 |
104 | pxor %xmm0,%xmm3 |
105 | pshufb %xmm13,%xmm3 |
106 | paddd %xmm3,%xmm2 |
107 | pxor %xmm2,%xmm1 |
108 | movdqa %xmm1,%xmm8 |
109 | psrld $0x7,%xmm1 |
110 | pslld $0x19,%xmm8 |
111 | por %xmm8,%xmm1 |
112 | pshufd $0x93,%xmm0,%xmm0 |
113 | pshufd $0x4e,%xmm3,%xmm3 |
114 | pshufd $0x39,%xmm2,%xmm2 |
115 | movzbl 0x8(%rcx),%eax |
116 | movd (%rsi,%rax,4),%xmm6 |
117 | movzbl 0x9(%rcx),%eax |
118 | movd (%rsi,%rax,4),%xmm7 |
119 | movzbl 0xa(%rcx),%eax |
120 | movd (%rsi,%rax,4),%xmm4 |
121 | movzbl 0xb(%rcx),%eax |
122 | movd (%rsi,%rax,4),%xmm5 |
123 | punpckldq %xmm7,%xmm6 |
124 | punpckldq %xmm5,%xmm4 |
125 | punpcklqdq %xmm4,%xmm6 |
126 | paddd %xmm6,%xmm0 |
127 | paddd %xmm1,%xmm0 |
128 | pxor %xmm0,%xmm3 |
129 | pshufb %xmm12,%xmm3 |
130 | paddd %xmm3,%xmm2 |
131 | pxor %xmm2,%xmm1 |
132 | movdqa %xmm1,%xmm8 |
133 | psrld $0xc,%xmm1 |
134 | pslld $0x14,%xmm8 |
135 | por %xmm8,%xmm1 |
136 | movzbl 0xc(%rcx),%eax |
137 | movd (%rsi,%rax,4),%xmm7 |
138 | movzbl 0xd(%rcx),%eax |
139 | movd (%rsi,%rax,4),%xmm4 |
140 | movzbl 0xe(%rcx),%eax |
141 | movd (%rsi,%rax,4),%xmm5 |
142 | movzbl 0xf(%rcx),%eax |
143 | movd (%rsi,%rax,4),%xmm6 |
144 | punpckldq %xmm4,%xmm7 |
145 | punpckldq %xmm6,%xmm5 |
146 | punpcklqdq %xmm5,%xmm7 |
147 | paddd %xmm7,%xmm0 |
148 | paddd %xmm1,%xmm0 |
149 | pxor %xmm0,%xmm3 |
150 | pshufb %xmm13,%xmm3 |
151 | paddd %xmm3,%xmm2 |
152 | pxor %xmm2,%xmm1 |
153 | movdqa %xmm1,%xmm8 |
154 | psrld $0x7,%xmm1 |
155 | pslld $0x19,%xmm8 |
156 | por %xmm8,%xmm1 |
157 | pshufd $0x39,%xmm0,%xmm0 |
158 | pshufd $0x4e,%xmm3,%xmm3 |
159 | pshufd $0x93,%xmm2,%xmm2 |
160 | addq $0x10,%rcx |
161 | cmpq %r8,%rcx |
162 | jnz .Lroundloop |
163 | pxor %xmm2,%xmm0 |
164 | pxor %xmm3,%xmm1 |
165 | pxor %xmm10,%xmm0 |
166 | pxor %xmm11,%xmm1 |
167 | addq $0x40,%rsi |
168 | decq %rdx |
169 | jnz .Lbeginofloop |
170 | movdqu %xmm0,(%rdi) |
171 | movdqu %xmm1,0x10(%rdi) |
172 | movdqu %xmm14,0x20(%rdi) |
173 | .Lendofloop: |
174 | RET |
175 | SYM_FUNC_END(blake2s_compress_ssse3) |
176 | |
177 | #ifdef CONFIG_AS_AVX512 |
178 | SYM_FUNC_START(blake2s_compress_avx512) |
179 | vmovdqu (%rdi),%xmm0 |
180 | vmovdqu 0x10(%rdi),%xmm1 |
181 | vmovdqu 0x20(%rdi),%xmm4 |
182 | vmovq %rcx,%xmm5 |
183 | vmovdqa IV(%rip),%xmm14 |
184 | vmovdqa IV+16(%rip),%xmm15 |
185 | jmp .Lblake2s_compress_avx512_mainloop |
186 | .align 32 |
187 | .Lblake2s_compress_avx512_mainloop: |
188 | vmovdqa %xmm0,%xmm10 |
189 | vmovdqa %xmm1,%xmm11 |
190 | vpaddq %xmm5,%xmm4,%xmm4 |
191 | vmovdqa %xmm14,%xmm2 |
192 | vpxor %xmm15,%xmm4,%xmm3 |
193 | vmovdqu (%rsi),%ymm6 |
194 | vmovdqu 0x20(%rsi),%ymm7 |
195 | addq $0x40,%rsi |
196 | leaq SIGMA2(%rip),%rax |
197 | movb $0xa,%cl |
198 | .Lblake2s_compress_avx512_roundloop: |
199 | addq $0x40,%rax |
200 | vmovdqa -0x40(%rax),%ymm8 |
201 | vmovdqa -0x20(%rax),%ymm9 |
202 | vpermi2d %ymm7,%ymm6,%ymm8 |
203 | vpermi2d %ymm7,%ymm6,%ymm9 |
204 | vmovdqa %ymm8,%ymm6 |
205 | vmovdqa %ymm9,%ymm7 |
206 | vpaddd %xmm8,%xmm0,%xmm0 |
207 | vpaddd %xmm1,%xmm0,%xmm0 |
208 | vpxor %xmm0,%xmm3,%xmm3 |
209 | vprord $0x10,%xmm3,%xmm3 |
210 | vpaddd %xmm3,%xmm2,%xmm2 |
211 | vpxor %xmm2,%xmm1,%xmm1 |
212 | vprord $0xc,%xmm1,%xmm1 |
213 | vextracti128 $0x1,%ymm8,%xmm8 |
214 | vpaddd %xmm8,%xmm0,%xmm0 |
215 | vpaddd %xmm1,%xmm0,%xmm0 |
216 | vpxor %xmm0,%xmm3,%xmm3 |
217 | vprord $0x8,%xmm3,%xmm3 |
218 | vpaddd %xmm3,%xmm2,%xmm2 |
219 | vpxor %xmm2,%xmm1,%xmm1 |
220 | vprord $0x7,%xmm1,%xmm1 |
221 | vpshufd $0x93,%xmm0,%xmm0 |
222 | vpshufd $0x4e,%xmm3,%xmm3 |
223 | vpshufd $0x39,%xmm2,%xmm2 |
224 | vpaddd %xmm9,%xmm0,%xmm0 |
225 | vpaddd %xmm1,%xmm0,%xmm0 |
226 | vpxor %xmm0,%xmm3,%xmm3 |
227 | vprord $0x10,%xmm3,%xmm3 |
228 | vpaddd %xmm3,%xmm2,%xmm2 |
229 | vpxor %xmm2,%xmm1,%xmm1 |
230 | vprord $0xc,%xmm1,%xmm1 |
231 | vextracti128 $0x1,%ymm9,%xmm9 |
232 | vpaddd %xmm9,%xmm0,%xmm0 |
233 | vpaddd %xmm1,%xmm0,%xmm0 |
234 | vpxor %xmm0,%xmm3,%xmm3 |
235 | vprord $0x8,%xmm3,%xmm3 |
236 | vpaddd %xmm3,%xmm2,%xmm2 |
237 | vpxor %xmm2,%xmm1,%xmm1 |
238 | vprord $0x7,%xmm1,%xmm1 |
239 | vpshufd $0x39,%xmm0,%xmm0 |
240 | vpshufd $0x4e,%xmm3,%xmm3 |
241 | vpshufd $0x93,%xmm2,%xmm2 |
242 | decb %cl |
243 | jne .Lblake2s_compress_avx512_roundloop |
244 | vpxor %xmm10,%xmm0,%xmm0 |
245 | vpxor %xmm11,%xmm1,%xmm1 |
246 | vpxor %xmm2,%xmm0,%xmm0 |
247 | vpxor %xmm3,%xmm1,%xmm1 |
248 | decq %rdx |
249 | jne .Lblake2s_compress_avx512_mainloop |
250 | vmovdqu %xmm0,(%rdi) |
251 | vmovdqu %xmm1,0x10(%rdi) |
252 | vmovdqu %xmm4,0x20(%rdi) |
253 | vzeroupper |
254 | RET |
255 | SYM_FUNC_END(blake2s_compress_avx512) |
256 | #endif /* CONFIG_AS_AVX512 */ |
257 | |