1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 */
6
7#include <linux/linkage.h>
8
9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10.align 32
11IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F
13.section .rodata.cst16.ROT16, "aM", @progbits, 16
14.align 16
15ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
16.section .rodata.cst16.ROR328, "aM", @progbits, 16
17.align 16
18ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20.align 64
21SIGMA:
22.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
23.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
24.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
25.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
26.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
27.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
28.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
29.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
30.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
31.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
32#ifdef CONFIG_AS_AVX512
33.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
34.align 64
35SIGMA2:
36.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
37.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
38.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
39.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
40.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
41.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
42.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
43.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
44.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
45.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
46#endif /* CONFIG_AS_AVX512 */
47
48.text
49SYM_FUNC_START(blake2s_compress_ssse3)
50 testq %rdx,%rdx
51 je .Lendofloop
52 movdqu (%rdi),%xmm0
53 movdqu 0x10(%rdi),%xmm1
54 movdqa ROT16(%rip),%xmm12
55 movdqa ROR328(%rip),%xmm13
56 movdqu 0x20(%rdi),%xmm14
57 movq %rcx,%xmm15
58 leaq SIGMA+0xa0(%rip),%r8
59 jmp .Lbeginofloop
60 .align 32
61.Lbeginofloop:
62 movdqa %xmm0,%xmm10
63 movdqa %xmm1,%xmm11
64 paddq %xmm15,%xmm14
65 movdqa IV(%rip),%xmm2
66 movdqa %xmm14,%xmm3
67 pxor IV+0x10(%rip),%xmm3
68 leaq SIGMA(%rip),%rcx
69.Lroundloop:
70 movzbl (%rcx),%eax
71 movd (%rsi,%rax,4),%xmm4
72 movzbl 0x1(%rcx),%eax
73 movd (%rsi,%rax,4),%xmm5
74 movzbl 0x2(%rcx),%eax
75 movd (%rsi,%rax,4),%xmm6
76 movzbl 0x3(%rcx),%eax
77 movd (%rsi,%rax,4),%xmm7
78 punpckldq %xmm5,%xmm4
79 punpckldq %xmm7,%xmm6
80 punpcklqdq %xmm6,%xmm4
81 paddd %xmm4,%xmm0
82 paddd %xmm1,%xmm0
83 pxor %xmm0,%xmm3
84 pshufb %xmm12,%xmm3
85 paddd %xmm3,%xmm2
86 pxor %xmm2,%xmm1
87 movdqa %xmm1,%xmm8
88 psrld $0xc,%xmm1
89 pslld $0x14,%xmm8
90 por %xmm8,%xmm1
91 movzbl 0x4(%rcx),%eax
92 movd (%rsi,%rax,4),%xmm5
93 movzbl 0x5(%rcx),%eax
94 movd (%rsi,%rax,4),%xmm6
95 movzbl 0x6(%rcx),%eax
96 movd (%rsi,%rax,4),%xmm7
97 movzbl 0x7(%rcx),%eax
98 movd (%rsi,%rax,4),%xmm4
99 punpckldq %xmm6,%xmm5
100 punpckldq %xmm4,%xmm7
101 punpcklqdq %xmm7,%xmm5
102 paddd %xmm5,%xmm0
103 paddd %xmm1,%xmm0
104 pxor %xmm0,%xmm3
105 pshufb %xmm13,%xmm3
106 paddd %xmm3,%xmm2
107 pxor %xmm2,%xmm1
108 movdqa %xmm1,%xmm8
109 psrld $0x7,%xmm1
110 pslld $0x19,%xmm8
111 por %xmm8,%xmm1
112 pshufd $0x93,%xmm0,%xmm0
113 pshufd $0x4e,%xmm3,%xmm3
114 pshufd $0x39,%xmm2,%xmm2
115 movzbl 0x8(%rcx),%eax
116 movd (%rsi,%rax,4),%xmm6
117 movzbl 0x9(%rcx),%eax
118 movd (%rsi,%rax,4),%xmm7
119 movzbl 0xa(%rcx),%eax
120 movd (%rsi,%rax,4),%xmm4
121 movzbl 0xb(%rcx),%eax
122 movd (%rsi,%rax,4),%xmm5
123 punpckldq %xmm7,%xmm6
124 punpckldq %xmm5,%xmm4
125 punpcklqdq %xmm4,%xmm6
126 paddd %xmm6,%xmm0
127 paddd %xmm1,%xmm0
128 pxor %xmm0,%xmm3
129 pshufb %xmm12,%xmm3
130 paddd %xmm3,%xmm2
131 pxor %xmm2,%xmm1
132 movdqa %xmm1,%xmm8
133 psrld $0xc,%xmm1
134 pslld $0x14,%xmm8
135 por %xmm8,%xmm1
136 movzbl 0xc(%rcx),%eax
137 movd (%rsi,%rax,4),%xmm7
138 movzbl 0xd(%rcx),%eax
139 movd (%rsi,%rax,4),%xmm4
140 movzbl 0xe(%rcx),%eax
141 movd (%rsi,%rax,4),%xmm5
142 movzbl 0xf(%rcx),%eax
143 movd (%rsi,%rax,4),%xmm6
144 punpckldq %xmm4,%xmm7
145 punpckldq %xmm6,%xmm5
146 punpcklqdq %xmm5,%xmm7
147 paddd %xmm7,%xmm0
148 paddd %xmm1,%xmm0
149 pxor %xmm0,%xmm3
150 pshufb %xmm13,%xmm3
151 paddd %xmm3,%xmm2
152 pxor %xmm2,%xmm1
153 movdqa %xmm1,%xmm8
154 psrld $0x7,%xmm1
155 pslld $0x19,%xmm8
156 por %xmm8,%xmm1
157 pshufd $0x39,%xmm0,%xmm0
158 pshufd $0x4e,%xmm3,%xmm3
159 pshufd $0x93,%xmm2,%xmm2
160 addq $0x10,%rcx
161 cmpq %r8,%rcx
162 jnz .Lroundloop
163 pxor %xmm2,%xmm0
164 pxor %xmm3,%xmm1
165 pxor %xmm10,%xmm0
166 pxor %xmm11,%xmm1
167 addq $0x40,%rsi
168 decq %rdx
169 jnz .Lbeginofloop
170 movdqu %xmm0,(%rdi)
171 movdqu %xmm1,0x10(%rdi)
172 movdqu %xmm14,0x20(%rdi)
173.Lendofloop:
174 RET
175SYM_FUNC_END(blake2s_compress_ssse3)
176
177#ifdef CONFIG_AS_AVX512
178SYM_FUNC_START(blake2s_compress_avx512)
179 vmovdqu (%rdi),%xmm0
180 vmovdqu 0x10(%rdi),%xmm1
181 vmovdqu 0x20(%rdi),%xmm4
182 vmovq %rcx,%xmm5
183 vmovdqa IV(%rip),%xmm14
184 vmovdqa IV+16(%rip),%xmm15
185 jmp .Lblake2s_compress_avx512_mainloop
186.align 32
187.Lblake2s_compress_avx512_mainloop:
188 vmovdqa %xmm0,%xmm10
189 vmovdqa %xmm1,%xmm11
190 vpaddq %xmm5,%xmm4,%xmm4
191 vmovdqa %xmm14,%xmm2
192 vpxor %xmm15,%xmm4,%xmm3
193 vmovdqu (%rsi),%ymm6
194 vmovdqu 0x20(%rsi),%ymm7
195 addq $0x40,%rsi
196 leaq SIGMA2(%rip),%rax
197 movb $0xa,%cl
198.Lblake2s_compress_avx512_roundloop:
199 addq $0x40,%rax
200 vmovdqa -0x40(%rax),%ymm8
201 vmovdqa -0x20(%rax),%ymm9
202 vpermi2d %ymm7,%ymm6,%ymm8
203 vpermi2d %ymm7,%ymm6,%ymm9
204 vmovdqa %ymm8,%ymm6
205 vmovdqa %ymm9,%ymm7
206 vpaddd %xmm8,%xmm0,%xmm0
207 vpaddd %xmm1,%xmm0,%xmm0
208 vpxor %xmm0,%xmm3,%xmm3
209 vprord $0x10,%xmm3,%xmm3
210 vpaddd %xmm3,%xmm2,%xmm2
211 vpxor %xmm2,%xmm1,%xmm1
212 vprord $0xc,%xmm1,%xmm1
213 vextracti128 $0x1,%ymm8,%xmm8
214 vpaddd %xmm8,%xmm0,%xmm0
215 vpaddd %xmm1,%xmm0,%xmm0
216 vpxor %xmm0,%xmm3,%xmm3
217 vprord $0x8,%xmm3,%xmm3
218 vpaddd %xmm3,%xmm2,%xmm2
219 vpxor %xmm2,%xmm1,%xmm1
220 vprord $0x7,%xmm1,%xmm1
221 vpshufd $0x93,%xmm0,%xmm0
222 vpshufd $0x4e,%xmm3,%xmm3
223 vpshufd $0x39,%xmm2,%xmm2
224 vpaddd %xmm9,%xmm0,%xmm0
225 vpaddd %xmm1,%xmm0,%xmm0
226 vpxor %xmm0,%xmm3,%xmm3
227 vprord $0x10,%xmm3,%xmm3
228 vpaddd %xmm3,%xmm2,%xmm2
229 vpxor %xmm2,%xmm1,%xmm1
230 vprord $0xc,%xmm1,%xmm1
231 vextracti128 $0x1,%ymm9,%xmm9
232 vpaddd %xmm9,%xmm0,%xmm0
233 vpaddd %xmm1,%xmm0,%xmm0
234 vpxor %xmm0,%xmm3,%xmm3
235 vprord $0x8,%xmm3,%xmm3
236 vpaddd %xmm3,%xmm2,%xmm2
237 vpxor %xmm2,%xmm1,%xmm1
238 vprord $0x7,%xmm1,%xmm1
239 vpshufd $0x39,%xmm0,%xmm0
240 vpshufd $0x4e,%xmm3,%xmm3
241 vpshufd $0x93,%xmm2,%xmm2
242 decb %cl
243 jne .Lblake2s_compress_avx512_roundloop
244 vpxor %xmm10,%xmm0,%xmm0
245 vpxor %xmm11,%xmm1,%xmm1
246 vpxor %xmm2,%xmm0,%xmm0
247 vpxor %xmm3,%xmm1,%xmm1
248 decq %rdx
249 jne .Lblake2s_compress_avx512_mainloop
250 vmovdqu %xmm0,(%rdi)
251 vmovdqu %xmm1,0x10(%rdi)
252 vmovdqu %xmm4,0x20(%rdi)
253 vzeroupper
254 RET
255SYM_FUNC_END(blake2s_compress_avx512)
256#endif /* CONFIG_AS_AVX512 */
257

source code of linux/arch/x86/crypto/blake2s-core.S