1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * x86_64/AVX2 assembler optimized version of Serpent |
4 | * |
5 | * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> |
6 | * |
7 | * Based on AVX assembler implementation of Serpent by: |
8 | * Copyright © 2012 Johannes Goetzfried |
9 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
10 | */ |
11 | |
12 | #include <linux/linkage.h> |
13 | #include <asm/frame.h> |
14 | #include "glue_helper-asm-avx2.S" |
15 | |
16 | .file "serpent-avx2-asm_64.S" |
17 | |
18 | .section .rodata.cst16.bswap128_mask, "aM" , @progbits, 16 |
19 | .align 16 |
20 | .Lbswap128_mask: |
21 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
22 | |
23 | .text |
24 | |
25 | #define CTX %rdi |
26 | |
27 | #define RNOT %ymm0 |
28 | #define tp %ymm1 |
29 | |
30 | #define RA1 %ymm2 |
31 | #define RA2 %ymm3 |
32 | #define RB1 %ymm4 |
33 | #define RB2 %ymm5 |
34 | #define RC1 %ymm6 |
35 | #define RC2 %ymm7 |
36 | #define RD1 %ymm8 |
37 | #define RD2 %ymm9 |
38 | #define RE1 %ymm10 |
39 | #define RE2 %ymm11 |
40 | |
41 | #define RK0 %ymm12 |
42 | #define RK1 %ymm13 |
43 | #define RK2 %ymm14 |
44 | #define RK3 %ymm15 |
45 | |
46 | #define RK0x %xmm12 |
47 | #define RK1x %xmm13 |
48 | #define RK2x %xmm14 |
49 | #define RK3x %xmm15 |
50 | |
51 | #define S0_1(x0, x1, x2, x3, x4) \ |
52 | vpor x0, x3, tp; \ |
53 | vpxor x3, x0, x0; \ |
54 | vpxor x2, x3, x4; \ |
55 | vpxor RNOT, x4, x4; \ |
56 | vpxor x1, tp, x3; \ |
57 | vpand x0, x1, x1; \ |
58 | vpxor x4, x1, x1; \ |
59 | vpxor x0, x2, x2; |
60 | #define S0_2(x0, x1, x2, x3, x4) \ |
61 | vpxor x3, x0, x0; \ |
62 | vpor x0, x4, x4; \ |
63 | vpxor x2, x0, x0; \ |
64 | vpand x1, x2, x2; \ |
65 | vpxor x2, x3, x3; \ |
66 | vpxor RNOT, x1, x1; \ |
67 | vpxor x4, x2, x2; \ |
68 | vpxor x2, x1, x1; |
69 | |
70 | #define S1_1(x0, x1, x2, x3, x4) \ |
71 | vpxor x0, x1, tp; \ |
72 | vpxor x3, x0, x0; \ |
73 | vpxor RNOT, x3, x3; \ |
74 | vpand tp, x1, x4; \ |
75 | vpor tp, x0, x0; \ |
76 | vpxor x2, x3, x3; \ |
77 | vpxor x3, x0, x0; \ |
78 | vpxor x3, tp, x1; |
79 | #define S1_2(x0, x1, x2, x3, x4) \ |
80 | vpxor x4, x3, x3; \ |
81 | vpor x4, x1, x1; \ |
82 | vpxor x2, x4, x4; \ |
83 | vpand x0, x2, x2; \ |
84 | vpxor x1, x2, x2; \ |
85 | vpor x0, x1, x1; \ |
86 | vpxor RNOT, x0, x0; \ |
87 | vpxor x2, x0, x0; \ |
88 | vpxor x1, x4, x4; |
89 | |
90 | #define S2_1(x0, x1, x2, x3, x4) \ |
91 | vpxor RNOT, x3, x3; \ |
92 | vpxor x0, x1, x1; \ |
93 | vpand x2, x0, tp; \ |
94 | vpxor x3, tp, tp; \ |
95 | vpor x0, x3, x3; \ |
96 | vpxor x1, x2, x2; \ |
97 | vpxor x1, x3, x3; \ |
98 | vpand tp, x1, x1; |
99 | #define S2_2(x0, x1, x2, x3, x4) \ |
100 | vpxor x2, tp, tp; \ |
101 | vpand x3, x2, x2; \ |
102 | vpor x1, x3, x3; \ |
103 | vpxor RNOT, tp, tp; \ |
104 | vpxor tp, x3, x3; \ |
105 | vpxor tp, x0, x4; \ |
106 | vpxor x2, tp, x0; \ |
107 | vpor x2, x1, x1; |
108 | |
109 | #define S3_1(x0, x1, x2, x3, x4) \ |
110 | vpxor x3, x1, tp; \ |
111 | vpor x0, x3, x3; \ |
112 | vpand x0, x1, x4; \ |
113 | vpxor x2, x0, x0; \ |
114 | vpxor tp, x2, x2; \ |
115 | vpand x3, tp, x1; \ |
116 | vpxor x3, x2, x2; \ |
117 | vpor x4, x0, x0; \ |
118 | vpxor x3, x4, x4; |
119 | #define S3_2(x0, x1, x2, x3, x4) \ |
120 | vpxor x0, x1, x1; \ |
121 | vpand x3, x0, x0; \ |
122 | vpand x4, x3, x3; \ |
123 | vpxor x2, x3, x3; \ |
124 | vpor x1, x4, x4; \ |
125 | vpand x1, x2, x2; \ |
126 | vpxor x3, x4, x4; \ |
127 | vpxor x3, x0, x0; \ |
128 | vpxor x2, x3, x3; |
129 | |
130 | #define S4_1(x0, x1, x2, x3, x4) \ |
131 | vpand x0, x3, tp; \ |
132 | vpxor x3, x0, x0; \ |
133 | vpxor x2, tp, tp; \ |
134 | vpor x3, x2, x2; \ |
135 | vpxor x1, x0, x0; \ |
136 | vpxor tp, x3, x4; \ |
137 | vpor x0, x2, x2; \ |
138 | vpxor x1, x2, x2; |
139 | #define S4_2(x0, x1, x2, x3, x4) \ |
140 | vpand x0, x1, x1; \ |
141 | vpxor x4, x1, x1; \ |
142 | vpand x2, x4, x4; \ |
143 | vpxor tp, x2, x2; \ |
144 | vpxor x0, x4, x4; \ |
145 | vpor x1, tp, x3; \ |
146 | vpxor RNOT, x1, x1; \ |
147 | vpxor x0, x3, x3; |
148 | |
149 | #define S5_1(x0, x1, x2, x3, x4) \ |
150 | vpor x0, x1, tp; \ |
151 | vpxor tp, x2, x2; \ |
152 | vpxor RNOT, x3, x3; \ |
153 | vpxor x0, x1, x4; \ |
154 | vpxor x2, x0, x0; \ |
155 | vpand x4, tp, x1; \ |
156 | vpor x3, x4, x4; \ |
157 | vpxor x0, x4, x4; |
158 | #define S5_2(x0, x1, x2, x3, x4) \ |
159 | vpand x3, x0, x0; \ |
160 | vpxor x3, x1, x1; \ |
161 | vpxor x2, x3, x3; \ |
162 | vpxor x1, x0, x0; \ |
163 | vpand x4, x2, x2; \ |
164 | vpxor x2, x1, x1; \ |
165 | vpand x0, x2, x2; \ |
166 | vpxor x2, x3, x3; |
167 | |
168 | #define S6_1(x0, x1, x2, x3, x4) \ |
169 | vpxor x0, x3, x3; \ |
170 | vpxor x2, x1, tp; \ |
171 | vpxor x0, x2, x2; \ |
172 | vpand x3, x0, x0; \ |
173 | vpor x3, tp, tp; \ |
174 | vpxor RNOT, x1, x4; \ |
175 | vpxor tp, x0, x0; \ |
176 | vpxor x2, tp, x1; |
177 | #define S6_2(x0, x1, x2, x3, x4) \ |
178 | vpxor x4, x3, x3; \ |
179 | vpxor x0, x4, x4; \ |
180 | vpand x0, x2, x2; \ |
181 | vpxor x1, x4, x4; \ |
182 | vpxor x3, x2, x2; \ |
183 | vpand x1, x3, x3; \ |
184 | vpxor x0, x3, x3; \ |
185 | vpxor x2, x1, x1; |
186 | |
187 | #define S7_1(x0, x1, x2, x3, x4) \ |
188 | vpxor RNOT, x1, tp; \ |
189 | vpxor RNOT, x0, x0; \ |
190 | vpand x2, tp, x1; \ |
191 | vpxor x3, x1, x1; \ |
192 | vpor tp, x3, x3; \ |
193 | vpxor x2, tp, x4; \ |
194 | vpxor x3, x2, x2; \ |
195 | vpxor x0, x3, x3; \ |
196 | vpor x1, x0, x0; |
197 | #define S7_2(x0, x1, x2, x3, x4) \ |
198 | vpand x0, x2, x2; \ |
199 | vpxor x4, x0, x0; \ |
200 | vpxor x3, x4, x4; \ |
201 | vpand x0, x3, x3; \ |
202 | vpxor x1, x4, x4; \ |
203 | vpxor x4, x2, x2; \ |
204 | vpxor x1, x3, x3; \ |
205 | vpor x0, x4, x4; \ |
206 | vpxor x1, x4, x4; |
207 | |
208 | #define SI0_1(x0, x1, x2, x3, x4) \ |
209 | vpxor x0, x1, x1; \ |
210 | vpor x1, x3, tp; \ |
211 | vpxor x1, x3, x4; \ |
212 | vpxor RNOT, x0, x0; \ |
213 | vpxor tp, x2, x2; \ |
214 | vpxor x0, tp, x3; \ |
215 | vpand x1, x0, x0; \ |
216 | vpxor x2, x0, x0; |
217 | #define SI0_2(x0, x1, x2, x3, x4) \ |
218 | vpand x3, x2, x2; \ |
219 | vpxor x4, x3, x3; \ |
220 | vpxor x3, x2, x2; \ |
221 | vpxor x3, x1, x1; \ |
222 | vpand x0, x3, x3; \ |
223 | vpxor x0, x1, x1; \ |
224 | vpxor x2, x0, x0; \ |
225 | vpxor x3, x4, x4; |
226 | |
227 | #define SI1_1(x0, x1, x2, x3, x4) \ |
228 | vpxor x3, x1, x1; \ |
229 | vpxor x2, x0, tp; \ |
230 | vpxor RNOT, x2, x2; \ |
231 | vpor x1, x0, x4; \ |
232 | vpxor x3, x4, x4; \ |
233 | vpand x1, x3, x3; \ |
234 | vpxor x2, x1, x1; \ |
235 | vpand x4, x2, x2; |
236 | #define SI1_2(x0, x1, x2, x3, x4) \ |
237 | vpxor x1, x4, x4; \ |
238 | vpor x3, x1, x1; \ |
239 | vpxor tp, x3, x3; \ |
240 | vpxor tp, x2, x2; \ |
241 | vpor x4, tp, x0; \ |
242 | vpxor x4, x2, x2; \ |
243 | vpxor x0, x1, x1; \ |
244 | vpxor x1, x4, x4; |
245 | |
246 | #define SI2_1(x0, x1, x2, x3, x4) \ |
247 | vpxor x1, x2, x2; \ |
248 | vpxor RNOT, x3, tp; \ |
249 | vpor x2, tp, tp; \ |
250 | vpxor x3, x2, x2; \ |
251 | vpxor x0, x3, x4; \ |
252 | vpxor x1, tp, x3; \ |
253 | vpor x2, x1, x1; \ |
254 | vpxor x0, x2, x2; |
255 | #define SI2_2(x0, x1, x2, x3, x4) \ |
256 | vpxor x4, x1, x1; \ |
257 | vpor x3, x4, x4; \ |
258 | vpxor x3, x2, x2; \ |
259 | vpxor x2, x4, x4; \ |
260 | vpand x1, x2, x2; \ |
261 | vpxor x3, x2, x2; \ |
262 | vpxor x4, x3, x3; \ |
263 | vpxor x0, x4, x4; |
264 | |
265 | #define SI3_1(x0, x1, x2, x3, x4) \ |
266 | vpxor x1, x2, x2; \ |
267 | vpand x2, x1, tp; \ |
268 | vpxor x0, tp, tp; \ |
269 | vpor x1, x0, x0; \ |
270 | vpxor x3, x1, x4; \ |
271 | vpxor x3, x0, x0; \ |
272 | vpor tp, x3, x3; \ |
273 | vpxor x2, tp, x1; |
274 | #define SI3_2(x0, x1, x2, x3, x4) \ |
275 | vpxor x3, x1, x1; \ |
276 | vpxor x2, x0, x0; \ |
277 | vpxor x3, x2, x2; \ |
278 | vpand x1, x3, x3; \ |
279 | vpxor x0, x1, x1; \ |
280 | vpand x2, x0, x0; \ |
281 | vpxor x3, x4, x4; \ |
282 | vpxor x0, x3, x3; \ |
283 | vpxor x1, x0, x0; |
284 | |
285 | #define SI4_1(x0, x1, x2, x3, x4) \ |
286 | vpxor x3, x2, x2; \ |
287 | vpand x1, x0, tp; \ |
288 | vpxor x2, tp, tp; \ |
289 | vpor x3, x2, x2; \ |
290 | vpxor RNOT, x0, x4; \ |
291 | vpxor tp, x1, x1; \ |
292 | vpxor x2, tp, x0; \ |
293 | vpand x4, x2, x2; |
294 | #define SI4_2(x0, x1, x2, x3, x4) \ |
295 | vpxor x0, x2, x2; \ |
296 | vpor x4, x0, x0; \ |
297 | vpxor x3, x0, x0; \ |
298 | vpand x2, x3, x3; \ |
299 | vpxor x3, x4, x4; \ |
300 | vpxor x1, x3, x3; \ |
301 | vpand x0, x1, x1; \ |
302 | vpxor x1, x4, x4; \ |
303 | vpxor x3, x0, x0; |
304 | |
305 | #define SI5_1(x0, x1, x2, x3, x4) \ |
306 | vpor x2, x1, tp; \ |
307 | vpxor x1, x2, x2; \ |
308 | vpxor x3, tp, tp; \ |
309 | vpand x1, x3, x3; \ |
310 | vpxor x3, x2, x2; \ |
311 | vpor x0, x3, x3; \ |
312 | vpxor RNOT, x0, x0; \ |
313 | vpxor x2, x3, x3; \ |
314 | vpor x0, x2, x2; |
315 | #define SI5_2(x0, x1, x2, x3, x4) \ |
316 | vpxor tp, x1, x4; \ |
317 | vpxor x4, x2, x2; \ |
318 | vpand x0, x4, x4; \ |
319 | vpxor tp, x0, x0; \ |
320 | vpxor x3, tp, x1; \ |
321 | vpand x2, x0, x0; \ |
322 | vpxor x3, x2, x2; \ |
323 | vpxor x2, x0, x0; \ |
324 | vpxor x4, x2, x2; \ |
325 | vpxor x3, x4, x4; |
326 | |
327 | #define SI6_1(x0, x1, x2, x3, x4) \ |
328 | vpxor x2, x0, x0; \ |
329 | vpand x3, x0, tp; \ |
330 | vpxor x3, x2, x2; \ |
331 | vpxor x2, tp, tp; \ |
332 | vpxor x1, x3, x3; \ |
333 | vpor x0, x2, x2; \ |
334 | vpxor x3, x2, x2; \ |
335 | vpand tp, x3, x3; |
336 | #define SI6_2(x0, x1, x2, x3, x4) \ |
337 | vpxor RNOT, tp, tp; \ |
338 | vpxor x1, x3, x3; \ |
339 | vpand x2, x1, x1; \ |
340 | vpxor tp, x0, x4; \ |
341 | vpxor x4, x3, x3; \ |
342 | vpxor x2, x4, x4; \ |
343 | vpxor x1, tp, x0; \ |
344 | vpxor x0, x2, x2; |
345 | |
346 | #define SI7_1(x0, x1, x2, x3, x4) \ |
347 | vpand x0, x3, tp; \ |
348 | vpxor x2, x0, x0; \ |
349 | vpor x3, x2, x2; \ |
350 | vpxor x1, x3, x4; \ |
351 | vpxor RNOT, x0, x0; \ |
352 | vpor tp, x1, x1; \ |
353 | vpxor x0, x4, x4; \ |
354 | vpand x2, x0, x0; \ |
355 | vpxor x1, x0, x0; |
356 | #define SI7_2(x0, x1, x2, x3, x4) \ |
357 | vpand x2, x1, x1; \ |
358 | vpxor x2, tp, x3; \ |
359 | vpxor x3, x4, x4; \ |
360 | vpand x3, x2, x2; \ |
361 | vpor x0, x3, x3; \ |
362 | vpxor x4, x1, x1; \ |
363 | vpxor x4, x3, x3; \ |
364 | vpand x0, x4, x4; \ |
365 | vpxor x2, x4, x4; |
366 | |
367 | #define get_key(i,j,t) \ |
368 | vpbroadcastd ( |
---|