1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 16-way parallel algorithm (AVX)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <linux/cfi_types.h>
11#include <asm/asm-offsets.h>
12#include <asm/frame.h>
13
14/* register macros */
15#define CTX %rdi
16
17
18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19 ( (((a0) & 1) << 0) | \
20 (((a1) & 1) << 1) | \
21 (((a2) & 1) << 2) | \
22 (((a3) & 1) << 3) | \
23 (((a4) & 1) << 4) | \
24 (((a5) & 1) << 5) | \
25 (((a6) & 1) << 6) | \
26 (((a7) & 1) << 7) )
27
28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29 ( ((l7) << (0 * 8)) | \
30 ((l6) << (1 * 8)) | \
31 ((l5) << (2 * 8)) | \
32 ((l4) << (3 * 8)) | \
33 ((l3) << (4 * 8)) | \
34 ((l2) << (5 * 8)) | \
35 ((l1) << (6 * 8)) | \
36 ((l0) << (7 * 8)) )
37
38#define inc_le128(x, minus_one, tmp) \
39 vpcmpeqq minus_one, x, tmp; \
40 vpsubq minus_one, x, x; \
41 vpslldq $8, tmp, tmp; \
42 vpsubq tmp, x, x;
43
44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45 vpand x, mask4bit, tmp0; \
46 vpandn x, mask4bit, x; \
47 vpsrld $4, x, x; \
48 \
49 vpshufb tmp0, lo_t, tmp0; \
50 vpshufb x, hi_t, x; \
51 vpxor tmp0, x, x;
52
53#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54 vpunpckhdq x1, x0, t2; \
55 vpunpckldq x1, x0, x0; \
56 \
57 vpunpckldq x3, x2, t1; \
58 vpunpckhdq x3, x2, x2; \
59 \
60 vpunpckhqdq t1, x0, x1; \
61 vpunpcklqdq t1, x0, x0; \
62 \
63 vpunpckhqdq x2, t2, x3; \
64 vpunpcklqdq x2, t2, x2;
65
66#define byteslice_16x16b(a0, b0, c0, d0, \
67 a1, b1, c1, d1, \
68 a2, b2, c2, d2, \
69 a3, b3, c3, d3, \
70 st0, st1) \
71 vmovdqu d2, st0; \
72 vmovdqu d3, st1; \
73 transpose_4x4(a0, a1, a2, a3, d2, d3); \
74 transpose_4x4(b0, b1, b2, b3, d2, d3); \
75 vmovdqu st0, d2; \
76 vmovdqu st1, d3; \
77 \
78 vmovdqu a0, st0; \
79 vmovdqu a1, st1; \
80 transpose_4x4(c0, c1, c2, c3, a0, a1); \
81 transpose_4x4(d0, d1, d2, d3, a0, a1); \
82 \
83 vmovdqu .Lshufb_16x16b(%rip), a0; \
84 vmovdqu st1, a1; \
85 vpshufb a0, a2, a2; \
86 vpshufb a0, a3, a3; \
87 vpshufb a0, b0, b0; \
88 vpshufb a0, b1, b1; \
89 vpshufb a0, b2, b2; \
90 vpshufb a0, b3, b3; \
91 vpshufb a0, a1, a1; \
92 vpshufb a0, c0, c0; \
93 vpshufb a0, c1, c1; \
94 vpshufb a0, c2, c2; \
95 vpshufb a0, c3, c3; \
96 vpshufb a0, d0, d0; \
97 vpshufb a0, d1, d1; \
98 vpshufb a0, d2, d2; \
99 vpshufb a0, d3, d3; \
100 vmovdqu d3, st1; \
101 vmovdqu st0, d3; \
102 vpshufb a0, d3, a0; \
103 vmovdqu d2, st0; \
104 \
105 transpose_4x4(a0, b0, c0, d0, d2, d3); \
106 transpose_4x4(a1, b1, c1, d1, d2, d3); \
107 vmovdqu st0, d2; \
108 vmovdqu st1, d3; \
109 \
110 vmovdqu b0, st0; \
111 vmovdqu b1, st1; \
112 transpose_4x4(a2, b2, c2, d2, b0, b1); \
113 transpose_4x4(a3, b3, c3, d3, b0, b1); \
114 vmovdqu st0, b0; \
115 vmovdqu st1, b1; \
116 /* does not adjust output bytes inside vectors */
117
118#define debyteslice_16x16b(a0, b0, c0, d0, \
119 a1, b1, c1, d1, \
120 a2, b2, c2, d2, \
121 a3, b3, c3, d3, \
122 st0, st1) \
123 vmovdqu d2, st0; \
124 vmovdqu d3, st1; \
125 transpose_4x4(a0, a1, a2, a3, d2, d3); \
126 transpose_4x4(b0, b1, b2, b3, d2, d3); \
127 vmovdqu st0, d2; \
128 vmovdqu st1, d3; \
129 \
130 vmovdqu a0, st0; \
131 vmovdqu a1, st1; \
132 transpose_4x4(c0, c1, c2, c3, a0, a1); \
133 transpose_4x4(d0, d1, d2, d3, a0, a1); \
134 \
135 vmovdqu .Lshufb_16x16b(%rip), a0; \
136 vmovdqu st1, a1; \
137 vpshufb a0, a2, a2; \
138 vpshufb a0, a3, a3; \
139 vpshufb a0, b0, b0; \
140 vpshufb a0, b1, b1; \
141 vpshufb a0, b2, b2; \
142 vpshufb a0, b3, b3; \
143 vpshufb a0, a1, a1; \
144 vpshufb a0, c0, c0; \
145 vpshufb a0, c1, c1; \
146 vpshufb a0, c2, c2; \
147 vpshufb a0, c3, c3; \
148 vpshufb a0, d0, d0; \
149 vpshufb a0, d1, d1; \
150 vpshufb a0, d2, d2; \
151 vpshufb a0, d3, d3; \
152 vmovdqu d3, st1; \
153 vmovdqu st0, d3; \
154 vpshufb a0, d3, a0; \
155 vmovdqu d2, st0; \
156 \
157 transpose_4x4(c0, d0, a0, b0, d2, d3); \
158 transpose_4x4(c1, d1, a1, b1, d2, d3); \
159 vmovdqu st0, d2; \
160 vmovdqu st1, d3; \
161 \
162 vmovdqu b0, st0; \
163 vmovdqu b1, st1; \
164 transpose_4x4(c2, d2, a2, b2, b0, b1); \
165 transpose_4x4(c3, d3, a3, b3, b0, b1); \
166 vmovdqu st0, b0; \
167 vmovdqu st1, b1; \
168 /* does not adjust output bytes inside vectors */
169
170/* load blocks to registers and apply pre-whitening */
171#define inpack16_pre(x0, x1, x2, x3, \
172 x4, x5, x6, x7, \
173 y0, y1, y2, y3, \
174 y4, y5, y6, y7, \
175 rio) \
176 vmovdqu (0 * 16)(rio), x0; \
177 vmovdqu (1 * 16)(rio), x1; \
178 vmovdqu (2 * 16)(rio), x2; \
179 vmovdqu (3 * 16)(rio), x3; \
180 vmovdqu (4 * 16)(rio), x4; \
181 vmovdqu (5 * 16)(rio), x5; \
182 vmovdqu (6 * 16)(rio), x6; \
183 vmovdqu (7 * 16)(rio), x7; \
184 vmovdqu (8 * 16)(rio), y0; \
185 vmovdqu (9 * 16)(rio), y1; \
186 vmovdqu (10 * 16)(rio), y2; \
187 vmovdqu (11 * 16)(rio), y3; \
188 vmovdqu (12 * 16)(rio), y4; \
189 vmovdqu (13 * 16)(rio), y5; \
190 vmovdqu (14 * 16)(rio), y6; \
191 vmovdqu (15 * 16)(rio), y7;
192
193/* byteslice pre-whitened blocks and store to temporary memory */
194#define inpack16_post(x0, x1, x2, x3, \
195 x4, x5, x6, x7, \
196 y0, y1, y2, y3, \
197 y4, y5, y6, y7, \
198 mem_ab, mem_cd) \
199 byteslice_16x16b(x0, x1, x2, x3, \
200 x4, x5, x6, x7, \
201 y0, y1, y2, y3, \
202 y4, y5, y6, y7, \
203 (mem_ab), (mem_cd)); \
204 \
205 vmovdqu x0, 0 * 16(mem_ab); \
206 vmovdqu x1, 1 * 16(mem_ab); \
207 vmovdqu x2, 2 * 16(mem_ab); \
208 vmovdqu x3, 3 * 16(mem_ab); \
209 vmovdqu x4, 4 * 16(mem_ab); \
210 vmovdqu x5, 5 * 16(mem_ab); \
211 vmovdqu x6, 6 * 16(mem_ab); \
212 vmovdqu x7, 7 * 16(mem_ab); \
213 vmovdqu y0, 0 * 16(mem_cd); \
214 vmovdqu y1, 1 * 16(mem_cd); \
215 vmovdqu y2, 2 * 16(mem_cd); \
216 vmovdqu y3, 3 * 16(mem_cd); \
217 vmovdqu y4, 4 * 16(mem_cd); \
218 vmovdqu y5, 5 * 16(mem_cd); \
219 vmovdqu y6, 6 * 16(mem_cd); \
220 vmovdqu y7, 7 * 16(mem_cd);
221
222#define write_output(x0, x1, x2, x3, \
223 x4, x5, x6, x7, \
224 y0, y1, y2, y3, \
225 y4, y5, y6, y7, \
226 mem) \
227 vmovdqu x0, 0 * 16(mem); \
228 vmovdqu x1, 1 * 16(mem); \
229 vmovdqu x2, 2 * 16(mem); \
230 vmovdqu x3, 3 * 16(mem); \
231 vmovdqu x4, 4 * 16(mem); \
232 vmovdqu x5, 5 * 16(mem); \
233 vmovdqu x6, 6 * 16(mem); \
234 vmovdqu x7, 7 * 16(mem); \
235 vmovdqu y0, 8 * 16(mem); \
236 vmovdqu y1, 9 * 16(mem); \
237 vmovdqu y2, 10 * 16(mem); \
238 vmovdqu y3, 11 * 16(mem); \
239 vmovdqu y4, 12 * 16(mem); \
240 vmovdqu y5, 13 * 16(mem); \
241 vmovdqu y6, 14 * 16(mem); \
242 vmovdqu y7, 15 * 16(mem); \
243
244#define aria_store_state_8way(x0, x1, x2, x3, \
245 x4, x5, x6, x7, \
246 mem_tmp, idx) \
247 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
248 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
249 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
250 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
251 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
252 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
253 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
254 vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256#define aria_load_state_8way(x0, x1, x2, x3, \
257 x4, x5, x6, x7, \
258 mem_tmp, idx) \
259 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
260 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
261 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
262 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
263 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
264 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
265 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
266 vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268#define aria_ark_8way(x0, x1, x2, x3, \
269 x4, x5, x6, x7, \
270 t0, t1, t2, rk, \
271 idx, round) \
272 /* AddRoundKey */ \
273 vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
274 vpsrld $24, t0, t2; \
275 vpshufb t1, t2, t2; \
276 vpxor t2, x0, x0; \
277 vpsrld $16, t0, t2; \
278 vpshufb t1, t2, t2; \
279 vpxor t2, x1, x1; \
280 vpsrld $8, t0, t2; \
281 vpshufb t1, t2, t2; \
282 vpxor t2, x2, x2; \
283 vpshufb t1, t0, t2; \
284 vpxor t2, x3, x3; \
285 vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
286 vpsrld $24, t0, t2; \
287 vpshufb t1, t2, t2; \
288 vpxor t2, x4, x4; \
289 vpsrld $16, t0, t2; \
290 vpshufb t1, t2, t2; \
291 vpxor t2, x5, x5; \
292 vpsrld $8, t0, t2; \
293 vpshufb t1, t2, t2; \
294 vpxor t2, x6, x6; \
295 vpshufb t1, t0, t2; \
296 vpxor t2, x7, x7;
297
298#ifdef CONFIG_AS_GFNI
299#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
300 x4, x5, x6, x7, \
301 t0, t1, t2, t3, \
302 t4, t5, t6, t7) \
303 vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
304 vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
305 vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
306 vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
307 vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
308 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
309 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
310 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
311 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
312 vgf2p8affineinvqb $0, t2, x2, x2; \
313 vgf2p8affineinvqb $0, t2, x6, x6; \
314 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
315 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
316 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
317 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
318 vgf2p8affineinvqb $0, t2, x3, x3; \
319 vgf2p8affineinvqb $0, t2, x7, x7
320
321#endif /* CONFIG_AS_GFNI */
322
323#define aria_sbox_8way(x0, x1, x2, x3, \
324 x4, x5, x6, x7, \
325 t0, t1, t2, t3, \
326 t4, t5, t6, t7) \
327 vmovdqa .Linv_shift_row(%rip), t0; \
328 vmovdqa .Lshift_row(%rip), t1; \
329 vbroadcastss .L0f0f0f0f(%rip), t6; \
330 vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
331 vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
332 vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
333 vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
334 \
335 vaesenclast t7, x0, x0; \
336 vaesenclast t7, x4, x4; \
337 vaesenclast t7, x1, x1; \
338 vaesenclast t7, x5, x5; \
339 vaesdeclast t7, x2, x2; \
340 vaesdeclast t7, x6, x6; \
341 \
342 /* AES inverse shift rows */ \
343 vpshufb t0, x0, x0; \
344 vpshufb t0, x4, x4; \
345 vpshufb t0, x1, x1; \
346 vpshufb t0, x5, x5; \
347 vpshufb t1, x3, x3; \
348 vpshufb t1, x7, x7; \
349 vpshufb t1, x2, x2; \
350 vpshufb t1, x6, x6; \
351 \
352 /* affine transformation for S2 */ \
353 filter_8bit(x1, t2, t3, t6, t0); \
354 /* affine transformation for S2 */ \
355 filter_8bit(x5, t2, t3, t6, t0); \
356 \
357 /* affine transformation for X2 */ \
358 filter_8bit(x3, t4, t5, t6, t0); \
359 /* affine transformation for X2 */ \
360 filter_8bit(x7, t4, t5, t6, t0); \
361 vaesdeclast t7, x3, x3; \
362 vaesdeclast t7, x7, x7;
363
364#define aria_diff_m(x0, x1, x2, x3, \
365 t0, t1, t2, t3) \
366 /* T = rotr32(X, 8); */ \
367 /* X ^= T */ \
368 vpxor x0, x3, t0; \
369 vpxor x1, x0, t1; \
370 vpxor x2, x1, t2; \
371 vpxor x3, x2, t3; \
372 /* X = T ^ rotr(X, 16); */ \
373 vpxor t2, x0, x0; \
374 vpxor x1, t3, t3; \
375 vpxor t0, x2, x2; \
376 vpxor t1, x3, x1; \
377 vmovdqu t3, x3;
378
379#define aria_diff_word(x0, x1, x2, x3, \
380 x4, x5, x6, x7, \
381 y0, y1, y2, y3, \
382 y4, y5, y6, y7) \
383 /* t1 ^= t2; */ \
384 vpxor y0, x4, x4; \
385 vpxor y1, x5, x5; \
386 vpxor y2, x6, x6; \
387 vpxor y3, x7, x7; \
388 \
389 /* t2 ^= t3; */ \
390 vpxor y4, y0, y0; \
391 vpxor y5, y1, y1; \
392 vpxor y6, y2, y2; \
393 vpxor y7, y3, y3; \
394 \
395 /* t0 ^= t1; */ \
396 vpxor x4, x0, x0; \
397 vpxor x5, x1, x1; \
398 vpxor x6, x2, x2; \
399 vpxor x7, x3, x3; \
400 \
401 /* t3 ^= t1; */ \
402 vpxor x4, y4, y4; \
403 vpxor x5, y5, y5; \
404 vpxor x6, y6, y6; \
405 vpxor x7, y7, y7; \
406 \
407 /* t2 ^= t0; */ \
408 vpxor x0, y0, y0; \
409 vpxor x1, y1, y1; \
410 vpxor x2, y2, y2; \
411 vpxor x3, y3, y3; \
412 \
413 /* t1 ^= t2; */ \
414 vpxor y0, x4, x4; \
415 vpxor y1, x5, x5; \
416 vpxor y2, x6, x6; \
417 vpxor y3, x7, x7;
418
419#define aria_fe(x0, x1, x2, x3, \
420 x4, x5, x6, x7, \
421 y0, y1, y2, y3, \
422 y4, y5, y6, y7, \
423 mem_tmp, rk, round) \
424 vpxor y7, y7, y7; \
425 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
426 y0, y7, y2, rk, 8, round); \
427 \
428 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
429 y0, y1, y2, y3, y4, y5, y6, y7); \
430 \
431 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
432 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
433 aria_store_state_8way(x0, x1, x2, x3, \
434 x4, x5, x6, x7, \
435 mem_tmp, 8); \
436 \
437 aria_load_state_8way(x0, x1, x2, x3, \
438 x4, x5, x6, x7, \
439 mem_tmp, 0); \
440 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
441 y0, y7, y2, rk, 0, round); \
442 \
443 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
444 y0, y1, y2, y3, y4, y5, y6, y7); \
445 \
446 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
447 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
448 aria_store_state_8way(x0, x1, x2, x3, \
449 x4, x5, x6, x7, \
450 mem_tmp, 0); \
451 aria_load_state_8way(y0, y1, y2, y3, \
452 y4, y5, y6, y7, \
453 mem_tmp, 8); \
454 aria_diff_word(x0, x1, x2, x3, \
455 x4, x5, x6, x7, \
456 y0, y1, y2, y3, \
457 y4, y5, y6, y7); \
458 /* aria_diff_byte() \
459 * T3 = ABCD -> BADC \
460 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
461 * T0 = ABCD -> CDAB \
462 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
463 * T1 = ABCD -> DCBA \
464 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
465 */ \
466 aria_diff_word(x2, x3, x0, x1, \
467 x7, x6, x5, x4, \
468 y0, y1, y2, y3, \
469 y5, y4, y7, y6); \
470 aria_store_state_8way(x3, x2, x1, x0, \
471 x6, x7, x4, x5, \
472 mem_tmp, 0);
473
474#define aria_fo(x0, x1, x2, x3, \
475 x4, x5, x6, x7, \
476 y0, y1, y2, y3, \
477 y4, y5, y6, y7, \
478 mem_tmp, rk, round) \
479 vpxor y7, y7, y7; \
480 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481 y0, y7, y2, rk, 8, round); \
482 \
483 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
484 y0, y1, y2, y3, y4, y5, y6, y7); \
485 \
486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488 aria_store_state_8way(x0, x1, x2, x3, \
489 x4, x5, x6, x7, \
490 mem_tmp, 8); \
491 \
492 aria_load_state_8way(x0, x1, x2, x3, \
493 x4, x5, x6, x7, \
494 mem_tmp, 0); \
495 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
496 y0, y7, y2, rk, 0, round); \
497 \
498 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
499 y0, y1, y2, y3, y4, y5, y6, y7); \
500 \
501 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
502 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
503 aria_store_state_8way(x0, x1, x2, x3, \
504 x4, x5, x6, x7, \
505 mem_tmp, 0); \
506 aria_load_state_8way(y0, y1, y2, y3, \
507 y4, y5, y6, y7, \
508 mem_tmp, 8); \
509 aria_diff_word(x0, x1, x2, x3, \
510 x4, x5, x6, x7, \
511 y0, y1, y2, y3, \
512 y4, y5, y6, y7); \
513 /* aria_diff_byte() \
514 * T1 = ABCD -> BADC \
515 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
516 * T2 = ABCD -> CDAB \
517 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
518 * T3 = ABCD -> DCBA \
519 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
520 */ \
521 aria_diff_word(x0, x1, x2, x3, \
522 x5, x4, x7, x6, \
523 y2, y3, y0, y1, \
524 y7, y6, y5, y4); \
525 aria_store_state_8way(x3, x2, x1, x0, \
526 x6, x7, x4, x5, \
527 mem_tmp, 0);
528
529#define aria_ff(x0, x1, x2, x3, \
530 x4, x5, x6, x7, \
531 y0, y1, y2, y3, \
532 y4, y5, y6, y7, \
533 mem_tmp, rk, round, last_round) \
534 vpxor y7, y7, y7; \
535 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
536 y0, y7, y2, rk, 8, round); \
537 \
538 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
539 y0, y1, y2, y3, y4, y5, y6, y7); \
540 \
541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
542 y0, y7, y2, rk, 8, last_round); \
543 \
544 aria_store_state_8way(x0, x1, x2, x3, \
545 x4, x5, x6, x7, \
546 mem_tmp, 8); \
547 \
548 aria_load_state_8way(x0, x1, x2, x3, \
549 x4, x5, x6, x7, \
550 mem_tmp, 0); \
551 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
552 y0, y7, y2, rk, 0, round); \
553 \
554 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
555 y0, y1, y2, y3, y4, y5, y6, y7); \
556 \
557 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
558 y0, y7, y2, rk, 0, last_round); \
559 \
560 aria_load_state_8way(y0, y1, y2, y3, \
561 y4, y5, y6, y7, \
562 mem_tmp, 8);
563
564#ifdef CONFIG_AS_GFNI
565#define aria_fe_gfni(x0, x1, x2, x3, \
566 x4, x5, x6, x7, \
567 y0, y1, y2, y3, \
568 y4, y5, y6, y7, \
569 mem_tmp, rk, round) \
570 vpxor y7, y7, y7; \
571 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
572 y0, y7, y2, rk, 8, round); \
573 \
574 aria_sbox_8way_gfni(x2, x3, x0, x1, \
575 x6, x7, x4, x5, \
576 y0, y1, y2, y3, \
577 y4, y5, y6, y7); \
578 \
579 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
580 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
581 aria_store_state_8way(x0, x1, x2, x3, \
582 x4, x5, x6, x7, \
583 mem_tmp, 8); \
584 \
585 aria_load_state_8way(x0, x1, x2, x3, \
586 x4, x5, x6, x7, \
587 mem_tmp, 0); \
588 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
589 y0, y7, y2, rk, 0, round); \
590 \
591 aria_sbox_8way_gfni(x2, x3, x0, x1, \
592 x6, x7, x4, x5, \
593 y0, y1, y2, y3, \
594 y4, y5, y6, y7); \
595 \
596 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
597 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
598 aria_store_state_8way(x0, x1, x2, x3, \
599 x4, x5, x6, x7, \
600 mem_tmp, 0); \
601 aria_load_state_8way(y0, y1, y2, y3, \
602 y4, y5, y6, y7, \
603 mem_tmp, 8); \
604 aria_diff_word(x0, x1, x2, x3, \
605 x4, x5, x6, x7, \
606 y0, y1, y2, y3, \
607 y4, y5, y6, y7); \
608 /* aria_diff_byte() \
609 * T3 = ABCD -> BADC \
610 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
611 * T0 = ABCD -> CDAB \
612 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
613 * T1 = ABCD -> DCBA \
614 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
615 */ \
616 aria_diff_word(x2, x3, x0, x1, \
617 x7, x6, x5, x4, \
618 y0, y1, y2, y3, \
619 y5, y4, y7, y6); \
620 aria_store_state_8way(x3, x2, x1, x0, \
621 x6, x7, x4, x5, \
622 mem_tmp, 0);
623
624#define aria_fo_gfni(x0, x1, x2, x3, \
625 x4, x5, x6, x7, \
626 y0, y1, y2, y3, \
627 y4, y5, y6, y7, \
628 mem_tmp, rk, round) \
629 vpxor y7, y7, y7; \
630 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
631 y0, y7, y2, rk, 8, round); \
632 \
633 aria_sbox_8way_gfni(x0, x1, x2, x3, \
634 x4, x5, x6, x7, \
635 y0, y1, y2, y3, \
636 y4, y5, y6, y7); \
637 \
638 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
639 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
640 aria_store_state_8way(x0, x1, x2, x3, \
641 x4, x5, x6, x7, \
642 mem_tmp, 8); \
643 \
644 aria_load_state_8way(x0, x1, x2, x3, \
645 x4, x5, x6, x7, \
646 mem_tmp, 0); \
647 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
648 y0, y7, y2, rk, 0, round); \
649 \
650 aria_sbox_8way_gfni(x0, x1, x2, x3, \
651 x4, x5, x6, x7, \
652 y0, y1, y2, y3, \
653 y4, y5, y6, y7); \
654 \
655 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
656 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
657 aria_store_state_8way(x0, x1, x2, x3, \
658 x4, x5, x6, x7, \
659 mem_tmp, 0); \
660 aria_load_state_8way(y0, y1, y2, y3, \
661 y4, y5, y6, y7, \
662 mem_tmp, 8); \
663 aria_diff_word(x0, x1, x2, x3, \
664 x4, x5, x6, x7, \
665 y0, y1, y2, y3, \
666 y4, y5, y6, y7); \
667 /* aria_diff_byte() \
668 * T1 = ABCD -> BADC \
669 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
670 * T2 = ABCD -> CDAB \
671 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
672 * T3 = ABCD -> DCBA \
673 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
674 */ \
675 aria_diff_word(x0, x1, x2, x3, \
676 x5, x4, x7, x6, \
677 y2, y3, y0, y1, \
678 y7, y6, y5, y4); \
679 aria_store_state_8way(x3, x2, x1, x0, \
680 x6, x7, x4, x5, \
681 mem_tmp, 0);
682
683#define aria_ff_gfni(x0, x1, x2, x3, \
684 x4, x5, x6, x7, \
685 y0, y1, y2, y3, \
686 y4, y5, y6, y7, \
687 mem_tmp, rk, round, last_round) \
688 vpxor y7, y7, y7; \
689 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
690 y0, y7, y2, rk, 8, round); \
691 \
692 aria_sbox_8way_gfni(x2, x3, x0, x1, \
693 x6, x7, x4, x5, \
694 y0, y1, y2, y3, \
695 y4, y5, y6, y7); \
696 \
697 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
698 y0, y7, y2, rk, 8, last_round); \
699 \
700 aria_store_state_8way(x0, x1, x2, x3, \
701 x4, x5, x6, x7, \
702 mem_tmp, 8); \
703 \
704 aria_load_state_8way(x0, x1, x2, x3, \
705 x4, x5, x6, x7, \
706 mem_tmp, 0); \
707 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
708 y0, y7, y2, rk, 0, round); \
709 \
710 aria_sbox_8way_gfni(x2, x3, x0, x1, \
711 x6, x7, x4, x5, \
712 y0, y1, y2, y3, \
713 y4, y5, y6, y7); \
714 \
715 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
716 y0, y7, y2, rk, 0, last_round); \
717 \
718 aria_load_state_8way(y0, y1, y2, y3, \
719 y4, y5, y6, y7, \
720 mem_tmp, 8);
721
722#endif /* CONFIG_AS_GFNI */
723
724/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725.section .rodata.cst16, "aM", @progbits, 16
726.align 16
727
728#define SHUFB_BYTES(idx) \
729 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
730
731.Lshufb_16x16b:
732 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733/* For isolating SubBytes from AESENCLAST, inverse shift row */
734.Linv_shift_row:
735 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737.Lshift_row:
738 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740/* For CTR-mode IV byteswap */
741.Lbswap128_mask:
742 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
744
745/* AES inverse affine and S2 combined:
746 * 1 1 0 0 0 0 0 1 x0 0
747 * 0 1 0 0 1 0 0 0 x1 0
748 * 1 1 0 0 1 1 1 1 x2 0
749 * 0 1 1 0 1 0 0 1 x3 1
750 * 0 1 0 0 1 1 0 0 * x4 + 0
751 * 0 1 0 1 1 0 0 0 x5 0
752 * 0 0 0 0 0 1 0 1 x6 0
753 * 1 1 1 0 0 1 1 1 x7 1
754 */
755.Ltf_lo__inv_aff__and__s2:
756 .octa 0x92172DA81A9FA520B2370D883ABF8500
757.Ltf_hi__inv_aff__and__s2:
758 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
759
760/* X2 and AES forward affine combined:
761 * 1 0 1 1 0 0 0 1 x0 0
762 * 0 1 1 1 1 0 1 1 x1 0
763 * 0 0 0 1 1 0 1 0 x2 1
764 * 0 1 0 0 0 1 0 0 x3 0
765 * 0 0 1 1 1 0 1 1 * x4 + 0
766 * 0 1 0 0 1 0 0 0 x5 0
767 * 1 1 0 1 0 0 1 1 x6 0
768 * 0 1 0 0 1 0 1 0 x7 0
769 */
770.Ltf_lo__x2__and__fwd_aff:
771 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772.Ltf_hi__x2__and__fwd_aff:
773 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
774
775#ifdef CONFIG_AS_GFNI
776/* AES affine: */
777#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
778.Ltf_aff_bitmatrix:
779 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
780 BV8(1, 1, 0, 0, 0, 1, 1, 1),
781 BV8(1, 1, 1, 0, 0, 0, 1, 1),
782 BV8(1, 1, 1, 1, 0, 0, 0, 1),
783 BV8(1, 1, 1, 1, 1, 0, 0, 0),
784 BV8(0, 1, 1, 1, 1, 1, 0, 0),
785 BV8(0, 0, 1, 1, 1, 1, 1, 0),
786 BV8(0, 0, 0, 1, 1, 1, 1, 1))
787 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
788 BV8(1, 1, 0, 0, 0, 1, 1, 1),
789 BV8(1, 1, 1, 0, 0, 0, 1, 1),
790 BV8(1, 1, 1, 1, 0, 0, 0, 1),
791 BV8(1, 1, 1, 1, 1, 0, 0, 0),
792 BV8(0, 1, 1, 1, 1, 1, 0, 0),
793 BV8(0, 0, 1, 1, 1, 1, 1, 0),
794 BV8(0, 0, 0, 1, 1, 1, 1, 1))
795
796/* AES inverse affine: */
797#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
798.Ltf_inv_bitmatrix:
799 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
800 BV8(1, 0, 0, 1, 0, 0, 1, 0),
801 BV8(0, 1, 0, 0, 1, 0, 0, 1),
802 BV8(1, 0, 1, 0, 0, 1, 0, 0),
803 BV8(0, 1, 0, 1, 0, 0, 1, 0),
804 BV8(0, 0, 1, 0, 1, 0, 0, 1),
805 BV8(1, 0, 0, 1, 0, 1, 0, 0),
806 BV8(0, 1, 0, 0, 1, 0, 1, 0))
807 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
808 BV8(1, 0, 0, 1, 0, 0, 1, 0),
809 BV8(0, 1, 0, 0, 1, 0, 0, 1),
810 BV8(1, 0, 1, 0, 0, 1, 0, 0),
811 BV8(0, 1, 0, 1, 0, 0, 1, 0),
812 BV8(0, 0, 1, 0, 1, 0, 0, 1),
813 BV8(1, 0, 0, 1, 0, 1, 0, 0),
814 BV8(0, 1, 0, 0, 1, 0, 1, 0))
815
816/* S2: */
817#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
818.Ltf_s2_bitmatrix:
819 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
820 BV8(0, 0, 1, 1, 1, 1, 1, 1),
821 BV8(1, 1, 1, 0, 1, 1, 0, 1),
822 BV8(1, 1, 0, 0, 0, 0, 1, 1),
823 BV8(0, 1, 0, 0, 0, 0, 1, 1),
824 BV8(1, 1, 0, 0, 1, 1, 1, 0),
825 BV8(0, 1, 1, 0, 0, 0, 1, 1),
826 BV8(1, 1, 1, 1, 0, 1, 1, 0))
827 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
828 BV8(0, 0, 1, 1, 1, 1, 1, 1),
829 BV8(1, 1, 1, 0, 1, 1, 0, 1),
830 BV8(1, 1, 0, 0, 0, 0, 1, 1),
831 BV8(0, 1, 0, 0, 0, 0, 1, 1),
832 BV8(1, 1, 0, 0, 1, 1, 1, 0),
833 BV8(0, 1, 1, 0, 0, 0, 1, 1),
834 BV8(1, 1, 1, 1, 0, 1, 1, 0))
835
836/* X2: */
837#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
838.Ltf_x2_bitmatrix:
839 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
840 BV8(0, 0, 1, 0, 0, 1, 1, 0),
841 BV8(0, 0, 0, 0, 1, 0, 1, 0),
842 BV8(1, 1, 1, 0, 0, 0, 1, 1),
843 BV8(1, 1, 1, 0, 1, 1, 0, 0),
844 BV8(0, 1, 1, 0, 1, 0, 1, 1),
845 BV8(1, 0, 1, 1, 1, 1, 0, 1),
846 BV8(1, 0, 0, 1, 0, 0, 1, 1))
847 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
848 BV8(0, 0, 1, 0, 0, 1, 1, 0),
849 BV8(0, 0, 0, 0, 1, 0, 1, 0),
850 BV8(1, 1, 1, 0, 0, 0, 1, 1),
851 BV8(1, 1, 1, 0, 1, 1, 0, 0),
852 BV8(0, 1, 1, 0, 1, 0, 1, 1),
853 BV8(1, 0, 1, 1, 1, 1, 0, 1),
854 BV8(1, 0, 0, 1, 0, 0, 1, 1))
855
856/* Identity matrix: */
857.Ltf_id_bitmatrix:
858 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
859 BV8(0, 1, 0, 0, 0, 0, 0, 0),
860 BV8(0, 0, 1, 0, 0, 0, 0, 0),
861 BV8(0, 0, 0, 1, 0, 0, 0, 0),
862 BV8(0, 0, 0, 0, 1, 0, 0, 0),
863 BV8(0, 0, 0, 0, 0, 1, 0, 0),
864 BV8(0, 0, 0, 0, 0, 0, 1, 0),
865 BV8(0, 0, 0, 0, 0, 0, 0, 1))
866 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
867 BV8(0, 1, 0, 0, 0, 0, 0, 0),
868 BV8(0, 0, 1, 0, 0, 0, 0, 0),
869 BV8(0, 0, 0, 1, 0, 0, 0, 0),
870 BV8(0, 0, 0, 0, 1, 0, 0, 0),
871 BV8(0, 0, 0, 0, 0, 1, 0, 0),
872 BV8(0, 0, 0, 0, 0, 0, 1, 0),
873 BV8(0, 0, 0, 0, 0, 0, 0, 1))
874#endif /* CONFIG_AS_GFNI */
875
876/* 4-bit mask */
877.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
878.align 4
879.L0f0f0f0f:
880 .long 0x0f0f0f0f
881
882.text
883
884SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
885 /* input:
886 * %r9: rk
887 * %rsi: dst
888 * %rdx: src
889 * %xmm0..%xmm15: 16 byte-sliced blocks
890 */
891
892 FRAME_BEGIN
893
894 movq %rsi, %rax;
895 leaq 8 * 16(%rax), %r8;
896
897 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
898 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
899 %xmm15, %rax, %r8);
900 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
901 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
902 %rax, %r9, 0);
903 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
904 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
905 %xmm15, %rax, %r9, 1);
906 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
907 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
908 %rax, %r9, 2);
909 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
910 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
911 %xmm15, %rax, %r9, 3);
912 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
913 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
914 %rax, %r9, 4);
915 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
916 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
917 %xmm15, %rax, %r9, 5);
918 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
919 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
920 %rax, %r9, 6);
921 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
922 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
923 %xmm15, %rax, %r9, 7);
924 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
925 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
926 %rax, %r9, 8);
927 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
928 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
929 %xmm15, %rax, %r9, 9);
930 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
931 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
932 %rax, %r9, 10);
933 cmpl $12, ARIA_CTX_rounds(CTX);
934 jne .Laria_192;
935 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
936 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
937 %xmm15, %rax, %r9, 11, 12);
938 jmp .Laria_end;
939.Laria_192:
940 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
941 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
942 %xmm15, %rax, %r9, 11);
943 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
944 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
945 %rax, %r9, 12);
946 cmpl $14, ARIA_CTX_rounds(CTX);
947 jne .Laria_256;
948 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
949 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
950 %xmm15, %rax, %r9, 13, 14);
951 jmp .Laria_end;
952.Laria_256:
953 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
954 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
955 %xmm15, %rax, %r9, 13);
956 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
957 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
958 %rax, %r9, 14);
959 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
960 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
961 %xmm15, %rax, %r9, 15, 16);
962.Laria_end:
963 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
964 %xmm9, %xmm13, %xmm0, %xmm5,
965 %xmm10, %xmm14, %xmm3, %xmm6,
966 %xmm11, %xmm15, %xmm2, %xmm7,
967 (%rax), (%r8));
968
969 FRAME_END
970 RET;
971SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
972
973SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
974 /* input:
975 * %rdi: ctx, CTX
976 * %rsi: dst
977 * %rdx: src
978 */
979
980 FRAME_BEGIN
981
982 leaq ARIA_CTX_enc_key(CTX), %r9;
983
984 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
985 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
986 %xmm15, %rdx);
987
988 call __aria_aesni_avx_crypt_16way;
989
990 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
991 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
992 %xmm15, %rax);
993
994 FRAME_END
995 RET;
996SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
997
998SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
999 /* input:
1000 * %rdi: ctx, CTX
1001 * %rsi: dst
1002 * %rdx: src
1003 */
1004
1005 FRAME_BEGIN
1006
1007 leaq ARIA_CTX_dec_key(CTX), %r9;
1008
1009 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1010 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1011 %xmm15, %rdx);
1012
1013 call __aria_aesni_avx_crypt_16way;
1014
1015 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1016 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1017 %xmm15, %rax);
1018
1019 FRAME_END
1020 RET;
1021SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1022
1023SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1024 /* input:
1025 * %rdi: ctx
1026 * %rsi: dst
1027 * %rdx: src
1028 * %rcx: keystream
1029 * %r8: iv (big endian, 128bit)
1030 */
1031
1032 FRAME_BEGIN
1033 /* load IV and byteswap */
1034 vmovdqu (%r8), %xmm8;
1035
1036 vmovdqa .Lbswap128_mask (%rip), %xmm1;
1037 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1038
1039 vpcmpeqd %xmm0, %xmm0, %xmm0;
1040 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1041
1042 /* construct IVs */
1043 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1044 vpshufb %xmm1, %xmm3, %xmm9;
1045 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046 vpshufb %xmm1, %xmm3, %xmm10;
1047 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048 vpshufb %xmm1, %xmm3, %xmm11;
1049 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050 vpshufb %xmm1, %xmm3, %xmm12;
1051 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052 vpshufb %xmm1, %xmm3, %xmm13;
1053 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054 vpshufb %xmm1, %xmm3, %xmm14;
1055 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056 vpshufb %xmm1, %xmm3, %xmm15;
1057 vmovdqu %xmm8, (0 * 16)(%rcx);
1058 vmovdqu %xmm9, (1 * 16)(%rcx);
1059 vmovdqu %xmm10, (2 * 16)(%rcx);
1060 vmovdqu %xmm11, (3 * 16)(%rcx);
1061 vmovdqu %xmm12, (4 * 16)(%rcx);
1062 vmovdqu %xmm13, (5 * 16)(%rcx);
1063 vmovdqu %xmm14, (6 * 16)(%rcx);
1064 vmovdqu %xmm15, (7 * 16)(%rcx);
1065
1066 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1067 vpshufb %xmm1, %xmm3, %xmm8;
1068 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069 vpshufb %xmm1, %xmm3, %xmm9;
1070 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071 vpshufb %xmm1, %xmm3, %xmm10;
1072 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073 vpshufb %xmm1, %xmm3, %xmm11;
1074 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075 vpshufb %xmm1, %xmm3, %xmm12;
1076 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077 vpshufb %xmm1, %xmm3, %xmm13;
1078 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079 vpshufb %xmm1, %xmm3, %xmm14;
1080 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081 vpshufb %xmm1, %xmm3, %xmm15;
1082 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083 vpshufb %xmm1, %xmm3, %xmm4;
1084 vmovdqu %xmm4, (%r8);
1085
1086 vmovdqu (0 * 16)(%rcx), %xmm0;
1087 vmovdqu (1 * 16)(%rcx), %xmm1;
1088 vmovdqu (2 * 16)(%rcx), %xmm2;
1089 vmovdqu (3 * 16)(%rcx), %xmm3;
1090 vmovdqu (4 * 16)(%rcx), %xmm4;
1091 vmovdqu (5 * 16)(%rcx), %xmm5;
1092 vmovdqu (6 * 16)(%rcx), %xmm6;
1093 vmovdqu (7 * 16)(%rcx), %xmm7;
1094
1095 FRAME_END
1096 RET;
1097SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1098
1099SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1100 /* input:
1101 * %rdi: ctx
1102 * %rsi: dst
1103 * %rdx: src
1104 * %rcx: keystream
1105 * %r8: iv (big endian, 128bit)
1106 */
1107 FRAME_BEGIN
1108
1109 call __aria_aesni_avx_ctr_gen_keystream_16way;
1110
1111 leaq (%rsi), %r10;
1112 leaq (%rdx), %r11;
1113 leaq (%rcx), %rsi;
1114 leaq (%rcx), %rdx;
1115 leaq ARIA_CTX_enc_key(CTX), %r9;
1116
1117 call __aria_aesni_avx_crypt_16way;
1118
1119 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1120 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1121 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1122 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1123 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1124 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1125 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1126 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1127 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1128 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1129 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1130 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1131 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1132 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1133 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1134 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1135 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1136 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1137 %xmm15, %r10);
1138
1139 FRAME_END
1140 RET;
1141SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1142
1143#ifdef CONFIG_AS_GFNI
1144SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1145 /* input:
1146 * %r9: rk
1147 * %rsi: dst
1148 * %rdx: src
1149 * %xmm0..%xmm15: 16 byte-sliced blocks
1150 */
1151
1152 FRAME_BEGIN
1153
1154 movq %rsi, %rax;
1155 leaq 8 * 16(%rax), %r8;
1156
1157 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1158 %xmm4, %xmm5, %xmm6, %xmm7,
1159 %xmm8, %xmm9, %xmm10, %xmm11,
1160 %xmm12, %xmm13, %xmm14,
1161 %xmm15, %rax, %r8);
1162 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1163 %xmm12, %xmm13, %xmm14, %xmm15,
1164 %xmm0, %xmm1, %xmm2, %xmm3,
1165 %xmm4, %xmm5, %xmm6, %xmm7,
1166 %rax, %r9, 0);
1167 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1168 %xmm4, %xmm5, %xmm6, %xmm7,
1169 %xmm8, %xmm9, %xmm10, %xmm11,
1170 %xmm12, %xmm13, %xmm14,
1171 %xmm15, %rax, %r9, 1);
1172 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1173 %xmm12, %xmm13, %xmm14, %xmm15,
1174 %xmm0, %xmm1, %xmm2, %xmm3,
1175 %xmm4, %xmm5, %xmm6, %xmm7,
1176 %rax, %r9, 2);
1177 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1178 %xmm4, %xmm5, %xmm6, %xmm7,
1179 %xmm8, %xmm9, %xmm10, %xmm11,
1180 %xmm12, %xmm13, %xmm14,
1181 %xmm15, %rax, %r9, 3);
1182 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1183 %xmm12, %xmm13, %xmm14, %xmm15,
1184 %xmm0, %xmm1, %xmm2, %xmm3,
1185 %xmm4, %xmm5, %xmm6, %xmm7,
1186 %rax, %r9, 4);
1187 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1188 %xmm4, %xmm5, %xmm6, %xmm7,
1189 %xmm8, %xmm9, %xmm10, %xmm11,
1190 %xmm12, %xmm13, %xmm14,
1191 %xmm15, %rax, %r9, 5);
1192 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1193 %xmm12, %xmm13, %xmm14, %xmm15,
1194 %xmm0, %xmm1, %xmm2, %xmm3,
1195 %xmm4, %xmm5, %xmm6, %xmm7,
1196 %rax, %r9, 6);
1197 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1198 %xmm4, %xmm5, %xmm6, %xmm7,
1199 %xmm8, %xmm9, %xmm10, %xmm11,
1200 %xmm12, %xmm13, %xmm14,
1201 %xmm15, %rax, %r9, 7);
1202 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1203 %xmm12, %xmm13, %xmm14, %xmm15,
1204 %xmm0, %xmm1, %xmm2, %xmm3,
1205 %xmm4, %xmm5, %xmm6, %xmm7,
1206 %rax, %r9, 8);
1207 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1208 %xmm4, %xmm5, %xmm6, %xmm7,
1209 %xmm8, %xmm9, %xmm10, %xmm11,
1210 %xmm12, %xmm13, %xmm14,
1211 %xmm15, %rax, %r9, 9);
1212 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1213 %xmm12, %xmm13, %xmm14, %xmm15,
1214 %xmm0, %xmm1, %xmm2, %xmm3,
1215 %xmm4, %xmm5, %xmm6, %xmm7,
1216 %rax, %r9, 10);
1217 cmpl $12, ARIA_CTX_rounds(CTX);
1218 jne .Laria_gfni_192;
1219 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1220 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1221 %xmm15, %rax, %r9, 11, 12);
1222 jmp .Laria_gfni_end;
1223.Laria_gfni_192:
1224 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1225 %xmm4, %xmm5, %xmm6, %xmm7,
1226 %xmm8, %xmm9, %xmm10, %xmm11,
1227 %xmm12, %xmm13, %xmm14,
1228 %xmm15, %rax, %r9, 11);
1229 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1230 %xmm12, %xmm13, %xmm14, %xmm15,
1231 %xmm0, %xmm1, %xmm2, %xmm3,
1232 %xmm4, %xmm5, %xmm6, %xmm7,
1233 %rax, %r9, 12);
1234 cmpl $14, ARIA_CTX_rounds(CTX);
1235 jne .Laria_gfni_256;
1236 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1237 %xmm4, %xmm5, %xmm6, %xmm7,
1238 %xmm8, %xmm9, %xmm10, %xmm11,
1239 %xmm12, %xmm13, %xmm14,
1240 %xmm15, %rax, %r9, 13, 14);
1241 jmp .Laria_gfni_end;
1242.Laria_gfni_256:
1243 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1244 %xmm4, %xmm5, %xmm6, %xmm7,
1245 %xmm8, %xmm9, %xmm10, %xmm11,
1246 %xmm12, %xmm13, %xmm14,
1247 %xmm15, %rax, %r9, 13);
1248 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1249 %xmm12, %xmm13, %xmm14, %xmm15,
1250 %xmm0, %xmm1, %xmm2, %xmm3,
1251 %xmm4, %xmm5, %xmm6, %xmm7,
1252 %rax, %r9, 14);
1253 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1254 %xmm4, %xmm5, %xmm6, %xmm7,
1255 %xmm8, %xmm9, %xmm10, %xmm11,
1256 %xmm12, %xmm13, %xmm14,
1257 %xmm15, %rax, %r9, 15, 16);
1258.Laria_gfni_end:
1259 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1260 %xmm9, %xmm13, %xmm0, %xmm5,
1261 %xmm10, %xmm14, %xmm3, %xmm6,
1262 %xmm11, %xmm15, %xmm2, %xmm7,
1263 (%rax), (%r8));
1264
1265 FRAME_END
1266 RET;
1267SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1268
1269SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1270 /* input:
1271 * %rdi: ctx, CTX
1272 * %rsi: dst
1273 * %rdx: src
1274 */
1275
1276 FRAME_BEGIN
1277
1278 leaq ARIA_CTX_enc_key(CTX), %r9;
1279
1280 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1281 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1282 %xmm15, %rdx);
1283
1284 call __aria_aesni_avx_gfni_crypt_16way;
1285
1286 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1287 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1288 %xmm15, %rax);
1289
1290 FRAME_END
1291 RET;
1292SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1293
1294SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1295 /* input:
1296 * %rdi: ctx, CTX
1297 * %rsi: dst
1298 * %rdx: src
1299 */
1300
1301 FRAME_BEGIN
1302
1303 leaq ARIA_CTX_dec_key(CTX), %r9;
1304
1305 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1306 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1307 %xmm15, %rdx);
1308
1309 call __aria_aesni_avx_gfni_crypt_16way;
1310
1311 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1312 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1313 %xmm15, %rax);
1314
1315 FRAME_END
1316 RET;
1317SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1318
1319SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1320 /* input:
1321 * %rdi: ctx
1322 * %rsi: dst
1323 * %rdx: src
1324 * %rcx: keystream
1325 * %r8: iv (big endian, 128bit)
1326 */
1327 FRAME_BEGIN
1328
1329 call __aria_aesni_avx_ctr_gen_keystream_16way
1330
1331 leaq (%rsi), %r10;
1332 leaq (%rdx), %r11;
1333 leaq (%rcx), %rsi;
1334 leaq (%rcx), %rdx;
1335 leaq ARIA_CTX_enc_key(CTX), %r9;
1336
1337 call __aria_aesni_avx_gfni_crypt_16way;
1338
1339 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1340 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1341 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1342 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1343 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1344 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1345 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1346 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1347 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1348 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1349 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1350 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1351 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1352 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1353 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1354 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1355 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1356 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1357 %xmm15, %r10);
1358
1359 FRAME_END
1360 RET;
1361SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1362#endif /* CONFIG_AS_GFNI */
1363

source code of linux/arch/x86/crypto/aria-aesni-avx-asm_64.S