1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ARIA Cipher 32-way parallel algorithm (AVX2)
4 *
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6 *
7 */
8
9#include <linux/linkage.h>
10#include <asm/frame.h>
11#include <asm/asm-offsets.h>
12#include <linux/cfi_types.h>
13
14/* register macros */
15#define CTX %rdi
16
17#define ymm0_x xmm0
18#define ymm1_x xmm1
19#define ymm2_x xmm2
20#define ymm3_x xmm3
21#define ymm4_x xmm4
22#define ymm5_x xmm5
23#define ymm6_x xmm6
24#define ymm7_x xmm7
25#define ymm8_x xmm8
26#define ymm9_x xmm9
27#define ymm10_x xmm10
28#define ymm11_x xmm11
29#define ymm12_x xmm12
30#define ymm13_x xmm13
31#define ymm14_x xmm14
32#define ymm15_x xmm15
33
34#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
35 ( (((a0) & 1) << 0) | \
36 (((a1) & 1) << 1) | \
37 (((a2) & 1) << 2) | \
38 (((a3) & 1) << 3) | \
39 (((a4) & 1) << 4) | \
40 (((a5) & 1) << 5) | \
41 (((a6) & 1) << 6) | \
42 (((a7) & 1) << 7) )
43
44#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
45 ( ((l7) << (0 * 8)) | \
46 ((l6) << (1 * 8)) | \
47 ((l5) << (2 * 8)) | \
48 ((l4) << (3 * 8)) | \
49 ((l3) << (4 * 8)) | \
50 ((l2) << (5 * 8)) | \
51 ((l1) << (6 * 8)) | \
52 ((l0) << (7 * 8)) )
53
54#define inc_le128(x, minus_one, tmp) \
55 vpcmpeqq minus_one, x, tmp; \
56 vpsubq minus_one, x, x; \
57 vpslldq $8, tmp, tmp; \
58 vpsubq tmp, x, x;
59
60#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
61 vpand x, mask4bit, tmp0; \
62 vpandn x, mask4bit, x; \
63 vpsrld $4, x, x; \
64 \
65 vpshufb tmp0, lo_t, tmp0; \
66 vpshufb x, hi_t, x; \
67 vpxor tmp0, x, x;
68
69#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
70 vpunpckhdq x1, x0, t2; \
71 vpunpckldq x1, x0, x0; \
72 \
73 vpunpckldq x3, x2, t1; \
74 vpunpckhdq x3, x2, x2; \
75 \
76 vpunpckhqdq t1, x0, x1; \
77 vpunpcklqdq t1, x0, x0; \
78 \
79 vpunpckhqdq x2, t2, x3; \
80 vpunpcklqdq x2, t2, x2;
81
82#define byteslice_16x16b(a0, b0, c0, d0, \
83 a1, b1, c1, d1, \
84 a2, b2, c2, d2, \
85 a3, b3, c3, d3, \
86 st0, st1) \
87 vmovdqu d2, st0; \
88 vmovdqu d3, st1; \
89 transpose_4x4(a0, a1, a2, a3, d2, d3); \
90 transpose_4x4(b0, b1, b2, b3, d2, d3); \
91 vmovdqu st0, d2; \
92 vmovdqu st1, d3; \
93 \
94 vmovdqu a0, st0; \
95 vmovdqu a1, st1; \
96 transpose_4x4(c0, c1, c2, c3, a0, a1); \
97 transpose_4x4(d0, d1, d2, d3, a0, a1); \
98 \
99 vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
100 vmovdqu st1, a1; \
101 vpshufb a0, a2, a2; \
102 vpshufb a0, a3, a3; \
103 vpshufb a0, b0, b0; \
104 vpshufb a0, b1, b1; \
105 vpshufb a0, b2, b2; \
106 vpshufb a0, b3, b3; \
107 vpshufb a0, a1, a1; \
108 vpshufb a0, c0, c0; \
109 vpshufb a0, c1, c1; \
110 vpshufb a0, c2, c2; \
111 vpshufb a0, c3, c3; \
112 vpshufb a0, d0, d0; \
113 vpshufb a0, d1, d1; \
114 vpshufb a0, d2, d2; \
115 vpshufb a0, d3, d3; \
116 vmovdqu d3, st1; \
117 vmovdqu st0, d3; \
118 vpshufb a0, d3, a0; \
119 vmovdqu d2, st0; \
120 \
121 transpose_4x4(a0, b0, c0, d0, d2, d3); \
122 transpose_4x4(a1, b1, c1, d1, d2, d3); \
123 vmovdqu st0, d2; \
124 vmovdqu st1, d3; \
125 \
126 vmovdqu b0, st0; \
127 vmovdqu b1, st1; \
128 transpose_4x4(a2, b2, c2, d2, b0, b1); \
129 transpose_4x4(a3, b3, c3, d3, b0, b1); \
130 vmovdqu st0, b0; \
131 vmovdqu st1, b1; \
132 /* does not adjust output bytes inside vectors */
133
134#define debyteslice_16x16b(a0, b0, c0, d0, \
135 a1, b1, c1, d1, \
136 a2, b2, c2, d2, \
137 a3, b3, c3, d3, \
138 st0, st1) \
139 vmovdqu d2, st0; \
140 vmovdqu d3, st1; \
141 transpose_4x4(a0, a1, a2, a3, d2, d3); \
142 transpose_4x4(b0, b1, b2, b3, d2, d3); \
143 vmovdqu st0, d2; \
144 vmovdqu st1, d3; \
145 \
146 vmovdqu a0, st0; \
147 vmovdqu a1, st1; \
148 transpose_4x4(c0, c1, c2, c3, a0, a1); \
149 transpose_4x4(d0, d1, d2, d3, a0, a1); \
150 \
151 vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
152 vmovdqu st1, a1; \
153 vpshufb a0, a2, a2; \
154 vpshufb a0, a3, a3; \
155 vpshufb a0, b0, b0; \
156 vpshufb a0, b1, b1; \
157 vpshufb a0, b2, b2; \
158 vpshufb a0, b3, b3; \
159 vpshufb a0, a1, a1; \
160 vpshufb a0, c0, c0; \
161 vpshufb a0, c1, c1; \
162 vpshufb a0, c2, c2; \
163 vpshufb a0, c3, c3; \
164 vpshufb a0, d0, d0; \
165 vpshufb a0, d1, d1; \
166 vpshufb a0, d2, d2; \
167 vpshufb a0, d3, d3; \
168 vmovdqu d3, st1; \
169 vmovdqu st0, d3; \
170 vpshufb a0, d3, a0; \
171 vmovdqu d2, st0; \
172 \
173 transpose_4x4(c0, d0, a0, b0, d2, d3); \
174 transpose_4x4(c1, d1, a1, b1, d2, d3); \
175 vmovdqu st0, d2; \
176 vmovdqu st1, d3; \
177 \
178 vmovdqu b0, st0; \
179 vmovdqu b1, st1; \
180 transpose_4x4(c2, d2, a2, b2, b0, b1); \
181 transpose_4x4(c3, d3, a3, b3, b0, b1); \
182 vmovdqu st0, b0; \
183 vmovdqu st1, b1; \
184 /* does not adjust output bytes inside vectors */
185
186/* load blocks to registers and apply pre-whitening */
187#define inpack16_pre(x0, x1, x2, x3, \
188 x4, x5, x6, x7, \
189 y0, y1, y2, y3, \
190 y4, y5, y6, y7, \
191 rio) \
192 vmovdqu (0 * 32)(rio), x0; \
193 vmovdqu (1 * 32)(rio), x1; \
194 vmovdqu (2 * 32)(rio), x2; \
195 vmovdqu (3 * 32)(rio), x3; \
196 vmovdqu (4 * 32)(rio), x4; \
197 vmovdqu (5 * 32)(rio), x5; \
198 vmovdqu (6 * 32)(rio), x6; \
199 vmovdqu (7 * 32)(rio), x7; \
200 vmovdqu (8 * 32)(rio), y0; \
201 vmovdqu (9 * 32)(rio), y1; \
202 vmovdqu (10 * 32)(rio), y2; \
203 vmovdqu (11 * 32)(rio), y3; \
204 vmovdqu (12 * 32)(rio), y4; \
205 vmovdqu (13 * 32)(rio), y5; \
206 vmovdqu (14 * 32)(rio), y6; \
207 vmovdqu (15 * 32)(rio), y7;
208
209/* byteslice pre-whitened blocks and store to temporary memory */
210#define inpack16_post(x0, x1, x2, x3, \
211 x4, x5, x6, x7, \
212 y0, y1, y2, y3, \
213 y4, y5, y6, y7, \
214 mem_ab, mem_cd) \
215 byteslice_16x16b(x0, x1, x2, x3, \
216 x4, x5, x6, x7, \
217 y0, y1, y2, y3, \
218 y4, y5, y6, y7, \
219 (mem_ab), (mem_cd)); \
220 \
221 vmovdqu x0, 0 * 32(mem_ab); \
222 vmovdqu x1, 1 * 32(mem_ab); \
223 vmovdqu x2, 2 * 32(mem_ab); \
224 vmovdqu x3, 3 * 32(mem_ab); \
225 vmovdqu x4, 4 * 32(mem_ab); \
226 vmovdqu x5, 5 * 32(mem_ab); \
227 vmovdqu x6, 6 * 32(mem_ab); \
228 vmovdqu x7, 7 * 32(mem_ab); \
229 vmovdqu y0, 0 * 32(mem_cd); \
230 vmovdqu y1, 1 * 32(mem_cd); \
231 vmovdqu y2, 2 * 32(mem_cd); \
232 vmovdqu y3, 3 * 32(mem_cd); \
233 vmovdqu y4, 4 * 32(mem_cd); \
234 vmovdqu y5, 5 * 32(mem_cd); \
235 vmovdqu y6, 6 * 32(mem_cd); \
236 vmovdqu y7, 7 * 32(mem_cd);
237
238#define write_output(x0, x1, x2, x3, \
239 x4, x5, x6, x7, \
240 y0, y1, y2, y3, \
241 y4, y5, y6, y7, \
242 mem) \
243 vmovdqu x0, 0 * 32(mem); \
244 vmovdqu x1, 1 * 32(mem); \
245 vmovdqu x2, 2 * 32(mem); \
246 vmovdqu x3, 3 * 32(mem); \
247 vmovdqu x4, 4 * 32(mem); \
248 vmovdqu x5, 5 * 32(mem); \
249 vmovdqu x6, 6 * 32(mem); \
250 vmovdqu x7, 7 * 32(mem); \
251 vmovdqu y0, 8 * 32(mem); \
252 vmovdqu y1, 9 * 32(mem); \
253 vmovdqu y2, 10 * 32(mem); \
254 vmovdqu y3, 11 * 32(mem); \
255 vmovdqu y4, 12 * 32(mem); \
256 vmovdqu y5, 13 * 32(mem); \
257 vmovdqu y6, 14 * 32(mem); \
258 vmovdqu y7, 15 * 32(mem); \
259
260#define aria_store_state_8way(x0, x1, x2, x3, \
261 x4, x5, x6, x7, \
262 mem_tmp, idx) \
263 vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \
264 vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \
265 vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \
266 vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \
267 vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \
268 vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \
269 vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \
270 vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
271
272#define aria_load_state_8way(x0, x1, x2, x3, \
273 x4, x5, x6, x7, \
274 mem_tmp, idx) \
275 vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \
276 vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \
277 vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \
278 vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \
279 vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \
280 vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \
281 vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \
282 vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
283
284#define aria_ark_8way(x0, x1, x2, x3, \
285 x4, x5, x6, x7, \
286 t0, rk, idx, round) \
287 /* AddRoundKey */ \
288 vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
289 vpxor t0, x0, x0; \
290 vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
291 vpxor t0, x1, x1; \
292 vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
293 vpxor t0, x2, x2; \
294 vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
295 vpxor t0, x3, x3; \
296 vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
297 vpxor t0, x4, x4; \
298 vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
299 vpxor t0, x5, x5; \
300 vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
301 vpxor t0, x6, x6; \
302 vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
303 vpxor t0, x7, x7;
304
305#ifdef CONFIG_AS_GFNI
306#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
307 x4, x5, x6, x7, \
308 t0, t1, t2, t3, \
309 t4, t5, t6, t7) \
310 vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
311 vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
312 vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
313 vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
314 vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
315 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
316 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
317 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
318 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
319 vgf2p8affineinvqb $0, t2, x2, x2; \
320 vgf2p8affineinvqb $0, t2, x6, x6; \
321 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
322 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
323 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
324 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
325 vgf2p8affineinvqb $0, t2, x3, x3; \
326 vgf2p8affineinvqb $0, t2, x7, x7
327
328#endif /* CONFIG_AS_GFNI */
329#define aria_sbox_8way(x0, x1, x2, x3, \
330 x4, x5, x6, x7, \
331 t0, t1, t2, t3, \
332 t4, t5, t6, t7) \
333 vpxor t7, t7, t7; \
334 vpxor t6, t6, t6; \
335 vbroadcasti128 .Linv_shift_row(%rip), t0; \
336 vbroadcasti128 .Lshift_row(%rip), t1; \
337 vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
338 vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
339 vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
340 vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
341 \
342 vextracti128 $1, x0, t6##_x; \
343 vaesenclast t7##_x, x0##_x, x0##_x; \
344 vaesenclast t7##_x, t6##_x, t6##_x; \
345 vinserti128 $1, t6##_x, x0, x0; \
346 \
347 vextracti128 $1, x4, t6##_x; \
348 vaesenclast t7##_x, x4##_x, x4##_x; \
349 vaesenclast t7##_x, t6##_x, t6##_x; \
350 vinserti128 $1, t6##_x, x4, x4; \
351 \
352 vextracti128 $1, x1, t6##_x; \
353 vaesenclast t7##_x, x1##_x, x1##_x; \
354 vaesenclast t7##_x, t6##_x, t6##_x; \
355 vinserti128 $1, t6##_x, x1, x1; \
356 \
357 vextracti128 $1, x5, t6##_x; \
358 vaesenclast t7##_x, x5##_x, x5##_x; \
359 vaesenclast t7##_x, t6##_x, t6##_x; \
360 vinserti128 $1, t6##_x, x5, x5; \
361 \
362 vextracti128 $1, x2, t6##_x; \
363 vaesdeclast t7##_x, x2##_x, x2##_x; \
364 vaesdeclast t7##_x, t6##_x, t6##_x; \
365 vinserti128 $1, t6##_x, x2, x2; \
366 \
367 vextracti128 $1, x6, t6##_x; \
368 vaesdeclast t7##_x, x6##_x, x6##_x; \
369 vaesdeclast t7##_x, t6##_x, t6##_x; \
370 vinserti128 $1, t6##_x, x6, x6; \
371 \
372 vpbroadcastd .L0f0f0f0f(%rip), t6; \
373 \
374 /* AES inverse shift rows */ \
375 vpshufb t0, x0, x0; \
376 vpshufb t0, x4, x4; \
377 vpshufb t0, x1, x1; \
378 vpshufb t0, x5, x5; \
379 vpshufb t1, x3, x3; \
380 vpshufb t1, x7, x7; \
381 vpshufb t1, x2, x2; \
382 vpshufb t1, x6, x6; \
383 \
384 /* affine transformation for S2 */ \
385 filter_8bit(x1, t2, t3, t6, t0); \
386 /* affine transformation for S2 */ \
387 filter_8bit(x5, t2, t3, t6, t0); \
388 \
389 /* affine transformation for X2 */ \
390 filter_8bit(x3, t4, t5, t6, t0); \
391 /* affine transformation for X2 */ \
392 filter_8bit(x7, t4, t5, t6, t0); \
393 \
394 vpxor t6, t6, t6; \
395 vextracti128 $1, x3, t6##_x; \
396 vaesdeclast t7##_x, x3##_x, x3##_x; \
397 vaesdeclast t7##_x, t6##_x, t6##_x; \
398 vinserti128 $1, t6##_x, x3, x3; \
399 \
400 vextracti128 $1, x7, t6##_x; \
401 vaesdeclast t7##_x, x7##_x, x7##_x; \
402 vaesdeclast t7##_x, t6##_x, t6##_x; \
403 vinserti128 $1, t6##_x, x7, x7; \
404
405#define aria_diff_m(x0, x1, x2, x3, \
406 t0, t1, t2, t3) \
407 /* T = rotr32(X, 8); */ \
408 /* X ^= T */ \
409 vpxor x0, x3, t0; \
410 vpxor x1, x0, t1; \
411 vpxor x2, x1, t2; \
412 vpxor x3, x2, t3; \
413 /* X = T ^ rotr(X, 16); */ \
414 vpxor t2, x0, x0; \
415 vpxor x1, t3, t3; \
416 vpxor t0, x2, x2; \
417 vpxor t1, x3, x1; \
418 vmovdqu t3, x3;
419
420#define aria_diff_word(x0, x1, x2, x3, \
421 x4, x5, x6, x7, \
422 y0, y1, y2, y3, \
423 y4, y5, y6, y7) \
424 /* t1 ^= t2; */ \
425 vpxor y0, x4, x4; \
426 vpxor y1, x5, x5; \
427 vpxor y2, x6, x6; \
428 vpxor y3, x7, x7; \
429 \
430 /* t2 ^= t3; */ \
431 vpxor y4, y0, y0; \
432 vpxor y5, y1, y1; \
433 vpxor y6, y2, y2; \
434 vpxor y7, y3, y3; \
435 \
436 /* t0 ^= t1; */ \
437 vpxor x4, x0, x0; \
438 vpxor x5, x1, x1; \
439 vpxor x6, x2, x2; \
440 vpxor x7, x3, x3; \
441 \
442 /* t3 ^= t1; */ \
443 vpxor x4, y4, y4; \
444 vpxor x5, y5, y5; \
445 vpxor x6, y6, y6; \
446 vpxor x7, y7, y7; \
447 \
448 /* t2 ^= t0; */ \
449 vpxor x0, y0, y0; \
450 vpxor x1, y1, y1; \
451 vpxor x2, y2, y2; \
452 vpxor x3, y3, y3; \
453 \
454 /* t1 ^= t2; */ \
455 vpxor y0, x4, x4; \
456 vpxor y1, x5, x5; \
457 vpxor y2, x6, x6; \
458 vpxor y3, x7, x7;
459
460#define aria_fe(x0, x1, x2, x3, \
461 x4, x5, x6, x7, \
462 y0, y1, y2, y3, \
463 y4, y5, y6, y7, \
464 mem_tmp, rk, round) \
465 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
466 y0, rk, 8, round); \
467 \
468 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
469 y0, y1, y2, y3, y4, y5, y6, y7); \
470 \
471 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
472 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
473 aria_store_state_8way(x0, x1, x2, x3, \
474 x4, x5, x6, x7, \
475 mem_tmp, 8); \
476 \
477 aria_load_state_8way(x0, x1, x2, x3, \
478 x4, x5, x6, x7, \
479 mem_tmp, 0); \
480 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481 y0, rk, 0, round); \
482 \
483 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
484 y0, y1, y2, y3, y4, y5, y6, y7); \
485 \
486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488 aria_store_state_8way(x0, x1, x2, x3, \
489 x4, x5, x6, x7, \
490 mem_tmp, 0); \
491 aria_load_state_8way(y0, y1, y2, y3, \
492 y4, y5, y6, y7, \
493 mem_tmp, 8); \
494 aria_diff_word(x0, x1, x2, x3, \
495 x4, x5, x6, x7, \
496 y0, y1, y2, y3, \
497 y4, y5, y6, y7); \
498 /* aria_diff_byte() \
499 * T3 = ABCD -> BADC \
500 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
501 * T0 = ABCD -> CDAB \
502 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
503 * T1 = ABCD -> DCBA \
504 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
505 */ \
506 aria_diff_word(x2, x3, x0, x1, \
507 x7, x6, x5, x4, \
508 y0, y1, y2, y3, \
509 y5, y4, y7, y6); \
510 aria_store_state_8way(x3, x2, x1, x0, \
511 x6, x7, x4, x5, \
512 mem_tmp, 0);
513
514#define aria_fo(x0, x1, x2, x3, \
515 x4, x5, x6, x7, \
516 y0, y1, y2, y3, \
517 y4, y5, y6, y7, \
518 mem_tmp, rk, round) \
519 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
520 y0, rk, 8, round); \
521 \
522 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
523 y0, y1, y2, y3, y4, y5, y6, y7); \
524 \
525 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
526 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
527 aria_store_state_8way(x0, x1, x2, x3, \
528 x4, x5, x6, x7, \
529 mem_tmp, 8); \
530 \
531 aria_load_state_8way(x0, x1, x2, x3, \
532 x4, x5, x6, x7, \
533 mem_tmp, 0); \
534 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
535 y0, rk, 0, round); \
536 \
537 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
538 y0, y1, y2, y3, y4, y5, y6, y7); \
539 \
540 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
541 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
542 aria_store_state_8way(x0, x1, x2, x3, \
543 x4, x5, x6, x7, \
544 mem_tmp, 0); \
545 aria_load_state_8way(y0, y1, y2, y3, \
546 y4, y5, y6, y7, \
547 mem_tmp, 8); \
548 aria_diff_word(x0, x1, x2, x3, \
549 x4, x5, x6, x7, \
550 y0, y1, y2, y3, \
551 y4, y5, y6, y7); \
552 /* aria_diff_byte() \
553 * T1 = ABCD -> BADC \
554 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
555 * T2 = ABCD -> CDAB \
556 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
557 * T3 = ABCD -> DCBA \
558 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
559 */ \
560 aria_diff_word(x0, x1, x2, x3, \
561 x5, x4, x7, x6, \
562 y2, y3, y0, y1, \
563 y7, y6, y5, y4); \
564 aria_store_state_8way(x3, x2, x1, x0, \
565 x6, x7, x4, x5, \
566 mem_tmp, 0);
567
568#define aria_ff(x0, x1, x2, x3, \
569 x4, x5, x6, x7, \
570 y0, y1, y2, y3, \
571 y4, y5, y6, y7, \
572 mem_tmp, rk, round, last_round) \
573 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
574 y0, rk, 8, round); \
575 \
576 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
577 y0, y1, y2, y3, y4, y5, y6, y7); \
578 \
579 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
580 y0, rk, 8, last_round); \
581 \
582 aria_store_state_8way(x0, x1, x2, x3, \
583 x4, x5, x6, x7, \
584 mem_tmp, 8); \
585 \
586 aria_load_state_8way(x0, x1, x2, x3, \
587 x4, x5, x6, x7, \
588 mem_tmp, 0); \
589 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
590 y0, rk, 0, round); \
591 \
592 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
593 y0, y1, y2, y3, y4, y5, y6, y7); \
594 \
595 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
596 y0, rk, 0, last_round); \
597 \
598 aria_load_state_8way(y0, y1, y2, y3, \
599 y4, y5, y6, y7, \
600 mem_tmp, 8);
601#ifdef CONFIG_AS_GFNI
602#define aria_fe_gfni(x0, x1, x2, x3, \
603 x4, x5, x6, x7, \
604 y0, y1, y2, y3, \
605 y4, y5, y6, y7, \
606 mem_tmp, rk, round) \
607 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
608 y0, rk, 8, round); \
609 \
610 aria_sbox_8way_gfni(x2, x3, x0, x1, \
611 x6, x7, x4, x5, \
612 y0, y1, y2, y3, \
613 y4, y5, y6, y7); \
614 \
615 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
616 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
617 aria_store_state_8way(x0, x1, x2, x3, \
618 x4, x5, x6, x7, \
619 mem_tmp, 8); \
620 \
621 aria_load_state_8way(x0, x1, x2, x3, \
622 x4, x5, x6, x7, \
623 mem_tmp, 0); \
624 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
625 y0, rk, 0, round); \
626 \
627 aria_sbox_8way_gfni(x2, x3, x0, x1, \
628 x6, x7, x4, x5, \
629 y0, y1, y2, y3, \
630 y4, y5, y6, y7); \
631 \
632 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
633 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
634 aria_store_state_8way(x0, x1, x2, x3, \
635 x4, x5, x6, x7, \
636 mem_tmp, 0); \
637 aria_load_state_8way(y0, y1, y2, y3, \
638 y4, y5, y6, y7, \
639 mem_tmp, 8); \
640 aria_diff_word(x0, x1, x2, x3, \
641 x4, x5, x6, x7, \
642 y0, y1, y2, y3, \
643 y4, y5, y6, y7); \
644 /* aria_diff_byte() \
645 * T3 = ABCD -> BADC \
646 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
647 * T0 = ABCD -> CDAB \
648 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
649 * T1 = ABCD -> DCBA \
650 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
651 */ \
652 aria_diff_word(x2, x3, x0, x1, \
653 x7, x6, x5, x4, \
654 y0, y1, y2, y3, \
655 y5, y4, y7, y6); \
656 aria_store_state_8way(x3, x2, x1, x0, \
657 x6, x7, x4, x5, \
658 mem_tmp, 0);
659
660#define aria_fo_gfni(x0, x1, x2, x3, \
661 x4, x5, x6, x7, \
662 y0, y1, y2, y3, \
663 y4, y5, y6, y7, \
664 mem_tmp, rk, round) \
665 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
666 y0, rk, 8, round); \
667 \
668 aria_sbox_8way_gfni(x0, x1, x2, x3, \
669 x4, x5, x6, x7, \
670 y0, y1, y2, y3, \
671 y4, y5, y6, y7); \
672 \
673 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
674 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
675 aria_store_state_8way(x0, x1, x2, x3, \
676 x4, x5, x6, x7, \
677 mem_tmp, 8); \
678 \
679 aria_load_state_8way(x0, x1, x2, x3, \
680 x4, x5, x6, x7, \
681 mem_tmp, 0); \
682 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
683 y0, rk, 0, round); \
684 \
685 aria_sbox_8way_gfni(x0, x1, x2, x3, \
686 x4, x5, x6, x7, \
687 y0, y1, y2, y3, \
688 y4, y5, y6, y7); \
689 \
690 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
691 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
692 aria_store_state_8way(x0, x1, x2, x3, \
693 x4, x5, x6, x7, \
694 mem_tmp, 0); \
695 aria_load_state_8way(y0, y1, y2, y3, \
696 y4, y5, y6, y7, \
697 mem_tmp, 8); \
698 aria_diff_word(x0, x1, x2, x3, \
699 x4, x5, x6, x7, \
700 y0, y1, y2, y3, \
701 y4, y5, y6, y7); \
702 /* aria_diff_byte() \
703 * T1 = ABCD -> BADC \
704 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
705 * T2 = ABCD -> CDAB \
706 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
707 * T3 = ABCD -> DCBA \
708 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
709 */ \
710 aria_diff_word(x0, x1, x2, x3, \
711 x5, x4, x7, x6, \
712 y2, y3, y0, y1, \
713 y7, y6, y5, y4); \
714 aria_store_state_8way(x3, x2, x1, x0, \
715 x6, x7, x4, x5, \
716 mem_tmp, 0);
717
718#define aria_ff_gfni(x0, x1, x2, x3, \
719 x4, x5, x6, x7, \
720 y0, y1, y2, y3, \
721 y4, y5, y6, y7, \
722 mem_tmp, rk, round, last_round) \
723 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
724 y0, rk, 8, round); \
725 \
726 aria_sbox_8way_gfni(x2, x3, x0, x1, \
727 x6, x7, x4, x5, \
728 y0, y1, y2, y3, \
729 y4, y5, y6, y7); \
730 \
731 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
732 y0, rk, 8, last_round); \
733 \
734 aria_store_state_8way(x0, x1, x2, x3, \
735 x4, x5, x6, x7, \
736 mem_tmp, 8); \
737 \
738 aria_load_state_8way(x0, x1, x2, x3, \
739 x4, x5, x6, x7, \
740 mem_tmp, 0); \
741 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
742 y0, rk, 0, round); \
743 \
744 aria_sbox_8way_gfni(x2, x3, x0, x1, \
745 x6, x7, x4, x5, \
746 y0, y1, y2, y3, \
747 y4, y5, y6, y7); \
748 \
749 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
750 y0, rk, 0, last_round); \
751 \
752 aria_load_state_8way(y0, y1, y2, y3, \
753 y4, y5, y6, y7, \
754 mem_tmp, 8);
755#endif /* CONFIG_AS_GFNI */
756
757.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
758.align 32
759#define SHUFB_BYTES(idx) \
760 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
761.Lshufb_16x16b:
762 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
763 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
764
765.section .rodata.cst16, "aM", @progbits, 16
766.align 16
767/* For isolating SubBytes from AESENCLAST, inverse shift row */
768.Linv_shift_row:
769 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
770 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
771.Lshift_row:
772 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
773 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
774/* For CTR-mode IV byteswap */
775.Lbswap128_mask:
776 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
777 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
778
779/* AES inverse affine and S2 combined:
780 * 1 1 0 0 0 0 0 1 x0 0
781 * 0 1 0 0 1 0 0 0 x1 0
782 * 1 1 0 0 1 1 1 1 x2 0
783 * 0 1 1 0 1 0 0 1 x3 1
784 * 0 1 0 0 1 1 0 0 * x4 + 0
785 * 0 1 0 1 1 0 0 0 x5 0
786 * 0 0 0 0 0 1 0 1 x6 0
787 * 1 1 1 0 0 1 1 1 x7 1
788 */
789.Ltf_lo__inv_aff__and__s2:
790 .octa 0x92172DA81A9FA520B2370D883ABF8500
791.Ltf_hi__inv_aff__and__s2:
792 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
793
794/* X2 and AES forward affine combined:
795 * 1 0 1 1 0 0 0 1 x0 0
796 * 0 1 1 1 1 0 1 1 x1 0
797 * 0 0 0 1 1 0 1 0 x2 1
798 * 0 1 0 0 0 1 0 0 x3 0
799 * 0 0 1 1 1 0 1 1 * x4 + 0
800 * 0 1 0 0 1 0 0 0 x5 0
801 * 1 1 0 1 0 0 1 1 x6 0
802 * 0 1 0 0 1 0 1 0 x7 0
803 */
804.Ltf_lo__x2__and__fwd_aff:
805 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
806.Ltf_hi__x2__and__fwd_aff:
807 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
808
809#ifdef CONFIG_AS_GFNI
810.section .rodata.cst8, "aM", @progbits, 8
811.align 8
812/* AES affine: */
813#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
814.Ltf_aff_bitmatrix:
815 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
816 BV8(1, 1, 0, 0, 0, 1, 1, 1),
817 BV8(1, 1, 1, 0, 0, 0, 1, 1),
818 BV8(1, 1, 1, 1, 0, 0, 0, 1),
819 BV8(1, 1, 1, 1, 1, 0, 0, 0),
820 BV8(0, 1, 1, 1, 1, 1, 0, 0),
821 BV8(0, 0, 1, 1, 1, 1, 1, 0),
822 BV8(0, 0, 0, 1, 1, 1, 1, 1))
823
824/* AES inverse affine: */
825#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
826.Ltf_inv_bitmatrix:
827 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
828 BV8(1, 0, 0, 1, 0, 0, 1, 0),
829 BV8(0, 1, 0, 0, 1, 0, 0, 1),
830 BV8(1, 0, 1, 0, 0, 1, 0, 0),
831 BV8(0, 1, 0, 1, 0, 0, 1, 0),
832 BV8(0, 0, 1, 0, 1, 0, 0, 1),
833 BV8(1, 0, 0, 1, 0, 1, 0, 0),
834 BV8(0, 1, 0, 0, 1, 0, 1, 0))
835
836/* S2: */
837#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
838.Ltf_s2_bitmatrix:
839 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
840 BV8(0, 0, 1, 1, 1, 1, 1, 1),
841 BV8(1, 1, 1, 0, 1, 1, 0, 1),
842 BV8(1, 1, 0, 0, 0, 0, 1, 1),
843 BV8(0, 1, 0, 0, 0, 0, 1, 1),
844 BV8(1, 1, 0, 0, 1, 1, 1, 0),
845 BV8(0, 1, 1, 0, 0, 0, 1, 1),
846 BV8(1, 1, 1, 1, 0, 1, 1, 0))
847
848/* X2: */
849#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
850.Ltf_x2_bitmatrix:
851 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
852 BV8(0, 0, 1, 0, 0, 1, 1, 0),
853 BV8(0, 0, 0, 0, 1, 0, 1, 0),
854 BV8(1, 1, 1, 0, 0, 0, 1, 1),
855 BV8(1, 1, 1, 0, 1, 1, 0, 0),
856 BV8(0, 1, 1, 0, 1, 0, 1, 1),
857 BV8(1, 0, 1, 1, 1, 1, 0, 1),
858 BV8(1, 0, 0, 1, 0, 0, 1, 1))
859
860/* Identity matrix: */
861.Ltf_id_bitmatrix:
862 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
863 BV8(0, 1, 0, 0, 0, 0, 0, 0),
864 BV8(0, 0, 1, 0, 0, 0, 0, 0),
865 BV8(0, 0, 0, 1, 0, 0, 0, 0),
866 BV8(0, 0, 0, 0, 1, 0, 0, 0),
867 BV8(0, 0, 0, 0, 0, 1, 0, 0),
868 BV8(0, 0, 0, 0, 0, 0, 1, 0),
869 BV8(0, 0, 0, 0, 0, 0, 0, 1))
870
871#endif /* CONFIG_AS_GFNI */
872
873/* 4-bit mask */
874.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
875.align 4
876.L0f0f0f0f:
877 .long 0x0f0f0f0f
878
879.text
880
881SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
882 /* input:
883 * %r9: rk
884 * %rsi: dst
885 * %rdx: src
886 * %ymm0..%ymm15: byte-sliced blocks
887 */
888
889 FRAME_BEGIN
890
891 movq %rsi, %rax;
892 leaq 8 * 32(%rax), %r8;
893
894 inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
895 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
896 %ymm15, %rax, %r8);
897 aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
898 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
899 %rax, %r9, 0);
900 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
901 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
902 %ymm15, %rax, %r9, 1);
903 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
904 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905 %rax, %r9, 2);
906 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
907 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
908 %ymm15, %rax, %r9, 3);
909 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
910 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911 %rax, %r9, 4);
912 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
913 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914 %ymm15, %rax, %r9, 5);
915 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
916 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917 %rax, %r9, 6);
918 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
919 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
920 %ymm15, %rax, %r9, 7);
921 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
922 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
923 %rax, %r9, 8);
924 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
925 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
926 %ymm15, %rax, %r9, 9);
927 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
928 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
929 %rax, %r9, 10);
930 cmpl $12, ARIA_CTX_rounds(CTX);
931 jne .Laria_192;
932 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
933 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
934 %ymm15, %rax, %r9, 11, 12);
935 jmp .Laria_end;
936.Laria_192:
937 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
938 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
939 %ymm15, %rax, %r9, 11);
940 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
941 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
942 %rax, %r9, 12);
943 cmpl $14, ARIA_CTX_rounds(CTX);
944 jne .Laria_256;
945 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
946 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
947 %ymm15, %rax, %r9, 13, 14);
948 jmp .Laria_end;
949.Laria_256:
950 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
951 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
952 %ymm15, %rax, %r9, 13);
953 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
954 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
955 %rax, %r9, 14);
956 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
957 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
958 %ymm15, %rax, %r9, 15, 16);
959.Laria_end:
960 debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
961 %ymm9, %ymm13, %ymm0, %ymm5,
962 %ymm10, %ymm14, %ymm3, %ymm6,
963 %ymm11, %ymm15, %ymm2, %ymm7,
964 (%rax), (%r8));
965
966 FRAME_END
967 RET;
968SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
969
970SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
971 /* input:
972 * %rdi: ctx, CTX
973 * %rsi: dst
974 * %rdx: src
975 */
976
977 FRAME_BEGIN
978
979 leaq ARIA_CTX_enc_key(CTX), %r9;
980
981 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
982 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
983 %ymm15, %rdx);
984
985 call __aria_aesni_avx2_crypt_32way;
986
987 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
988 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
989 %ymm15, %rax);
990
991 FRAME_END
992 RET;
993SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
994
995SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
996 /* input:
997 * %rdi: ctx, CTX
998 * %rsi: dst
999 * %rdx: src
1000 */
1001
1002 FRAME_BEGIN
1003
1004 leaq ARIA_CTX_dec_key(CTX), %r9;
1005
1006 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1007 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1008 %ymm15, %rdx);
1009
1010 call __aria_aesni_avx2_crypt_32way;
1011
1012 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1013 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014 %ymm15, %rax);
1015
1016 FRAME_END
1017 RET;
1018SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
1019
1020SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
1021 /* input:
1022 * %rdi: ctx
1023 * %rsi: dst
1024 * %rdx: src
1025 * %rcx: keystream
1026 * %r8: iv (big endian, 128bit)
1027 */
1028
1029 FRAME_BEGIN
1030 movq 8(%r8), %r11;
1031 bswapq %r11;
1032
1033 vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
1034 vpcmpeqd %ymm0, %ymm0, %ymm0;
1035 vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */
1036 vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
1037
1038 /* load IV and byteswap */
1039 vmovdqu (%r8), %xmm7;
1040 vpshufb %xmm6, %xmm7, %xmm7;
1041 vmovdqa %xmm7, %xmm3;
1042 inc_le128(%xmm7, %xmm0, %xmm4);
1043 vinserti128 $1, %xmm7, %ymm3, %ymm3;
1044 vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
1045
1046 /* check need for handling 64-bit overflow and carry */
1047 cmpq $(0xffffffffffffffff - 32), %r11;
1048 ja .Lhandle_ctr_carry;
1049
1050 /* construct IVs */
1051 vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
1052 vpshufb %ymm6, %ymm3, %ymm9;
1053 vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
1054 vpshufb %ymm6, %ymm3, %ymm10;
1055 vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
1056 vpshufb %ymm6, %ymm3, %ymm11;
1057 vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
1058 vpshufb %ymm6, %ymm3, %ymm12;
1059 vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
1060 vpshufb %ymm6, %ymm3, %ymm13;
1061 vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
1062 vpshufb %ymm6, %ymm3, %ymm14;
1063 vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
1064 vpshufb %ymm6, %ymm3, %ymm15;
1065 vmovdqu %ymm8, (0 * 32)(%rcx);
1066 vmovdqu %ymm9, (1 * 32)(%rcx);
1067 vmovdqu %ymm10, (2 * 32)(%rcx);
1068 vmovdqu %ymm11, (3 * 32)(%rcx);
1069 vmovdqu %ymm12, (4 * 32)(%rcx);
1070 vmovdqu %ymm13, (5 * 32)(%rcx);
1071 vmovdqu %ymm14, (6 * 32)(%rcx);
1072 vmovdqu %ymm15, (7 * 32)(%rcx);
1073
1074 vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
1075 vpshufb %ymm6, %ymm3, %ymm8;
1076 vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
1077 vpshufb %ymm6, %ymm3, %ymm9;
1078 vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
1079 vpshufb %ymm6, %ymm3, %ymm10;
1080 vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
1081 vpshufb %ymm6, %ymm3, %ymm11;
1082 vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
1083 vpshufb %ymm6, %ymm3, %ymm12;
1084 vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
1085 vpshufb %ymm6, %ymm3, %ymm13;
1086 vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
1087 vpshufb %ymm6, %ymm3, %ymm14;
1088 vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
1089 vpshufb %ymm6, %ymm3, %ymm15;
1090 vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
1091 vpshufb %xmm6, %xmm3, %xmm3;
1092 vmovdqu %xmm3, (%r8);
1093 vmovdqu (0 * 32)(%rcx), %ymm0;
1094 vmovdqu (1 * 32)(%rcx), %ymm1;
1095 vmovdqu (2 * 32)(%rcx), %ymm2;
1096 vmovdqu (3 * 32)(%rcx), %ymm3;
1097 vmovdqu (4 * 32)(%rcx), %ymm4;
1098 vmovdqu (5 * 32)(%rcx), %ymm5;
1099 vmovdqu (6 * 32)(%rcx), %ymm6;
1100 vmovdqu (7 * 32)(%rcx), %ymm7;
1101 jmp .Lctr_carry_done;
1102
1103 .Lhandle_ctr_carry:
1104 /* construct IVs */
1105 inc_le128(%ymm3, %ymm0, %ymm4);
1106 inc_le128(%ymm3, %ymm0, %ymm4);
1107 vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
1108 inc_le128(%ymm3, %ymm0, %ymm4);
1109 inc_le128(%ymm3, %ymm0, %ymm4);
1110 vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
1111 inc_le128(%ymm3, %ymm0, %ymm4);
1112 inc_le128(%ymm3, %ymm0, %ymm4);
1113 vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
1114 inc_le128(%ymm3, %ymm0, %ymm4);
1115 inc_le128(%ymm3, %ymm0, %ymm4);
1116 vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
1117 inc_le128(%ymm3, %ymm0, %ymm4);
1118 inc_le128(%ymm3, %ymm0, %ymm4);
1119 vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
1120 inc_le128(%ymm3, %ymm0, %ymm4);
1121 inc_le128(%ymm3, %ymm0, %ymm4);
1122 vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
1123 inc_le128(%ymm3, %ymm0, %ymm4);
1124 inc_le128(%ymm3, %ymm0, %ymm4);
1125 vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
1126 vmovdqu %ymm8, (0 * 32)(%rcx);
1127 vmovdqu %ymm9, (1 * 32)(%rcx);
1128 vmovdqu %ymm10, (2 * 32)(%rcx);
1129 vmovdqu %ymm11, (3 * 32)(%rcx);
1130 vmovdqu %ymm12, (4 * 32)(%rcx);
1131 vmovdqu %ymm13, (5 * 32)(%rcx);
1132 vmovdqu %ymm14, (6 * 32)(%rcx);
1133 vmovdqu %ymm15, (7 * 32)(%rcx);
1134
1135 inc_le128(%ymm3, %ymm0, %ymm4);
1136 inc_le128(%ymm3, %ymm0, %ymm4);
1137 vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
1138 inc_le128(%ymm3, %ymm0, %ymm4);
1139 inc_le128(%ymm3, %ymm0, %ymm4);
1140 vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
1141 inc_le128(%ymm3, %ymm0, %ymm4);
1142 inc_le128(%ymm3, %ymm0, %ymm4);
1143 vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
1144 inc_le128(%ymm3, %ymm0, %ymm4);
1145 inc_le128(%ymm3, %ymm0, %ymm4);
1146 vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
1147 inc_le128(%ymm3, %ymm0, %ymm4);
1148 inc_le128(%ymm3, %ymm0, %ymm4);
1149 vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
1150 inc_le128(%ymm3, %ymm0, %ymm4);
1151 inc_le128(%ymm3, %ymm0, %ymm4);
1152 vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
1153 inc_le128(%ymm3, %ymm0, %ymm4);
1154 inc_le128(%ymm3, %ymm0, %ymm4);
1155 vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
1156 inc_le128(%ymm3, %ymm0, %ymm4);
1157 inc_le128(%ymm3, %ymm0, %ymm4);
1158 vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
1159 inc_le128(%ymm3, %ymm0, %ymm4);
1160 vextracti128 $1, %ymm3, %xmm3;
1161 vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
1162 vmovdqu %xmm3, (%r8);
1163 vmovdqu (0 * 32)(%rcx), %ymm0;
1164 vmovdqu (1 * 32)(%rcx), %ymm1;
1165 vmovdqu (2 * 32)(%rcx), %ymm2;
1166 vmovdqu (3 * 32)(%rcx), %ymm3;
1167 vmovdqu (4 * 32)(%rcx), %ymm4;
1168 vmovdqu (5 * 32)(%rcx), %ymm5;
1169 vmovdqu (6 * 32)(%rcx), %ymm6;
1170 vmovdqu (7 * 32)(%rcx), %ymm7;
1171
1172 .Lctr_carry_done:
1173
1174 FRAME_END
1175 RET;
1176SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
1177
1178SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
1179 /* input:
1180 * %rdi: ctx
1181 * %rsi: dst
1182 * %rdx: src
1183 * %rcx: keystream
1184 * %r8: iv (big endian, 128bit)
1185 */
1186 FRAME_BEGIN
1187
1188 call __aria_aesni_avx2_ctr_gen_keystream_32way;
1189
1190 leaq (%rsi), %r10;
1191 leaq (%rdx), %r11;
1192 leaq (%rcx), %rsi;
1193 leaq (%rcx), %rdx;
1194 leaq ARIA_CTX_enc_key(CTX), %r9;
1195
1196 call __aria_aesni_avx2_crypt_32way;
1197
1198 vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1199 vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1200 vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1201 vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1202 vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1203 vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1204 vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1205 vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1206 vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1207 vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1208 vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1209 vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1210 vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1211 vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1212 vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1213 vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1214 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1215 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1216 %ymm15, %r10);
1217
1218 FRAME_END
1219 RET;
1220SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
1221
1222#ifdef CONFIG_AS_GFNI
1223SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
1224 /* input:
1225 * %r9: rk
1226 * %rsi: dst
1227 * %rdx: src
1228 * %ymm0..%ymm15: 16 byte-sliced blocks
1229 */
1230
1231 FRAME_BEGIN
1232
1233 movq %rsi, %rax;
1234 leaq 8 * 32(%rax), %r8;
1235
1236 inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
1237 %ymm4, %ymm5, %ymm6, %ymm7,
1238 %ymm8, %ymm9, %ymm10, %ymm11,
1239 %ymm12, %ymm13, %ymm14,
1240 %ymm15, %rax, %r8);
1241 aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
1242 %ymm12, %ymm13, %ymm14, %ymm15,
1243 %ymm0, %ymm1, %ymm2, %ymm3,
1244 %ymm4, %ymm5, %ymm6, %ymm7,
1245 %rax, %r9, 0);
1246 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1247 %ymm4, %ymm5, %ymm6, %ymm7,
1248 %ymm8, %ymm9, %ymm10, %ymm11,
1249 %ymm12, %ymm13, %ymm14,
1250 %ymm15, %rax, %r9, 1);
1251 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1252 %ymm12, %ymm13, %ymm14, %ymm15,
1253 %ymm0, %ymm1, %ymm2, %ymm3,
1254 %ymm4, %ymm5, %ymm6, %ymm7,
1255 %rax, %r9, 2);
1256 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1257 %ymm4, %ymm5, %ymm6, %ymm7,
1258 %ymm8, %ymm9, %ymm10, %ymm11,
1259 %ymm12, %ymm13, %ymm14,
1260 %ymm15, %rax, %r9, 3);
1261 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1262 %ymm12, %ymm13, %ymm14, %ymm15,
1263 %ymm0, %ymm1, %ymm2, %ymm3,
1264 %ymm4, %ymm5, %ymm6, %ymm7,
1265 %rax, %r9, 4);
1266 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1267 %ymm4, %ymm5, %ymm6, %ymm7,
1268 %ymm8, %ymm9, %ymm10, %ymm11,
1269 %ymm12, %ymm13, %ymm14,
1270 %ymm15, %rax, %r9, 5);
1271 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1272 %ymm12, %ymm13, %ymm14, %ymm15,
1273 %ymm0, %ymm1, %ymm2, %ymm3,
1274 %ymm4, %ymm5, %ymm6, %ymm7,
1275 %rax, %r9, 6);
1276 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1277 %ymm4, %ymm5, %ymm6, %ymm7,
1278 %ymm8, %ymm9, %ymm10, %ymm11,
1279 %ymm12, %ymm13, %ymm14,
1280 %ymm15, %rax, %r9, 7);
1281 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1282 %ymm12, %ymm13, %ymm14, %ymm15,
1283 %ymm0, %ymm1, %ymm2, %ymm3,
1284 %ymm4, %ymm5, %ymm6, %ymm7,
1285 %rax, %r9, 8);
1286 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1287 %ymm4, %ymm5, %ymm6, %ymm7,
1288 %ymm8, %ymm9, %ymm10, %ymm11,
1289 %ymm12, %ymm13, %ymm14,
1290 %ymm15, %rax, %r9, 9);
1291 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1292 %ymm12, %ymm13, %ymm14, %ymm15,
1293 %ymm0, %ymm1, %ymm2, %ymm3,
1294 %ymm4, %ymm5, %ymm6, %ymm7,
1295 %rax, %r9, 10);
1296 cmpl $12, ARIA_CTX_rounds(CTX);
1297 jne .Laria_gfni_192;
1298 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1299 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1300 %ymm15, %rax, %r9, 11, 12);
1301 jmp .Laria_gfni_end;
1302.Laria_gfni_192:
1303 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1304 %ymm4, %ymm5, %ymm6, %ymm7,
1305 %ymm8, %ymm9, %ymm10, %ymm11,
1306 %ymm12, %ymm13, %ymm14,
1307 %ymm15, %rax, %r9, 11);
1308 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1309 %ymm12, %ymm13, %ymm14, %ymm15,
1310 %ymm0, %ymm1, %ymm2, %ymm3,
1311 %ymm4, %ymm5, %ymm6, %ymm7,
1312 %rax, %r9, 12);
1313 cmpl $14, ARIA_CTX_rounds(CTX);
1314 jne .Laria_gfni_256;
1315 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1316 %ymm4, %ymm5, %ymm6, %ymm7,
1317 %ymm8, %ymm9, %ymm10, %ymm11,
1318 %ymm12, %ymm13, %ymm14,
1319 %ymm15, %rax, %r9, 13, 14);
1320 jmp .Laria_gfni_end;
1321.Laria_gfni_256:
1322 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1323 %ymm4, %ymm5, %ymm6, %ymm7,
1324 %ymm8, %ymm9, %ymm10, %ymm11,
1325 %ymm12, %ymm13, %ymm14,
1326 %ymm15, %rax, %r9, 13);
1327 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
1328 %ymm12, %ymm13, %ymm14, %ymm15,
1329 %ymm0, %ymm1, %ymm2, %ymm3,
1330 %ymm4, %ymm5, %ymm6, %ymm7,
1331 %rax, %r9, 14);
1332 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
1333 %ymm4, %ymm5, %ymm6, %ymm7,
1334 %ymm8, %ymm9, %ymm10, %ymm11,
1335 %ymm12, %ymm13, %ymm14,
1336 %ymm15, %rax, %r9, 15, 16);
1337.Laria_gfni_end:
1338 debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
1339 %ymm9, %ymm13, %ymm0, %ymm5,
1340 %ymm10, %ymm14, %ymm3, %ymm6,
1341 %ymm11, %ymm15, %ymm2, %ymm7,
1342 (%rax), (%r8));
1343
1344 FRAME_END
1345 RET;
1346SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
1347
1348SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
1349 /* input:
1350 * %rdi: ctx, CTX
1351 * %rsi: dst
1352 * %rdx: src
1353 */
1354
1355 FRAME_BEGIN
1356
1357 leaq ARIA_CTX_enc_key(CTX), %r9;
1358
1359 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1360 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1361 %ymm15, %rdx);
1362
1363 call __aria_aesni_avx2_gfni_crypt_32way;
1364
1365 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1366 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1367 %ymm15, %rax);
1368
1369 FRAME_END
1370 RET;
1371SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
1372
1373SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
1374 /* input:
1375 * %rdi: ctx, CTX
1376 * %rsi: dst
1377 * %rdx: src
1378 */
1379
1380 FRAME_BEGIN
1381
1382 leaq ARIA_CTX_dec_key(CTX), %r9;
1383
1384 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1385 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1386 %ymm15, %rdx);
1387
1388 call __aria_aesni_avx2_gfni_crypt_32way;
1389
1390 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1391 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1392 %ymm15, %rax);
1393
1394 FRAME_END
1395 RET;
1396SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
1397
1398SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
1399 /* input:
1400 * %rdi: ctx
1401 * %rsi: dst
1402 * %rdx: src
1403 * %rcx: keystream
1404 * %r8: iv (big endian, 128bit)
1405 */
1406 FRAME_BEGIN
1407
1408 call __aria_aesni_avx2_ctr_gen_keystream_32way
1409
1410 leaq (%rsi), %r10;
1411 leaq (%rdx), %r11;
1412 leaq (%rcx), %rsi;
1413 leaq (%rcx), %rdx;
1414 leaq ARIA_CTX_enc_key(CTX), %r9;
1415
1416 call __aria_aesni_avx2_gfni_crypt_32way;
1417
1418 vpxor (0 * 32)(%r11), %ymm1, %ymm1;
1419 vpxor (1 * 32)(%r11), %ymm0, %ymm0;
1420 vpxor (2 * 32)(%r11), %ymm3, %ymm3;
1421 vpxor (3 * 32)(%r11), %ymm2, %ymm2;
1422 vpxor (4 * 32)(%r11), %ymm4, %ymm4;
1423 vpxor (5 * 32)(%r11), %ymm5, %ymm5;
1424 vpxor (6 * 32)(%r11), %ymm6, %ymm6;
1425 vpxor (7 * 32)(%r11), %ymm7, %ymm7;
1426 vpxor (8 * 32)(%r11), %ymm8, %ymm8;
1427 vpxor (9 * 32)(%r11), %ymm9, %ymm9;
1428 vpxor (10 * 32)(%r11), %ymm10, %ymm10;
1429 vpxor (11 * 32)(%r11), %ymm11, %ymm11;
1430 vpxor (12 * 32)(%r11), %ymm12, %ymm12;
1431 vpxor (13 * 32)(%r11), %ymm13, %ymm13;
1432 vpxor (14 * 32)(%r11), %ymm14, %ymm14;
1433 vpxor (15 * 32)(%r11), %ymm15, %ymm15;
1434 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
1435 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1436 %ymm15, %r10);
1437
1438 FRAME_END
1439 RET;
1440SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
1441#endif /* CONFIG_AS_GFNI */
1442

source code of linux/arch/x86/crypto/aria-aesni-avx2-asm_64.S