1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * ARIA Cipher 32-way parallel algorithm (AVX2) |
4 | * |
5 | * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> |
6 | * |
7 | */ |
8 | |
9 | #include <linux/linkage.h> |
10 | #include <asm/frame.h> |
11 | #include <asm/asm-offsets.h> |
12 | #include <linux/cfi_types.h> |
13 | |
14 | /* register macros */ |
15 | #define CTX %rdi |
16 | |
17 | #define ymm0_x xmm0 |
18 | #define ymm1_x xmm1 |
19 | #define ymm2_x xmm2 |
20 | #define ymm3_x xmm3 |
21 | #define ymm4_x xmm4 |
22 | #define ymm5_x xmm5 |
23 | #define ymm6_x xmm6 |
24 | #define ymm7_x xmm7 |
25 | #define ymm8_x xmm8 |
26 | #define ymm9_x xmm9 |
27 | #define ymm10_x xmm10 |
28 | #define ymm11_x xmm11 |
29 | #define ymm12_x xmm12 |
30 | #define ymm13_x xmm13 |
31 | #define ymm14_x xmm14 |
32 | #define ymm15_x xmm15 |
33 | |
34 | #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ |
35 | ( (((a0) & 1) << 0) | \ |
36 | (((a1) & 1) << 1) | \ |
37 | (((a2) & 1) << 2) | \ |
38 | (((a3) & 1) << 3) | \ |
39 | (((a4) & 1) << 4) | \ |
40 | (((a5) & 1) << 5) | \ |
41 | (((a6) & 1) << 6) | \ |
42 | (((a7) & 1) << 7) ) |
43 | |
44 | #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ |
45 | ( ((l7) << (0 * 8)) | \ |
46 | ((l6) << (1 * 8)) | \ |
47 | ((l5) << (2 * 8)) | \ |
48 | ((l4) << (3 * 8)) | \ |
49 | ((l3) << (4 * 8)) | \ |
50 | ((l2) << (5 * 8)) | \ |
51 | ((l1) << (6 * 8)) | \ |
52 | ((l0) << (7 * 8)) ) |
53 | |
54 | #define inc_le128(x, minus_one, tmp) \ |
55 | vpcmpeqq minus_one, x, tmp; \ |
56 | vpsubq minus_one, x, x; \ |
57 | vpslldq $8, tmp, tmp; \ |
58 | vpsubq tmp, x, x; |
59 | |
60 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ |
61 | vpand x, mask4bit, tmp0; \ |
62 | vpandn x, mask4bit, x; \ |
63 | vpsrld $4, x, x; \ |
64 | \ |
65 | vpshufb tmp0, lo_t, tmp0; \ |
66 | vpshufb x, hi_t, x; \ |
67 | vpxor tmp0, x, x; |
68 | |
69 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ |
70 | vpunpckhdq x1, x0, t2; \ |
71 | vpunpckldq x1, x0, x0; \ |
72 | \ |
73 | vpunpckldq x3, x2, t1; \ |
74 | vpunpckhdq x3, x2, x2; \ |
75 | \ |
76 | vpunpckhqdq t1, x0, x1; \ |
77 | vpunpcklqdq t1, x0, x0; \ |
78 | \ |
79 | vpunpckhqdq x2, t2, x3; \ |
80 | vpunpcklqdq x2, t2, x2; |
81 | |
82 | #define byteslice_16x16b(a0, b0, c0, d0, \ |
83 | a1, b1, c1, d1, \ |
84 | a2, b2, c2, d2, \ |
85 | a3, b3, c3, d3, \ |
86 | st0, st1) \ |
87 | vmovdqu d2, st0; \ |
88 | vmovdqu d3, st1; \ |
89 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ |
90 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ |
91 | vmovdqu st0, d2; \ |
92 | vmovdqu st1, d3; \ |
93 | \ |
94 | vmovdqu a0, st0; \ |
95 | vmovdqu a1, st1; \ |
96 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ |
97 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ |
98 | \ |
99 | vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ |
100 | vmovdqu st1, a1; \ |
101 | vpshufb a0, a2, a2; \ |
102 | vpshufb a0, a3, a3; \ |
103 | vpshufb a0, b0, b0; \ |
104 | vpshufb a0, b1, b1; \ |
105 | vpshufb a0, b2, b2; \ |
106 | vpshufb a0, b3, b3; \ |
107 | vpshufb a0, a1, a1; \ |
108 | vpshufb a0, c0, c0; \ |
109 | vpshufb a0, c1, c1; \ |
110 | vpshufb a0, c2, c2; \ |
111 | vpshufb a0, c3, c3; \ |
112 | vpshufb a0, d0, d0; \ |
113 | vpshufb a0, d1, d1; \ |
114 | vpshufb a0, d2, d2; \ |
115 | vpshufb a0, d3, d3; \ |
116 | vmovdqu d3, st1; \ |
117 | vmovdqu st0, d3; \ |
118 | vpshufb a0, d3, a0; \ |
119 | vmovdqu d2, st0; \ |
120 | \ |
121 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ |
122 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ |
123 | vmovdqu st0, d2; \ |
124 | vmovdqu st1, d3; \ |
125 | \ |
126 | vmovdqu b0, st0; \ |
127 | vmovdqu b1, st1; \ |
128 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ |
129 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ |
130 | vmovdqu st0, b0; \ |
131 | vmovdqu st1, b1; \ |
132 | /* does not adjust output bytes inside vectors */ |
133 | |
134 | #define debyteslice_16x16b(a0, b0, c0, d0, \ |
135 | a1, b1, c1, d1, \ |
136 | a2, b2, c2, d2, \ |
137 | a3, b3, c3, d3, \ |
138 | st0, st1) \ |
139 | vmovdqu d2, st0; \ |
140 | vmovdqu d3, st1; \ |
141 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ |
142 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ |
143 | vmovdqu st0, d2; \ |
144 | vmovdqu st1, d3; \ |
145 | \ |
146 | vmovdqu a0, st0; \ |
147 | vmovdqu a1, st1; \ |
148 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ |
149 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ |
150 | \ |
151 | vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ |
152 | vmovdqu st1, a1; \ |
153 | vpshufb a0, a2, a2; \ |
154 | vpshufb a0, a3, a3; \ |
155 | vpshufb a0, b0, b0; \ |
156 | vpshufb a0, b1, b1; \ |
157 | vpshufb a0, b2, b2; \ |
158 | vpshufb a0, b3, b3; \ |
159 | vpshufb a0, a1, a1; \ |
160 | vpshufb a0, c0, c0; \ |
161 | vpshufb a0, c1, c1; \ |
162 | vpshufb a0, c2, c2; \ |
163 | vpshufb a0, c3, c3; \ |
164 | vpshufb a0, d0, d0; \ |
165 | vpshufb a0, d1, d1; \ |
166 | vpshufb a0, d2, d2; \ |
167 | vpshufb a0, d3, d3; \ |
168 | vmovdqu d3, st1; \ |
169 | vmovdqu st0, d3; \ |
170 | vpshufb a0, d3, a0; \ |
171 | vmovdqu d2, st0; \ |
172 | \ |
173 | transpose_4x4(c0, d0, a0, b0, d2, d3); \ |
174 | transpose_4x4(c1, d1, a1, b1, d2, d3); \ |
175 | vmovdqu st0, d2; \ |
176 | vmovdqu st1, d3; \ |
177 | \ |
178 | vmovdqu b0, st0; \ |
179 | vmovdqu b1, st1; \ |
180 | transpose_4x4(c2, d2, a2, b2, b0, b1); \ |
181 | transpose_4x4(c3, d3, a3, b3, b0, b1); \ |
182 | vmovdqu st0, b0; \ |
183 | vmovdqu st1, b1; \ |
184 | /* does not adjust output bytes inside vectors */ |
185 | |
186 | /* load blocks to registers and apply pre-whitening */ |
187 | #define inpack16_pre(x0, x1, x2, x3, \ |
188 | x4, x5, x6, x7, \ |
189 | y0, y1, y2, y3, \ |
190 | y4, y5, y6, y7, \ |
191 | rio) \ |
192 | vmovdqu (0 * 32)(rio), x0; \ |
193 | vmovdqu (1 * 32)(rio), x1; \ |
194 | vmovdqu (2 * 32)(rio), x2; \ |
195 | vmovdqu (3 * 32)(rio), x3; \ |
196 | vmovdqu (4 * 32)(rio), x4; \ |
197 | vmovdqu (5 * 32)(rio), x5; \ |
198 | vmovdqu (6 * 32)(rio), x6; \ |
199 | vmovdqu (7 * 32)(rio), x7; \ |
200 | vmovdqu (8 * 32)(rio), y0; \ |
201 | vmovdqu (9 * 32)(rio), y1; \ |
202 | vmovdqu (10 * 32)(rio), y2; \ |
203 | vmovdqu (11 * 32)(rio), y3; \ |
204 | vmovdqu (12 * 32)(rio), y4; \ |
205 | vmovdqu (13 * 32)(rio), y5; \ |
206 | vmovdqu (14 * 32)(rio), y6; \ |
207 | vmovdqu (15 * 32)(rio), y7; |
208 | |
209 | /* byteslice pre-whitened blocks and store to temporary memory */ |
210 | #define inpack16_post(x0, x1, x2, x3, \ |
211 | x4, x5, x6, x7, \ |
212 | y0, y1, y2, y3, \ |
213 | y4, y5, y6, y7, \ |
214 | mem_ab, mem_cd) \ |
215 | byteslice_16x16b(x0, x1, x2, x3, \ |
216 | x4, x5, x6, x7, \ |
217 | y0, y1, y2, y3, \ |
218 | y4, y5, y6, y7, \ |
219 | (mem_ab), (mem_cd)); \ |
220 | \ |
221 | vmovdqu x0, 0 * 32(mem_ab); \ |
222 | vmovdqu x1, 1 * 32(mem_ab); \ |
223 | vmovdqu x2, 2 * 32(mem_ab); \ |
224 | vmovdqu x3, 3 * 32(mem_ab); \ |
225 | vmovdqu x4, 4 * 32(mem_ab); \ |
226 | vmovdqu x5, 5 * 32(mem_ab); \ |
227 | vmovdqu x6, 6 * 32(mem_ab); \ |
228 | vmovdqu x7, 7 * 32(mem_ab); \ |
229 | vmovdqu y0, 0 * 32(mem_cd); \ |
230 | vmovdqu y1, 1 * 32(mem_cd); \ |
231 | vmovdqu y2, 2 * 32(mem_cd); \ |
232 | vmovdqu y3, 3 * 32(mem_cd); \ |
233 | vmovdqu y4, 4 * 32(mem_cd); \ |
234 | vmovdqu y5, 5 * 32(mem_cd); \ |
235 | vmovdqu y6, 6 * 32(mem_cd); \ |
236 | vmovdqu y7, 7 * 32(mem_cd); |
237 | |
238 | #define write_output(x0, x1, x2, x3, \ |
239 | x4, x5, x6, x7, \ |
240 | y0, y1, y2, y3, \ |
241 | y4, y5, y6, y7, \ |
242 | mem) \ |
243 | vmovdqu x0, 0 * 32(mem); \ |
244 | vmovdqu x1, 1 * 32(mem); \ |
245 | vmovdqu x2, 2 * 32(mem); \ |
246 | vmovdqu x3, 3 * 32(mem); \ |
247 | vmovdqu x4, 4 * 32(mem); \ |
248 | vmovdqu x5, 5 * 32(mem); \ |
249 | vmovdqu x6, 6 * 32(mem); \ |
250 | vmovdqu x7, 7 * 32(mem); \ |
251 | vmovdqu y0, 8 * 32(mem); \ |
252 | vmovdqu y1, 9 * 32(mem); \ |
253 | vmovdqu y2, 10 * 32(mem); \ |
254 | vmovdqu y3, 11 * 32(mem); \ |
255 | vmovdqu y4, 12 * 32(mem); \ |
256 | vmovdqu y5, 13 * 32(mem); \ |
257 | vmovdqu y6, 14 * 32(mem); \ |
258 | vmovdqu y7, 15 * 32(mem); \ |
259 | |
260 | #define aria_store_state_8way(x0, x1, x2, x3, \ |
261 | x4, x5, x6, x7, \ |
262 | mem_tmp, idx) \ |
263 | vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \ |
264 | vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \ |
265 | vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \ |
266 | vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \ |
267 | vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \ |
268 | vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \ |
269 | vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \ |
270 | vmovdqu x7, ((idx + 7) * 32)(mem_tmp); |
271 | |
272 | #define aria_load_state_8way(x0, x1, x2, x3, \ |
273 | x4, x5, x6, x7, \ |
274 | mem_tmp, idx) \ |
275 | vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \ |
276 | vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \ |
277 | vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \ |
278 | vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \ |
279 | vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \ |
280 | vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \ |
281 | vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \ |
282 | vmovdqu ((idx + 7) * 32)(mem_tmp), x7; |
283 | |
284 | #define aria_ark_8way(x0, x1, x2, x3, \ |
285 | x4, x5, x6, x7, \ |
286 | t0, rk, idx, round) \ |
287 | /* AddRoundKey */ \ |
288 | vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ |
289 | vpxor t0, x0, x0; \ |
290 | vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ |
291 | vpxor t0, x1, x1; \ |
292 | vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ |
293 | vpxor t0, x2, x2; \ |
294 | vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ |
295 | vpxor t0, x3, x3; \ |
296 | vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ |
297 | vpxor t0, x4, x4; \ |
298 | vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ |
299 | vpxor t0, x5, x5; \ |
300 | vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ |
301 | vpxor t0, x6, x6; \ |
302 | vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ |
303 | vpxor t0, x7, x7; |
304 | |
305 | #ifdef CONFIG_AS_GFNI |
306 | #define aria_sbox_8way_gfni(x0, x1, x2, x3, \ |
307 | x4, x5, x6, x7, \ |
308 | t0, t1, t2, t3, \ |
309 | t4, t5, t6, t7) \ |
310 | vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ |
311 | vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ |
312 | vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ |
313 | vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ |
314 | vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ |
315 | vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ |
316 | vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ |
317 | vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ |
318 | vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ |
319 | vgf2p8affineinvqb $0, t2, x2, x2; \ |
320 | vgf2p8affineinvqb $0, t2, x6, x6; \ |
321 | vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ |
322 | vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ |
323 | vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ |
324 | vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ |
325 | vgf2p8affineinvqb $0, t2, x3, x3; \ |
326 | vgf2p8affineinvqb $0, t2, x7, x7 |
327 | |
328 | #endif /* CONFIG_AS_GFNI */ |
329 | #define aria_sbox_8way(x0, x1, x2, x3, \ |
330 | x4, x5, x6, x7, \ |
331 | t0, t1, t2, t3, \ |
332 | t4, t5, t6, t7) \ |
333 | vpxor t7, t7, t7; \ |
334 | vpxor t6, t6, t6; \ |
335 | vbroadcasti128 .Linv_shift_row(%rip), t0; \ |
336 | vbroadcasti128 .Lshift_row(%rip), t1; \ |
337 | vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \ |
338 | vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \ |
339 | vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ |
340 | vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ |
341 | \ |
342 | vextracti128 $1, x0, t6##_x; \ |
343 | vaesenclast t7##_x, x0##_x, x0##_x; \ |
344 | vaesenclast t7##_x, t6##_x, t6##_x; \ |
345 | vinserti128 $1, t6##_x, x0, x0; \ |
346 | \ |
347 | vextracti128 $1, x4, t6##_x; \ |
348 | vaesenclast t7##_x, x4##_x, x4##_x; \ |
349 | vaesenclast t7##_x, t6##_x, t6##_x; \ |
350 | vinserti128 $1, t6##_x, x4, x4; \ |
351 | \ |
352 | vextracti128 $1, x1, t6##_x; \ |
353 | vaesenclast t7##_x, x1##_x, x1##_x; \ |
354 | vaesenclast t7##_x, t6##_x, t6##_x; \ |
355 | vinserti128 $1, t6##_x, x1, x1; \ |
356 | \ |
357 | vextracti128 $1, x5, t6##_x; \ |
358 | vaesenclast t7##_x, x5##_x, x5##_x; \ |
359 | vaesenclast t7##_x, t6##_x, t6##_x; \ |
360 | vinserti128 $1, t6##_x, x5, x5; \ |
361 | \ |
362 | vextracti128 $1, x2, t6##_x; \ |
363 | vaesdeclast t7##_x, x2##_x, x2##_x; \ |
364 | vaesdeclast t7##_x, t6##_x, t6##_x; \ |
365 | vinserti128 $1, t6##_x, x2, x2; \ |
366 | \ |
367 | vextracti128 $1, x6, t6##_x; \ |
368 | vaesdeclast t7##_x, x6##_x, x6##_x; \ |
369 | vaesdeclast t7##_x, t6##_x, t6##_x; \ |
370 | vinserti128 $1, t6##_x, x6, x6; \ |
371 | \ |
372 | vpbroadcastd .L0f0f0f0f(%rip), t6; \ |
373 | \ |
374 | /* AES inverse shift rows */ \ |
375 | vpshufb t0, x0, x0; \ |
376 | vpshufb t0, x4, x4; \ |
377 | vpshufb t0, x1, x1; \ |
378 | vpshufb t0, x5, x5; \ |
379 | vpshufb t1, x3, x3; \ |
380 | vpshufb t1, x7, x7; \ |
381 | vpshufb t1, x2, x2; \ |
382 | vpshufb t1, x6, x6; \ |
383 | \ |
384 | /* affine transformation for S2 */ \ |
385 | filter_8bit(x1, t2, t3, t6, t0); \ |
386 | /* affine transformation for S2 */ \ |
387 | filter_8bit(x5, t2, t3, t6, t0); \ |
388 | \ |
389 | /* affine transformation for X2 */ \ |
390 | filter_8bit(x3, t4, t5, t6, t0); \ |
391 | /* affine transformation for X2 */ \ |
392 | filter_8bit(x7, t4, t5, t6, t0); \ |
393 | \ |
394 | vpxor t6, t6, t6; \ |
395 | vextracti128 $1, x3, t6##_x; \ |
396 | vaesdeclast t7##_x, x3##_x, x3##_x; \ |
397 | vaesdeclast t7##_x, t6##_x, t6##_x; \ |
398 | vinserti128 $1, t6##_x, x3, x3; \ |
399 | \ |
400 | vextracti128 $1, x7, t6##_x; \ |
401 | vaesdeclast t7##_x, x7##_x, x7##_x; \ |
402 | vaesdeclast t7##_x, t6##_x, t6##_x; \ |
403 | vinserti128 $1, t6##_x, x7, x7; \ |
404 | |
405 | #define aria_diff_m(x0, x1, x2, x3, \ |
406 | t0, t1, t2, t3) \ |
407 | /* T = rotr32(X, 8); */ \ |
408 | /* X ^= T */ \ |
409 | vpxor x0, x3, t0; \ |
410 | vpxor x1, x0, t1; \ |
411 | vpxor x2, x1, t2; \ |
412 | vpxor x3, x2, t3; \ |
413 | /* X = T ^ rotr(X, 16); */ \ |
414 | vpxor t2, x0, x0; \ |
415 | vpxor x1, t3, t3; \ |
416 | vpxor t0, x2, x2; \ |
417 | vpxor t1, x3, x1; \ |
418 | vmovdqu t3, x3; |
419 | |
420 | #define aria_diff_word(x0, x1, x2, x3, \ |
421 | x4, x5, x6, x7, \ |
422 | y0, y1, y2, y3, \ |
423 | y4, y5, y6, y7) \ |
424 | /* t1 ^= t2; */ \ |
425 | vpxor y0, x4, x4; \ |
426 | vpxor y1, x5, x5; \ |
427 | vpxor y2, x6, x6; \ |
428 | vpxor y3, x7, x7; \ |
429 | \ |
430 | /* t2 ^= t3; */ \ |
431 | vpxor y4, y0, y0; \ |
432 | vpxor y5, y1, y1; \ |
433 | vpxor y6, y2, y2; \ |
434 | vpxor y7, y3, y3; \ |
435 | \ |
436 | /* t0 ^= t1; */ \ |
437 | vpxor x4, x0, x0; \ |
438 | vpxor x5, x1, x1; \ |
439 | vpxor x6, x2, x2; \ |
440 | vpxor x7, x3, x3; \ |
441 | \ |
442 | /* t3 ^= t1; */ \ |
443 | vpxor x4, y4, y4; \ |
444 | vpxor x5, y5, y5; \ |
445 | vpxor x6, y6, y6; \ |
446 | vpxor x7, y7, y7; \ |
447 | \ |
448 | /* t2 ^= t0; */ \ |
449 | vpxor x0, y0, y0; \ |
450 | vpxor x1, y1, y1; \ |
451 | vpxor x2, y2, y2; \ |
452 | vpxor x3, y3, y3; \ |
453 | \ |
454 | /* t1 ^= t2; */ \ |
455 | vpxor y0, x4, x4; \ |
456 | vpxor y1, x5, x5; \ |
457 | vpxor y2, x6, x6; \ |
458 | vpxor y3, x7, x7; |
459 | |
460 | #define aria_fe(x0, x1, x2, x3, \ |
461 | x4, x5, x6, x7, \ |
462 | y0, y1, y2, y3, \ |
463 | y4, y5, y6, y7, \ |
464 | mem_tmp, rk, round) \ |
465 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
466 | y0, rk, 8, round); \ |
467 | \ |
468 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
469 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
470 | \ |
471 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
472 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
473 | aria_store_state_8way(x0, x1, x2, x3, \ |
474 | x4, x5, x6, x7, \ |
475 | mem_tmp, 8); \ |
476 | \ |
477 | aria_load_state_8way(x0, x1, x2, x3, \ |
478 | x4, x5, x6, x7, \ |
479 | mem_tmp, 0); \ |
480 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
481 | y0, rk, 0, round); \ |
482 | \ |
483 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
484 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
485 | \ |
486 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
487 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
488 | aria_store_state_8way(x0, x1, x2, x3, \ |
489 | x4, x5, x6, x7, \ |
490 | mem_tmp, 0); \ |
491 | aria_load_state_8way(y0, y1, y2, y3, \ |
492 | y4, y5, y6, y7, \ |
493 | mem_tmp, 8); \ |
494 | aria_diff_word(x0, x1, x2, x3, \ |
495 | x4, x5, x6, x7, \ |
496 | y0, y1, y2, y3, \ |
497 | y4, y5, y6, y7); \ |
498 | /* aria_diff_byte() \ |
499 | * T3 = ABCD -> BADC \ |
500 | * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ |
501 | * T0 = ABCD -> CDAB \ |
502 | * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ |
503 | * T1 = ABCD -> DCBA \ |
504 | * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ |
505 | */ \ |
506 | aria_diff_word(x2, x3, x0, x1, \ |
507 | x7, x6, x5, x4, \ |
508 | y0, y1, y2, y3, \ |
509 | y5, y4, y7, y6); \ |
510 | aria_store_state_8way(x3, x2, x1, x0, \ |
511 | x6, x7, x4, x5, \ |
512 | mem_tmp, 0); |
513 | |
514 | #define aria_fo(x0, x1, x2, x3, \ |
515 | x4, x5, x6, x7, \ |
516 | y0, y1, y2, y3, \ |
517 | y4, y5, y6, y7, \ |
518 | mem_tmp, rk, round) \ |
519 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
520 | y0, rk, 8, round); \ |
521 | \ |
522 | aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
523 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
524 | \ |
525 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
526 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
527 | aria_store_state_8way(x0, x1, x2, x3, \ |
528 | x4, x5, x6, x7, \ |
529 | mem_tmp, 8); \ |
530 | \ |
531 | aria_load_state_8way(x0, x1, x2, x3, \ |
532 | x4, x5, x6, x7, \ |
533 | mem_tmp, 0); \ |
534 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
535 | y0, rk, 0, round); \ |
536 | \ |
537 | aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
538 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
539 | \ |
540 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
541 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
542 | aria_store_state_8way(x0, x1, x2, x3, \ |
543 | x4, x5, x6, x7, \ |
544 | mem_tmp, 0); \ |
545 | aria_load_state_8way(y0, y1, y2, y3, \ |
546 | y4, y5, y6, y7, \ |
547 | mem_tmp, 8); \ |
548 | aria_diff_word(x0, x1, x2, x3, \ |
549 | x4, x5, x6, x7, \ |
550 | y0, y1, y2, y3, \ |
551 | y4, y5, y6, y7); \ |
552 | /* aria_diff_byte() \ |
553 | * T1 = ABCD -> BADC \ |
554 | * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ |
555 | * T2 = ABCD -> CDAB \ |
556 | * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ |
557 | * T3 = ABCD -> DCBA \ |
558 | * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ |
559 | */ \ |
560 | aria_diff_word(x0, x1, x2, x3, \ |
561 | x5, x4, x7, x6, \ |
562 | y2, y3, y0, y1, \ |
563 | y7, y6, y5, y4); \ |
564 | aria_store_state_8way(x3, x2, x1, x0, \ |
565 | x6, x7, x4, x5, \ |
566 | mem_tmp, 0); |
567 | |
568 | #define aria_ff(x0, x1, x2, x3, \ |
569 | x4, x5, x6, x7, \ |
570 | y0, y1, y2, y3, \ |
571 | y4, y5, y6, y7, \ |
572 | mem_tmp, rk, round, last_round) \ |
573 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
574 | y0, rk, 8, round); \ |
575 | \ |
576 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
577 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
578 | \ |
579 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
580 | y0, rk, 8, last_round); \ |
581 | \ |
582 | aria_store_state_8way(x0, x1, x2, x3, \ |
583 | x4, x5, x6, x7, \ |
584 | mem_tmp, 8); \ |
585 | \ |
586 | aria_load_state_8way(x0, x1, x2, x3, \ |
587 | x4, x5, x6, x7, \ |
588 | mem_tmp, 0); \ |
589 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
590 | y0, rk, 0, round); \ |
591 | \ |
592 | aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ |
593 | y0, y1, y2, y3, y4, y5, y6, y7); \ |
594 | \ |
595 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
596 | y0, rk, 0, last_round); \ |
597 | \ |
598 | aria_load_state_8way(y0, y1, y2, y3, \ |
599 | y4, y5, y6, y7, \ |
600 | mem_tmp, 8); |
601 | #ifdef CONFIG_AS_GFNI |
602 | #define aria_fe_gfni(x0, x1, x2, x3, \ |
603 | x4, x5, x6, x7, \ |
604 | y0, y1, y2, y3, \ |
605 | y4, y5, y6, y7, \ |
606 | mem_tmp, rk, round) \ |
607 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
608 | y0, rk, 8, round); \ |
609 | \ |
610 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
611 | x6, x7, x4, x5, \ |
612 | y0, y1, y2, y3, \ |
613 | y4, y5, y6, y7); \ |
614 | \ |
615 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
616 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
617 | aria_store_state_8way(x0, x1, x2, x3, \ |
618 | x4, x5, x6, x7, \ |
619 | mem_tmp, 8); \ |
620 | \ |
621 | aria_load_state_8way(x0, x1, x2, x3, \ |
622 | x4, x5, x6, x7, \ |
623 | mem_tmp, 0); \ |
624 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
625 | y0, rk, 0, round); \ |
626 | \ |
627 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
628 | x6, x7, x4, x5, \ |
629 | y0, y1, y2, y3, \ |
630 | y4, y5, y6, y7); \ |
631 | \ |
632 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
633 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
634 | aria_store_state_8way(x0, x1, x2, x3, \ |
635 | x4, x5, x6, x7, \ |
636 | mem_tmp, 0); \ |
637 | aria_load_state_8way(y0, y1, y2, y3, \ |
638 | y4, y5, y6, y7, \ |
639 | mem_tmp, 8); \ |
640 | aria_diff_word(x0, x1, x2, x3, \ |
641 | x4, x5, x6, x7, \ |
642 | y0, y1, y2, y3, \ |
643 | y4, y5, y6, y7); \ |
644 | /* aria_diff_byte() \ |
645 | * T3 = ABCD -> BADC \ |
646 | * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ |
647 | * T0 = ABCD -> CDAB \ |
648 | * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ |
649 | * T1 = ABCD -> DCBA \ |
650 | * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ |
651 | */ \ |
652 | aria_diff_word(x2, x3, x0, x1, \ |
653 | x7, x6, x5, x4, \ |
654 | y0, y1, y2, y3, \ |
655 | y5, y4, y7, y6); \ |
656 | aria_store_state_8way(x3, x2, x1, x0, \ |
657 | x6, x7, x4, x5, \ |
658 | mem_tmp, 0); |
659 | |
660 | #define aria_fo_gfni(x0, x1, x2, x3, \ |
661 | x4, x5, x6, x7, \ |
662 | y0, y1, y2, y3, \ |
663 | y4, y5, y6, y7, \ |
664 | mem_tmp, rk, round) \ |
665 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
666 | y0, rk, 8, round); \ |
667 | \ |
668 | aria_sbox_8way_gfni(x0, x1, x2, x3, \ |
669 | x4, x5, x6, x7, \ |
670 | y0, y1, y2, y3, \ |
671 | y4, y5, y6, y7); \ |
672 | \ |
673 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
674 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
675 | aria_store_state_8way(x0, x1, x2, x3, \ |
676 | x4, x5, x6, x7, \ |
677 | mem_tmp, 8); \ |
678 | \ |
679 | aria_load_state_8way(x0, x1, x2, x3, \ |
680 | x4, x5, x6, x7, \ |
681 | mem_tmp, 0); \ |
682 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
683 | y0, rk, 0, round); \ |
684 | \ |
685 | aria_sbox_8way_gfni(x0, x1, x2, x3, \ |
686 | x4, x5, x6, x7, \ |
687 | y0, y1, y2, y3, \ |
688 | y4, y5, y6, y7); \ |
689 | \ |
690 | aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ |
691 | aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ |
692 | aria_store_state_8way(x0, x1, x2, x3, \ |
693 | x4, x5, x6, x7, \ |
694 | mem_tmp, 0); \ |
695 | aria_load_state_8way(y0, y1, y2, y3, \ |
696 | y4, y5, y6, y7, \ |
697 | mem_tmp, 8); \ |
698 | aria_diff_word(x0, x1, x2, x3, \ |
699 | x4, x5, x6, x7, \ |
700 | y0, y1, y2, y3, \ |
701 | y4, y5, y6, y7); \ |
702 | /* aria_diff_byte() \ |
703 | * T1 = ABCD -> BADC \ |
704 | * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ |
705 | * T2 = ABCD -> CDAB \ |
706 | * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ |
707 | * T3 = ABCD -> DCBA \ |
708 | * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ |
709 | */ \ |
710 | aria_diff_word(x0, x1, x2, x3, \ |
711 | x5, x4, x7, x6, \ |
712 | y2, y3, y0, y1, \ |
713 | y7, y6, y5, y4); \ |
714 | aria_store_state_8way(x3, x2, x1, x0, \ |
715 | x6, x7, x4, x5, \ |
716 | mem_tmp, 0); |
717 | |
718 | #define aria_ff_gfni(x0, x1, x2, x3, \ |
719 | x4, x5, x6, x7, \ |
720 | y0, y1, y2, y3, \ |
721 | y4, y5, y6, y7, \ |
722 | mem_tmp, rk, round, last_round) \ |
723 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
724 | y0, rk, 8, round); \ |
725 | \ |
726 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
727 | x6, x7, x4, x5, \ |
728 | y0, y1, y2, y3, \ |
729 | y4, y5, y6, y7); \ |
730 | \ |
731 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
732 | y0, rk, 8, last_round); \ |
733 | \ |
734 | aria_store_state_8way(x0, x1, x2, x3, \ |
735 | x4, x5, x6, x7, \ |
736 | mem_tmp, 8); \ |
737 | \ |
738 | aria_load_state_8way(x0, x1, x2, x3, \ |
739 | x4, x5, x6, x7, \ |
740 | mem_tmp, 0); \ |
741 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
742 | y0, rk, 0, round); \ |
743 | \ |
744 | aria_sbox_8way_gfni(x2, x3, x0, x1, \ |
745 | x6, x7, x4, x5, \ |
746 | y0, y1, y2, y3, \ |
747 | y4, y5, y6, y7); \ |
748 | \ |
749 | aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ |
750 | y0, rk, 0, last_round); \ |
751 | \ |
752 | aria_load_state_8way(y0, y1, y2, y3, \ |
753 | y4, y5, y6, y7, \ |
754 | mem_tmp, 8); |
755 | #endif /* CONFIG_AS_GFNI */ |
756 | |
757 | .section .rodata.cst32.shufb_16x16b, "aM" , @progbits, 32 |
758 | .align 32 |
759 | #define SHUFB_BYTES(idx) \ |
760 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) |
761 | .Lshufb_16x16b: |
762 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) |
763 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) |
764 | |
765 | .section .rodata.cst16, "aM" , @progbits, 16 |
766 | .align 16 |
767 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ |
768 | .Linv_shift_row: |
769 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b |
770 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 |
771 | .Lshift_row: |
772 | .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 |
773 | .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b |
774 | /* For CTR-mode IV byteswap */ |
775 | .Lbswap128_mask: |
776 | .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 |
777 | .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 |
778 | |
779 | /* AES inverse affine and S2 combined: |
780 | * 1 1 0 0 0 0 0 1 x0 0 |
781 | * 0 1 0 0 1 0 0 0 x1 0 |
782 | * 1 1 0 0 1 1 1 1 x2 0 |
783 | * 0 1 1 0 1 0 0 1 x3 1 |
784 | * 0 1 0 0 1 1 0 0 * x4 + 0 |
785 | * 0 1 0 1 1 0 0 0 x5 0 |
786 | * 0 0 0 0 0 1 0 1 x6 0 |
787 | * 1 1 1 0 0 1 1 1 x7 1 |
788 | */ |
789 | .Ltf_lo__inv_aff__and__s2: |
790 | .octa 0x92172DA81A9FA520B2370D883ABF8500 |
791 | .Ltf_hi__inv_aff__and__s2: |
792 | .octa 0x2B15FFC1AF917B45E6D8320C625CB688 |
793 | |
794 | /* X2 and AES forward affine combined: |
795 | * 1 0 1 1 0 0 0 1 x0 0 |
796 | * 0 1 1 1 1 0 1 1 x1 0 |
797 | * 0 0 0 1 1 0 1 0 x2 1 |
798 | * 0 1 0 0 0 1 0 0 x3 0 |
799 | * 0 0 1 1 1 0 1 1 * x4 + 0 |
800 | * 0 1 0 0 1 0 0 0 x5 0 |
801 | * 1 1 0 1 0 0 1 1 x6 0 |
802 | * 0 1 0 0 1 0 1 0 x7 0 |
803 | */ |
804 | .Ltf_lo__x2__and__fwd_aff: |
805 | .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 |
806 | .Ltf_hi__x2__and__fwd_aff: |
807 | .octa 0x3F893781E95FE1576CDA64D2BA0CB204 |
808 | |
809 | #ifdef CONFIG_AS_GFNI |
810 | .section .rodata.cst8, "aM" , @progbits, 8 |
811 | .align 8 |
812 | /* AES affine: */ |
813 | #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) |
814 | .Ltf_aff_bitmatrix: |
815 | .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), |
816 | BV8(1, 1, 0, 0, 0, 1, 1, 1), |
817 | BV8(1, 1, 1, 0, 0, 0, 1, 1), |
818 | BV8(1, 1, 1, 1, 0, 0, 0, 1), |
819 | BV8(1, 1, 1, 1, 1, 0, 0, 0), |
820 | BV8(0, 1, 1, 1, 1, 1, 0, 0), |
821 | BV8(0, 0, 1, 1, 1, 1, 1, 0), |
822 | BV8(0, 0, 0, 1, 1, 1, 1, 1)) |
823 | |
824 | /* AES inverse affine: */ |
825 | #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) |
826 | .Ltf_inv_bitmatrix: |
827 | .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), |
828 | BV8(1, 0, 0, 1, 0, 0, 1, 0), |
829 | BV8(0, 1, 0, 0, 1, 0, 0, 1), |
830 | BV8(1, 0, 1, 0, 0, 1, 0, 0), |
831 | BV8(0, 1, 0, 1, 0, 0, 1, 0), |
832 | BV8(0, 0, 1, 0, 1, 0, 0, 1), |
833 | BV8(1, 0, 0, 1, 0, 1, 0, 0), |
834 | BV8(0, 1, 0, 0, 1, 0, 1, 0)) |
835 | |
836 | /* S2: */ |
837 | #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) |
838 | .Ltf_s2_bitmatrix: |
839 | .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), |
840 | BV8(0, 0, 1, 1, 1, 1, 1, 1), |
841 | BV8(1, 1, 1, 0, 1, 1, 0, 1), |
842 | BV8(1, 1, 0, 0, 0, 0, 1, 1), |
843 | BV8(0, 1, 0, 0, 0, 0, 1, 1), |
844 | BV8(1, 1, 0, 0, 1, 1, 1, 0), |
845 | BV8(0, 1, 1, 0, 0, 0, 1, 1), |
846 | BV8(1, 1, 1, 1, 0, 1, 1, 0)) |
847 | |
848 | /* X2: */ |
849 | #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) |
850 | .Ltf_x2_bitmatrix: |
851 | .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), |
852 | BV8(0, 0, 1, 0, 0, 1, 1, 0), |
853 | BV8(0, 0, 0, 0, 1, 0, 1, 0), |
854 | BV8(1, 1, 1, 0, 0, 0, 1, 1), |
855 | BV8(1, 1, 1, 0, 1, 1, 0, 0), |
856 | BV8(0, 1, 1, 0, 1, 0, 1, 1), |
857 | BV8(1, 0, 1, 1, 1, 1, 0, 1), |
858 | BV8(1, 0, 0, 1, 0, 0, 1, 1)) |
859 | |
860 | /* Identity matrix: */ |
861 | .Ltf_id_bitmatrix: |
862 | .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), |
863 | BV8(0, 1, 0, 0, 0, 0, 0, 0), |
864 | BV8(0, 0, 1, 0, 0, 0, 0, 0), |
865 | BV8(0, 0, 0, 1, 0, 0, 0, 0), |
866 | BV8(0, 0, 0, 0, 1, 0, 0, 0), |
867 | BV8(0, 0, 0, 0, 0, 1, 0, 0), |
868 | BV8(0, 0, 0, 0, 0, 0, 1, 0), |
869 | BV8(0, 0, 0, 0, 0, 0, 0, 1)) |
870 | |
871 | #endif /* CONFIG_AS_GFNI */ |
872 | |
873 | /* 4-bit mask */ |
874 | .section .rodata.cst4.L0f0f0f0f, "aM" , @progbits, 4 |
875 | .align 4 |
876 | .L0f0f0f0f: |
877 | .long 0x0f0f0f0f |
878 | |
879 | .text |
880 | |
881 | SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way) |
882 | /* input: |
883 | * %r9: rk |
884 | * %rsi: dst |
885 | * %rdx: src |
886 | * %ymm0..%ymm15: byte-sliced blocks |
887 | */ |
888 | |
889 | FRAME_BEGIN |
890 | |
891 | movq %rsi, %rax; |
892 | leaq 8 * 32(%rax), %r8; |
893 | |
894 | inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
895 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
896 | %ymm15, %rax, %r8); |
897 | aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, |
898 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
899 | %rax, %r9, 0); |
900 | aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
901 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
902 | %ymm15, %rax, %r9, 1); |
903 | aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, |
904 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
905 | %rax, %r9, 2); |
906 | aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
907 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
908 | %ymm15, %rax, %r9, 3); |
909 | aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, |
910 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
911 | %rax, %r9, 4); |
912 | aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
913 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
914 | %ymm15, %rax, %r9, 5); |
915 | aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, |
916 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
917 | %rax, %r9, 6); |
918 | aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
919 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
920 | %ymm15, %rax, %r9, 7); |
921 | aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, |
922 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
923 | %rax, %r9, 8); |
924 | aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
925 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
926 | %ymm15, %rax, %r9, 9); |
927 | aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, |
928 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
929 | %rax, %r9, 10); |
930 | cmpl $12, ARIA_CTX_rounds(CTX); |
931 | jne .Laria_192; |
932 | aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
933 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
934 | %ymm15, %rax, %r9, 11, 12); |
935 | jmp .Laria_end; |
936 | .Laria_192: |
937 | aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
938 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
939 | %ymm15, %rax, %r9, 11); |
940 | aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, |
941 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
942 | %rax, %r9, 12); |
943 | cmpl $14, ARIA_CTX_rounds(CTX); |
944 | jne .Laria_256; |
945 | aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
946 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
947 | %ymm15, %rax, %r9, 13, 14); |
948 | jmp .Laria_end; |
949 | .Laria_256: |
950 | aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
951 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
952 | %ymm15, %rax, %r9, 13); |
953 | aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, |
954 | %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
955 | %rax, %r9, 14); |
956 | aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
957 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
958 | %ymm15, %rax, %r9, 15, 16); |
959 | .Laria_end: |
960 | debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4, |
961 | %ymm9, %ymm13, %ymm0, %ymm5, |
962 | %ymm10, %ymm14, %ymm3, %ymm6, |
963 | %ymm11, %ymm15, %ymm2, %ymm7, |
964 | (%rax), (%r8)); |
965 | |
966 | FRAME_END |
967 | RET; |
968 | SYM_FUNC_END(__aria_aesni_avx2_crypt_32way) |
969 | |
970 | SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way) |
971 | /* input: |
972 | * %rdi: ctx, CTX |
973 | * %rsi: dst |
974 | * %rdx: src |
975 | */ |
976 | |
977 | FRAME_BEGIN |
978 | |
979 | leaq ARIA_CTX_enc_key(CTX), %r9; |
980 | |
981 | inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
982 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
983 | %ymm15, %rdx); |
984 | |
985 | call __aria_aesni_avx2_crypt_32way; |
986 | |
987 | write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
988 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
989 | %ymm15, %rax); |
990 | |
991 | FRAME_END |
992 | RET; |
993 | SYM_FUNC_END(aria_aesni_avx2_encrypt_32way) |
994 | |
995 | SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way) |
996 | /* input: |
997 | * %rdi: ctx, CTX |
998 | * %rsi: dst |
999 | * %rdx: src |
1000 | */ |
1001 | |
1002 | FRAME_BEGIN |
1003 | |
1004 | leaq ARIA_CTX_dec_key(CTX), %r9; |
1005 | |
1006 | inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
1007 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1008 | %ymm15, %rdx); |
1009 | |
1010 | call __aria_aesni_avx2_crypt_32way; |
1011 | |
1012 | write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
1013 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1014 | %ymm15, %rax); |
1015 | |
1016 | FRAME_END |
1017 | RET; |
1018 | SYM_FUNC_END(aria_aesni_avx2_decrypt_32way) |
1019 | |
1020 | SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way) |
1021 | /* input: |
1022 | * %rdi: ctx |
1023 | * %rsi: dst |
1024 | * %rdx: src |
1025 | * %rcx: keystream |
1026 | * %r8: iv (big endian, 128bit) |
1027 | */ |
1028 | |
1029 | FRAME_BEGIN |
1030 | movq 8(%r8), %r11; |
1031 | bswapq %r11; |
1032 | |
1033 | vbroadcasti128 .Lbswap128_mask (%rip), %ymm6; |
1034 | vpcmpeqd %ymm0, %ymm0, %ymm0; |
1035 | vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */ |
1036 | vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */ |
1037 | |
1038 | /* load IV and byteswap */ |
1039 | vmovdqu (%r8), %xmm7; |
1040 | vpshufb %xmm6, %xmm7, %xmm7; |
1041 | vmovdqa %xmm7, %xmm3; |
1042 | inc_le128(%xmm7, %xmm0, %xmm4); |
1043 | vinserti128 $1, %xmm7, %ymm3, %ymm3; |
1044 | vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */ |
1045 | |
1046 | /* check need for handling 64-bit overflow and carry */ |
1047 | cmpq $(0xffffffffffffffff - 32), %r11; |
1048 | ja .Lhandle_ctr_carry; |
1049 | |
1050 | /* construct IVs */ |
1051 | vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */ |
1052 | vpshufb %ymm6, %ymm3, %ymm9; |
1053 | vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */ |
1054 | vpshufb %ymm6, %ymm3, %ymm10; |
1055 | vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */ |
1056 | vpshufb %ymm6, %ymm3, %ymm11; |
1057 | vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */ |
1058 | vpshufb %ymm6, %ymm3, %ymm12; |
1059 | vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */ |
1060 | vpshufb %ymm6, %ymm3, %ymm13; |
1061 | vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */ |
1062 | vpshufb %ymm6, %ymm3, %ymm14; |
1063 | vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */ |
1064 | vpshufb %ymm6, %ymm3, %ymm15; |
1065 | vmovdqu %ymm8, (0 * 32)(%rcx); |
1066 | vmovdqu %ymm9, (1 * 32)(%rcx); |
1067 | vmovdqu %ymm10, (2 * 32)(%rcx); |
1068 | vmovdqu %ymm11, (3 * 32)(%rcx); |
1069 | vmovdqu %ymm12, (4 * 32)(%rcx); |
1070 | vmovdqu %ymm13, (5 * 32)(%rcx); |
1071 | vmovdqu %ymm14, (6 * 32)(%rcx); |
1072 | vmovdqu %ymm15, (7 * 32)(%rcx); |
1073 | |
1074 | vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */ |
1075 | vpshufb %ymm6, %ymm3, %ymm8; |
1076 | vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */ |
1077 | vpshufb %ymm6, %ymm3, %ymm9; |
1078 | vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */ |
1079 | vpshufb %ymm6, %ymm3, %ymm10; |
1080 | vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */ |
1081 | vpshufb %ymm6, %ymm3, %ymm11; |
1082 | vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */ |
1083 | vpshufb %ymm6, %ymm3, %ymm12; |
1084 | vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */ |
1085 | vpshufb %ymm6, %ymm3, %ymm13; |
1086 | vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */ |
1087 | vpshufb %ymm6, %ymm3, %ymm14; |
1088 | vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */ |
1089 | vpshufb %ymm6, %ymm3, %ymm15; |
1090 | vpsubq %ymm5, %ymm3, %ymm3; /* +32 */ |
1091 | vpshufb %xmm6, %xmm3, %xmm3; |
1092 | vmovdqu %xmm3, (%r8); |
1093 | vmovdqu (0 * 32)(%rcx), %ymm0; |
1094 | vmovdqu (1 * 32)(%rcx), %ymm1; |
1095 | vmovdqu (2 * 32)(%rcx), %ymm2; |
1096 | vmovdqu (3 * 32)(%rcx), %ymm3; |
1097 | vmovdqu (4 * 32)(%rcx), %ymm4; |
1098 | vmovdqu (5 * 32)(%rcx), %ymm5; |
1099 | vmovdqu (6 * 32)(%rcx), %ymm6; |
1100 | vmovdqu (7 * 32)(%rcx), %ymm7; |
1101 | jmp .Lctr_carry_done; |
1102 | |
1103 | .Lhandle_ctr_carry: |
1104 | /* construct IVs */ |
1105 | inc_le128(%ymm3, %ymm0, %ymm4); |
1106 | inc_le128(%ymm3, %ymm0, %ymm4); |
1107 | vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */ |
1108 | inc_le128(%ymm3, %ymm0, %ymm4); |
1109 | inc_le128(%ymm3, %ymm0, %ymm4); |
1110 | vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */ |
1111 | inc_le128(%ymm3, %ymm0, %ymm4); |
1112 | inc_le128(%ymm3, %ymm0, %ymm4); |
1113 | vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */ |
1114 | inc_le128(%ymm3, %ymm0, %ymm4); |
1115 | inc_le128(%ymm3, %ymm0, %ymm4); |
1116 | vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */ |
1117 | inc_le128(%ymm3, %ymm0, %ymm4); |
1118 | inc_le128(%ymm3, %ymm0, %ymm4); |
1119 | vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */ |
1120 | inc_le128(%ymm3, %ymm0, %ymm4); |
1121 | inc_le128(%ymm3, %ymm0, %ymm4); |
1122 | vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */ |
1123 | inc_le128(%ymm3, %ymm0, %ymm4); |
1124 | inc_le128(%ymm3, %ymm0, %ymm4); |
1125 | vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */ |
1126 | vmovdqu %ymm8, (0 * 32)(%rcx); |
1127 | vmovdqu %ymm9, (1 * 32)(%rcx); |
1128 | vmovdqu %ymm10, (2 * 32)(%rcx); |
1129 | vmovdqu %ymm11, (3 * 32)(%rcx); |
1130 | vmovdqu %ymm12, (4 * 32)(%rcx); |
1131 | vmovdqu %ymm13, (5 * 32)(%rcx); |
1132 | vmovdqu %ymm14, (6 * 32)(%rcx); |
1133 | vmovdqu %ymm15, (7 * 32)(%rcx); |
1134 | |
1135 | inc_le128(%ymm3, %ymm0, %ymm4); |
1136 | inc_le128(%ymm3, %ymm0, %ymm4); |
1137 | vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */ |
1138 | inc_le128(%ymm3, %ymm0, %ymm4); |
1139 | inc_le128(%ymm3, %ymm0, %ymm4); |
1140 | vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */ |
1141 | inc_le128(%ymm3, %ymm0, %ymm4); |
1142 | inc_le128(%ymm3, %ymm0, %ymm4); |
1143 | vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */ |
1144 | inc_le128(%ymm3, %ymm0, %ymm4); |
1145 | inc_le128(%ymm3, %ymm0, %ymm4); |
1146 | vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */ |
1147 | inc_le128(%ymm3, %ymm0, %ymm4); |
1148 | inc_le128(%ymm3, %ymm0, %ymm4); |
1149 | vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */ |
1150 | inc_le128(%ymm3, %ymm0, %ymm4); |
1151 | inc_le128(%ymm3, %ymm0, %ymm4); |
1152 | vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */ |
1153 | inc_le128(%ymm3, %ymm0, %ymm4); |
1154 | inc_le128(%ymm3, %ymm0, %ymm4); |
1155 | vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */ |
1156 | inc_le128(%ymm3, %ymm0, %ymm4); |
1157 | inc_le128(%ymm3, %ymm0, %ymm4); |
1158 | vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */ |
1159 | inc_le128(%ymm3, %ymm0, %ymm4); |
1160 | vextracti128 $1, %ymm3, %xmm3; |
1161 | vpshufb %xmm6, %xmm3, %xmm3; /* +32 */ |
1162 | vmovdqu %xmm3, (%r8); |
1163 | vmovdqu (0 * 32)(%rcx), %ymm0; |
1164 | vmovdqu (1 * 32)(%rcx), %ymm1; |
1165 | vmovdqu (2 * 32)(%rcx), %ymm2; |
1166 | vmovdqu (3 * 32)(%rcx), %ymm3; |
1167 | vmovdqu (4 * 32)(%rcx), %ymm4; |
1168 | vmovdqu (5 * 32)(%rcx), %ymm5; |
1169 | vmovdqu (6 * 32)(%rcx), %ymm6; |
1170 | vmovdqu (7 * 32)(%rcx), %ymm7; |
1171 | |
1172 | .Lctr_carry_done: |
1173 | |
1174 | FRAME_END |
1175 | RET; |
1176 | SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way) |
1177 | |
1178 | SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way) |
1179 | /* input: |
1180 | * %rdi: ctx |
1181 | * %rsi: dst |
1182 | * %rdx: src |
1183 | * %rcx: keystream |
1184 | * %r8: iv (big endian, 128bit) |
1185 | */ |
1186 | FRAME_BEGIN |
1187 | |
1188 | call __aria_aesni_avx2_ctr_gen_keystream_32way; |
1189 | |
1190 | leaq (%rsi), %r10; |
1191 | leaq (%rdx), %r11; |
1192 | leaq (%rcx), %rsi; |
1193 | leaq (%rcx), %rdx; |
1194 | leaq ARIA_CTX_enc_key(CTX), %r9; |
1195 | |
1196 | call __aria_aesni_avx2_crypt_32way; |
1197 | |
1198 | vpxor (0 * 32)(%r11), %ymm1, %ymm1; |
1199 | vpxor (1 * 32)(%r11), %ymm0, %ymm0; |
1200 | vpxor (2 * 32)(%r11), %ymm3, %ymm3; |
1201 | vpxor (3 * 32)(%r11), %ymm2, %ymm2; |
1202 | vpxor (4 * 32)(%r11), %ymm4, %ymm4; |
1203 | vpxor (5 * 32)(%r11), %ymm5, %ymm5; |
1204 | vpxor (6 * 32)(%r11), %ymm6, %ymm6; |
1205 | vpxor (7 * 32)(%r11), %ymm7, %ymm7; |
1206 | vpxor (8 * 32)(%r11), %ymm8, %ymm8; |
1207 | vpxor (9 * 32)(%r11), %ymm9, %ymm9; |
1208 | vpxor (10 * 32)(%r11), %ymm10, %ymm10; |
1209 | vpxor (11 * 32)(%r11), %ymm11, %ymm11; |
1210 | vpxor (12 * 32)(%r11), %ymm12, %ymm12; |
1211 | vpxor (13 * 32)(%r11), %ymm13, %ymm13; |
1212 | vpxor (14 * 32)(%r11), %ymm14, %ymm14; |
1213 | vpxor (15 * 32)(%r11), %ymm15, %ymm15; |
1214 | write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
1215 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1216 | %ymm15, %r10); |
1217 | |
1218 | FRAME_END |
1219 | RET; |
1220 | SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way) |
1221 | |
1222 | #ifdef CONFIG_AS_GFNI |
1223 | SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way) |
1224 | /* input: |
1225 | * %r9: rk |
1226 | * %rsi: dst |
1227 | * %rdx: src |
1228 | * %ymm0..%ymm15: 16 byte-sliced blocks |
1229 | */ |
1230 | |
1231 | FRAME_BEGIN |
1232 | |
1233 | movq %rsi, %rax; |
1234 | leaq 8 * 32(%rax), %r8; |
1235 | |
1236 | inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, |
1237 | %ymm4, %ymm5, %ymm6, %ymm7, |
1238 | %ymm8, %ymm9, %ymm10, %ymm11, |
1239 | %ymm12, %ymm13, %ymm14, |
1240 | %ymm15, %rax, %r8); |
1241 | aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11, |
1242 | %ymm12, %ymm13, %ymm14, %ymm15, |
1243 | %ymm0, %ymm1, %ymm2, %ymm3, |
1244 | %ymm4, %ymm5, %ymm6, %ymm7, |
1245 | %rax, %r9, 0); |
1246 | aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1247 | %ymm4, %ymm5, %ymm6, %ymm7, |
1248 | %ymm8, %ymm9, %ymm10, %ymm11, |
1249 | %ymm12, %ymm13, %ymm14, |
1250 | %ymm15, %rax, %r9, 1); |
1251 | aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, |
1252 | %ymm12, %ymm13, %ymm14, %ymm15, |
1253 | %ymm0, %ymm1, %ymm2, %ymm3, |
1254 | %ymm4, %ymm5, %ymm6, %ymm7, |
1255 | %rax, %r9, 2); |
1256 | aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1257 | %ymm4, %ymm5, %ymm6, %ymm7, |
1258 | %ymm8, %ymm9, %ymm10, %ymm11, |
1259 | %ymm12, %ymm13, %ymm14, |
1260 | %ymm15, %rax, %r9, 3); |
1261 | aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, |
1262 | %ymm12, %ymm13, %ymm14, %ymm15, |
1263 | %ymm0, %ymm1, %ymm2, %ymm3, |
1264 | %ymm4, %ymm5, %ymm6, %ymm7, |
1265 | %rax, %r9, 4); |
1266 | aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1267 | %ymm4, %ymm5, %ymm6, %ymm7, |
1268 | %ymm8, %ymm9, %ymm10, %ymm11, |
1269 | %ymm12, %ymm13, %ymm14, |
1270 | %ymm15, %rax, %r9, 5); |
1271 | aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, |
1272 | %ymm12, %ymm13, %ymm14, %ymm15, |
1273 | %ymm0, %ymm1, %ymm2, %ymm3, |
1274 | %ymm4, %ymm5, %ymm6, %ymm7, |
1275 | %rax, %r9, 6); |
1276 | aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1277 | %ymm4, %ymm5, %ymm6, %ymm7, |
1278 | %ymm8, %ymm9, %ymm10, %ymm11, |
1279 | %ymm12, %ymm13, %ymm14, |
1280 | %ymm15, %rax, %r9, 7); |
1281 | aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, |
1282 | %ymm12, %ymm13, %ymm14, %ymm15, |
1283 | %ymm0, %ymm1, %ymm2, %ymm3, |
1284 | %ymm4, %ymm5, %ymm6, %ymm7, |
1285 | %rax, %r9, 8); |
1286 | aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1287 | %ymm4, %ymm5, %ymm6, %ymm7, |
1288 | %ymm8, %ymm9, %ymm10, %ymm11, |
1289 | %ymm12, %ymm13, %ymm14, |
1290 | %ymm15, %rax, %r9, 9); |
1291 | aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, |
1292 | %ymm12, %ymm13, %ymm14, %ymm15, |
1293 | %ymm0, %ymm1, %ymm2, %ymm3, |
1294 | %ymm4, %ymm5, %ymm6, %ymm7, |
1295 | %rax, %r9, 10); |
1296 | cmpl $12, ARIA_CTX_rounds(CTX); |
1297 | jne .Laria_gfni_192; |
1298 | aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
1299 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1300 | %ymm15, %rax, %r9, 11, 12); |
1301 | jmp .Laria_gfni_end; |
1302 | .Laria_gfni_192: |
1303 | aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1304 | %ymm4, %ymm5, %ymm6, %ymm7, |
1305 | %ymm8, %ymm9, %ymm10, %ymm11, |
1306 | %ymm12, %ymm13, %ymm14, |
1307 | %ymm15, %rax, %r9, 11); |
1308 | aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, |
1309 | %ymm12, %ymm13, %ymm14, %ymm15, |
1310 | %ymm0, %ymm1, %ymm2, %ymm3, |
1311 | %ymm4, %ymm5, %ymm6, %ymm7, |
1312 | %rax, %r9, 12); |
1313 | cmpl $14, ARIA_CTX_rounds(CTX); |
1314 | jne .Laria_gfni_256; |
1315 | aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1316 | %ymm4, %ymm5, %ymm6, %ymm7, |
1317 | %ymm8, %ymm9, %ymm10, %ymm11, |
1318 | %ymm12, %ymm13, %ymm14, |
1319 | %ymm15, %rax, %r9, 13, 14); |
1320 | jmp .Laria_gfni_end; |
1321 | .Laria_gfni_256: |
1322 | aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1323 | %ymm4, %ymm5, %ymm6, %ymm7, |
1324 | %ymm8, %ymm9, %ymm10, %ymm11, |
1325 | %ymm12, %ymm13, %ymm14, |
1326 | %ymm15, %rax, %r9, 13); |
1327 | aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, |
1328 | %ymm12, %ymm13, %ymm14, %ymm15, |
1329 | %ymm0, %ymm1, %ymm2, %ymm3, |
1330 | %ymm4, %ymm5, %ymm6, %ymm7, |
1331 | %rax, %r9, 14); |
1332 | aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, |
1333 | %ymm4, %ymm5, %ymm6, %ymm7, |
1334 | %ymm8, %ymm9, %ymm10, %ymm11, |
1335 | %ymm12, %ymm13, %ymm14, |
1336 | %ymm15, %rax, %r9, 15, 16); |
1337 | .Laria_gfni_end: |
1338 | debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4, |
1339 | %ymm9, %ymm13, %ymm0, %ymm5, |
1340 | %ymm10, %ymm14, %ymm3, %ymm6, |
1341 | %ymm11, %ymm15, %ymm2, %ymm7, |
1342 | (%rax), (%r8)); |
1343 | |
1344 | FRAME_END |
1345 | RET; |
1346 | SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way) |
1347 | |
1348 | SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way) |
1349 | /* input: |
1350 | * %rdi: ctx, CTX |
1351 | * %rsi: dst |
1352 | * %rdx: src |
1353 | */ |
1354 | |
1355 | FRAME_BEGIN |
1356 | |
1357 | leaq ARIA_CTX_enc_key(CTX), %r9; |
1358 | |
1359 | inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
1360 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1361 | %ymm15, %rdx); |
1362 | |
1363 | call __aria_aesni_avx2_gfni_crypt_32way; |
1364 | |
1365 | write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
1366 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1367 | %ymm15, %rax); |
1368 | |
1369 | FRAME_END |
1370 | RET; |
1371 | SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way) |
1372 | |
1373 | SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way) |
1374 | /* input: |
1375 | * %rdi: ctx, CTX |
1376 | * %rsi: dst |
1377 | * %rdx: src |
1378 | */ |
1379 | |
1380 | FRAME_BEGIN |
1381 | |
1382 | leaq ARIA_CTX_dec_key(CTX), %r9; |
1383 | |
1384 | inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, |
1385 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1386 | %ymm15, %rdx); |
1387 | |
1388 | call __aria_aesni_avx2_gfni_crypt_32way; |
1389 | |
1390 | write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
1391 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1392 | %ymm15, %rax); |
1393 | |
1394 | FRAME_END |
1395 | RET; |
1396 | SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way) |
1397 | |
1398 | SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way) |
1399 | /* input: |
1400 | * %rdi: ctx |
1401 | * %rsi: dst |
1402 | * %rdx: src |
1403 | * %rcx: keystream |
1404 | * %r8: iv (big endian, 128bit) |
1405 | */ |
1406 | FRAME_BEGIN |
1407 | |
1408 | call __aria_aesni_avx2_ctr_gen_keystream_32way |
1409 | |
1410 | leaq (%rsi), %r10; |
1411 | leaq (%rdx), %r11; |
1412 | leaq (%rcx), %rsi; |
1413 | leaq (%rcx), %rdx; |
1414 | leaq ARIA_CTX_enc_key(CTX), %r9; |
1415 | |
1416 | call __aria_aesni_avx2_gfni_crypt_32way; |
1417 | |
1418 | vpxor (0 * 32)(%r11), %ymm1, %ymm1; |
1419 | vpxor (1 * 32)(%r11), %ymm0, %ymm0; |
1420 | vpxor (2 * 32)(%r11), %ymm3, %ymm3; |
1421 | vpxor (3 * 32)(%r11), %ymm2, %ymm2; |
1422 | vpxor (4 * 32)(%r11), %ymm4, %ymm4; |
1423 | vpxor (5 * 32)(%r11), %ymm5, %ymm5; |
1424 | vpxor (6 * 32)(%r11), %ymm6, %ymm6; |
1425 | vpxor (7 * 32)(%r11), %ymm7, %ymm7; |
1426 | vpxor (8 * 32)(%r11), %ymm8, %ymm8; |
1427 | vpxor (9 * 32)(%r11), %ymm9, %ymm9; |
1428 | vpxor (10 * 32)(%r11), %ymm10, %ymm10; |
1429 | vpxor (11 * 32)(%r11), %ymm11, %ymm11; |
1430 | vpxor (12 * 32)(%r11), %ymm12, %ymm12; |
1431 | vpxor (13 * 32)(%r11), %ymm13, %ymm13; |
1432 | vpxor (14 * 32)(%r11), %ymm14, %ymm14; |
1433 | vpxor (15 * 32)(%r11), %ymm15, %ymm15; |
1434 | write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, |
1435 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, |
1436 | %ymm15, %r10); |
1437 | |
1438 | FRAME_END |
1439 | RET; |
1440 | SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way) |
1441 | #endif /* CONFIG_AS_GFNI */ |
1442 | |