1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 *
5 * Copyright (C) 2012 Johannes Goetzfried
6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 *
8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9 */
10
11#include <linux/linkage.h>
12#include <asm/frame.h>
13#include "glue_helper-asm-avx.S"
14
15.file "cast6-avx-x86_64-asm_64.S"
16
17.extern cast_s1
18.extern cast_s2
19.extern cast_s3
20.extern cast_s4
21
22/* structure of crypto context */
23#define km 0
24#define kr (12*4*4)
25
26/* s-boxes */
27#define s1 cast_s1
28#define s2 cast_s2
29#define s3 cast_s3
30#define s4 cast_s4
31
32/**********************************************************************
33 8-way AVX cast6
34 **********************************************************************/
35#define CTX %r15
36
37#define RA1 %xmm0
38#define RB1 %xmm1
39#define RC1 %xmm2
40#define RD1 %xmm3
41
42#define RA2 %xmm4
43#define RB2 %xmm5
44#define RC2 %xmm6
45#define RD2 %xmm7
46
47#define RX %xmm8
48
49#define RKM %xmm9
50#define RKR %xmm10
51#define RKRF %xmm11
52#define RKRR %xmm12
53#define R32 %xmm13
54#define R1ST %xmm14
55
56#define RTMP %xmm15
57
58#define RID1 %rdi
59#define RID1d %edi
60#define RID2 %rsi
61#define RID2d %esi
62
63#define RGI1 %rdx
64#define RGI1bl %dl
65#define RGI1bh %dh
66#define RGI2 %rcx
67#define RGI2bl %cl
68#define RGI2bh %ch
69
70#define RGI3 %rax
71#define RGI3bl %al
72#define RGI3bh %ah
73#define RGI4 %rbx
74#define RGI4bl %bl
75#define RGI4bh %bh
76
77#define RFS1 %r8
78#define RFS1d %r8d
79#define RFS2 %r9
80#define RFS2d %r9d
81#define RFS3 %r10
82#define RFS3d %r10d
83
84
85#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86 movzbl src ## bh, RID1d; \
87 leaq s1(%rip), RID2; \
88 movl (RID2,RID1,4), dst ## d; \
89 movzbl src ## bl, RID2d; \
90 leaq s2(%rip), RID1; \
91 op1 (RID1,RID2,4), dst ## d; \
92 shrq $16, src; \
93 movzbl src ## bh, RID1d; \
94 leaq s3(%rip), RID2; \
95 op2 (RID2,RID1,4), dst ## d; \
96 movzbl src ## bl, RID2d; \
97 interleave_op(il_reg); \
98 leaq s4(%rip), RID1; \
99 op3 (RID1,RID2,4), dst ## d;
100
101#define dummy(d) /* do nothing */
102
103#define shr_next(reg) \
104 shrq $16, reg;
105
106#define F_head(a, x, gi1, gi2, op0) \
107 op0 a, RKM, x; \
108 vpslld RKRF, x, RTMP; \
109 vpsrld RKRR, x, x; \
110 vpor RTMP, x, x; \
111 \
112 vmovq x, gi1; \
113 vpextrq $1, x, gi2;
114
115#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
116 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
117 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
118 \
119 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
120 shlq $32, RFS2; \
121 orq RFS1, RFS2; \
122 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
123 shlq $32, RFS1; \
124 orq RFS1, RFS3; \
125 \
126 vmovq RFS2, x; \
127 vpinsrq $1, RFS3, x, x;
128
129#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
130 F_head(b1, RX, RGI1, RGI2, op0); \
131 F_head(b2, RX, RGI3, RGI4, op0); \
132 \
133 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
134 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
135 \
136 vpxor a1, RX, a1; \
137 vpxor a2, RTMP, a2;
138
139#define F1_2(a1, b1, a2, b2) \
140 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
141#define F2_2(a1, b1, a2, b2) \
142 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
143#define F3_2(a1, b1, a2, b2) \
144 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
145
146#define qop(in, out, f) \
147 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
148
149#define get_round_keys(nn) \
150 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
151 vpand R1ST, RKR, RKRF; \
152 vpsubq RKRF, R32, RKRR; \
153 vpsrldq $1, RKR, RKR;
154
155#define Q(n) \
156 get_round_keys(4*n+0); \
157 qop(RD, RC, 1); \
158 \
159 get_round_keys(4*n+1); \
160 qop(RC, RB, 2); \
161 \
162 get_round_keys(4*n+2); \
163 qop(RB, RA, 3); \
164 \
165 get_round_keys(4*n+3); \
166 qop(RA, RD, 1);
167
168#define QBAR(n) \
169 get_round_keys(4*n+3); \
170 qop(RA, RD, 1); \
171 \
172 get_round_keys(4*n+2); \
173 qop(RB, RA, 3); \
174 \
175 get_round_keys(4*n+1); \
176 qop(RC, RB, 2); \
177 \
178 get_round_keys(4*n+0); \
179 qop(RD, RC, 1);
180
181#define shuffle(mask) \
182 vpshufb mask(%rip), RKR, RKR;
183
184#define preload_rkr(n, do_mask, mask) \
185 vbroadcastss .L16_mask(%rip), RKR; \
186 /* add 16-bit rotation to key rotations (mod 32) */ \
187 vpxor (kr+n*16)(CTX), RKR, RKR; \
188 do_mask(mask);
189
190#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
191 vpunpckldq x1, x0, t0; \
192 vpunpckhdq x1, x0, t2; \
193 vpunpckldq x3, x2, t1; \
194 vpunpckhdq x3, x2, x3; \
195 \
196 vpunpcklqdq t1, t0, x0; \
197 vpunpckhqdq t1, t0, x1; \
198 vpunpcklqdq x3, t2, x2; \
199 vpunpckhqdq x3, t2, x3;
200
201#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
202 vpshufb rmask, x0, x0; \
203 vpshufb rmask, x1, x1; \
204 vpshufb rmask, x2, x2; \
205 vpshufb rmask, x3, x3; \
206 \
207 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
208
209#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
210 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
211 \
212 vpshufb rmask, x0, x0; \
213 vpshufb rmask, x1, x1; \
214 vpshufb rmask, x2, x2; \
215 vpshufb rmask, x3, x3;
216
217.section .rodata.cst16, "aM", @progbits, 16
218.align 16
219.Lbswap_mask:
220 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
221.Lbswap128_mask:
222 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
223.Lrkr_enc_Q_Q_QBAR_QBAR:
224 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
225.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
226 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
227.Lrkr_dec_Q_Q_Q_Q:
228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
229.Lrkr_dec_Q_Q_QBAR_QBAR:
230 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
231.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
233
234.section .rodata.cst4.L16_mask, "aM", @progbits, 4
235.align 4
236.L16_mask:
237 .byte 16, 16, 16, 16
238
239.section .rodata.cst4.L32_mask, "aM", @progbits, 4
240.align 4
241.L32_mask:
242 .byte 32, 0, 0, 0
243
244.section .rodata.cst4.first_mask, "aM", @progbits, 4
245.align 4
246.Lfirst_mask:
247 .byte 0x1f, 0, 0, 0
248
249.text
250
251.align 8
252SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
253 /* input:
254 * %rdi: ctx
255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
256 * output:
257 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
258 */
259
260 pushq %r15;
261 pushq %rbx;
262
263 movq %rdi, CTX;
264
265 vmovdqa .Lbswap_mask(%rip), RKM;
266 vmovd .Lfirst_mask(%rip), R1ST;
267 vmovd .L32_mask(%rip), R32;
268
269 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
270 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
271
272 preload_rkr(0, dummy, none);
273 Q(0);
274 Q(1);
275 Q(2);
276 Q(3);
277 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
278 Q(4);
279 Q(5);
280 QBAR(6);
281 QBAR(7);
282 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
283 QBAR(8);
284 QBAR(9);
285 QBAR(10);
286 QBAR(11);
287
288 popq %rbx;
289 popq %r15;
290
291 vmovdqa .Lbswap_mask(%rip), RKM;
292
293 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
294 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
295
296 RET;
297SYM_FUNC_END(__cast6_enc_blk8)
298
299.align 8
300SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
301 /* input:
302 * %rdi: ctx
303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
304 * output:
305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
306 */
307
308 pushq %r15;
309 pushq %rbx;
310
311 movq %rdi, CTX;
312
313 vmovdqa .Lbswap_mask(%rip), RKM;
314 vmovd .Lfirst_mask(%rip), R1ST;
315 vmovd .L32_mask(%rip), R32;
316
317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
319
320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
321 Q(11);
322 Q(10);
323 Q(9);
324 Q(8);
325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
326 Q(7);
327 Q(6);
328 QBAR(5);
329 QBAR(4);
330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
331 QBAR(3);
332 QBAR(2);
333 QBAR(1);
334 QBAR(0);
335
336 popq %rbx;
337 popq %r15;
338
339 vmovdqa .Lbswap_mask(%rip), RKM;
340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
342
343 RET;
344SYM_FUNC_END(__cast6_dec_blk8)
345
346SYM_FUNC_START(cast6_ecb_enc_8way)
347 /* input:
348 * %rdi: ctx
349 * %rsi: dst
350 * %rdx: src
351 */
352 FRAME_BEGIN
353 pushq %r15;
354
355 movq %rdi, CTX;
356 movq %rsi, %r11;
357
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360 call __cast6_enc_blk8;
361
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364 popq %r15;
365 FRAME_END
366 RET;
367SYM_FUNC_END(cast6_ecb_enc_8way)
368
369SYM_FUNC_START(cast6_ecb_dec_8way)
370 /* input:
371 * %rdi: ctx
372 * %rsi: dst
373 * %rdx: src
374 */
375 FRAME_BEGIN
376 pushq %r15;
377
378 movq %rdi, CTX;
379 movq %rsi, %r11;
380
381 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
382
383 call __cast6_dec_blk8;
384
385 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
386
387 popq %r15;
388 FRAME_END
389 RET;
390SYM_FUNC_END(cast6_ecb_dec_8way)
391
392SYM_FUNC_START(cast6_cbc_dec_8way)
393 /* input:
394 * %rdi: ctx
395 * %rsi: dst
396 * %rdx: src
397 */
398 FRAME_BEGIN
399 pushq %r12;
400 pushq %r15;
401
402 movq %rdi, CTX;
403 movq %rsi, %r11;
404 movq %rdx, %r12;
405
406 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
407
408 call __cast6_dec_blk8;
409
410 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
411
412 popq %r15;
413 popq %r12;
414 FRAME_END
415 RET;
416SYM_FUNC_END(cast6_cbc_dec_8way)
417

source code of linux/arch/x86/crypto/cast6-avx-x86_64-asm_64.S