1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Twofish Cipher 3-way parallel algorithm (x86_64)
4 *
5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10
11.file "twofish-x86_64-asm-3way.S"
12.text
13
14/* structure of crypto context */
15#define s0 0
16#define s1 1024
17#define s2 2048
18#define s3 3072
19#define w 4096
20#define k 4128
21
22/**********************************************************************
23 3-way twofish
24 **********************************************************************/
25#define CTX %rdi
26#define RIO %rdx
27
28#define RAB0 %rax
29#define RAB1 %rbx
30#define RAB2 %rcx
31
32#define RAB0d %eax
33#define RAB1d %ebx
34#define RAB2d %ecx
35
36#define RAB0bh %ah
37#define RAB1bh %bh
38#define RAB2bh %ch
39
40#define RAB0bl %al
41#define RAB1bl %bl
42#define RAB2bl %cl
43
44#define CD0 0x0(%rsp)
45#define CD1 0x8(%rsp)
46#define CD2 0x10(%rsp)
47
48# used only before/after all rounds
49#define RCD0 %r8
50#define RCD1 %r9
51#define RCD2 %r10
52
53# used only during rounds
54#define RX0 %r8
55#define RX1 %r9
56#define RX2 %r10
57
58#define RX0d %r8d
59#define RX1d %r9d
60#define RX2d %r10d
61
62#define RY0 %r11
63#define RY1 %r12
64#define RY2 %r13
65
66#define RY0d %r11d
67#define RY1d %r12d
68#define RY2d %r13d
69
70#define RT0 %rdx
71#define RT1 %rsi
72
73#define RT0d %edx
74#define RT1d %esi
75
76#define RT1bl %sil
77
78#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
79 movzbl ab ## bl, tmp2 ## d; \
80 movzbl ab ## bh, tmp1 ## d; \
81 rorq $(rot), ab; \
82 op1##l T0(CTX, tmp2, 4), dst ## d; \
83 op2##l T1(CTX, tmp1, 4), dst ## d;
84
85#define swap_ab_with_cd(ab, cd, tmp) \
86 movq cd, tmp; \
87 movq ab, cd; \
88 movq tmp, ab;
89
90/*
91 * Combined G1 & G2 function. Reordered with help of rotates to have moves
92 * at beginning.
93 */
94#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
95 /* G1,1 && G2,1 */ \
96 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
97 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
98 \
99 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
100 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
101 \
102 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
103 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
104 \
105 /* G1,2 && G2,2 */ \
106 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
107 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
108 swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
109 \
110 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
111 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
112 swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
113 \
114 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
115 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
116 swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
117
118#define enc_round_end(ab, x, y, n) \
119 addl y ## d, x ## d; \
120 addl x ## d, y ## d; \
121 addl k+4*(2*(n))(CTX), x ## d; \
122 xorl ab ## d, x ## d; \
123 addl k+4*(2*(n)+1)(CTX), y ## d; \
124 shrq $32, ab; \
125 roll $1, ab ## d; \
126 xorl y ## d, ab ## d; \
127 shlq $32, ab; \
128 rorl $1, x ## d; \
129 orq x, ab;
130
131#define dec_round_end(ba, x, y, n) \
132 addl y ## d, x ## d; \
133 addl x ## d, y ## d; \
134 addl k+4*(2*(n))(CTX), x ## d; \
135 addl k+4*(2*(n)+1)(CTX), y ## d; \
136 xorl ba ## d, y ## d; \
137 shrq $32, ba; \
138 roll $1, ba ## d; \
139 xorl x ## d, ba ## d; \
140 shlq $32, ba; \
141 rorl $1, y ## d; \
142 orq y, ba;
143
144#define encrypt_round3(ab, cd, n) \
145 g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
146 \
147 enc_round_end(ab ## 0, RX0, RY0, n); \
148 enc_round_end(ab ## 1, RX1, RY1, n); \
149 enc_round_end(ab ## 2, RX2, RY2, n);
150
151#define decrypt_round3(ba, dc, n) \
152 g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
153 \
154 dec_round_end(ba ## 0, RX0, RY0, n); \
155 dec_round_end(ba ## 1, RX1, RY1, n); \
156 dec_round_end(ba ## 2, RX2, RY2, n);
157
158#define encrypt_cycle3(ab, cd, n) \
159 encrypt_round3(ab, cd, n*2); \
160 encrypt_round3(ab, cd, (n*2)+1);
161
162#define decrypt_cycle3(ba, dc, n) \
163 decrypt_round3(ba, dc, (n*2)+1); \
164 decrypt_round3(ba, dc, (n*2));
165
166#define push_cd() \
167 pushq RCD2; \
168 pushq RCD1; \
169 pushq RCD0;
170
171#define pop_cd() \
172 popq RCD0; \
173 popq RCD1; \
174 popq RCD2;
175
176#define inpack3(in, n, xy, m) \
177 movq 4*(n)(in), xy ## 0; \
178 xorq w+4*m(CTX), xy ## 0; \
179 \
180 movq 4*(4+(n))(in), xy ## 1; \
181 xorq w+4*m(CTX), xy ## 1; \
182 \
183 movq 4*(8+(n))(in), xy ## 2; \
184 xorq w+4*m(CTX), xy ## 2;
185
186#define outunpack3(op, out, n, xy, m) \
187 xorq w+4*m(CTX), xy ## 0; \
188 op ## q xy ## 0, 4*(n)(out); \
189 \
190 xorq w+4*m(CTX), xy ## 1; \
191 op ## q xy ## 1, 4*(4+(n))(out); \
192 \
193 xorq w+4*m(CTX), xy ## 2; \
194 op ## q xy ## 2, 4*(8+(n))(out);
195
196#define inpack_enc3() \
197 inpack3(RIO, 0, RAB, 0); \
198 inpack3(RIO, 2, RCD, 2);
199
200#define outunpack_enc3(op) \
201 outunpack3(op, RIO, 2, RAB, 6); \
202 outunpack3(op, RIO, 0, RCD, 4);
203
204#define inpack_dec3() \
205 inpack3(RIO, 0, RAB, 4); \
206 rorq $32, RAB0; \
207 rorq $32, RAB1; \
208 rorq $32, RAB2; \
209 inpack3(RIO, 2, RCD, 6); \
210 rorq $32, RCD0; \
211 rorq $32, RCD1; \
212 rorq $32, RCD2;
213
214#define outunpack_dec3() \
215 rorq $32, RCD0; \
216 rorq $32, RCD1; \
217 rorq $32, RCD2; \
218 outunpack3(mov, RIO, 0, RCD, 0); \
219 rorq $32, RAB0; \
220 rorq $32, RAB1; \
221 rorq $32, RAB2; \
222 outunpack3(mov, RIO, 2, RAB, 2);
223
224SYM_TYPED_FUNC_START(__twofish_enc_blk_3way)
225 /* input:
226 * %rdi: ctx, CTX
227 * %rsi: dst
228 * %rdx: src, RIO
229 * %rcx: bool, if true: xor output
230 */
231 pushq %r13;
232 pushq %r12;
233 pushq %rbx;
234
235 pushq %rcx; /* bool xor */
236 pushq %rsi; /* dst */
237
238 inpack_enc3();
239
240 push_cd();
241 encrypt_cycle3(RAB, CD, 0);
242 encrypt_cycle3(RAB, CD, 1);
243 encrypt_cycle3(RAB, CD, 2);
244 encrypt_cycle3(RAB, CD, 3);
245 encrypt_cycle3(RAB, CD, 4);
246 encrypt_cycle3(RAB, CD, 5);
247 encrypt_cycle3(RAB, CD, 6);
248 encrypt_cycle3(RAB, CD, 7);
249 pop_cd();
250
251 popq RIO; /* dst */
252 popq RT1; /* bool xor */
253
254 testb RT1bl, RT1bl;
255 jnz .L__enc_xor3;
256
257 outunpack_enc3(mov);
258
259 popq %rbx;
260 popq %r12;
261 popq %r13;
262 RET;
263
264.L__enc_xor3:
265 outunpack_enc3(xor);
266
267 popq %rbx;
268 popq %r12;
269 popq %r13;
270 RET;
271SYM_FUNC_END(__twofish_enc_blk_3way)
272
273SYM_TYPED_FUNC_START(twofish_dec_blk_3way)
274 /* input:
275 * %rdi: ctx, CTX
276 * %rsi: dst
277 * %rdx: src, RIO
278 */
279 pushq %r13;
280 pushq %r12;
281 pushq %rbx;
282
283 pushq %rsi; /* dst */
284
285 inpack_dec3();
286
287 push_cd();
288 decrypt_cycle3(RAB, CD, 7);
289 decrypt_cycle3(RAB, CD, 6);
290 decrypt_cycle3(RAB, CD, 5);
291 decrypt_cycle3(RAB, CD, 4);
292 decrypt_cycle3(RAB, CD, 3);
293 decrypt_cycle3(RAB, CD, 2);
294 decrypt_cycle3(RAB, CD, 1);
295 decrypt_cycle3(RAB, CD, 0);
296 pop_cd();
297
298 popq RIO; /* dst */
299
300 outunpack_dec3();
301
302 popq %rbx;
303 popq %r12;
304 popq %r13;
305 RET;
306SYM_FUNC_END(twofish_dec_blk_3way)
307

source code of linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S