1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /*************************************************************************** |
3 | * Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * |
4 | * * |
5 | ***************************************************************************/ |
6 | |
7 | .file "twofish-x86_64-asm.S" |
8 | .text |
9 | |
10 | #include <linux/linkage.h> |
11 | #include <asm/asm-offsets.h> |
12 | |
13 | #define a_offset 0 |
14 | #define b_offset 4 |
15 | #define c_offset 8 |
16 | #define d_offset 12 |
17 | |
18 | /* Structure of the crypto context struct*/ |
19 | |
20 | #define s0 0 /* S0 Array 256 Words each */ |
21 | #define s1 1024 /* S1 Array */ |
22 | #define s2 2048 /* S2 Array */ |
23 | #define s3 3072 /* S3 Array */ |
24 | #define w 4096 /* 8 whitening keys (word) */ |
25 | #define k 4128 /* key 1-32 ( word ) */ |
26 | |
27 | /* define a few register aliases to allow macro substitution */ |
28 | |
29 | #define R0 %rax |
30 | #define R0D %eax |
31 | #define R0B %al |
32 | #define R0H %ah |
33 | |
34 | #define R1 %rbx |
35 | #define R1D %ebx |
36 | #define R1B %bl |
37 | #define R1H %bh |
38 | |
39 | #define R2 %rcx |
40 | #define R2D %ecx |
41 | #define R2B %cl |
42 | #define R2H %ch |
43 | |
44 | #define R3 %rdx |
45 | #define R3D %edx |
46 | #define R3B %dl |
47 | #define R3H %dh |
48 | |
49 | |
50 | /* performs input whitening */ |
51 | #define input_whitening(src,context,offset)\ |
52 | xor w+offset(context), src; |
53 | |
54 | /* performs input whitening */ |
55 | #define output_whitening(src,context,offset)\ |
56 | xor w+16+offset(context), src; |
57 | |
58 | |
59 | /* |
60 | * a input register containing a (rotated 16) |
61 | * b input register containing b |
62 | * c input register containing c |
63 | * d input register containing d (already rol $1) |
64 | * operations on a and b are interleaved to increase performance |
65 | */ |
66 | #define encrypt_round(a,b,c,d,round)\ |
67 | movzx b ## B, %edi;\ |
68 | mov s1(%r11,%rdi,4),%r8d;\ |
69 | movzx a ## B, %edi;\ |
70 | mov s2(%r11,%rdi,4),%r9d;\ |
71 | movzx b ## H, %edi;\ |
72 | ror $16, b ## D;\ |
73 | xor s2(%r11,%rdi,4),%r8d;\ |
74 | movzx a ## H, %edi;\ |
75 | ror $16, a ## D;\ |
76 | xor s3(%r11,%rdi,4),%r9d;\ |
77 | movzx b ## B, %edi;\ |
78 | xor s3(%r11,%rdi,4),%r8d;\ |
79 | movzx a ## B, %edi;\ |
80 | xor (%r11,%rdi,4), %r9d;\ |
81 | movzx b ## H, %edi;\ |
82 | ror $15, b ## D;\ |
83 | xor (%r11,%rdi,4), %r8d;\ |
84 | movzx a ## H, %edi;\ |
85 | xor s1(%r11,%rdi,4),%r9d;\ |
86 | add %r8d, %r9d;\ |
87 | add %r9d, %r8d;\ |
88 | add k+round(%r11), %r9d;\ |
89 | xor %r9d, c ## D;\ |
90 | rol $15, c ## D;\ |
91 | add k+4+round(%r11),%r8d;\ |
92 | xor %r8d, d ## D; |
93 | |
94 | /* |
95 | * a input register containing a(rotated 16) |
96 | * b input register containing b |
97 | * c input register containing c |
98 | * d input register containing d (already rol $1) |
99 | * operations on a and b are interleaved to increase performance |
100 | * during the round a and b are prepared for the output whitening |
101 | */ |
102 | #define encrypt_last_round(a,b,c,d,round)\ |
103 | mov b ## D, %r10d;\ |
104 | shl $32, %r10;\ |
105 | movzx b ## B, %edi;\ |
106 | mov s1(%r11,%rdi,4),%r8d;\ |
107 | movzx a ## B, %edi;\ |
108 | mov s2(%r11,%rdi,4),%r9d;\ |
109 | movzx b ## H, %edi;\ |
110 | ror $16, b ## D;\ |
111 | xor s2(%r11,%rdi,4),%r8d;\ |
112 | movzx a ## H, %edi;\ |
113 | ror $16, a ## D;\ |
114 | xor s3(%r11,%rdi,4),%r9d;\ |
115 | movzx b ## B, %edi;\ |
116 | xor s3(%r11,%rdi,4),%r8d;\ |
117 | movzx a ## B, %edi;\ |
118 | xor (%r11,%rdi,4), %r9d;\ |
119 | xor a, %r10;\ |
120 | movzx b ## H, %edi;\ |
121 | xor (%r11,%rdi,4), %r8d;\ |
122 | movzx a ## H, %edi;\ |
123 | xor s1(%r11,%rdi,4),%r9d;\ |
124 | add %r8d, %r9d;\ |
125 | add %r9d, %r8d;\ |
126 | add k+round(%r11), %r9d;\ |
127 | xor %r9d, c ## D;\ |
128 | ror $1, c ## D;\ |
129 | add k+4+round(%r11),%r8d;\ |
130 | xor %r8d, d ## D |
131 | |
132 | /* |
133 | * a input register containing a |
134 | * b input register containing b (rotated 16) |
135 | * c input register containing c (already rol $1) |
136 | * d input register containing d |
137 | * operations on a and b are interleaved to increase performance |
138 | */ |
139 | #define decrypt_round(a,b,c,d,round)\ |
140 | movzx a ## B, %edi;\ |
141 | mov (%r11,%rdi,4), %r9d;\ |
142 | movzx b ## B, %edi;\ |
143 | mov s3(%r11,%rdi,4),%r8d;\ |
144 | movzx a ## H, %edi;\ |
145 | ror $16, a ## D;\ |
146 | xor s1(%r11,%rdi,4),%r9d;\ |
147 | movzx b ## H, %edi;\ |
148 | ror $16, b ## D;\ |
149 | xor (%r11,%rdi,4), %r8d;\ |
150 | movzx a ## B, %edi;\ |
151 | xor s2(%r11,%rdi,4),%r9d;\ |
152 | movzx b ## B, %edi;\ |
153 | xor s1(%r11,%rdi,4),%r8d;\ |
154 | movzx a ## H, %edi;\ |
155 | ror $15, a ## D;\ |
156 | xor s3(%r11,%rdi,4),%r9d;\ |
157 | movzx b ## H, %edi;\ |
158 | xor s2(%r11,%rdi,4),%r8d;\ |
159 | add %r8d, %r9d;\ |
160 | add %r9d, %r8d;\ |
161 | add k+round(%r11), %r9d;\ |
162 | xor %r9d, c ## D;\ |
163 | add k+4+round(%r11),%r8d;\ |
164 | xor %r8d, d ## D;\ |
165 | rol $15, d ## D; |
166 | |
167 | /* |
168 | * a input register containing a |
169 | * b input register containing b |
170 | * c input register containing c (already rol $1) |
171 | * d input register containing d |
172 | * operations on a and b are interleaved to increase performance |
173 | * during the round a and b are prepared for the output whitening |
174 | */ |
175 | #define decrypt_last_round(a,b,c,d,round)\ |
176 | movzx a ## B, %edi;\ |
177 | mov (%r11,%rdi,4), %r9d;\ |
178 | movzx b ## B, %edi;\ |
179 | mov s3(%r11,%rdi,4),%r8d;\ |
180 | movzx b ## H, %edi;\ |
181 | ror $16, b ## D;\ |
182 | xor (%r11,%rdi,4), %r8d;\ |
183 | movzx a ## H, %edi;\ |
184 | mov b ## D, %r10d;\ |
185 | shl $32, %r10;\ |
186 | xor a, %r10;\ |
187 | ror $16, a ## D;\ |
188 | xor s1(%r11,%rdi,4),%r9d;\ |
189 | movzx b ## B, %edi;\ |
190 | xor s1(%r11,%rdi,4),%r8d;\ |
191 | movzx a ## B, %edi;\ |
192 | xor s2(%r11,%rdi,4),%r9d;\ |
193 | movzx b ## H, %edi;\ |
194 | xor s2(%r11,%rdi,4),%r8d;\ |
195 | movzx a ## H, %edi;\ |
196 | xor s3(%r11,%rdi,4),%r9d;\ |
197 | add %r8d, %r9d;\ |
198 | add %r9d, %r8d;\ |
199 | add k+round(%r11), %r9d;\ |
200 | xor %r9d, c ## D;\ |
201 | add k+4+round(%r11),%r8d;\ |
202 | xor %r8d, d ## D;\ |
203 | ror $1, d ## D; |
204 | |
205 | SYM_FUNC_START(twofish_enc_blk) |
206 | pushq R1 |
207 | |
208 | /* %rdi contains the ctx address */ |
209 | /* %rsi contains the output address */ |
210 | /* %rdx contains the input address */ |
211 | /* ctx address is moved to free one non-rex register |
212 | as target for the 8bit high operations */ |
213 | mov %rdi, %r11 |
214 | |
215 | movq (R3), R1 |
216 | movq 8(R3), R3 |
217 | input_whitening(R1,%r11,a_offset) |
218 | input_whitening(R3,%r11,c_offset) |
219 | mov R1D, R0D |
220 | rol $16, R0D |
221 | shr $32, R1 |
222 | mov R3D, R2D |
223 | shr $32, R3 |
224 | rol $1, R3D |
225 | |
226 | encrypt_round(R0,R1,R2,R3,0); |
227 | encrypt_round(R2,R3,R0,R1,8); |
228 | encrypt_round(R0,R1,R2,R3,2*8); |
229 | encrypt_round(R2,R3,R0,R1,3*8); |
230 | encrypt_round(R0,R1,R2,R3,4*8); |
231 | encrypt_round(R2,R3,R0,R1,5*8); |
232 | encrypt_round(R0,R1,R2,R3,6*8); |
233 | encrypt_round(R2,R3,R0,R1,7*8); |
234 | encrypt_round(R0,R1,R2,R3,8*8); |
235 | encrypt_round(R2,R3,R0,R1,9*8); |
236 | encrypt_round(R0,R1,R2,R3,10*8); |
237 | encrypt_round(R2,R3,R0,R1,11*8); |
238 | encrypt_round(R0,R1,R2,R3,12*8); |
239 | encrypt_round(R2,R3,R0,R1,13*8); |
240 | encrypt_round(R0,R1,R2,R3,14*8); |
241 | encrypt_last_round(R2,R3,R0,R1,15*8); |
242 | |
243 | |
244 | output_whitening(%r10,%r11,a_offset) |
245 | movq %r10, (%rsi) |
246 | |
247 | shl $32, R1 |
248 | xor R0, R1 |
249 | |
250 | output_whitening(R1,%r11,c_offset) |
251 | movq R1, 8(%rsi) |
252 | |
253 | popq R1 |
254 | movl $1,%eax |
255 | RET |
256 | SYM_FUNC_END(twofish_enc_blk) |
257 | |
258 | SYM_FUNC_START(twofish_dec_blk) |
259 | pushq R1 |
260 | |
261 | /* %rdi contains the ctx address */ |
262 | /* %rsi contains the output address */ |
263 | /* %rdx contains the input address */ |
264 | /* ctx address is moved to free one non-rex register |
265 | as target for the 8bit high operations */ |
266 | mov %rdi, %r11 |
267 | |
268 | movq (R3), R1 |
269 | movq 8(R3), R3 |
270 | output_whitening(R1,%r11,a_offset) |
271 | output_whitening(R3,%r11,c_offset) |
272 | mov R1D, R0D |
273 | shr $32, R1 |
274 | rol $16, R1D |
275 | mov R3D, R2D |
276 | shr $32, R3 |
277 | rol $1, R2D |
278 | |
279 | decrypt_round(R0,R1,R2,R3,15*8); |
280 | decrypt_round(R2,R3,R0,R1,14*8); |
281 | decrypt_round(R0,R1,R2,R3,13*8); |
282 | decrypt_round(R2,R3,R0,R1,12*8); |
283 | decrypt_round(R0,R1,R2,R3,11*8); |
284 | decrypt_round(R2,R3,R0,R1,10*8); |
285 | decrypt_round(R0,R1,R2,R3,9*8); |
286 | decrypt_round(R2,R3,R0,R1,8*8); |
287 | decrypt_round(R0,R1,R2,R3,7*8); |
288 | decrypt_round(R2,R3,R0,R1,6*8); |
289 | decrypt_round(R0,R1,R2,R3,5*8); |
290 | decrypt_round(R2,R3,R0,R1,4*8); |
291 | decrypt_round(R0,R1,R2,R3,3*8); |
292 | decrypt_round(R2,R3,R0,R1,2*8); |
293 | decrypt_round(R0,R1,R2,R3,1*8); |
294 | decrypt_last_round(R2,R3,R0,R1,0); |
295 | |
296 | input_whitening(%r10,%r11,a_offset) |
297 | movq %r10, (%rsi) |
298 | |
299 | shl $32, R1 |
300 | xor R0, R1 |
301 | |
302 | input_whitening(R1,%r11,c_offset) |
303 | movq R1, 8(%rsi) |
304 | |
305 | popq R1 |
306 | movl $1,%eax |
307 | RET |
308 | SYM_FUNC_END(twofish_dec_blk) |
309 | |