twofish-x86_64-asm_64.S source code [linux/arch/x86/crypto/twofish-x86_64-asm_64.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/***************************************************************************
3	* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
4	* *
5	***************************************************************************/
6
7	.file "twofish-x86_64-asm.S"
8	.text
9
10	#include <linux/linkage.h>
11	#include <asm/asm-offsets.h>
12
13	#define a_offset 0
14	#define b_offset 4
15	#define c_offset 8
16	#define d_offset 12
17
18	/ Structure of the crypto context struct/
19
20	#define s0 0 /* S0 Array 256 Words each */
21	#define s1 1024 /* S1 Array */
22	#define s2 2048 /* S2 Array */
23	#define s3 3072 /* S3 Array */
24	#define w 4096 /* 8 whitening keys (word) */
25	#define k 4128 /* key 1-32 ( word ) */
26
27	/ define a few register aliases to allow macro substitution /
28
29	#define R0 %rax
30	#define R0D %eax
31	#define R0B %al
32	#define R0H %ah
33
34	#define R1 %rbx
35	#define R1D %ebx
36	#define R1B %bl
37	#define R1H %bh
38
39	#define R2 %rcx
40	#define R2D %ecx
41	#define R2B %cl
42	#define R2H %ch
43
44	#define R3 %rdx
45	#define R3D %edx
46	#define R3B %dl
47	#define R3H %dh
48
49
50	/ performs input whitening /
51	#define input_whitening(src,context,offset)\
52	xor w+offset(context), src;
53
54	/ performs input whitening /
55	#define output_whitening(src,context,offset)\
56	xor w+16+offset(context), src;
57
58
59	/*
60	* a input register containing a (rotated 16)
61	* b input register containing b
62	* c input register containing c
63	* d input register containing d (already rol $1)
64	* operations on a and b are interleaved to increase performance
65	*/
66	#define encrypt_round(a,b,c,d,round)\
67	movzx b ## B, %edi;\
68	mov s1(%r11,%rdi,4),%r8d;\
69	movzx a ## B, %edi;\
70	mov s2(%r11,%rdi,4),%r9d;\
71	movzx b ## H, %edi;\
72	ror $16, b ## D;\
73	xor s2(%r11,%rdi,4),%r8d;\
74	movzx a ## H, %edi;\
75	ror $16, a ## D;\
76	xor s3(%r11,%rdi,4),%r9d;\
77	movzx b ## B, %edi;\
78	xor s3(%r11,%rdi,4),%r8d;\
79	movzx a ## B, %edi;\
80	xor (%r11,%rdi,4), %r9d;\
81	movzx b ## H, %edi;\
82	ror $15, b ## D;\
83	xor (%r11,%rdi,4), %r8d;\
84	movzx a ## H, %edi;\
85	xor s1(%r11,%rdi,4),%r9d;\
86	add %r8d, %r9d;\
87	add %r9d, %r8d;\
88	add k+round(%r11), %r9d;\
89	xor %r9d, c ## D;\
90	rol $15, c ## D;\
91	add k+4+round(%r11),%r8d;\
92	xor %r8d, d ## D;
93
94	/*
95	* a input register containing a(rotated 16)
96	* b input register containing b
97	* c input register containing c
98	* d input register containing d (already rol $1)
99	* operations on a and b are interleaved to increase performance
100	* during the round a and b are prepared for the output whitening
101	*/
102	#define encrypt_last_round(a,b,c,d,round)\
103	mov b ## D, %r10d;\
104	shl $32, %r10;\
105	movzx b ## B, %edi;\
106	mov s1(%r11,%rdi,4),%r8d;\
107	movzx a ## B, %edi;\
108	mov s2(%r11,%rdi,4),%r9d;\
109	movzx b ## H, %edi;\
110	ror $16, b ## D;\
111	xor s2(%r11,%rdi,4),%r8d;\
112	movzx a ## H, %edi;\
113	ror $16, a ## D;\
114	xor s3(%r11,%rdi,4),%r9d;\
115	movzx b ## B, %edi;\
116	xor s3(%r11,%rdi,4),%r8d;\
117	movzx a ## B, %edi;\
118	xor (%r11,%rdi,4), %r9d;\
119	xor a, %r10;\
120	movzx b ## H, %edi;\
121	xor (%r11,%rdi,4), %r8d;\
122	movzx a ## H, %edi;\
123	xor s1(%r11,%rdi,4),%r9d;\
124	add %r8d, %r9d;\
125	add %r9d, %r8d;\
126	add k+round(%r11), %r9d;\
127	xor %r9d, c ## D;\
128	ror $1, c ## D;\
129	add k+4+round(%r11),%r8d;\
130	xor %r8d, d ## D
131
132	/*
133	* a input register containing a
134	* b input register containing b (rotated 16)
135	* c input register containing c (already rol $1)
136	* d input register containing d
137	* operations on a and b are interleaved to increase performance
138	*/
139	#define decrypt_round(a,b,c,d,round)\
140	movzx a ## B, %edi;\
141	mov (%r11,%rdi,4), %r9d;\
142	movzx b ## B, %edi;\
143	mov s3(%r11,%rdi,4),%r8d;\
144	movzx a ## H, %edi;\
145	ror $16, a ## D;\
146	xor s1(%r11,%rdi,4),%r9d;\
147	movzx b ## H, %edi;\
148	ror $16, b ## D;\
149	xor (%r11,%rdi,4), %r8d;\
150	movzx a ## B, %edi;\
151	xor s2(%r11,%rdi,4),%r9d;\
152	movzx b ## B, %edi;\
153	xor s1(%r11,%rdi,4),%r8d;\
154	movzx a ## H, %edi;\
155	ror $15, a ## D;\
156	xor s3(%r11,%rdi,4),%r9d;\
157	movzx b ## H, %edi;\
158	xor s2(%r11,%rdi,4),%r8d;\
159	add %r8d, %r9d;\
160	add %r9d, %r8d;\
161	add k+round(%r11), %r9d;\
162	xor %r9d, c ## D;\
163	add k+4+round(%r11),%r8d;\
164	xor %r8d, d ## D;\
165	rol $15, d ## D;
166
167	/*
168	* a input register containing a
169	* b input register containing b
170	* c input register containing c (already rol $1)
171	* d input register containing d
172	* operations on a and b are interleaved to increase performance
173	* during the round a and b are prepared for the output whitening
174	*/
175	#define decrypt_last_round(a,b,c,d,round)\
176	movzx a ## B, %edi;\
177	mov (%r11,%rdi,4), %r9d;\
178	movzx b ## B, %edi;\
179	mov s3(%r11,%rdi,4),%r8d;\
180	movzx b ## H, %edi;\
181	ror $16, b ## D;\
182	xor (%r11,%rdi,4), %r8d;\
183	movzx a ## H, %edi;\
184	mov b ## D, %r10d;\
185	shl $32, %r10;\
186	xor a, %r10;\
187	ror $16, a ## D;\
188	xor s1(%r11,%rdi,4),%r9d;\
189	movzx b ## B, %edi;\
190	xor s1(%r11,%rdi,4),%r8d;\
191	movzx a ## B, %edi;\
192	xor s2(%r11,%rdi,4),%r9d;\
193	movzx b ## H, %edi;\
194	xor s2(%r11,%rdi,4),%r8d;\
195	movzx a ## H, %edi;\
196	xor s3(%r11,%rdi,4),%r9d;\
197	add %r8d, %r9d;\
198	add %r9d, %r8d;\
199	add k+round(%r11), %r9d;\
200	xor %r9d, c ## D;\
201	add k+4+round(%r11),%r8d;\
202	xor %r8d, d ## D;\
203	ror $1, d ## D;
204
205	SYM_FUNC_START(twofish_enc_blk)
206	pushq R1
207
208	/ %rdi contains the ctx address /
209	/ %rsi contains the output address /
210	/ %rdx contains the input address /
211	/ ctx address is moved to free one non-rex register*
212	as target for the 8bit high operations /*
213	mov %rdi, %r11
214
215	movq (R3), R1
216	movq `8`(R3), R3
217	input_whitening(R1,%r11,a_offset)
218	input_whitening(R3,%r11,c_offset)
219	mov R1D, R0D
220	rol $`16`, R0D
221	shr $`32`, R1
222	mov R3D, R2D
223	shr $`32`, R3
224	rol $`1`, R3D
225
226	encrypt_round(R0,R1,R2,R3,`0`);
227	encrypt_round(R2,R3,R0,R1,`8`);
228	encrypt_round(R0,R1,R2,R3,`2`*`8`);
229	encrypt_round(R2,R3,R0,R1,`3`*`8`);
230	encrypt_round(R0,R1,R2,R3,`4`*`8`);
231	encrypt_round(R2,R3,R0,R1,`5`*`8`);
232	encrypt_round(R0,R1,R2,R3,`6`*`8`);
233	encrypt_round(R2,R3,R0,R1,`7`*`8`);
234	encrypt_round(R0,R1,R2,R3,`8`*`8`);
235	encrypt_round(R2,R3,R0,R1,`9`*`8`);
236	encrypt_round(R0,R1,R2,R3,`10`*`8`);
237	encrypt_round(R2,R3,R0,R1,`11`*`8`);
238	encrypt_round(R0,R1,R2,R3,`12`*`8`);
239	encrypt_round(R2,R3,R0,R1,`13`*`8`);
240	encrypt_round(R0,R1,R2,R3,`14`*`8`);
241	encrypt_last_round(R2,R3,R0,R1,`15`*`8`);
242
243
244	output_whitening(%r10,%r11,a_offset)
245	movq %r10, (%rsi)
246
247	shl $`32`, R1
248	xor R0, R1
249
250	output_whitening(R1,%r11,c_offset)
251	movq R1, `8`(%rsi)
252
253	popq R1
254	movl $`1`,%eax
255	RET
256	SYM_FUNC_END(twofish_enc_blk)
257
258	SYM_FUNC_START(twofish_dec_blk)
259	pushq R1
260
261	/ %rdi contains the ctx address /
262	/ %rsi contains the output address /
263	/ %rdx contains the input address /
264	/ ctx address is moved to free one non-rex register*
265	as target for the 8bit high operations /*
266	mov %rdi, %r11
267
268	movq (R3), R1
269	movq `8`(R3), R3
270	output_whitening(R1,%r11,a_offset)
271	output_whitening(R3,%r11,c_offset)
272	mov R1D, R0D
273	shr $`32`, R1
274	rol $`16`, R1D
275	mov R3D, R2D
276	shr $`32`, R3
277	rol $`1`, R2D
278
279	decrypt_round(R0,R1,R2,R3,`15`*`8`);
280	decrypt_round(R2,R3,R0,R1,`14`*`8`);
281	decrypt_round(R0,R1,R2,R3,`13`*`8`);
282	decrypt_round(R2,R3,R0,R1,`12`*`8`);
283	decrypt_round(R0,R1,R2,R3,`11`*`8`);
284	decrypt_round(R2,R3,R0,R1,`10`*`8`);
285	decrypt_round(R0,R1,R2,R3,`9`*`8`);
286	decrypt_round(R2,R3,R0,R1,`8`*`8`);
287	decrypt_round(R0,R1,R2,R3,`7`*`8`);
288	decrypt_round(R2,R3,R0,R1,`6`*`8`);
289	decrypt_round(R0,R1,R2,R3,`5`*`8`);
290	decrypt_round(R2,R3,R0,R1,`4`*`8`);
291	decrypt_round(R0,R1,R2,R3,`3`*`8`);
292	decrypt_round(R2,R3,R0,R1,`2`*`8`);
293	decrypt_round(R0,R1,R2,R3,`1`*`8`);
294	decrypt_last_round(R2,R3,R0,R1,`0`);
295
296	input_whitening(%r10,%r11,a_offset)
297	movq %r10, (%rsi)
298
299	shl $`32`, R1
300	xor R0, R1
301
302	input_whitening(R1,%r11,c_offset)
303	movq R1, `8`(%rsi)
304
305	popq R1
306	movl $`1`,%eax
307	RET
308	SYM_FUNC_END(twofish_dec_blk)
309

source code of linux/arch/x86/crypto/twofish-x86_64-asm_64.S