aes-neonbs-core.S source code [linux/arch/arm/crypto/aes-neonbs-core.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* Bit sliced AES using NEON instructions
4	*
5	* Copyright (C) 2017 Linaro Ltd.
6	* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
7	*/
8
9	/*
10	* The algorithm implemented here is described in detail by the paper
11	* 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
12	* Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
13	*
14	* This implementation is based primarily on the OpenSSL implementation
15	* for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
16	*/
17
18	#include <linux/linkage.h>
19	#include <asm/assembler.h>
20
21	.text
22	.fpu neon
23
24	rounds .req ip
25	bskey .req r4
26
27	q0l .req d0
28	q0h .req d1
29	q1l .req d2
30	q1h .req d3
31	q2l .req d4
32	q2h .req d5
33	q3l .req d6
34	q3h .req d7
35	q4l .req d8
36	q4h .req d9
37	q5l .req d10
38	q5h .req d11
39	q6l .req d12
40	q6h .req d13
41	q7l .req d14
42	q7h .req d15
43	q8l .req d16
44	q8h .req d17
45	q9l .req d18
46	q9h .req d19
47	q10l .req d20
48	q10h .req d21
49	q11l .req d22
50	q11h .req d23
51	q12l .req d24
52	q12h .req d25
53	q13l .req d26
54	q13h .req d27
55	q14l .req d28
56	q14h .req d29
57	q15l .req d30
58	q15h .req d31
59
60	.macro __tbl, out, tbl, in, tmp
61	.ifc \out, \tbl
62	.ifb \tmp
63	.error __tbl needs temp register if out == tbl
64	.endif
65	vmov \tmp, \out
66	.endif
67	vtbl`.8` \out\()l, {\tbl}, \in\()l
68	.ifc \out, \tbl
69	vtbl`.8` \out\()h, {\tmp}, \in\()h
70	.else
71	vtbl`.8` \out\()h, {\tbl}, \in\()h
72	.endif
73	.endm
74
75	.macro __ldr, out, sym
76	vldr \out\()l, \sym
77	vldr \out\()h, \sym + `8`
78	.endm
79
80	.macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
81	veor \b2, \b2, \b1
82	veor \b5, \b5, \b6
83	veor \b3, \b3, \b0
84	veor \b6, \b6, \b2
85	veor \b5, \b5, \b0
86	veor \b6, \b6, \b3
87	veor \b3, \b3, \b7
88	veor \b7, \b7, \b5
89	veor \b3, \b3, \b4
90	veor \b4, \b4, \b5
91	veor \b2, \b2, \b7
92	veor \b3, \b3, \b1
93	veor \b1, \b1, \b5
94	.endm
95
96	.macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
97	veor \b0, \b0, \b6
98	veor \b1, \b1, \b4
99	veor \b4, \b4, \b6
100	veor \b2, \b2, \b0
101	veor \b6, \b6, \b1
102	veor \b1, \b1, \b5
103	veor \b5, \b5, \b3
104	veor \b3, \b3, \b7
105	veor \b7, \b7, \b5
106	veor \b2, \b2, \b5
107	veor \b4, \b4, \b7
108	.endm
109
110	.macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
111	veor \b1, \b1, \b7
112	veor \b4, \b4, \b7
113	veor \b7, \b7, \b5
114	veor \b1, \b1, \b3
115	veor \b2, \b2, \b5
116	veor \b3, \b3, \b7
117	veor \b6, \b6, \b1
118	veor \b2, \b2, \b0
119	veor \b5, \b5, \b3
120	veor \b4, \b4, \b6
121	veor \b0, \b0, \b6
122	veor \b1, \b1, \b4
123	.endm
124
125	.macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
126	veor \b1, \b1, \b5
127	veor \b2, \b2, \b7
128	veor \b3, \b3, \b1
129	veor \b4, \b4, \b5
130	veor \b7, \b7, \b5
131	veor \b3, \b3, \b4
132	veor \b5, \b5, \b0
133	veor \b3, \b3, \b7
134	veor \b6, \b6, \b2
135	veor \b2, \b2, \b1
136	veor \b6, \b6, \b3
137	veor \b3, \b3, \b0
138	veor \b5, \b5, \b6
139	.endm
140
141	.macro mul_gf4, x0, x1, y0, y1, t0, t1
142	veor \t0, \y0, \y1
143	vand \t0, \t0, \x0
144	veor \x0, \x0, \x1
145	vand \t1, \x1, \y0
146	vand \x0, \x0, \y1
147	veor \x1, \t1, \t0
148	veor \x0, \x0, \t1
149	.endm
150
151	.macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
152	veor \t0, \y0, \y1
153	veor \t1, \y2, \y3
154	vand \t0, \t0, \x0
155	vand \t1, \t1, \x2
156	veor \x0, \x0, \x1
157	veor \x2, \x2, \x3
158	vand \x1, \x1, \y0
159	vand \x3, \x3, \y2
160	vand \x0, \x0, \y1
161	vand \x2, \x2, \y3
162	veor \x1, \x1, \x0
163	veor \x2, \x2, \x3
164	veor \x0, \x0, \t0
165	veor \x3, \x3, \t1
166	.endm
167
168	.macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
169	y0, y1, y2, y3, t0, t1, t2, t3
170	veor \t0, \x0, \x2
171	veor \t1, \x1, \x3
172	mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
173	veor \y0, \y0, \y2
174	veor \y1, \y1, \y3
175	mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
176	veor \x0, \x0, \t0
177	veor \x2, \x2, \t0
178	veor \x1, \x1, \t1
179	veor \x3, \x3, \t1
180	veor \t0, \x4, \x6
181	veor \t1, \x5, \x7
182	mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
183	veor \y0, \y0, \y2
184	veor \y1, \y1, \y3
185	mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
186	veor \x4, \x4, \t0
187	veor \x6, \x6, \t0
188	veor \x5, \x5, \t1
189	veor \x7, \x7, \t1
190	.endm
191
192	.macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
193	t0, t1, t2, t3, s0, s1, s2, s3
194	veor \t3, \x4, \x6
195	veor \t0, \x5, \x7
196	veor \t1, \x1, \x3
197	veor \s1, \x7, \x6
198	veor \s0, \x0, \x2
199	veor \s3, \t3, \t0
200	vorr \t2, \t0, \t1
201	vand \s2, \t3, \s0
202	vorr \t3, \t3, \s0
203	veor \s0, \s0, \t1
204	vand \t0, \t0, \t1
205	veor \t1, \x3, \x2
206	vand \s3, \s3, \s0
207	vand \s1, \s1, \t1
208	veor \t1, \x4, \x5
209	veor \s0, \x1, \x0
210	veor \t3, \t3, \s1
211	veor \t2, \t2, \s1
212	vand \s1, \t1, \s0
213	vorr \t1, \t1, \s0
214	veor \t3, \t3, \s3
215	veor \t0, \t0, \s1
216	veor \t2, \t2, \s2
217	veor \t1, \t1, \s3
218	veor \t0, \t0, \s2
219	vand \s0, \x7, \x3
220	veor \t1, \t1, \s2
221	vand \s1, \x6, \x2
222	vand \s2, \x5, \x1
223	vorr \s3, \x4, \x0
224	veor \t3, \t3, \s0
225	veor \t1, \t1, \s2
226	veor \s0, \t0, \s3
227	veor \t2, \t2, \s1
228	vand \s2, \t3, \t1
229	veor \s1, \t2, \s2
230	veor \s3, \s0, \s2
231	vbsl \s1, \t1, \s0
232	vmvn \t0, \s0
233	vbsl \s0, \s1, \s3
234	vbsl \t0, \s1, \s3
235	vbsl \s3, \t3, \t2
236	veor \t3, \t3, \t2
237	vand \s2, \s0, \s3
238	veor \t1, \t1, \t0
239	veor \s2, \s2, \t3
240	mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
241	\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
242	.endm
243
244	.macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
245	t0, t1, t2, t3, s0, s1, s2, s3
246	in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
247	inv_gf256 \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
248	\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
249	out_bs_ch \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
250	.endm
251
252	.macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
253	t0, t1, t2, t3, s0, s1, s2, s3
254	inv_in_bs_ch \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
255	inv_gf256 \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
256	\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
257	inv_out_bs_ch \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
258	.endm
259
260	.macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
261	t0, t1, t2, t3, mask
262	vld1`.8` {\t0-\t1}, [bskey, :`256`]!
263	veor \t0, \t0, \x0
264	vld1`.8` {\t2-\t3}, [bskey, :`256`]!
265	veor \t1, \t1, \x1
266	__tbl \x0, \t0, \mask
267	veor \t2, \t2, \x2
268	__tbl \x1, \t1, \mask
269	vld1`.8` {\t0-\t1}, [bskey, :`256`]!
270	veor \t3, \t3, \x3
271	__tbl \x2, \t2, \mask
272	__tbl \x3, \t3, \mask
273	vld1`.8` {\t2-\t3}, [bskey, :`256`]!
274	veor \t0, \t0, \x4
275	veor \t1, \t1, \x5
276	__tbl \x4, \t0, \mask
277	veor \t2, \t2, \x6
278	__tbl \x5, \t1, \mask
279	veor \t3, \t3, \x7
280	__tbl \x6, \t2, \mask
281	__tbl \x7, \t3, \mask
282	.endm
283
284	.macro inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
285	t0, t1, t2, t3, mask
286	__tbl \x0, \x0, \mask, \t0
287	__tbl \x1, \x1, \mask, \t1
288	__tbl \x2, \x2, \mask, \t2
289	__tbl \x3, \x3, \mask, \t3
290	__tbl \x4, \x4, \mask, \t0
291	__tbl \x5, \x5, \mask, \t1
292	__tbl \x6, \x6, \mask, \t2
293	__tbl \x7, \x7, \mask, \t3
294	.endm
295
296	.macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
297	t0, t1, t2, t3, t4, t5, t6, t7, inv
298	vext`.8` \t0, \x0, \x0, #`12`
299	vext`.8` \t1, \x1, \x1, #`12`
300	veor \x0, \x0, \t0
301	vext`.8` \t2, \x2, \x2, #`12`
302	veor \x1, \x1, \t1
303	vext`.8` \t3, \x3, \x3, #`12`
304	veor \x2, \x2, \t2
305	vext`.8` \t4, \x4, \x4, #`12`
306	veor \x3, \x3, \t3
307	vext`.8` \t5, \x5, \x5, #`12`
308	veor \x4, \x4, \t4
309	vext`.8` \t6, \x6, \x6, #`12`
310	veor \x5, \x5, \t5
311	vext`.8` \t7, \x7, \x7, #`12`
312	veor \x6, \x6, \t6
313	veor \t1, \t1, \x0
314	veor`.8` \x7, \x7, \t7
315	vext`.8` \x0, \x0, \x0, #`8`
316	veor \t2, \t2, \x1
317	veor \t0, \t0, \x7
318	veor \t1, \t1, \x7
319	vext`.8` \x1, \x1, \x1, #`8`
320	veor \t5, \t5, \x4
321	veor \x0, \x0, \t0
322	veor \t6, \t6, \x5
323	veor \x1, \x1, \t1
324	vext`.8` \t0, \x4, \x4, #`8`
325	veor \t4, \t4, \x3
326	vext`.8` \t1, \x5, \x5, #`8`
327	veor \t7, \t7, \x6
328	vext`.8` \x4, \x3, \x3, #`8`
329	veor \t3, \t3, \x2
330	vext`.8` \x5, \x7, \x7, #`8`
331	veor \t4, \t4, \x7
332	vext`.8` \x3, \x6, \x6, #`8`
333	veor \t3, \t3, \x7
334	vext`.8` \x6, \x2, \x2, #`8`
335	veor \x7, \t1, \t5
336	.ifb \inv
337	veor \x2, \t0, \t4
338	veor \x4, \x4, \t3
339	veor \x5, \x5, \t7
340	veor \x3, \x3, \t6
341	veor \x6, \x6, \t2
342	.else
343	veor \t3, \t3, \x4
344	veor \x5, \x5, \t7
345	veor \x2, \x3, \t6
346	veor \x3, \t0, \t4
347	veor \x4, \x6, \t2
348	vmov \x6, \t3
349	.endif
350	.endm
351
352	.macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
353	t0, t1, t2, t3, t4, t5, t6, t7
354	vld1`.8` {\t0-\t1}, [bskey, :`256`]!
355	veor \x0, \x0, \t0
356	vld1`.8` {\t2-\t3}, [bskey, :`256`]!
357	veor \x1, \x1, \t1
358	vld1`.8` {\t4-\t5}, [bskey, :`256`]!
359	veor \x2, \x2, \t2
360	vld1`.8` {\t6-\t7}, [bskey, :`256`]
361	sub bskey, bskey, #`224`
362	veor \x3, \x3, \t3
363	veor \x4, \x4, \t4
364	veor \x5, \x5, \t5
365	veor \x6, \x6, \t6
366	veor \x7, \x7, \t7
367	vext`.8` \t0, \x0, \x0, #`8`
368	vext`.8` \t6, \x6, \x6, #`8`
369	vext`.8` \t7, \x7, \x7, #`8`
370	veor \t0, \t0, \x0
371	vext`.8` \t1, \x1, \x1, #`8`
372	veor \t6, \t6, \x6
373	vext`.8` \t2, \x2, \x2, #`8`
374	veor \t7, \t7, \x7
375	vext`.8` \t3, \x3, \x3, #`8`
376	veor \t1, \t1, \x1
377	vext`.8` \t4, \x4, \x4, #`8`
378	veor \t2, \t2, \x2
379	vext`.8` \t5, \x5, \x5, #`8`
380	veor \t3, \t3, \x3
381	veor \t4, \t4, \x4
382	veor \t5, \t5, \x5
383	veor \x0, \x0, \t6
384	veor \x1, \x1, \t6
385	veor \x2, \x2, \t0
386	veor \x4, \x4, \t2
387	veor \x3, \x3, \t1
388	veor \x1, \x1, \t7
389	veor \x2, \x2, \t7
390	veor \x4, \x4, \t6
391	veor \x5, \x5, \t3
392	veor \x3, \x3, \t6
393	veor \x6, \x6, \t4
394	veor \x4, \x4, \t7
395	veor \x5, \x5, \t7
396	veor \x7, \x7, \t5
397	mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
398	\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, `1`
399	.endm
400
401	.macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
402	vshr.u64 \t0, \b0, #\n
403	vshr.u64 \t1, \b1, #\n
404	veor \t0, \t0, \a0
405	veor \t1, \t1, \a1
406	vand \t0, \t0, \mask
407	vand \t1, \t1, \mask
408	veor \a0, \a0, \t0
409	vshl.s64 \t0, \t0, #\n
410	veor \a1, \a1, \t1
411	vshl.s64 \t1, \t1, #\n
412	veor \b0, \b0, \t0
413	veor \b1, \b1, \t1
414	.endm
415
416	.macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
417	vmov.i8 \t0, #`0x55`
418	vmov.i8 \t1, #`0x33`
419	swapmove_2x \x0, \x1, \x2, \x3, `1`, \t0, \t2, \t3
420	swapmove_2x \x4, \x5, \x6, \x7, `1`, \t0, \t2, \t3
421	vmov.i8 \t0, #`0x0f`
422	swapmove_2x \x0, \x2, \x1, \x3, `2`, \t1, \t2, \t3
423	swapmove_2x \x4, \x6, \x5, \x7, `2`, \t1, \t2, \t3
424	swapmove_2x \x0, \x4, \x1, \x5, `4`, \t0, \t2, \t3
425	swapmove_2x \x2, \x6, \x3, \x7, `4`, \t0, \t2, \t3
426	.endm
427
428	.align `4`
429	M0: .quad `0x02060a0e03070b0f`, `0x0004080c0105090d`
430
431	/*
432	* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
433	*/
434	ENTRY(aesbs_convert_key)
435	vld1`.32` {q7}, [r1]! // load round 0 key
436	vld1`.32` {q15}, [r1]! // load round 1 key
437
438	vmov.i8 q8, #`0x01` // bit masks
439	vmov.i8 q9, #`0x02`
440	vmov.i8 q10, #`0x04`
441	vmov.i8 q11, #`0x08`
442	vmov.i8 q12, #`0x10`
443	vmov.i8 q13, #`0x20`
444	__ldr q14, M0
445
446	sub r2, r2, #`1`
447	vst1`.8` {q7}, [r0, :`128`]! // save round 0 key
448
449	.Lkey_loop:
450	__tbl q7, q15, q14
451	vmov.i8 q6, #`0x40`
452	vmov.i8 q15, #`0x80`
453
454	vtst`.8` q0, q7, q8
455	vtst`.8` q1, q7, q9
456	vtst`.8` q2, q7, q10
457	vtst`.8` q3, q7, q11
458	vtst`.8` q4, q7, q12
459	vtst`.8` q5, q7, q13
460	vtst`.8` q6, q7, q6
461	vtst`.8` q7, q7, q15
462	vld1`.32` {q15}, [r1]! // load next round key
463	vmvn q0, q0
464	vmvn q1, q1
465	vmvn q5, q5
466	vmvn q6, q6
467
468	subs r2, r2, #`1`
469	vst1`.8` {q0-q1}, [r0, :`256`]!
470	vst1`.8` {q2-q3}, [r0, :`256`]!
471	vst1`.8` {q4-q5}, [r0, :`256`]!
472	vst1`.8` {q6-q7}, [r0, :`256`]!
473	bne .Lkey_loop
474
475	vmov.i8 q7, #`0x63` // compose .L63
476	veor q15, q15, q7
477	vst1`.8` {q15}, [r0, :`128`]
478	bx lr
479	ENDPROC(aesbs_convert_key)
480
481	.align `4`
482	M0SR: .quad `0x0a0e02060f03070b`, `0x0004080c05090d01`
483
484	aesbs_encrypt8:
485	vld1`.8` {q9}, [bskey, :`128`]! // round 0 key
486	__ldr q8, M0SR
487
488	veor q10, q0, q9 // xor with round0 key
489	veor q11, q1, q9
490	__tbl q0, q10, q8
491	veor q12, q2, q9
492	__tbl q1, q11, q8
493	veor q13, q3, q9
494	__tbl q2, q12, q8
495	veor q14, q4, q9
496	__tbl q3, q13, q8
497	veor q15, q5, q9
498	__tbl q4, q14, q8
499	veor q10, q6, q9
500	__tbl q5, q15, q8
501	veor q11, q7, q9
502	__tbl q6, q10, q8
503	__tbl q7, q11, q8
504
505	bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
506
507	sub rounds, rounds, #`1`
508	b .Lenc_sbox
509
510	.align `5`
511	SR: .quad `0x0504070600030201`, `0x0f0e0d0c0a09080b`
512	SRM0: .quad `0x0304090e00050a0f`, `0x01060b0c0207080d`
513
514	.Lenc_last:
515	__ldr q12, SRM0
516	.Lenc_loop:
517	shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
518	.Lenc_sbox:
519	sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
520	q13, q14, q15
521	subs rounds, rounds, #`1`
522	bcc .Lenc_done
523
524	mix_cols q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
525	q13, q14, q15
526
527	beq .Lenc_last
528	__ldr q12, SR
529	b .Lenc_loop
530
531	.Lenc_done:
532	vld1`.8` {q12}, [bskey, :`128`] // last round key
533
534	bitslice q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
535
536	veor q0, q0, q12
537	veor q1, q1, q12
538	veor q4, q4, q12
539	veor q6, q6, q12
540	veor q3, q3, q12
541	veor q7, q7, q12
542	veor q2, q2, q12
543	veor q5, q5, q12
544	bx lr
545	ENDPROC(aesbs_encrypt8)
546
547	.align `4`
548	M0ISR: .quad `0x0a0e0206070b0f03`, `0x0004080c0d010509`
549
550	aesbs_decrypt8:
551	add bskey, bskey, rounds, lsl #`7`
552	sub bskey, bskey, #`112`
553	vld1`.8` {q9}, [bskey, :`128`] // round 0 key
554	sub bskey, bskey, #`128`
555	__ldr q8, M0ISR
556
557	veor q10, q0, q9 // xor with round0 key
558	veor q11, q1, q9
559	__tbl q0, q10, q8
560	veor q12, q2, q9
561	__tbl q1, q11, q8
562	veor q13, q3, q9
563	__tbl q2, q12, q8
564	veor q14, q4, q9
565	__tbl q3, q13, q8
566	veor q15, q5, q9
567	__tbl q4, q14, q8
568	veor q10, q6, q9
569	__tbl q5, q15, q8
570	veor q11, q7, q9
571	__tbl q6, q10, q8
572	__tbl q7, q11, q8
573
574	bitslice q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
575
576	sub rounds, rounds, #`1`
577	b .Ldec_sbox
578
579	.align `5`
580	ISR: .quad `0x0504070602010003`, `0x0f0e0d0c080b0a09`
581	ISRM0: .quad `0x01040b0e0205080f`, `0x0306090c00070a0d`
582
583	.Ldec_last:
584	__ldr q12, ISRM0
585	.Ldec_loop:
586	inv_shift_rows q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
587	.Ldec_sbox:
588	inv_sbox q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
589	q13, q14, q15
590	subs rounds, rounds, #`1`
591	bcc .Ldec_done
592
593	inv_mix_cols q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
594	q13, q14, q15
595
596	beq .Ldec_last
597	__ldr q12, ISR
598	b .Ldec_loop
599
600	.Ldec_done:
601	add bskey, bskey, #`112`
602	vld1`.8` {q12}, [bskey, :`128`] // last round key
603
604	bitslice q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
605
606	veor q0, q0, q12
607	veor q1, q1, q12
608	veor q6, q6, q12
609	veor q4, q4, q12
610	veor q2, q2, q12
611	veor q7, q7, q12
612	veor q3, q3, q12
613	veor q5, q5, q12
614	bx lr
615	ENDPROC(aesbs_decrypt8)
616
617	/*
618	* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
619	* int blocks)
620	* aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
621	* int blocks)
622	*/
623	.macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
624	push {r4-r6, lr}
625	ldr r5, [sp, #`16`] // number of blocks
626
627	`99`: adr ip, `0f`
628	and lr, r5, #`7`
629	cmp r5, #`8`
630	sub ip, ip, lr, lsl #`2`
631	movlt pc, ip // computed goto if blocks < 8
632
633	vld1`.8` {q0}, [r1]!
634	vld1`.8` {q1}, [r1]!
635	vld1`.8` {q2}, [r1]!
636	vld1`.8` {q3}, [r1]!
637	vld1`.8` {q4}, [r1]!
638	vld1`.8` {q5}, [r1]!
639	vld1`.8` {q6}, [r1]!
640	vld1`.8` {q7}, [r1]!
641
642	`0`: mov bskey, r2
643	mov rounds, r3
644	bl \do8
645
646	adr ip, `1f`
647	and lr, r5, #`7`
648	cmp r5, #`8`
649	sub ip, ip, lr, lsl #`2`
650	movlt pc, ip // computed goto if blocks < 8
651
652	vst1`.8` {\o0}, [r0]!
653	vst1`.8` {\o1}, [r0]!
654	vst1`.8` {\o2}, [r0]!
655	vst1`.8` {\o3}, [r0]!
656	vst1`.8` {\o4}, [r0]!
657	vst1`.8` {\o5}, [r0]!
658	vst1`.8` {\o6}, [r0]!
659	vst1`.8` {\o7}, [r0]!
660
661	`1`: subs r5, r5, #`8`
662	bgt `99b`
663
664	pop {r4-r6, pc}
665	.endm
666
667	.align `4`
668	ENTRY(aesbs_ecb_encrypt)
669	__ecb_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
670	ENDPROC(aesbs_ecb_encrypt)
671
672	.align `4`
673	ENTRY(aesbs_ecb_decrypt)
674	__ecb_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
675	ENDPROC(aesbs_ecb_decrypt)
676
677	/*
678	* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
679	* int rounds, int blocks, u8 iv[])
680	*/
681	.align `4`
682	ENTRY(aesbs_cbc_decrypt)
683	mov ip, sp
684	push {r4-r6, lr}
685	ldm ip, {r5-r6} // load args 4-5
686
687	`99`: adr ip, `0f`
688	and lr, r5, #`7`
689	cmp r5, #`8`
690	sub ip, ip, lr, lsl #`2`
691	mov lr, r1
692	movlt pc, ip // computed goto if blocks < 8
693
694	vld1`.8` {q0}, [lr]!
695	vld1`.8` {q1}, [lr]!
696	vld1`.8` {q2}, [lr]!
697	vld1`.8` {q3}, [lr]!
698	vld1`.8` {q4}, [lr]!
699	vld1`.8` {q5}, [lr]!
700	vld1`.8` {q6}, [lr]!
701	vld1`.8` {q7}, [lr]
702
703	`0`: mov bskey, r2
704	mov rounds, r3
705	bl aesbs_decrypt8
706
707	vld1`.8` {q8}, [r6]
708	vmov q9, q8
709	vmov q10, q8
710	vmov q11, q8
711	vmov q12, q8
712	vmov q13, q8
713	vmov q14, q8
714	vmov q15, q8
715
716	adr ip, `1f`
717	and lr, r5, #`7`
718	cmp r5, #`8`
719	sub ip, ip, lr, lsl #`2`
720	movlt pc, ip // computed goto if blocks < 8
721
722	vld1`.8` {q9}, [r1]!
723	vld1`.8` {q10}, [r1]!
724	vld1`.8` {q11}, [r1]!
725	vld1`.8` {q12}, [r1]!
726	vld1`.8` {q13}, [r1]!
727	vld1`.8` {q14}, [r1]!
728	vld1`.8` {q15}, [r1]!
729	W(nop)
730
731	`1`: adr ip, `2f`
732	sub ip, ip, lr, lsl #`3`
733	movlt pc, ip // computed goto if blocks < 8
734
735	veor q0, q0, q8
736	vst1`.8` {q0}, [r0]!
737	veor q1, q1, q9
738	vst1`.8` {q1}, [r0]!
739	veor q6, q6, q10
740	vst1`.8` {q6}, [r0]!
741	veor q4, q4, q11
742	vst1`.8` {q4}, [r0]!
743	veor q2, q2, q12
744	vst1`.8` {q2}, [r0]!
745	veor q7, q7, q13
746	vst1`.8` {q7}, [r0]!
747	veor q3, q3, q14
748	vst1`.8` {q3}, [r0]!
749	veor q5, q5, q15
750	vld1`.8` {q8}, [r1]! // load next round's iv
751	`2`: vst1`.8` {q5}, [r0]!
752
753	subs r5, r5, #`8`
754	vst1`.8` {q8}, [r6] // store next round's iv
755	bgt `99b`
756
757	pop {r4-r6, pc}
758	ENDPROC(aesbs_cbc_decrypt)
759
760	.macro next_ctr, q
761	vmov \q\()h, r9, r10
762	adds r10, r10, #`1`
763	adcs r9, r9, #`0`
764	vmov \q\()l, r7, r8
765	adcs r8, r8, #`0`
766	adc r7, r7, #`0`
767	vrev32`.8` \q, \q
768	.endm
769
770	/*
771	* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
772	* int rounds, int bytes, u8 ctr[])
773	*/
774	ENTRY(aesbs_ctr_encrypt)
775	mov ip, sp
776	push {r4-r10, lr}
777
778	ldm ip, {r5, r6} // load args 4-5
779	vld1`.8` {q0}, [r6] // load counter
780	vrev32`.8` q1, q0
781	vmov r9, r10, d3
782	vmov r7, r8, d2
783
784	adds r10, r10, #`1`
785	adcs r9, r9, #`0`
786	adcs r8, r8, #`0`
787	adc r7, r7, #`0`
788
789	`99`: vmov q1, q0
790	sub lr, r5, #`1`
791	vmov q2, q0
792	adr ip, `0f`
793	vmov q3, q0
794	and lr, lr, #`112`
795	vmov q4, q0
796	cmp r5, #`112`
797	vmov q5, q0
798	sub ip, ip, lr, lsl #`1`
799	vmov q6, q0
800	add ip, ip, lr, lsr #`2`
801	vmov q7, q0
802	movle pc, ip // computed goto if bytes < 112
803
804	next_ctr q1
805	next_ctr q2
806	next_ctr q3
807	next_ctr q4
808	next_ctr q5
809	next_ctr q6
810	next_ctr q7
811
812	`0`: mov bskey, r2
813	mov rounds, r3
814	bl aesbs_encrypt8
815
816	adr ip, `1f`
817	sub lr, r5, #`1`
818	cmp r5, #`128`
819	bic lr, lr, #`15`
820	ands r4, r5, #`15` // preserves C flag
821	teqcs r5, r5 // set Z flag if not last iteration
822	sub ip, ip, lr, lsr #`2`
823	rsb r4, r4, #`16`
824	movcc pc, ip // computed goto if bytes < 128
825
826	vld1`.8` {q8}, [r1]!
827	vld1`.8` {q9}, [r1]!
828	vld1`.8` {q10}, [r1]!
829	vld1`.8` {q11}, [r1]!
830	vld1`.8` {q12}, [r1]!
831	vld1`.8` {q13}, [r1]!
832	vld1`.8` {q14}, [r1]!
833	`1`: subne r1, r1, r4
834	vld1`.8` {q15}, [r1]!
835
836	add ip, ip, #`2f` - `1b`
837
838	veor q0, q0, q8
839	veor q1, q1, q9
840	veor q4, q4, q10
841	veor q6, q6, q11
842	veor q3, q3, q12
843	veor q7, q7, q13
844	veor q2, q2, q14
845	bne `3f`
846	veor q5, q5, q15
847
848	movcc pc, ip // computed goto if bytes < 128
849
850	vst1`.8` {q0}, [r0]!
851	vst1`.8` {q1}, [r0]!
852	vst1`.8` {q4}, [r0]!
853	vst1`.8` {q6}, [r0]!
854	vst1`.8` {q3}, [r0]!
855	vst1`.8` {q7}, [r0]!
856	vst1`.8` {q2}, [r0]!
857	`2`: subne r0, r0, r4
858	vst1`.8` {q5}, [r0]!
859
860	next_ctr q0
861
862	subs r5, r5, #`128`
863	bgt `99b`
864
865	vst1`.8` {q0}, [r6]
866	pop {r4-r10, pc}
867
868	`3`: adr lr, .Lpermute_table + `16`
869	cmp r5, #`16` // Z flag remains cleared
870	sub lr, lr, r4
871	vld1`.8` {q8-q9}, [lr]
872	vtbl`.8` d16, {q5}, d16
873	vtbl`.8` d17, {q5}, d17
874	veor q5, q8, q15
875	bcc `4f` // have to reload prev if R5 < 16
876	vtbx`.8` d10, {q2}, d18
877	vtbx`.8` d11, {q2}, d19
878	mov pc, ip // branch back to VST sequence
879
880	`4`: sub r0, r0, r4
881	vshr.s8 q9, q9, #`7` // create mask for VBIF
882	vld1`.8` {q8}, [r0] // reload
883	vbif q5, q8, q9
884	vst1`.8` {q5}, [r0]
885	pop {r4-r10, pc}
886	ENDPROC(aesbs_ctr_encrypt)
887
888	.align `6`
889	.Lpermute_table:
890	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
891	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
892	.byte `0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`
893	.byte `0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x0e`, `0x0f`
894	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
895	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
896
897	.macro next_tweak, out, in, const, tmp
898	vshr.s64 \tmp, \in, #`63`
899	vand \tmp, \tmp, \const
900	vadd.u64 \out, \in, \in
901	vext`.8` \tmp, \tmp, \tmp, #`8`
902	veor \out, \out, \tmp
903	.endm
904
905	/*
906	* aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
907	* int blocks, u8 iv[], int reorder_last_tweak)
908	* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
909	* int blocks, u8 iv[], int reorder_last_tweak)
910	*/
911	.align `6`
912	__xts_prepare8:
913	vld1`.8` {q14}, [r7] // load iv
914	vmov.i32 d30, #`0x87` // compose tweak mask vector
915	vmovl.u32 q15, d30
916	vshr.u64 d30, d31, #`7`
917	vmov q12, q14
918
919	adr ip, `0f`
920	and r4, r6, #`7`
921	cmp r6, #`8`
922	sub ip, ip, r4, lsl #`5`
923	mov r4, sp
924	movlt pc, ip // computed goto if blocks < 8
925
926	vld1`.8` {q0}, [r1]!
927	next_tweak q12, q14, q15, q13
928	veor q0, q0, q14
929	vst1`.8` {q14}, [r4, :`128`]!
930
931	vld1`.8` {q1}, [r1]!
932	next_tweak q14, q12, q15, q13
933	veor q1, q1, q12
934	vst1`.8` {q12}, [r4, :`128`]!
935
936	vld1`.8` {q2}, [r1]!
937	next_tweak q12, q14, q15, q13
938	veor q2, q2, q14
939	vst1`.8` {q14}, [r4, :`128`]!
940
941	vld1`.8` {q3}, [r1]!
942	next_tweak q14, q12, q15, q13
943	veor q3, q3, q12
944	vst1`.8` {q12}, [r4, :`128`]!
945
946	vld1`.8` {q4}, [r1]!
947	next_tweak q12, q14, q15, q13
948	veor q4, q4, q14
949	vst1`.8` {q14}, [r4, :`128`]!
950
951	vld1`.8` {q5}, [r1]!
952	next_tweak q14, q12, q15, q13
953	veor q5, q5, q12
954	vst1`.8` {q12}, [r4, :`128`]!
955
956	vld1`.8` {q6}, [r1]!
957	next_tweak q12, q14, q15, q13
958	veor q6, q6, q14
959	vst1`.8` {q14}, [r4, :`128`]!
960
961	vld1`.8` {q7}, [r1]!
962	next_tweak q14, q12, q15, q13
963	THUMB( itt le )
964	W(cmple) r8, #`0`
965	ble `1f`
966	`0`: veor q7, q7, q12
967	vst1`.8` {q12}, [r4, :`128`]
968
969	vst1`.8` {q14}, [r7] // store next iv
970	bx lr
971
972	`1`: vswp q12, q14
973	b `0b`
974	ENDPROC(__xts_prepare8)
975
976	.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
977	push {r4-r8, lr}
978	mov r5, sp // preserve sp
979	ldrd r6, r7, [sp, #`24`] // get blocks and iv args
980	rsb r8, ip, #`1`
981	sub ip, sp, #`128` // make room for 8x tweak
982	bic ip, ip, #`0xf` // align sp to 16 bytes
983	mov sp, ip
984
985	`99`: bl __xts_prepare8
986
987	mov bskey, r2
988	mov rounds, r3
989	bl \do8
990
991	adr ip, `0f`
992	and lr, r6, #`7`
993	cmp r6, #`8`
994	sub ip, ip, lr, lsl #`2`
995	mov r4, sp
996	movlt pc, ip // computed goto if blocks < 8
997
998	vld1`.8` {q8}, [r4, :`128`]!
999	vld1`.8` {q9}, [r4, :`128`]!
1000	vld1`.8` {q10}, [r4, :`128`]!
1001	vld1`.8` {q11}, [r4, :`128`]!
1002	vld1`.8` {q12}, [r4, :`128`]!
1003	vld1`.8` {q13}, [r4, :`128`]!
1004	vld1`.8` {q14}, [r4, :`128`]!
1005	vld1`.8` {q15}, [r4, :`128`]
1006
1007	`0`: adr ip, `1f`
1008	sub ip, ip, lr, lsl #`3`
1009	movlt pc, ip // computed goto if blocks < 8
1010
1011	veor \o0, \o0, q8
1012	vst1`.8` {\o0}, [r0]!
1013	veor \o1, \o1, q9
1014	vst1`.8` {\o1}, [r0]!
1015	veor \o2, \o2, q10
1016	vst1`.8` {\o2}, [r0]!
1017	veor \o3, \o3, q11
1018	vst1`.8` {\o3}, [r0]!
1019	veor \o4, \o4, q12
1020	vst1`.8` {\o4}, [r0]!
1021	veor \o5, \o5, q13
1022	vst1`.8` {\o5}, [r0]!
1023	veor \o6, \o6, q14
1024	vst1`.8` {\o6}, [r0]!
1025	veor \o7, \o7, q15
1026	vst1`.8` {\o7}, [r0]!
1027
1028	`1`: subs r6, r6, #`8`
1029	bgt `99b`
1030
1031	mov sp, r5
1032	pop {r4-r8, pc}
1033	.endm
1034
1035	ENTRY(aesbs_xts_encrypt)
1036	mov ip, #`0` // never reorder final tweak
1037	__xts_crypt aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1038	ENDPROC(aesbs_xts_encrypt)
1039
1040	ENTRY(aesbs_xts_decrypt)
1041	ldr ip, [sp, #`8`] // reorder final tweak?
1042	__xts_crypt aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1043	ENDPROC(aesbs_xts_decrypt)
1044

source code of linux/arch/arm/crypto/aes-neonbs-core.S