aes-ce-core.S source code [linux/arch/arm/crypto/aes-ce-core.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4	*
5	* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6	*/
7
8	#include <linux/linkage.h>
9	#include <asm/assembler.h>
10
11	.text
12	.arch armv8-a
13	.fpu crypto-neon-fp-armv8
14	.align `3`
15
16	.macro enc_round, state, key
17	aese`.8` \state, \key
18	aesmc`.8` \state, \state
19	.endm
20
21	.macro dec_round, state, key
22	aesd`.8` \state, \key
23	aesimc`.8` \state, \state
24	.endm
25
26	.macro enc_dround, key1, key2
27	enc_round q0, \key1
28	enc_round q0, \key2
29	.endm
30
31	.macro dec_dround, key1, key2
32	dec_round q0, \key1
33	dec_round q0, \key2
34	.endm
35
36	.macro enc_fround, key1, key2, key3
37	enc_round q0, \key1
38	aese`.8` q0, \key2
39	veor q0, q0, \key3
40	.endm
41
42	.macro dec_fround, key1, key2, key3
43	dec_round q0, \key1
44	aesd`.8` q0, \key2
45	veor q0, q0, \key3
46	.endm
47
48	.macro enc_dround_4x, key1, key2
49	enc_round q0, \key1
50	enc_round q1, \key1
51	enc_round q2, \key1
52	enc_round q3, \key1
53	enc_round q0, \key2
54	enc_round q1, \key2
55	enc_round q2, \key2
56	enc_round q3, \key2
57	.endm
58
59	.macro dec_dround_4x, key1, key2
60	dec_round q0, \key1
61	dec_round q1, \key1
62	dec_round q2, \key1
63	dec_round q3, \key1
64	dec_round q0, \key2
65	dec_round q1, \key2
66	dec_round q2, \key2
67	dec_round q3, \key2
68	.endm
69
70	.macro enc_fround_4x, key1, key2, key3
71	enc_round q0, \key1
72	enc_round q1, \key1
73	enc_round q2, \key1
74	enc_round q3, \key1
75	aese`.8` q0, \key2
76	aese`.8` q1, \key2
77	aese`.8` q2, \key2
78	aese`.8` q3, \key2
79	veor q0, q0, \key3
80	veor q1, q1, \key3
81	veor q2, q2, \key3
82	veor q3, q3, \key3
83	.endm
84
85	.macro dec_fround_4x, key1, key2, key3
86	dec_round q0, \key1
87	dec_round q1, \key1
88	dec_round q2, \key1
89	dec_round q3, \key1
90	aesd`.8` q0, \key2
91	aesd`.8` q1, \key2
92	aesd`.8` q2, \key2
93	aesd`.8` q3, \key2
94	veor q0, q0, \key3
95	veor q1, q1, \key3
96	veor q2, q2, \key3
97	veor q3, q3, \key3
98	.endm
99
100	.macro do_block, dround, fround
101	cmp r3, #`12` @ which key size?
102	vld1`.32` {q10-q11}, [ip]!
103	\dround q8, q9
104	vld1`.32` {q12-q13}, [ip]!
105	\dround q10, q11
106	vld1`.32` {q10-q11}, [ip]!
107	\dround q12, q13
108	vld1`.32` {q12-q13}, [ip]!
109	\dround q10, q11
110	blo `0f` @ AES-`128`: `10` rounds
111	vld1`.32` {q10-q11}, [ip]!
112	\dround q12, q13
113	beq `1f` @ AES-`192`: `12` rounds
114	vld1`.32` {q12-q13}, [ip]
115	\dround q10, q11
116	`0`: \fround q12, q13, q14
117	bx lr
118
119	`1`: \fround q10, q11, q14
120	bx lr
121	.endm
122
123	/*
124	* Internal, non-AAPCS compliant functions that implement the core AES
125	* transforms. These should preserve all registers except q0 - q2 and ip
126	* Arguments:
127	* q0 : first in/output block
128	* q1 : second in/output block (_4x version only)
129	* q2 : third in/output block (_4x version only)
130	* q3 : fourth in/output block (_4x version only)
131	* q8 : first round key
132	* q9 : secound round key
133	* q14 : final round key
134	* r2 : address of round key array
135	* r3 : number of rounds
136	*/
137	.align `6`
138	aes_encrypt:
139	add ip, r2, #`32` @ `3rd` round key
140	.Laes_encrypt_tweak:
141	do_block enc_dround, enc_fround
142	ENDPROC(aes_encrypt)
143
144	.align `6`
145	aes_decrypt:
146	add ip, r2, #`32` @ `3rd` round key
147	do_block dec_dround, dec_fround
148	ENDPROC(aes_decrypt)
149
150	.align `6`
151	aes_encrypt_4x:
152	add ip, r2, #`32` @ `3rd` round key
153	do_block enc_dround_4x, enc_fround_4x
154	ENDPROC(aes_encrypt_4x)
155
156	.align `6`
157	aes_decrypt_4x:
158	add ip, r2, #`32` @ `3rd` round key
159	do_block dec_dround_4x, dec_fround_4x
160	ENDPROC(aes_decrypt_4x)
161
162	.macro prepare_key, rk, rounds
163	add ip, \rk, \rounds, lsl #`4`
164	vld1`.32` {q8-q9}, [\rk] @ load first `2` round keys
165	vld1`.32` {q14}, [ip] @ load last round key
166	.endm
167
168	/*
169	* aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
170	* int blocks)
171	* aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
172	* int blocks)
173	*/
174	ENTRY(ce_aes_ecb_encrypt)
175	push {r4, lr}
176	ldr r4, [sp, #`8`]
177	prepare_key r2, r3
178	.Lecbencloop4x:
179	subs r4, r4, #`4`
180	bmi .Lecbenc1x
181	vld1`.8` {q0-q1}, [r1]!
182	vld1`.8` {q2-q3}, [r1]!
183	bl aes_encrypt_4x
184	vst1`.8` {q0-q1}, [r0]!
185	vst1`.8` {q2-q3}, [r0]!
186	b .Lecbencloop4x
187	.Lecbenc1x:
188	adds r4, r4, #`4`
189	beq .Lecbencout
190	.Lecbencloop:
191	vld1`.8` {q0}, [r1]!
192	bl aes_encrypt
193	vst1`.8` {q0}, [r0]!
194	subs r4, r4, #`1`
195	bne .Lecbencloop
196	.Lecbencout:
197	pop {r4, pc}
198	ENDPROC(ce_aes_ecb_encrypt)
199
200	ENTRY(ce_aes_ecb_decrypt)
201	push {r4, lr}
202	ldr r4, [sp, #`8`]
203	prepare_key r2, r3
204	.Lecbdecloop4x:
205	subs r4, r4, #`4`
206	bmi .Lecbdec1x
207	vld1`.8` {q0-q1}, [r1]!
208	vld1`.8` {q2-q3}, [r1]!
209	bl aes_decrypt_4x
210	vst1`.8` {q0-q1}, [r0]!
211	vst1`.8` {q2-q3}, [r0]!
212	b .Lecbdecloop4x
213	.Lecbdec1x:
214	adds r4, r4, #`4`
215	beq .Lecbdecout
216	.Lecbdecloop:
217	vld1`.8` {q0}, [r1]!
218	bl aes_decrypt
219	vst1`.8` {q0}, [r0]!
220	subs r4, r4, #`1`
221	bne .Lecbdecloop
222	.Lecbdecout:
223	pop {r4, pc}
224	ENDPROC(ce_aes_ecb_decrypt)
225
226	/*
227	* aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228	* int blocks, u8 iv[])
229	* aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230	* int blocks, u8 iv[])
231	*/
232	ENTRY(ce_aes_cbc_encrypt)
233	push {r4-r6, lr}
234	ldrd r4, r5, [sp, #`16`]
235	vld1`.8` {q0}, [r5]
236	prepare_key r2, r3
237	.Lcbcencloop:
238	vld1`.8` {q1}, [r1]! @ get next pt block
239	veor q0, q0, q1 @ ..and xor with iv
240	bl aes_encrypt
241	vst1`.8` {q0}, [r0]!
242	subs r4, r4, #`1`
243	bne .Lcbcencloop
244	vst1`.8` {q0}, [r5]
245	pop {r4-r6, pc}
246	ENDPROC(ce_aes_cbc_encrypt)
247
248	ENTRY(ce_aes_cbc_decrypt)
249	push {r4-r6, lr}
250	ldrd r4, r5, [sp, #`16`]
251	vld1`.8` {q15}, [r5] @ keep iv in q15
252	prepare_key r2, r3
253	.Lcbcdecloop4x:
254	subs r4, r4, #`4`
255	bmi .Lcbcdec1x
256	vld1`.8` {q0-q1}, [r1]!
257	vld1`.8` {q2-q3}, [r1]!
258	vmov q4, q0
259	vmov q5, q1
260	vmov q6, q2
261	vmov q7, q3
262	bl aes_decrypt_4x
263	veor q0, q0, q15
264	veor q1, q1, q4
265	veor q2, q2, q5
266	veor q3, q3, q6
267	vmov q15, q7
268	vst1`.8` {q0-q1}, [r0]!
269	vst1`.8` {q2-q3}, [r0]!
270	b .Lcbcdecloop4x
271	.Lcbcdec1x:
272	adds r4, r4, #`4`
273	beq .Lcbcdecout
274	vmov q6, q14 @ preserve last round key
275	.Lcbcdecloop:
276	vld1`.8` {q0}, [r1]! @ get next ct block
277	veor q14, q15, q6 @ combine prev ct with last key
278	vmov q15, q0
279	bl aes_decrypt
280	vst1`.8` {q0}, [r0]!
281	subs r4, r4, #`1`
282	bne .Lcbcdecloop
283	.Lcbcdecout:
284	vst1`.8` {q15}, [r5] @ keep iv in q15
285	pop {r4-r6, pc}
286	ENDPROC(ce_aes_cbc_decrypt)
287
288
289	/*
290	* ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291	* int rounds, int bytes, u8 const iv[])
292	* ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293	* int rounds, int bytes, u8 const iv[])
294	*/
295
296	ENTRY(ce_aes_cbc_cts_encrypt)
297	push {r4-r6, lr}
298	ldrd r4, r5, [sp, #`16`]
299
300	movw ip, :lower16:.Lcts_permute_table
301	movt ip, :upper16:.Lcts_permute_table
302	sub r4, r4, #`16`
303	add lr, ip, #`32`
304	add ip, ip, r4
305	sub lr, lr, r4
306	vld1`.8` {q5}, [ip]
307	vld1`.8` {q6}, [lr]
308
309	add ip, r1, r4
310	vld1`.8` {q0}, [r1] @ overlapping loads
311	vld1`.8` {q3}, [ip]
312
313	vld1`.8` {q1}, [r5] @ get iv
314	prepare_key r2, r3
315
316	veor q0, q0, q1 @ xor with iv
317	bl aes_encrypt
318
319	vtbl`.8` d4, {d0-d1}, d10
320	vtbl`.8` d5, {d0-d1}, d11
321	vtbl`.8` d2, {d6-d7}, d12
322	vtbl`.8` d3, {d6-d7}, d13
323
324	veor q0, q0, q1
325	bl aes_encrypt
326
327	add r4, r0, r4
328	vst1`.8` {q2}, [r4] @ overlapping stores
329	vst1`.8` {q0}, [r0]
330
331	pop {r4-r6, pc}
332	ENDPROC(ce_aes_cbc_cts_encrypt)
333
334	ENTRY(ce_aes_cbc_cts_decrypt)
335	push {r4-r6, lr}
336	ldrd r4, r5, [sp, #`16`]
337
338	movw ip, :lower16:.Lcts_permute_table
339	movt ip, :upper16:.Lcts_permute_table
340	sub r4, r4, #`16`
341	add lr, ip, #`32`
342	add ip, ip, r4
343	sub lr, lr, r4
344	vld1`.8` {q5}, [ip]
345	vld1`.8` {q6}, [lr]
346
347	add ip, r1, r4
348	vld1`.8` {q0}, [r1] @ overlapping loads
349	vld1`.8` {q1}, [ip]
350
351	vld1`.8` {q3}, [r5] @ get iv
352	prepare_key r2, r3
353
354	bl aes_decrypt
355
356	vtbl`.8` d4, {d0-d1}, d10
357	vtbl`.8` d5, {d0-d1}, d11
358	vtbx`.8` d0, {d2-d3}, d12
359	vtbx`.8` d1, {d2-d3}, d13
360
361	veor q1, q1, q2
362	bl aes_decrypt
363	veor q0, q0, q3 @ xor with iv
364
365	add r4, r0, r4
366	vst1`.8` {q1}, [r4] @ overlapping stores
367	vst1`.8` {q0}, [r0]
368
369	pop {r4-r6, pc}
370	ENDPROC(ce_aes_cbc_cts_decrypt)
371
372
373	/*
374	* aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375	* int blocks, u8 ctr[])
376	*/
377	ENTRY(ce_aes_ctr_encrypt)
378	push {r4-r6, lr}
379	ldrd r4, r5, [sp, #`16`]
380	vld1`.8` {q7}, [r5] @ load ctr
381	prepare_key r2, r3
382	vmov r6, s31 @ keep swabbed ctr in r6
383	rev r6, r6
384	cmn r6, r4 @ `32` bit overflow?
385	bcs .Lctrloop
386	.Lctrloop4x:
387	subs r4, r4, #`4`
388	bmi .Lctr1x
389
390	/*
391	* NOTE: the sequence below has been carefully tweaked to avoid
392	* a silicon erratum that exists in Cortex-A57 (#1742098) and
393	* Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
394	* may produce an incorrect result if they take their input from a
395	* register of which a single 32-bit lane has been updated the last
396	* time it was modified. To work around this, the lanes of registers
397	* q0-q3 below are not manipulated individually, and the different
398	* counter values are prepared by successive manipulations of q7.
399	*/
400	add ip, r6, #`1`
401	vmov q0, q7
402	rev ip, ip
403	add lr, r6, #`2`
404	vmov s31, ip @ set lane `3` of q1 via q7
405	add ip, r6, #`3`
406	rev lr, lr
407	vmov q1, q7
408	vmov s31, lr @ set lane `3` of q2 via q7
409	rev ip, ip
410	vmov q2, q7
411	vmov s31, ip @ set lane `3` of q3 via q7
412	add r6, r6, #`4`
413	vmov q3, q7
414
415	vld1`.8` {q4-q5}, [r1]!
416	vld1`.8` {q6}, [r1]!
417	vld1`.8` {q15}, [r1]!
418	bl aes_encrypt_4x
419	veor q0, q0, q4
420	veor q1, q1, q5
421	veor q2, q2, q6
422	veor q3, q3, q15
423	rev ip, r6
424	vst1`.8` {q0-q1}, [r0]!
425	vst1`.8` {q2-q3}, [r0]!
426	vmov s31, ip
427	b .Lctrloop4x
428	.Lctr1x:
429	adds r4, r4, #`4`
430	beq .Lctrout
431	.Lctrloop:
432	vmov q0, q7
433	bl aes_encrypt
434
435	adds r6, r6, #`1` @ increment BE ctr
436	rev ip, r6
437	vmov s31, ip
438	bcs .Lctrcarry
439
440	.Lctrcarrydone:
441	subs r4, r4, #`1`
442	bmi .Lctrtailblock @ blocks < `0` means tail block
443	vld1`.8` {q3}, [r1]!
444	veor q3, q0, q3
445	vst1`.8` {q3}, [r0]!
446	bne .Lctrloop
447
448	.Lctrout:
449	vst1`.8` {q7}, [r5] @ return next CTR value
450	pop {r4-r6, pc}
451
452	.Lctrtailblock:
453	vst1`.8` {q0}, [r0, :`64`] @ return the key stream
454	b .Lctrout
455
456	.Lctrcarry:
457	.irp sreg, s30, s29, s28
458	vmov ip, \sreg @ load next word of ctr
459	rev ip, ip @ ... to handle the carry
460	adds ip, ip, #`1`
461	rev ip, ip
462	vmov \sreg, ip
463	bcc .Lctrcarrydone
464	.endr
465	b .Lctrcarrydone
466	ENDPROC(ce_aes_ctr_encrypt)
467
468	/*
469	* aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
470	* int bytes, u8 iv[], u32 const rk2[], int first)
471	* aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
472	* int bytes, u8 iv[], u32 const rk2[], int first)
473	*/
474
475	.macro next_tweak, out, in, const, tmp
476	vshr.s64 \tmp, \in, #`63`
477	vand \tmp, \tmp, \const
478	vadd.u64 \out, \in, \in
479	vext`.8` \tmp, \tmp, \tmp, #`8`
480	veor \out, \out, \tmp
481	.endm
482
483	ce_aes_xts_init:
484	vmov.i32 d30, #`0x87` @ compose tweak mask vector
485	vmovl.u32 q15, d30
486	vshr.u64 d30, d31, #`7`
487
488	ldrd r4, r5, [sp, #`16`] @ load args
489	ldr r6, [sp, #`28`]
490	vld1`.8` {q0}, [r5] @ load iv
491	teq r6, #`1` @ start of a block?
492	bxne lr
493
494	@ Encrypt the IV in q0 with the second AES key. This should only
495	@ be done at the start of a block.
496	ldr r6, [sp, #`24`] @ load AES key `2`
497	prepare_key r6, r3
498	add ip, r6, #`32` @ `3rd` round key of key `2`
499	b .Laes_encrypt_tweak @ tail call
500	ENDPROC(ce_aes_xts_init)
501
502	ENTRY(ce_aes_xts_encrypt)
503	push {r4-r6, lr}
504
505	bl ce_aes_xts_init @ run shared prologue
506	prepare_key r2, r3
507	vmov q4, q0
508
509	teq r6, #`0` @ start of a block?
510	bne .Lxtsenc4x
511
512	.Lxtsencloop4x:
513	next_tweak q4, q4, q15, q10
514	.Lxtsenc4x:
515	subs r4, r4, #`64`
516	bmi .Lxtsenc1x
517	vld1`.8` {q0-q1}, [r1]! @ get `4` pt blocks
518	vld1`.8` {q2-q3}, [r1]!
519	next_tweak q5, q4, q15, q10
520	veor q0, q0, q4
521	next_tweak q6, q5, q15, q10
522	veor q1, q1, q5
523	next_tweak q7, q6, q15, q10
524	veor q2, q2, q6
525	veor q3, q3, q7
526	bl aes_encrypt_4x
527	veor q0, q0, q4
528	veor q1, q1, q5
529	veor q2, q2, q6
530	veor q3, q3, q7
531	vst1`.8` {q0-q1}, [r0]! @ write `4` ct blocks
532	vst1`.8` {q2-q3}, [r0]!
533	vmov q4, q7
534	teq r4, #`0`
535	beq .Lxtsencret
536	b .Lxtsencloop4x
537	.Lxtsenc1x:
538	adds r4, r4, #`64`
539	beq .Lxtsencout
540	subs r4, r4, #`16`
541	bmi .LxtsencctsNx
542	.Lxtsencloop:
543	vld1`.8` {q0}, [r1]!
544	.Lxtsencctsout:
545	veor q0, q0, q4
546	bl aes_encrypt
547	veor q0, q0, q4
548	teq r4, #`0`
549	beq .Lxtsencout
550	subs r4, r4, #`16`
551	next_tweak q4, q4, q15, q6
552	bmi .Lxtsenccts
553	vst1`.8` {q0}, [r0]!
554	b .Lxtsencloop
555	.Lxtsencout:
556	vst1`.8` {q0}, [r0]
557	.Lxtsencret:
558	vst1`.8` {q4}, [r5]
559	pop {r4-r6, pc}
560
561	.LxtsencctsNx:
562	vmov q0, q3
563	sub r0, r0, #`16`
564	.Lxtsenccts:
565	movw ip, :lower16:.Lcts_permute_table
566	movt ip, :upper16:.Lcts_permute_table
567
568	add r1, r1, r4 @ rewind input pointer
569	add r4, r4, #`16` @ # bytes in final block
570	add lr, ip, #`32`
571	add ip, ip, r4
572	sub lr, lr, r4
573	add r4, r0, r4 @ output address of final block
574
575	vld1`.8` {q1}, [r1] @ load final partial block
576	vld1`.8` {q2}, [ip]
577	vld1`.8` {q3}, [lr]
578
579	vtbl`.8` d4, {d0-d1}, d4
580	vtbl`.8` d5, {d0-d1}, d5
581	vtbx`.8` d0, {d2-d3}, d6
582	vtbx`.8` d1, {d2-d3}, d7
583
584	vst1`.8` {q2}, [r4] @ overlapping stores
585	mov r4, #`0`
586	b .Lxtsencctsout
587	ENDPROC(ce_aes_xts_encrypt)
588
589
590	ENTRY(ce_aes_xts_decrypt)
591	push {r4-r6, lr}
592
593	bl ce_aes_xts_init @ run shared prologue
594	prepare_key r2, r3
595	vmov q4, q0
596
597	/ subtract 16 bytes if we are doing CTS /
598	tst r4, #`0xf`
599	subne r4, r4, #`0x10`
600
601	teq r6, #`0` @ start of a block?
602	bne .Lxtsdec4x
603
604	.Lxtsdecloop4x:
605	next_tweak q4, q4, q15, q10
606	.Lxtsdec4x:
607	subs r4, r4, #`64`
608	bmi .Lxtsdec1x
609	vld1`.8` {q0-q1}, [r1]! @ get `4` ct blocks
610	vld1`.8` {q2-q3}, [r1]!
611	next_tweak q5, q4, q15, q10
612	veor q0, q0, q4
613	next_tweak q6, q5, q15, q10
614	veor q1, q1, q5
615	next_tweak q7, q6, q15, q10
616	veor q2, q2, q6
617	veor q3, q3, q7
618	bl aes_decrypt_4x
619	veor q0, q0, q4
620	veor q1, q1, q5
621	veor q2, q2, q6
622	veor q3, q3, q7
623	vst1`.8` {q0-q1}, [r0]! @ write `4` pt blocks
624	vst1`.8` {q2-q3}, [r0]!
625	vmov q4, q7
626	teq r4, #`0`
627	beq .Lxtsdecout
628	b .Lxtsdecloop4x
629	.Lxtsdec1x:
630	adds r4, r4, #`64`
631	beq .Lxtsdecout
632	subs r4, r4, #`16`
633	.Lxtsdecloop:
634	vld1`.8` {q0}, [r1]!
635	bmi .Lxtsdeccts
636	.Lxtsdecctsout:
637	veor q0, q0, q4
638	bl aes_decrypt
639	veor q0, q0, q4
640	vst1`.8` {q0}, [r0]!
641	teq r4, #`0`
642	beq .Lxtsdecout
643	subs r4, r4, #`16`
644	next_tweak q4, q4, q15, q6
645	b .Lxtsdecloop
646	.Lxtsdecout:
647	vst1`.8` {q4}, [r5]
648	pop {r4-r6, pc}
649
650	.Lxtsdeccts:
651	movw ip, :lower16:.Lcts_permute_table
652	movt ip, :upper16:.Lcts_permute_table
653
654	add r1, r1, r4 @ rewind input pointer
655	add r4, r4, #`16` @ # bytes in final block
656	add lr, ip, #`32`
657	add ip, ip, r4
658	sub lr, lr, r4
659	add r4, r0, r4 @ output address of final block
660
661	next_tweak q5, q4, q15, q6
662
663	vld1`.8` {q1}, [r1] @ load final partial block
664	vld1`.8` {q2}, [ip]
665	vld1`.8` {q3}, [lr]
666
667	veor q0, q0, q5
668	bl aes_decrypt
669	veor q0, q0, q5
670
671	vtbl`.8` d4, {d0-d1}, d4
672	vtbl`.8` d5, {d0-d1}, d5
673	vtbx`.8` d0, {d2-d3}, d6
674	vtbx`.8` d1, {d2-d3}, d7
675
676	vst1`.8` {q2}, [r4] @ overlapping stores
677	mov r4, #`0`
678	b .Lxtsdecctsout
679	ENDPROC(ce_aes_xts_decrypt)
680
681	/*
682	* u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
683	* AES sbox substitution on each byte in
684	* 'input'
685	*/
686	ENTRY(ce_aes_sub)
687	vdup`.32` q1, r0
688	veor q0, q0, q0
689	aese`.8` q0, q1
690	vmov r0, s0
691	bx lr
692	ENDPROC(ce_aes_sub)
693
694	/*
695	* void ce_aes_invert(u8 dst, u8 src) - perform the Inverse MixColumns
696	* operation on round key *src
697	*/
698	ENTRY(ce_aes_invert)
699	vld1`.32` {q0}, [r1]
700	aesimc`.8` q0, q0
701	vst1`.32` {q0}, [r0]
702	bx lr
703	ENDPROC(ce_aes_invert)
704
705	.section ".rodata", "a"
706	.align `6`
707	.Lcts_permute_table:
708	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
709	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
710	.byte `0x0`, `0x1`, `0x2`, `0x3`, `0x4`, `0x5`, `0x6`, `0x7`
711	.byte `0x8`, `0x9`, `0xa`, `0xb`, `0xc`, `0xd`, `0xe`, `0xf`
712	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
713	.byte `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`, `0xff`
714

source code of linux/arch/arm/crypto/aes-ce-core.S