aes-neon.S source code [linux/arch/arm64/crypto/aes-neon.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
4	*
5	* Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6	*/
7
8	#include <linux/linkage.h>
9	#include <asm/assembler.h>
10
11	#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func)
12	#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func)
13
14	xtsmask .req v7
15	cbciv .req v7
16	vctr .req v4
17
18	.macro xts_reload_mask, tmp
19	xts_load_mask \tmp
20	.endm
21
22	/ special case for the neon-bs driver calling into this one for CTS /
23	.macro xts_cts_skip_tw, reg, lbl
24	tbnz \reg, #`1`, \lbl
25	.endm
26
27	/ multiply by polynomial 'x' in GF(2^8) /
28	.macro mul_by_x, out, in, temp, const
29	sshr \temp, \in, #`7`
30	shl \out, \in, #`1`
31	and \temp, \temp, \const
32	eor \out, \out, \temp
33	.endm
34
35	/ multiply by polynomial 'x^2' in GF(2^8) /
36	.macro mul_by_x2, out, in, temp, const
37	ushr \temp, \in, #`6`
38	shl \out, \in, #`2`
39	pmul \temp, \temp, \const
40	eor \out, \out, \temp
41	.endm
42
43	/ preload the entire Sbox /
44	.macro prepare, sbox, shiftrows, temp
45	movi v12`.16b`, #`0x1b`
46	ldr_l q13, \shiftrows, \temp
47	ldr_l q14, .Lror32by8, \temp
48	adr_l \temp, \sbox
49	ld1 {v16`.16b`-v19`.16b`}, [\temp], #`64`
50	ld1 {v20`.16b`-v23`.16b`}, [\temp], #`64`
51	ld1 {v24`.16b`-v27`.16b`}, [\temp], #`64`
52	ld1 {v28`.16b`-v31`.16b`}, [\temp]
53	.endm
54
55	/ do preload for encryption /
56	.macro enc_prepare, ignore0, ignore1, temp
57	prepare crypto_aes_sbox, .LForward_ShiftRows, \temp
58	.endm
59
60	.macro enc_switch_key, ignore0, ignore1, temp
61	/ do nothing /
62	.endm
63
64	/ do preload for decryption /
65	.macro dec_prepare, ignore0, ignore1, temp
66	prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
67	.endm
68
69	/ apply SubBytes transformation using the preloaded Sbox /
70	.macro sub_bytes, in
71	sub v9`.16b`, \in\()`.16b`, v15`.16b`
72	tbl \in\()`.16b`, {v16`.16b`-v19`.16b`}, \in\()`.16b`
73	sub v10`.16b`, v9`.16b`, v15`.16b`
74	tbx \in\()`.16b`, {v20`.16b`-v23`.16b`}, v9`.16b`
75	sub v11`.16b`, v10`.16b`, v15`.16b`
76	tbx \in\()`.16b`, {v24`.16b`-v27`.16b`}, v10`.16b`
77	tbx \in\()`.16b`, {v28`.16b`-v31`.16b`}, v11`.16b`
78	.endm
79
80	/ apply MixColumns transformation /
81	.macro mix_columns, in, enc
82	.if \enc == `0`
83	/ Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } /
84	mul_by_x2 v8`.16b`, \in\()`.16b`, v9`.16b`, v12`.16b`
85	eor \in\()`.16b`, \in\()`.16b`, v8`.16b`
86	rev32 v8`.8h`, v8`.8h`
87	eor \in\()`.16b`, \in\()`.16b`, v8`.16b`
88	.endif
89
90	mul_by_x v9`.16b`, \in\()`.16b`, v8`.16b`, v12`.16b`
91	rev32 v8`.8h`, \in\()`.8h`
92	eor v8`.16b`, v8`.16b`, v9`.16b`
93	eor \in\()`.16b`, \in\()`.16b`, v8`.16b`
94	tbl \in\()`.16b`, {\in\()`.16b`}, v14`.16b`
95	eor \in\()`.16b`, \in\()`.16b`, v8`.16b`
96	.endm
97
98	.macro do_block, enc, in, rounds, rk, rkp, i
99	ld1 {v15`.4s`}, [\rk]
100	add \rkp, \rk, #`16`
101	mov \i, \rounds
102	`1111`: eor \in\()`.16b`, \in\()`.16b`, v15`.16b` / ^round key /
103	movi v15`.16b`, #`0x40`
104	tbl \in\()`.16b`, {\in\()`.16b`}, v13`.16b` / ShiftRows /
105	sub_bytes \in
106	subs \i, \i, #`1`
107	ld1 {v15`.4s`}, [\rkp], #`16`
108	beq `2222f`
109	mix_columns \in, \enc
110	b `1111b`
111	`2222`: eor \in\()`.16b`, \in\()`.16b`, v15`.16b` / ^round key /
112	.endm
113
114	.macro encrypt_block, in, rounds, rk, rkp, i
115	do_block `1`, \in, \rounds, \rk, \rkp, \i
116	.endm
117
118	.macro decrypt_block, in, rounds, rk, rkp, i
119	do_block `0`, \in, \rounds, \rk, \rkp, \i
120	.endm
121
122	/*
123	* Interleaved versions: functionally equivalent to the
124	* ones above, but applied to AES states in parallel.
125	*/
126
127	.macro sub_bytes_4x, in0, in1, in2, in3
128	sub v8`.16b`, \in0\()`.16b`, v15`.16b`
129	tbl \in0\()`.16b`, {v16`.16b`-v19`.16b`}, \in0\()`.16b`
130	sub v9`.16b`, \in1\()`.16b`, v15`.16b`
131	tbl \in1\()`.16b`, {v16`.16b`-v19`.16b`}, \in1\()`.16b`
132	sub v10`.16b`, \in2\()`.16b`, v15`.16b`
133	tbl \in2\()`.16b`, {v16`.16b`-v19`.16b`}, \in2\()`.16b`
134	sub v11`.16b`, \in3\()`.16b`, v15`.16b`
135	tbl \in3\()`.16b`, {v16`.16b`-v19`.16b`}, \in3\()`.16b`
136	tbx \in0\()`.16b`, {v20`.16b`-v23`.16b`}, v8`.16b`
137	tbx \in1\()`.16b`, {v20`.16b`-v23`.16b`}, v9`.16b`
138	sub v8`.16b`, v8`.16b`, v15`.16b`
139	tbx \in2\()`.16b`, {v20`.16b`-v23`.16b`}, v10`.16b`
140	sub v9`.16b`, v9`.16b`, v15`.16b`
141	tbx \in3\()`.16b`, {v20`.16b`-v23`.16b`}, v11`.16b`
142	sub v10`.16b`, v10`.16b`, v15`.16b`
143	tbx \in0\()`.16b`, {v24`.16b`-v27`.16b`}, v8`.16b`
144	sub v11`.16b`, v11`.16b`, v15`.16b`
145	tbx \in1\()`.16b`, {v24`.16b`-v27`.16b`}, v9`.16b`
146	sub v8`.16b`, v8`.16b`, v15`.16b`
147	tbx \in2\()`.16b`, {v24`.16b`-v27`.16b`}, v10`.16b`
148	sub v9`.16b`, v9`.16b`, v15`.16b`
149	tbx \in3\()`.16b`, {v24`.16b`-v27`.16b`}, v11`.16b`
150	sub v10`.16b`, v10`.16b`, v15`.16b`
151	tbx \in0\()`.16b`, {v28`.16b`-v31`.16b`}, v8`.16b`
152	sub v11`.16b`, v11`.16b`, v15`.16b`
153	tbx \in1\()`.16b`, {v28`.16b`-v31`.16b`}, v9`.16b`
154	tbx \in2\()`.16b`, {v28`.16b`-v31`.16b`}, v10`.16b`
155	tbx \in3\()`.16b`, {v28`.16b`-v31`.16b`}, v11`.16b`
156	.endm
157
158	.macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
159	sshr \tmp0\()`.16b`, \in0\()`.16b`, #`7`
160	shl \out0\()`.16b`, \in0\()`.16b`, #`1`
161	sshr \tmp1\()`.16b`, \in1\()`.16b`, #`7`
162	and \tmp0\()`.16b`, \tmp0\()`.16b`, \const\()`.16b`
163	shl \out1\()`.16b`, \in1\()`.16b`, #`1`
164	and \tmp1\()`.16b`, \tmp1\()`.16b`, \const\()`.16b`
165	eor \out0\()`.16b`, \out0\()`.16b`, \tmp0\()`.16b`
166	eor \out1\()`.16b`, \out1\()`.16b`, \tmp1\()`.16b`
167	.endm
168
169	.macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
170	ushr \tmp0\()`.16b`, \in0\()`.16b`, #`6`
171	shl \out0\()`.16b`, \in0\()`.16b`, #`2`
172	ushr \tmp1\()`.16b`, \in1\()`.16b`, #`6`
173	pmul \tmp0\()`.16b`, \tmp0\()`.16b`, \const\()`.16b`
174	shl \out1\()`.16b`, \in1\()`.16b`, #`2`
175	pmul \tmp1\()`.16b`, \tmp1\()`.16b`, \const\()`.16b`
176	eor \out0\()`.16b`, \out0\()`.16b`, \tmp0\()`.16b`
177	eor \out1\()`.16b`, \out1\()`.16b`, \tmp1\()`.16b`
178	.endm
179
180	.macro mix_columns_2x, in0, in1, enc
181	.if \enc == `0`
182	/ Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } /
183	mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12
184	eor \in0\()`.16b`, \in0\()`.16b`, v8`.16b`
185	rev32 v8`.8h`, v8`.8h`
186	eor \in1\()`.16b`, \in1\()`.16b`, v9`.16b`
187	rev32 v9`.8h`, v9`.8h`
188	eor \in0\()`.16b`, \in0\()`.16b`, v8`.16b`
189	eor \in1\()`.16b`, \in1\()`.16b`, v9`.16b`
190	.endif
191
192	mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
193	rev32 v10`.8h`, \in0\()`.8h`
194	rev32 v11`.8h`, \in1\()`.8h`
195	eor v10`.16b`, v10`.16b`, v8`.16b`
196	eor v11`.16b`, v11`.16b`, v9`.16b`
197	eor \in0\()`.16b`, \in0\()`.16b`, v10`.16b`
198	eor \in1\()`.16b`, \in1\()`.16b`, v11`.16b`
199	tbl \in0\()`.16b`, {\in0\()`.16b`}, v14`.16b`
200	tbl \in1\()`.16b`, {\in1\()`.16b`}, v14`.16b`
201	eor \in0\()`.16b`, \in0\()`.16b`, v10`.16b`
202	eor \in1\()`.16b`, \in1\()`.16b`, v11`.16b`
203	.endm
204
205	.macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
206	ld1 {v15`.4s`}, [\rk]
207	add \rkp, \rk, #`16`
208	mov \i, \rounds
209	`1111`: eor \in0\()`.16b`, \in0\()`.16b`, v15`.16b` / ^round key /
210	eor \in1\()`.16b`, \in1\()`.16b`, v15`.16b` / ^round key /
211	eor \in2\()`.16b`, \in2\()`.16b`, v15`.16b` / ^round key /
212	eor \in3\()`.16b`, \in3\()`.16b`, v15`.16b` / ^round key /
213	movi v15`.16b`, #`0x40`
214	tbl \in0\()`.16b`, {\in0\()`.16b`}, v13`.16b` / ShiftRows /
215	tbl \in1\()`.16b`, {\in1\()`.16b`}, v13`.16b` / ShiftRows /
216	tbl \in2\()`.16b`, {\in2\()`.16b`}, v13`.16b` / ShiftRows /
217	tbl \in3\()`.16b`, {\in3\()`.16b`}, v13`.16b` / ShiftRows /
218	sub_bytes_4x \in0, \in1, \in2, \in3
219	subs \i, \i, #`1`
220	ld1 {v15`.4s`}, [\rkp], #`16`
221	beq `2222f`
222	mix_columns_2x \in0, \in1, \enc
223	mix_columns_2x \in2, \in3, \enc
224	b `1111b`
225	`2222`: eor \in0\()`.16b`, \in0\()`.16b`, v15`.16b` / ^round key /
226	eor \in1\()`.16b`, \in1\()`.16b`, v15`.16b` / ^round key /
227	eor \in2\()`.16b`, \in2\()`.16b`, v15`.16b` / ^round key /
228	eor \in3\()`.16b`, \in3\()`.16b`, v15`.16b` / ^round key /
229	.endm
230
231	.macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
232	do_block_4x `1`, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
233	.endm
234
235	.macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
236	do_block_4x `0`, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
237	.endm
238
239	#include "aes-modes.S"
240
241	.section ".rodata", "a"
242	.align `4`
243	.LForward_ShiftRows:
244	.octa `0x0b06010c07020d08030e09040f0a0500`
245
246	.LReverse_ShiftRows:
247	.octa `0x0306090c0f0205080b0e0104070a0d00`
248
249	.Lror32by8:
250	.octa `0x0c0f0e0d080b0a090407060500030201`
251

source code of linux/arch/arm64/crypto/aes-neon.S