1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON |
4 | * |
5 | * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> |
6 | */ |
7 | |
8 | #include <linux/linkage.h> |
9 | #include <asm/assembler.h> |
10 | |
11 | #define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func) |
12 | #define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func) |
13 | |
14 | xtsmask .req v7 |
15 | cbciv .req v7 |
16 | vctr .req v4 |
17 | |
18 | .macro xts_reload_mask, tmp |
19 | xts_load_mask \tmp |
20 | .endm |
21 | |
22 | /* special case for the neon-bs driver calling into this one for CTS */ |
23 | .macro xts_cts_skip_tw, reg, lbl |
24 | tbnz \reg, #1, \lbl |
25 | .endm |
26 | |
27 | /* multiply by polynomial 'x' in GF(2^8) */ |
28 | .macro mul_by_x, out, in, temp, const |
29 | sshr \temp, \in, #7 |
30 | shl \out, \in, #1 |
31 | and \temp, \temp, \const |
32 | eor \out, \out, \temp |
33 | .endm |
34 | |
35 | /* multiply by polynomial 'x^2' in GF(2^8) */ |
36 | .macro mul_by_x2, out, in, temp, const |
37 | ushr \temp, \in, #6 |
38 | shl \out, \in, #2 |
39 | pmul \temp, \temp, \const |
40 | eor \out, \out, \temp |
41 | .endm |
42 | |
43 | /* preload the entire Sbox */ |
44 | .macro prepare, sbox, shiftrows, temp |
45 | movi v12.16b, #0x1b |
46 | ldr_l q13, \shiftrows, \temp |
47 | ldr_l q14, .Lror32by8, \temp |
48 | adr_l \temp, \sbox |
49 | ld1 {v16.16b-v19.16b}, [\temp], #64 |
50 | ld1 {v20.16b-v23.16b}, [\temp], #64 |
51 | ld1 {v24.16b-v27.16b}, [\temp], #64 |
52 | ld1 {v28.16b-v31.16b}, [\temp] |
53 | .endm |
54 | |
55 | /* do preload for encryption */ |
56 | .macro enc_prepare, ignore0, ignore1, temp |
57 | prepare crypto_aes_sbox, .LForward_ShiftRows, \temp |
58 | .endm |
59 | |
60 | .macro enc_switch_key, ignore0, ignore1, temp |
61 | /* do nothing */ |
62 | .endm |
63 | |
64 | /* do preload for decryption */ |
65 | .macro dec_prepare, ignore0, ignore1, temp |
66 | prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp |
67 | .endm |
68 | |
69 | /* apply SubBytes transformation using the preloaded Sbox */ |
70 | .macro sub_bytes, in |
71 | sub v9.16b, \in\().16b, v15.16b |
72 | tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b |
73 | sub v10.16b, v9.16b, v15.16b |
74 | tbx \in\().16b, {v20.16b-v23.16b}, v9.16b |
75 | sub v11.16b, v10.16b, v15.16b |
76 | tbx \in\().16b, {v24.16b-v27.16b}, v10.16b |
77 | tbx \in\().16b, {v28.16b-v31.16b}, v11.16b |
78 | .endm |
79 | |
80 | /* apply MixColumns transformation */ |
81 | .macro mix_columns, in, enc |
82 | .if \enc == 0 |
83 | /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ |
84 | mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b |
85 | eor \in\().16b, \in\().16b, v8.16b |
86 | rev32 v8.8h, v8.8h |
87 | eor \in\().16b, \in\().16b, v8.16b |
88 | .endif |
89 | |
90 | mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b |
91 | rev32 v8.8h, \in\().8h |
92 | eor v8.16b, v8.16b, v9.16b |
93 | eor \in\().16b, \in\().16b, v8.16b |
94 | tbl \in\().16b, {\in\().16b}, v14.16b |
95 | eor \in\().16b, \in\().16b, v8.16b |
96 | .endm |
97 | |
98 | .macro do_block, enc, in, rounds, rk, rkp, i |
99 | ld1 {v15.4s}, [\rk] |
100 | add \rkp, \rk, #16 |
101 | mov \i, \rounds |
102 | 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ |
103 | movi v15.16b, #0x40 |
104 | tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ |
105 | sub_bytes \in |
106 | subs \i, \i, #1 |
107 | ld1 {v15.4s}, [\rkp], #16 |
108 | beq 2222f |
109 | mix_columns \in, \enc |
110 | b 1111b |
111 | 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ |
112 | .endm |
113 | |
114 | .macro encrypt_block, in, rounds, rk, rkp, i |
115 | do_block 1, \in, \rounds, \rk, \rkp, \i |
116 | .endm |
117 | |
118 | .macro decrypt_block, in, rounds, rk, rkp, i |
119 | do_block 0, \in, \rounds, \rk, \rkp, \i |
120 | .endm |
121 | |
122 | /* |
123 | * Interleaved versions: functionally equivalent to the |
124 | * ones above, but applied to AES states in parallel. |
125 | */ |
126 | |
127 | .macro sub_bytes_4x, in0, in1, in2, in3 |
128 | sub v8.16b, \in0\().16b, v15.16b |
129 | tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b |
130 | sub v9.16b, \in1\().16b, v15.16b |
131 | tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b |
132 | sub v10.16b, \in2\().16b, v15.16b |
133 | tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b |
134 | sub v11.16b, \in3\().16b, v15.16b |
135 | tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b |
136 | tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b |
137 | tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b |
138 | sub v8.16b, v8.16b, v15.16b |
139 | tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b |
140 | sub v9.16b, v9.16b, v15.16b |
141 | tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b |
142 | sub v10.16b, v10.16b, v15.16b |
143 | tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b |
144 | sub v11.16b, v11.16b, v15.16b |
145 | tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b |
146 | sub v8.16b, v8.16b, v15.16b |
147 | tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b |
148 | sub v9.16b, v9.16b, v15.16b |
149 | tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b |
150 | sub v10.16b, v10.16b, v15.16b |
151 | tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b |
152 | sub v11.16b, v11.16b, v15.16b |
153 | tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b |
154 | tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b |
155 | tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b |
156 | .endm |
157 | |
158 | .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const |
159 | sshr \tmp0\().16b, \in0\().16b, #7 |
160 | shl \out0\().16b, \in0\().16b, #1 |
161 | sshr \tmp1\().16b, \in1\().16b, #7 |
162 | and \tmp0\().16b, \tmp0\().16b, \const\().16b |
163 | shl \out1\().16b, \in1\().16b, #1 |
164 | and \tmp1\().16b, \tmp1\().16b, \const\().16b |
165 | eor \out0\().16b, \out0\().16b, \tmp0\().16b |
166 | eor \out1\().16b, \out1\().16b, \tmp1\().16b |
167 | .endm |
168 | |
169 | .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const |
170 | ushr \tmp0\().16b, \in0\().16b, #6 |
171 | shl \out0\().16b, \in0\().16b, #2 |
172 | ushr \tmp1\().16b, \in1\().16b, #6 |
173 | pmul \tmp0\().16b, \tmp0\().16b, \const\().16b |
174 | shl \out1\().16b, \in1\().16b, #2 |
175 | pmul \tmp1\().16b, \tmp1\().16b, \const\().16b |
176 | eor \out0\().16b, \out0\().16b, \tmp0\().16b |
177 | eor \out1\().16b, \out1\().16b, \tmp1\().16b |
178 | .endm |
179 | |
180 | .macro mix_columns_2x, in0, in1, enc |
181 | .if \enc == 0 |
182 | /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ |
183 | mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12 |
184 | eor \in0\().16b, \in0\().16b, v8.16b |
185 | rev32 v8.8h, v8.8h |
186 | eor \in1\().16b, \in1\().16b, v9.16b |
187 | rev32 v9.8h, v9.8h |
188 | eor \in0\().16b, \in0\().16b, v8.16b |
189 | eor \in1\().16b, \in1\().16b, v9.16b |
190 | .endif |
191 | |
192 | mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12 |
193 | rev32 v10.8h, \in0\().8h |
194 | rev32 v11.8h, \in1\().8h |
195 | eor v10.16b, v10.16b, v8.16b |
196 | eor v11.16b, v11.16b, v9.16b |
197 | eor \in0\().16b, \in0\().16b, v10.16b |
198 | eor \in1\().16b, \in1\().16b, v11.16b |
199 | tbl \in0\().16b, {\in0\().16b}, v14.16b |
200 | tbl \in1\().16b, {\in1\().16b}, v14.16b |
201 | eor \in0\().16b, \in0\().16b, v10.16b |
202 | eor \in1\().16b, \in1\().16b, v11.16b |
203 | .endm |
204 | |
205 | .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i |
206 | ld1 {v15.4s}, [\rk] |
207 | add \rkp, \rk, #16 |
208 | mov \i, \rounds |
209 | 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ |
210 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ |
211 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ |
212 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ |
213 | movi v15.16b, #0x40 |
214 | tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ |
215 | tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ |
216 | tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ |
217 | tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ |
218 | sub_bytes_4x \in0, \in1, \in2, \in3 |
219 | subs \i, \i, #1 |
220 | ld1 {v15.4s}, [\rkp], #16 |
221 | beq 2222f |
222 | mix_columns_2x \in0, \in1, \enc |
223 | mix_columns_2x \in2, \in3, \enc |
224 | b 1111b |
225 | 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ |
226 | eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ |
227 | eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ |
228 | eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ |
229 | .endm |
230 | |
231 | .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i |
232 | do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i |
233 | .endm |
234 | |
235 | .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i |
236 | do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i |
237 | .endm |
238 | |
239 | #include "aes-modes.S" |
240 | |
241 | .section ".rodata" , "a" |
242 | .align 4 |
243 | .LForward_ShiftRows: |
244 | .octa 0x0b06010c07020d08030e09040f0a0500 |
245 | |
246 | .LReverse_ShiftRows: |
247 | .octa 0x0306090c0f0205080b0e0104070a0d00 |
248 | |
249 | .Lror32by8: |
250 | .octa 0x0c0f0e0d080b0a090407060500030201 |
251 | |