1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Bit sliced AES using NEON instructions
4 *
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/*
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
12 *
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
15 */
16
17#include <linux/linkage.h>
18#include <linux/cfi_types.h>
19#include <asm/assembler.h>
20
21 .text
22
23 rounds .req x11
24 bskey .req x12
25
26 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
27 eor \b2, \b2, \b1
28 eor \b5, \b5, \b6
29 eor \b3, \b3, \b0
30 eor \b6, \b6, \b2
31 eor \b5, \b5, \b0
32 eor \b6, \b6, \b3
33 eor \b3, \b3, \b7
34 eor \b7, \b7, \b5
35 eor \b3, \b3, \b4
36 eor \b4, \b4, \b5
37 eor \b2, \b2, \b7
38 eor \b3, \b3, \b1
39 eor \b1, \b1, \b5
40 .endm
41
42 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
43 eor \b0, \b0, \b6
44 eor \b1, \b1, \b4
45 eor \b4, \b4, \b6
46 eor \b2, \b2, \b0
47 eor \b6, \b6, \b1
48 eor \b1, \b1, \b5
49 eor \b5, \b5, \b3
50 eor \b3, \b3, \b7
51 eor \b7, \b7, \b5
52 eor \b2, \b2, \b5
53 eor \b4, \b4, \b7
54 .endm
55
56 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
57 eor \b1, \b1, \b7
58 eor \b4, \b4, \b7
59 eor \b7, \b7, \b5
60 eor \b1, \b1, \b3
61 eor \b2, \b2, \b5
62 eor \b3, \b3, \b7
63 eor \b6, \b6, \b1
64 eor \b2, \b2, \b0
65 eor \b5, \b5, \b3
66 eor \b4, \b4, \b6
67 eor \b0, \b0, \b6
68 eor \b1, \b1, \b4
69 .endm
70
71 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
72 eor \b1, \b1, \b5
73 eor \b2, \b2, \b7
74 eor \b3, \b3, \b1
75 eor \b4, \b4, \b5
76 eor \b7, \b7, \b5
77 eor \b3, \b3, \b4
78 eor \b5, \b5, \b0
79 eor \b3, \b3, \b7
80 eor \b6, \b6, \b2
81 eor \b2, \b2, \b1
82 eor \b6, \b6, \b3
83 eor \b3, \b3, \b0
84 eor \b5, \b5, \b6
85 .endm
86
87 .macro mul_gf4, x0, x1, y0, y1, t0, t1
88 eor \t0, \y0, \y1
89 and \t0, \t0, \x0
90 eor \x0, \x0, \x1
91 and \t1, \x1, \y0
92 and \x0, \x0, \y1
93 eor \x1, \t1, \t0
94 eor \x0, \x0, \t1
95 .endm
96
97 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
98 eor \t0, \y0, \y1
99 eor \t1, \y2, \y3
100 and \t0, \t0, \x0
101 and \t1, \t1, \x2
102 eor \x0, \x0, \x1
103 eor \x2, \x2, \x3
104 and \x1, \x1, \y0
105 and \x3, \x3, \y2
106 and \x0, \x0, \y1
107 and \x2, \x2, \y3
108 eor \x1, \x1, \x0
109 eor \x2, \x2, \x3
110 eor \x0, \x0, \t0
111 eor \x3, \x3, \t1
112 .endm
113
114 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
115 y0, y1, y2, y3, t0, t1, t2, t3
116 eor \t0, \x0, \x2
117 eor \t1, \x1, \x3
118 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
119 eor \y0, \y0, \y2
120 eor \y1, \y1, \y3
121 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
122 eor \x0, \x0, \t0
123 eor \x2, \x2, \t0
124 eor \x1, \x1, \t1
125 eor \x3, \x3, \t1
126 eor \t0, \x4, \x6
127 eor \t1, \x5, \x7
128 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
129 eor \y0, \y0, \y2
130 eor \y1, \y1, \y3
131 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
132 eor \x4, \x4, \t0
133 eor \x6, \x6, \t0
134 eor \x5, \x5, \t1
135 eor \x7, \x7, \t1
136 .endm
137
138 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
139 t0, t1, t2, t3, s0, s1, s2, s3
140 eor \t3, \x4, \x6
141 eor \t0, \x5, \x7
142 eor \t1, \x1, \x3
143 eor \s1, \x7, \x6
144 eor \s0, \x0, \x2
145 eor \s3, \t3, \t0
146 orr \t2, \t0, \t1
147 and \s2, \t3, \s0
148 orr \t3, \t3, \s0
149 eor \s0, \s0, \t1
150 and \t0, \t0, \t1
151 eor \t1, \x3, \x2
152 and \s3, \s3, \s0
153 and \s1, \s1, \t1
154 eor \t1, \x4, \x5
155 eor \s0, \x1, \x0
156 eor \t3, \t3, \s1
157 eor \t2, \t2, \s1
158 and \s1, \t1, \s0
159 orr \t1, \t1, \s0
160 eor \t3, \t3, \s3
161 eor \t0, \t0, \s1
162 eor \t2, \t2, \s2
163 eor \t1, \t1, \s3
164 eor \t0, \t0, \s2
165 and \s0, \x7, \x3
166 eor \t1, \t1, \s2
167 and \s1, \x6, \x2
168 and \s2, \x5, \x1
169 orr \s3, \x4, \x0
170 eor \t3, \t3, \s0
171 eor \t1, \t1, \s2
172 eor \s0, \t0, \s3
173 eor \t2, \t2, \s1
174 and \s2, \t3, \t1
175 eor \s1, \t2, \s2
176 eor \s3, \s0, \s2
177 bsl \s1, \t1, \s0
178 not \t0, \s0
179 bsl \s0, \s1, \s3
180 bsl \t0, \s1, \s3
181 bsl \s3, \t3, \t2
182 eor \t3, \t3, \t2
183 and \s2, \s0, \s3
184 eor \t1, \t1, \t0
185 eor \s2, \s2, \t3
186 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
187 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
188 .endm
189
190 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
191 t0, t1, t2, t3, s0, s1, s2, s3
192 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
193 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
194 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
195 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
196 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
197 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
198 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
199 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
200 .endm
201
202 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
203 t0, t1, t2, t3, s0, s1, s2, s3
204 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
205 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
206 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
207 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
208 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
209 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
210 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
211 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
212 .endm
213
214 .macro enc_next_rk
215 ldp q16, q17, [bskey], #128
216 ldp q18, q19, [bskey, #-96]
217 ldp q20, q21, [bskey, #-64]
218 ldp q22, q23, [bskey, #-32]
219 .endm
220
221 .macro dec_next_rk
222 ldp q16, q17, [bskey, #-128]!
223 ldp q18, q19, [bskey, #32]
224 ldp q20, q21, [bskey, #64]
225 ldp q22, q23, [bskey, #96]
226 .endm
227
228 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
229 eor \x0\().16b, \x0\().16b, v16.16b
230 eor \x1\().16b, \x1\().16b, v17.16b
231 eor \x2\().16b, \x2\().16b, v18.16b
232 eor \x3\().16b, \x3\().16b, v19.16b
233 eor \x4\().16b, \x4\().16b, v20.16b
234 eor \x5\().16b, \x5\().16b, v21.16b
235 eor \x6\().16b, \x6\().16b, v22.16b
236 eor \x7\().16b, \x7\().16b, v23.16b
237 .endm
238
239 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
240 tbl \x0\().16b, {\x0\().16b}, \mask\().16b
241 tbl \x1\().16b, {\x1\().16b}, \mask\().16b
242 tbl \x2\().16b, {\x2\().16b}, \mask\().16b
243 tbl \x3\().16b, {\x3\().16b}, \mask\().16b
244 tbl \x4\().16b, {\x4\().16b}, \mask\().16b
245 tbl \x5\().16b, {\x5\().16b}, \mask\().16b
246 tbl \x6\().16b, {\x6\().16b}, \mask\().16b
247 tbl \x7\().16b, {\x7\().16b}, \mask\().16b
248 .endm
249
250 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
251 t0, t1, t2, t3, t4, t5, t6, t7, inv
252 ext \t0\().16b, \x0\().16b, \x0\().16b, #12
253 ext \t1\().16b, \x1\().16b, \x1\().16b, #12
254 eor \x0\().16b, \x0\().16b, \t0\().16b
255 ext \t2\().16b, \x2\().16b, \x2\().16b, #12
256 eor \x1\().16b, \x1\().16b, \t1\().16b
257 ext \t3\().16b, \x3\().16b, \x3\().16b, #12
258 eor \x2\().16b, \x2\().16b, \t2\().16b
259 ext \t4\().16b, \x4\().16b, \x4\().16b, #12
260 eor \x3\().16b, \x3\().16b, \t3\().16b
261 ext \t5\().16b, \x5\().16b, \x5\().16b, #12
262 eor \x4\().16b, \x4\().16b, \t4\().16b
263 ext \t6\().16b, \x6\().16b, \x6\().16b, #12
264 eor \x5\().16b, \x5\().16b, \t5\().16b
265 ext \t7\().16b, \x7\().16b, \x7\().16b, #12
266 eor \x6\().16b, \x6\().16b, \t6\().16b
267 eor \t1\().16b, \t1\().16b, \x0\().16b
268 eor \x7\().16b, \x7\().16b, \t7\().16b
269 ext \x0\().16b, \x0\().16b, \x0\().16b, #8
270 eor \t2\().16b, \t2\().16b, \x1\().16b
271 eor \t0\().16b, \t0\().16b, \x7\().16b
272 eor \t1\().16b, \t1\().16b, \x7\().16b
273 ext \x1\().16b, \x1\().16b, \x1\().16b, #8
274 eor \t5\().16b, \t5\().16b, \x4\().16b
275 eor \x0\().16b, \x0\().16b, \t0\().16b
276 eor \t6\().16b, \t6\().16b, \x5\().16b
277 eor \x1\().16b, \x1\().16b, \t1\().16b
278 ext \t0\().16b, \x4\().16b, \x4\().16b, #8
279 eor \t4\().16b, \t4\().16b, \x3\().16b
280 ext \t1\().16b, \x5\().16b, \x5\().16b, #8
281 eor \t7\().16b, \t7\().16b, \x6\().16b
282 ext \x4\().16b, \x3\().16b, \x3\().16b, #8
283 eor \t3\().16b, \t3\().16b, \x2\().16b
284 ext \x5\().16b, \x7\().16b, \x7\().16b, #8
285 eor \t4\().16b, \t4\().16b, \x7\().16b
286 ext \x3\().16b, \x6\().16b, \x6\().16b, #8
287 eor \t3\().16b, \t3\().16b, \x7\().16b
288 ext \x6\().16b, \x2\().16b, \x2\().16b, #8
289 eor \x7\().16b, \t1\().16b, \t5\().16b
290 .ifb \inv
291 eor \x2\().16b, \t0\().16b, \t4\().16b
292 eor \x4\().16b, \x4\().16b, \t3\().16b
293 eor \x5\().16b, \x5\().16b, \t7\().16b
294 eor \x3\().16b, \x3\().16b, \t6\().16b
295 eor \x6\().16b, \x6\().16b, \t2\().16b
296 .else
297 eor \t3\().16b, \t3\().16b, \x4\().16b
298 eor \x5\().16b, \x5\().16b, \t7\().16b
299 eor \x2\().16b, \x3\().16b, \t6\().16b
300 eor \x3\().16b, \t0\().16b, \t4\().16b
301 eor \x4\().16b, \x6\().16b, \t2\().16b
302 mov \x6\().16b, \t3\().16b
303 .endif
304 .endm
305
306 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
307 t0, t1, t2, t3, t4, t5, t6, t7
308 ext \t0\().16b, \x0\().16b, \x0\().16b, #8
309 ext \t6\().16b, \x6\().16b, \x6\().16b, #8
310 ext \t7\().16b, \x7\().16b, \x7\().16b, #8
311 eor \t0\().16b, \t0\().16b, \x0\().16b
312 ext \t1\().16b, \x1\().16b, \x1\().16b, #8
313 eor \t6\().16b, \t6\().16b, \x6\().16b
314 ext \t2\().16b, \x2\().16b, \x2\().16b, #8
315 eor \t7\().16b, \t7\().16b, \x7\().16b
316 ext \t3\().16b, \x3\().16b, \x3\().16b, #8
317 eor \t1\().16b, \t1\().16b, \x1\().16b
318 ext \t4\().16b, \x4\().16b, \x4\().16b, #8
319 eor \t2\().16b, \t2\().16b, \x2\().16b
320 ext \t5\().16b, \x5\().16b, \x5\().16b, #8
321 eor \t3\().16b, \t3\().16b, \x3\().16b
322 eor \t4\().16b, \t4\().16b, \x4\().16b
323 eor \t5\().16b, \t5\().16b, \x5\().16b
324 eor \x0\().16b, \x0\().16b, \t6\().16b
325 eor \x1\().16b, \x1\().16b, \t6\().16b
326 eor \x2\().16b, \x2\().16b, \t0\().16b
327 eor \x4\().16b, \x4\().16b, \t2\().16b
328 eor \x3\().16b, \x3\().16b, \t1\().16b
329 eor \x1\().16b, \x1\().16b, \t7\().16b
330 eor \x2\().16b, \x2\().16b, \t7\().16b
331 eor \x4\().16b, \x4\().16b, \t6\().16b
332 eor \x5\().16b, \x5\().16b, \t3\().16b
333 eor \x3\().16b, \x3\().16b, \t6\().16b
334 eor \x6\().16b, \x6\().16b, \t4\().16b
335 eor \x4\().16b, \x4\().16b, \t7\().16b
336 eor \x5\().16b, \x5\().16b, \t7\().16b
337 eor \x7\().16b, \x7\().16b, \t5\().16b
338 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
339 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
340 .endm
341
342 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
343 ushr \t0\().2d, \b0\().2d, #\n
344 ushr \t1\().2d, \b1\().2d, #\n
345 eor \t0\().16b, \t0\().16b, \a0\().16b
346 eor \t1\().16b, \t1\().16b, \a1\().16b
347 and \t0\().16b, \t0\().16b, \mask\().16b
348 and \t1\().16b, \t1\().16b, \mask\().16b
349 eor \a0\().16b, \a0\().16b, \t0\().16b
350 shl \t0\().2d, \t0\().2d, #\n
351 eor \a1\().16b, \a1\().16b, \t1\().16b
352 shl \t1\().2d, \t1\().2d, #\n
353 eor \b0\().16b, \b0\().16b, \t0\().16b
354 eor \b1\().16b, \b1\().16b, \t1\().16b
355 .endm
356
357 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
358 movi \t0\().16b, #0x55
359 movi \t1\().16b, #0x33
360 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
361 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
362 movi \t0\().16b, #0x0f
363 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
364 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
365 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
366 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
367 .endm
368
369
370 .align 6
371M0: .octa 0x0004080c0105090d02060a0e03070b0f
372
373M0SR: .octa 0x0004080c05090d010a0e02060f03070b
374SR: .octa 0x0f0e0d0c0a09080b0504070600030201
375SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
376
377M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
378ISR: .octa 0x0f0e0d0c080b0a090504070602010003
379ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
380
381 /*
382 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
383 */
384SYM_FUNC_START(aesbs_convert_key)
385 ld1 {v7.4s}, [x1], #16 // load round 0 key
386 ld1 {v17.4s}, [x1], #16 // load round 1 key
387
388 movi v8.16b, #0x01 // bit masks
389 movi v9.16b, #0x02
390 movi v10.16b, #0x04
391 movi v11.16b, #0x08
392 movi v12.16b, #0x10
393 movi v13.16b, #0x20
394 movi v14.16b, #0x40
395 movi v15.16b, #0x80
396 ldr q16, M0
397
398 sub x2, x2, #1
399 str q7, [x0], #16 // save round 0 key
400
401.Lkey_loop:
402 tbl v7.16b ,{v17.16b}, v16.16b
403 ld1 {v17.4s}, [x1], #16 // load next round key
404
405 cmtst v0.16b, v7.16b, v8.16b
406 cmtst v1.16b, v7.16b, v9.16b
407 cmtst v2.16b, v7.16b, v10.16b
408 cmtst v3.16b, v7.16b, v11.16b
409 cmtst v4.16b, v7.16b, v12.16b
410 cmtst v5.16b, v7.16b, v13.16b
411 cmtst v6.16b, v7.16b, v14.16b
412 cmtst v7.16b, v7.16b, v15.16b
413 not v0.16b, v0.16b
414 not v1.16b, v1.16b
415 not v5.16b, v5.16b
416 not v6.16b, v6.16b
417
418 subs x2, x2, #1
419 stp q0, q1, [x0], #128
420 stp q2, q3, [x0, #-96]
421 stp q4, q5, [x0, #-64]
422 stp q6, q7, [x0, #-32]
423 b.ne .Lkey_loop
424
425 movi v7.16b, #0x63 // compose .L63
426 eor v17.16b, v17.16b, v7.16b
427 str q17, [x0]
428 ret
429SYM_FUNC_END(aesbs_convert_key)
430
431 .align 4
432SYM_FUNC_START_LOCAL(aesbs_encrypt8)
433 ldr q9, [bskey], #16 // round 0 key
434 ldr q8, M0SR
435 ldr q24, SR
436
437 eor v10.16b, v0.16b, v9.16b // xor with round0 key
438 eor v11.16b, v1.16b, v9.16b
439 tbl v0.16b, {v10.16b}, v8.16b
440 eor v12.16b, v2.16b, v9.16b
441 tbl v1.16b, {v11.16b}, v8.16b
442 eor v13.16b, v3.16b, v9.16b
443 tbl v2.16b, {v12.16b}, v8.16b
444 eor v14.16b, v4.16b, v9.16b
445 tbl v3.16b, {v13.16b}, v8.16b
446 eor v15.16b, v5.16b, v9.16b
447 tbl v4.16b, {v14.16b}, v8.16b
448 eor v10.16b, v6.16b, v9.16b
449 tbl v5.16b, {v15.16b}, v8.16b
450 eor v11.16b, v7.16b, v9.16b
451 tbl v6.16b, {v10.16b}, v8.16b
452 tbl v7.16b, {v11.16b}, v8.16b
453
454 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
455
456 sub rounds, rounds, #1
457 b .Lenc_sbox
458
459.Lenc_loop:
460 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
461.Lenc_sbox:
462 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
463 v13, v14, v15
464 subs rounds, rounds, #1
465 b.cc .Lenc_done
466
467 enc_next_rk
468
469 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
470 v13, v14, v15
471
472 add_round_key v0, v1, v2, v3, v4, v5, v6, v7
473
474 b.ne .Lenc_loop
475 ldr q24, SRM0
476 b .Lenc_loop
477
478.Lenc_done:
479 ldr q12, [bskey] // last round key
480
481 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
482
483 eor v0.16b, v0.16b, v12.16b
484 eor v1.16b, v1.16b, v12.16b
485 eor v4.16b, v4.16b, v12.16b
486 eor v6.16b, v6.16b, v12.16b
487 eor v3.16b, v3.16b, v12.16b
488 eor v7.16b, v7.16b, v12.16b
489 eor v2.16b, v2.16b, v12.16b
490 eor v5.16b, v5.16b, v12.16b
491 ret
492SYM_FUNC_END(aesbs_encrypt8)
493
494 .align 4
495SYM_FUNC_START_LOCAL(aesbs_decrypt8)
496 lsl x9, rounds, #7
497 add bskey, bskey, x9
498
499 ldr q9, [bskey, #-112]! // round 0 key
500 ldr q8, M0ISR
501 ldr q24, ISR
502
503 eor v10.16b, v0.16b, v9.16b // xor with round0 key
504 eor v11.16b, v1.16b, v9.16b
505 tbl v0.16b, {v10.16b}, v8.16b
506 eor v12.16b, v2.16b, v9.16b
507 tbl v1.16b, {v11.16b}, v8.16b
508 eor v13.16b, v3.16b, v9.16b
509 tbl v2.16b, {v12.16b}, v8.16b
510 eor v14.16b, v4.16b, v9.16b
511 tbl v3.16b, {v13.16b}, v8.16b
512 eor v15.16b, v5.16b, v9.16b
513 tbl v4.16b, {v14.16b}, v8.16b
514 eor v10.16b, v6.16b, v9.16b
515 tbl v5.16b, {v15.16b}, v8.16b
516 eor v11.16b, v7.16b, v9.16b
517 tbl v6.16b, {v10.16b}, v8.16b
518 tbl v7.16b, {v11.16b}, v8.16b
519
520 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
521
522 sub rounds, rounds, #1
523 b .Ldec_sbox
524
525.Ldec_loop:
526 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
527.Ldec_sbox:
528 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
529 v13, v14, v15
530 subs rounds, rounds, #1
531 b.cc .Ldec_done
532
533 dec_next_rk
534
535 add_round_key v0, v1, v6, v4, v2, v7, v3, v5
536
537 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
538 v13, v14, v15
539
540 b.ne .Ldec_loop
541 ldr q24, ISRM0
542 b .Ldec_loop
543.Ldec_done:
544 ldr q12, [bskey, #-16] // last round key
545
546 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
547
548 eor v0.16b, v0.16b, v12.16b
549 eor v1.16b, v1.16b, v12.16b
550 eor v6.16b, v6.16b, v12.16b
551 eor v4.16b, v4.16b, v12.16b
552 eor v2.16b, v2.16b, v12.16b
553 eor v7.16b, v7.16b, v12.16b
554 eor v3.16b, v3.16b, v12.16b
555 eor v5.16b, v5.16b, v12.16b
556 ret
557SYM_FUNC_END(aesbs_decrypt8)
558
559 /*
560 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
561 * int blocks)
562 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
563 * int blocks)
564 */
565 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
566 frame_push 5
567
568 mov x19, x0
569 mov x20, x1
570 mov x21, x2
571 mov x22, x3
572 mov x23, x4
573
57499: mov x5, #1
575 lsl x5, x5, x23
576 subs w23, w23, #8
577 csel x23, x23, xzr, pl
578 csel x5, x5, xzr, mi
579
580 ld1 {v0.16b}, [x20], #16
581 tbnz x5, #1, 0f
582 ld1 {v1.16b}, [x20], #16
583 tbnz x5, #2, 0f
584 ld1 {v2.16b}, [x20], #16
585 tbnz x5, #3, 0f
586 ld1 {v3.16b}, [x20], #16
587 tbnz x5, #4, 0f
588 ld1 {v4.16b}, [x20], #16
589 tbnz x5, #5, 0f
590 ld1 {v5.16b}, [x20], #16
591 tbnz x5, #6, 0f
592 ld1 {v6.16b}, [x20], #16
593 tbnz x5, #7, 0f
594 ld1 {v7.16b}, [x20], #16
595
5960: mov bskey, x21
597 mov rounds, x22
598 bl \do8
599
600 st1 {\o0\().16b}, [x19], #16
601 tbnz x5, #1, 1f
602 st1 {\o1\().16b}, [x19], #16
603 tbnz x5, #2, 1f
604 st1 {\o2\().16b}, [x19], #16
605 tbnz x5, #3, 1f
606 st1 {\o3\().16b}, [x19], #16
607 tbnz x5, #4, 1f
608 st1 {\o4\().16b}, [x19], #16
609 tbnz x5, #5, 1f
610 st1 {\o5\().16b}, [x19], #16
611 tbnz x5, #6, 1f
612 st1 {\o6\().16b}, [x19], #16
613 tbnz x5, #7, 1f
614 st1 {\o7\().16b}, [x19], #16
615
616 cbz x23, 1f
617 b 99b
618
6191: frame_pop
620 ret
621 .endm
622
623 .align 4
624SYM_TYPED_FUNC_START(aesbs_ecb_encrypt)
625 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
626SYM_FUNC_END(aesbs_ecb_encrypt)
627
628 .align 4
629SYM_TYPED_FUNC_START(aesbs_ecb_decrypt)
630 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
631SYM_FUNC_END(aesbs_ecb_decrypt)
632
633 /*
634 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
635 * int blocks, u8 iv[])
636 */
637 .align 4
638SYM_FUNC_START(aesbs_cbc_decrypt)
639 frame_push 6
640
641 mov x19, x0
642 mov x20, x1
643 mov x21, x2
644 mov x22, x3
645 mov x23, x4
646 mov x24, x5
647
64899: mov x6, #1
649 lsl x6, x6, x23
650 subs w23, w23, #8
651 csel x23, x23, xzr, pl
652 csel x6, x6, xzr, mi
653
654 ld1 {v0.16b}, [x20], #16
655 mov v25.16b, v0.16b
656 tbnz x6, #1, 0f
657 ld1 {v1.16b}, [x20], #16
658 mov v26.16b, v1.16b
659 tbnz x6, #2, 0f
660 ld1 {v2.16b}, [x20], #16
661 mov v27.16b, v2.16b
662 tbnz x6, #3, 0f
663 ld1 {v3.16b}, [x20], #16
664 mov v28.16b, v3.16b
665 tbnz x6, #4, 0f
666 ld1 {v4.16b}, [x20], #16
667 mov v29.16b, v4.16b
668 tbnz x6, #5, 0f
669 ld1 {v5.16b}, [x20], #16
670 mov v30.16b, v5.16b
671 tbnz x6, #6, 0f
672 ld1 {v6.16b}, [x20], #16
673 mov v31.16b, v6.16b
674 tbnz x6, #7, 0f
675 ld1 {v7.16b}, [x20]
676
6770: mov bskey, x21
678 mov rounds, x22
679 bl aesbs_decrypt8
680
681 ld1 {v24.16b}, [x24] // load IV
682
683 eor v1.16b, v1.16b, v25.16b
684 eor v6.16b, v6.16b, v26.16b
685 eor v4.16b, v4.16b, v27.16b
686 eor v2.16b, v2.16b, v28.16b
687 eor v7.16b, v7.16b, v29.16b
688 eor v0.16b, v0.16b, v24.16b
689 eor v3.16b, v3.16b, v30.16b
690 eor v5.16b, v5.16b, v31.16b
691
692 st1 {v0.16b}, [x19], #16
693 mov v24.16b, v25.16b
694 tbnz x6, #1, 1f
695 st1 {v1.16b}, [x19], #16
696 mov v24.16b, v26.16b
697 tbnz x6, #2, 1f
698 st1 {v6.16b}, [x19], #16
699 mov v24.16b, v27.16b
700 tbnz x6, #3, 1f
701 st1 {v4.16b}, [x19], #16
702 mov v24.16b, v28.16b
703 tbnz x6, #4, 1f
704 st1 {v2.16b}, [x19], #16
705 mov v24.16b, v29.16b
706 tbnz x6, #5, 1f
707 st1 {v7.16b}, [x19], #16
708 mov v24.16b, v30.16b
709 tbnz x6, #6, 1f
710 st1 {v3.16b}, [x19], #16
711 mov v24.16b, v31.16b
712 tbnz x6, #7, 1f
713 ld1 {v24.16b}, [x20], #16
714 st1 {v5.16b}, [x19], #16
7151: st1 {v24.16b}, [x24] // store IV
716
717 cbz x23, 2f
718 b 99b
719
7202: frame_pop
721 ret
722SYM_FUNC_END(aesbs_cbc_decrypt)
723
724 .macro next_tweak, out, in, const, tmp
725 sshr \tmp\().2d, \in\().2d, #63
726 and \tmp\().16b, \tmp\().16b, \const\().16b
727 add \out\().2d, \in\().2d, \in\().2d
728 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
729 eor \out\().16b, \out\().16b, \tmp\().16b
730 .endm
731
732 /*
733 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
734 * int blocks, u8 iv[])
735 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
736 * int blocks, u8 iv[])
737 */
738SYM_FUNC_START_LOCAL(__xts_crypt8)
739 movi v18.2s, #0x1
740 movi v19.2s, #0x87
741 uzp1 v18.4s, v18.4s, v19.4s
742
743 ld1 {v0.16b-v3.16b}, [x1], #64
744 ld1 {v4.16b-v7.16b}, [x1], #64
745
746 next_tweak v26, v25, v18, v19
747 next_tweak v27, v26, v18, v19
748 next_tweak v28, v27, v18, v19
749 next_tweak v29, v28, v18, v19
750 next_tweak v30, v29, v18, v19
751 next_tweak v31, v30, v18, v19
752 next_tweak v16, v31, v18, v19
753 next_tweak v17, v16, v18, v19
754
755 eor v0.16b, v0.16b, v25.16b
756 eor v1.16b, v1.16b, v26.16b
757 eor v2.16b, v2.16b, v27.16b
758 eor v3.16b, v3.16b, v28.16b
759 eor v4.16b, v4.16b, v29.16b
760 eor v5.16b, v5.16b, v30.16b
761 eor v6.16b, v6.16b, v31.16b
762 eor v7.16b, v7.16b, v16.16b
763
764 stp q16, q17, [x6]
765
766 mov bskey, x2
767 mov rounds, x3
768 br x16
769SYM_FUNC_END(__xts_crypt8)
770
771 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
772 frame_push 0, 32
773 add x6, sp, #.Lframe_local_offset
774
775 ld1 {v25.16b}, [x5]
776
7770: adr x16, \do8
778 bl __xts_crypt8
779
780 eor v16.16b, \o0\().16b, v25.16b
781 eor v17.16b, \o1\().16b, v26.16b
782 eor v18.16b, \o2\().16b, v27.16b
783 eor v19.16b, \o3\().16b, v28.16b
784
785 ldp q24, q25, [x6]
786
787 eor v20.16b, \o4\().16b, v29.16b
788 eor v21.16b, \o5\().16b, v30.16b
789 eor v22.16b, \o6\().16b, v31.16b
790 eor v23.16b, \o7\().16b, v24.16b
791
792 st1 {v16.16b-v19.16b}, [x0], #64
793 st1 {v20.16b-v23.16b}, [x0], #64
794
795 subs x4, x4, #8
796 b.gt 0b
797
798 st1 {v25.16b}, [x5]
799 frame_pop
800 ret
801 .endm
802
803SYM_TYPED_FUNC_START(aesbs_xts_encrypt)
804 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
805SYM_FUNC_END(aesbs_xts_encrypt)
806
807SYM_TYPED_FUNC_START(aesbs_xts_decrypt)
808 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
809SYM_FUNC_END(aesbs_xts_decrypt)
810
811 .macro next_ctr, v
812 mov \v\().d[1], x8
813 adds x8, x8, #1
814 mov \v\().d[0], x7
815 adc x7, x7, xzr
816 rev64 \v\().16b, \v\().16b
817 .endm
818
819 /*
820 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
821 * int rounds, int blocks, u8 iv[])
822 */
823SYM_FUNC_START(aesbs_ctr_encrypt)
824 frame_push 0
825 ldp x7, x8, [x5]
826 ld1 {v0.16b}, [x5]
827CPU_LE( rev x7, x7 )
828CPU_LE( rev x8, x8 )
829 adds x8, x8, #1
830 adc x7, x7, xzr
831
8320: next_ctr v1
833 next_ctr v2
834 next_ctr v3
835 next_ctr v4
836 next_ctr v5
837 next_ctr v6
838 next_ctr v7
839
840 mov bskey, x2
841 mov rounds, x3
842 bl aesbs_encrypt8
843
844 ld1 { v8.16b-v11.16b}, [x1], #64
845 ld1 {v12.16b-v15.16b}, [x1], #64
846
847 eor v8.16b, v0.16b, v8.16b
848 eor v9.16b, v1.16b, v9.16b
849 eor v10.16b, v4.16b, v10.16b
850 eor v11.16b, v6.16b, v11.16b
851 eor v12.16b, v3.16b, v12.16b
852 eor v13.16b, v7.16b, v13.16b
853 eor v14.16b, v2.16b, v14.16b
854 eor v15.16b, v5.16b, v15.16b
855
856 st1 { v8.16b-v11.16b}, [x0], #64
857 st1 {v12.16b-v15.16b}, [x0], #64
858
859 next_ctr v0
860 subs x4, x4, #8
861 b.gt 0b
862
863 st1 {v0.16b}, [x5]
864 frame_pop
865 ret
866SYM_FUNC_END(aesbs_ctr_encrypt)
867

source code of linux/arch/arm64/crypto/aes-neonbs-core.S