1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Bit sliced AES using NEON instructions |
4 | * |
5 | * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> |
6 | */ |
7 | |
8 | /* |
9 | * The algorithm implemented here is described in detail by the paper |
10 | * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and |
11 | * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) |
12 | * |
13 | * This implementation is based primarily on the OpenSSL implementation |
14 | * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> |
15 | */ |
16 | |
17 | #include <linux/linkage.h> |
18 | #include <linux/cfi_types.h> |
19 | #include <asm/assembler.h> |
20 | |
21 | .text |
22 | |
23 | rounds .req x11 |
24 | bskey .req x12 |
25 | |
26 | .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 |
27 | eor \b2, \b2, \b1 |
28 | eor \b5, \b5, \b6 |
29 | eor \b3, \b3, \b0 |
30 | eor \b6, \b6, \b2 |
31 | eor \b5, \b5, \b0 |
32 | eor \b6, \b6, \b3 |
33 | eor \b3, \b3, \b7 |
34 | eor \b7, \b7, \b5 |
35 | eor \b3, \b3, \b4 |
36 | eor \b4, \b4, \b5 |
37 | eor \b2, \b2, \b7 |
38 | eor \b3, \b3, \b1 |
39 | eor \b1, \b1, \b5 |
40 | .endm |
41 | |
42 | .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 |
43 | eor \b0, \b0, \b6 |
44 | eor \b1, \b1, \b4 |
45 | eor \b4, \b4, \b6 |
46 | eor \b2, \b2, \b0 |
47 | eor \b6, \b6, \b1 |
48 | eor \b1, \b1, \b5 |
49 | eor \b5, \b5, \b3 |
50 | eor \b3, \b3, \b7 |
51 | eor \b7, \b7, \b5 |
52 | eor \b2, \b2, \b5 |
53 | eor \b4, \b4, \b7 |
54 | .endm |
55 | |
56 | .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 |
57 | eor \b1, \b1, \b7 |
58 | eor \b4, \b4, \b7 |
59 | eor \b7, \b7, \b5 |
60 | eor \b1, \b1, \b3 |
61 | eor \b2, \b2, \b5 |
62 | eor \b3, \b3, \b7 |
63 | eor \b6, \b6, \b1 |
64 | eor \b2, \b2, \b0 |
65 | eor \b5, \b5, \b3 |
66 | eor \b4, \b4, \b6 |
67 | eor \b0, \b0, \b6 |
68 | eor \b1, \b1, \b4 |
69 | .endm |
70 | |
71 | .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 |
72 | eor \b1, \b1, \b5 |
73 | eor \b2, \b2, \b7 |
74 | eor \b3, \b3, \b1 |
75 | eor \b4, \b4, \b5 |
76 | eor \b7, \b7, \b5 |
77 | eor \b3, \b3, \b4 |
78 | eor \b5, \b5, \b0 |
79 | eor \b3, \b3, \b7 |
80 | eor \b6, \b6, \b2 |
81 | eor \b2, \b2, \b1 |
82 | eor \b6, \b6, \b3 |
83 | eor \b3, \b3, \b0 |
84 | eor \b5, \b5, \b6 |
85 | .endm |
86 | |
87 | .macro mul_gf4, x0, x1, y0, y1, t0, t1 |
88 | eor \t0, \y0, \y1 |
89 | and \t0, \t0, \x0 |
90 | eor \x0, \x0, \x1 |
91 | and \t1, \x1, \y0 |
92 | and \x0, \x0, \y1 |
93 | eor \x1, \t1, \t0 |
94 | eor \x0, \x0, \t1 |
95 | .endm |
96 | |
97 | .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 |
98 | eor \t0, \y0, \y1 |
99 | eor \t1, \y2, \y3 |
100 | and \t0, \t0, \x0 |
101 | and \t1, \t1, \x2 |
102 | eor \x0, \x0, \x1 |
103 | eor \x2, \x2, \x3 |
104 | and \x1, \x1, \y0 |
105 | and \x3, \x3, \y2 |
106 | and \x0, \x0, \y1 |
107 | and \x2, \x2, \y3 |
108 | eor \x1, \x1, \x0 |
109 | eor \x2, \x2, \x3 |
110 | eor \x0, \x0, \t0 |
111 | eor \x3, \x3, \t1 |
112 | .endm |
113 | |
114 | .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ |
115 | y0, y1, y2, y3, t0, t1, t2, t3 |
116 | eor \t0, \x0, \x2 |
117 | eor \t1, \x1, \x3 |
118 | mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 |
119 | eor \y0, \y0, \y2 |
120 | eor \y1, \y1, \y3 |
121 | mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 |
122 | eor \x0, \x0, \t0 |
123 | eor \x2, \x2, \t0 |
124 | eor \x1, \x1, \t1 |
125 | eor \x3, \x3, \t1 |
126 | eor \t0, \x4, \x6 |
127 | eor \t1, \x5, \x7 |
128 | mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 |
129 | eor \y0, \y0, \y2 |
130 | eor \y1, \y1, \y3 |
131 | mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 |
132 | eor \x4, \x4, \t0 |
133 | eor \x6, \x6, \t0 |
134 | eor \x5, \x5, \t1 |
135 | eor \x7, \x7, \t1 |
136 | .endm |
137 | |
138 | .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ |
139 | t0, t1, t2, t3, s0, s1, s2, s3 |
140 | eor \t3, \x4, \x6 |
141 | eor \t0, \x5, \x7 |
142 | eor \t1, \x1, \x3 |
143 | eor \s1, \x7, \x6 |
144 | eor \s0, \x0, \x2 |
145 | eor \s3, \t3, \t0 |
146 | orr \t2, \t0, \t1 |
147 | and \s2, \t3, \s0 |
148 | orr \t3, \t3, \s0 |
149 | eor \s0, \s0, \t1 |
150 | and \t0, \t0, \t1 |
151 | eor \t1, \x3, \x2 |
152 | and \s3, \s3, \s0 |
153 | and \s1, \s1, \t1 |
154 | eor \t1, \x4, \x5 |
155 | eor \s0, \x1, \x0 |
156 | eor \t3, \t3, \s1 |
157 | eor \t2, \t2, \s1 |
158 | and \s1, \t1, \s0 |
159 | orr \t1, \t1, \s0 |
160 | eor \t3, \t3, \s3 |
161 | eor \t0, \t0, \s1 |
162 | eor \t2, \t2, \s2 |
163 | eor \t1, \t1, \s3 |
164 | eor \t0, \t0, \s2 |
165 | and \s0, \x7, \x3 |
166 | eor \t1, \t1, \s2 |
167 | and \s1, \x6, \x2 |
168 | and \s2, \x5, \x1 |
169 | orr \s3, \x4, \x0 |
170 | eor \t3, \t3, \s0 |
171 | eor \t1, \t1, \s2 |
172 | eor \s0, \t0, \s3 |
173 | eor \t2, \t2, \s1 |
174 | and \s2, \t3, \t1 |
175 | eor \s1, \t2, \s2 |
176 | eor \s3, \s0, \s2 |
177 | bsl \s1, \t1, \s0 |
178 | not \t0, \s0 |
179 | bsl \s0, \s1, \s3 |
180 | bsl \t0, \s1, \s3 |
181 | bsl \s3, \t3, \t2 |
182 | eor \t3, \t3, \t2 |
183 | and \s2, \s0, \s3 |
184 | eor \t1, \t1, \t0 |
185 | eor \s2, \s2, \t3 |
186 | mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ |
187 | \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 |
188 | .endm |
189 | |
190 | .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ |
191 | t0, t1, t2, t3, s0, s1, s2, s3 |
192 | in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ |
193 | \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b |
194 | inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ |
195 | \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ |
196 | \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ |
197 | \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b |
198 | out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ |
199 | \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b |
200 | .endm |
201 | |
202 | .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ |
203 | t0, t1, t2, t3, s0, s1, s2, s3 |
204 | inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ |
205 | \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b |
206 | inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ |
207 | \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ |
208 | \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ |
209 | \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b |
210 | inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ |
211 | \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b |
212 | .endm |
213 | |
214 | .macro enc_next_rk |
215 | ldp q16, q17, [bskey], #128 |
216 | ldp q18, q19, [bskey, #-96] |
217 | ldp q20, q21, [bskey, #-64] |
218 | ldp q22, q23, [bskey, #-32] |
219 | .endm |
220 | |
221 | .macro dec_next_rk |
222 | ldp q16, q17, [bskey, #-128]! |
223 | ldp q18, q19, [bskey, #32] |
224 | ldp q20, q21, [bskey, #64] |
225 | ldp q22, q23, [bskey, #96] |
226 | .endm |
227 | |
228 | .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 |
229 | eor \x0\().16b, \x0\().16b, v16.16b |
230 | eor \x1\().16b, \x1\().16b, v17.16b |
231 | eor \x2\().16b, \x2\().16b, v18.16b |
232 | eor \x3\().16b, \x3\().16b, v19.16b |
233 | eor \x4\().16b, \x4\().16b, v20.16b |
234 | eor \x5\().16b, \x5\().16b, v21.16b |
235 | eor \x6\().16b, \x6\().16b, v22.16b |
236 | eor \x7\().16b, \x7\().16b, v23.16b |
237 | .endm |
238 | |
239 | .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask |
240 | tbl \x0\().16b, {\x0\().16b}, \mask\().16b |
241 | tbl \x1\().16b, {\x1\().16b}, \mask\().16b |
242 | tbl \x2\().16b, {\x2\().16b}, \mask\().16b |
243 | tbl \x3\().16b, {\x3\().16b}, \mask\().16b |
244 | tbl \x4\().16b, {\x4\().16b}, \mask\().16b |
245 | tbl \x5\().16b, {\x5\().16b}, \mask\().16b |
246 | tbl \x6\().16b, {\x6\().16b}, \mask\().16b |
247 | tbl \x7\().16b, {\x7\().16b}, \mask\().16b |
248 | .endm |
249 | |
250 | .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ |
251 | t0, t1, t2, t3, t4, t5, t6, t7, inv |
252 | ext \t0\().16b, \x0\().16b, \x0\().16b, #12 |
253 | ext \t1\().16b, \x1\().16b, \x1\().16b, #12 |
254 | eor \x0\().16b, \x0\().16b, \t0\().16b |
255 | ext \t2\().16b, \x2\().16b, \x2\().16b, #12 |
256 | eor \x1\().16b, \x1\().16b, \t1\().16b |
257 | ext \t3\().16b, \x3\().16b, \x3\().16b, #12 |
258 | eor \x2\().16b, \x2\().16b, \t2\().16b |
259 | ext \t4\().16b, \x4\().16b, \x4\().16b, #12 |
260 | eor \x3\().16b, \x3\().16b, \t3\().16b |
261 | ext \t5\().16b, \x5\().16b, \x5\().16b, #12 |
262 | eor \x4\().16b, \x4\().16b, \t4\().16b |
263 | ext \t6\().16b, \x6\().16b, \x6\().16b, #12 |
264 | eor \x5\().16b, \x5\().16b, \t5\().16b |
265 | ext \t7\().16b, \x7\().16b, \x7\().16b, #12 |
266 | eor \x6\().16b, \x6\().16b, \t6\().16b |
267 | eor \t1\().16b, \t1\().16b, \x0\().16b |
268 | eor \x7\().16b, \x7\().16b, \t7\().16b |
269 | ext \x0\().16b, \x0\().16b, \x0\().16b, #8 |
270 | eor \t2\().16b, \t2\().16b, \x1\().16b |
271 | eor \t0\().16b, \t0\().16b, \x7\().16b |
272 | eor \t1\().16b, \t1\().16b, \x7\().16b |
273 | ext \x1\().16b, \x1\().16b, \x1\().16b, #8 |
274 | eor \t5\().16b, \t5\().16b, \x4\().16b |
275 | eor \x0\().16b, \x0\().16b, \t0\().16b |
276 | eor \t6\().16b, \t6\().16b, \x5\().16b |
277 | eor \x1\().16b, \x1\().16b, \t1\().16b |
278 | ext \t0\().16b, \x4\().16b, \x4\().16b, #8 |
279 | eor \t4\().16b, \t4\().16b, \x3\().16b |
280 | ext \t1\().16b, \x5\().16b, \x5\().16b, #8 |
281 | eor \t7\().16b, \t7\().16b, \x6\().16b |
282 | ext \x4\().16b, \x3\().16b, \x3\().16b, #8 |
283 | eor \t3\().16b, \t3\().16b, \x2\().16b |
284 | ext \x5\().16b, \x7\().16b, \x7\().16b, #8 |
285 | eor \t4\().16b, \t4\().16b, \x7\().16b |
286 | ext \x3\().16b, \x6\().16b, \x6\().16b, #8 |
287 | eor \t3\().16b, \t3\().16b, \x7\().16b |
288 | ext \x6\().16b, \x2\().16b, \x2\().16b, #8 |
289 | eor \x7\().16b, \t1\().16b, \t5\().16b |
290 | .ifb \inv |
291 | eor \x2\().16b, \t0\().16b, \t4\().16b |
292 | eor \x4\().16b, \x4\().16b, \t3\().16b |
293 | eor \x5\().16b, \x5\().16b, \t7\().16b |
294 | eor \x3\().16b, \x3\().16b, \t6\().16b |
295 | eor \x6\().16b, \x6\().16b, \t2\().16b |
296 | .else |
297 | eor \t3\().16b, \t3\().16b, \x4\().16b |
298 | eor \x5\().16b, \x5\().16b, \t7\().16b |
299 | eor \x2\().16b, \x3\().16b, \t6\().16b |
300 | eor \x3\().16b, \t0\().16b, \t4\().16b |
301 | eor \x4\().16b, \x6\().16b, \t2\().16b |
302 | mov \x6\().16b, \t3\().16b |
303 | .endif |
304 | .endm |
305 | |
306 | .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ |
307 | t0, t1, t2, t3, t4, t5, t6, t7 |
308 | ext \t0\().16b, \x0\().16b, \x0\().16b, #8 |
309 | ext \t6\().16b, \x6\().16b, \x6\().16b, #8 |
310 | ext \t7\().16b, \x7\().16b, \x7\().16b, #8 |
311 | eor \t0\().16b, \t0\().16b, \x0\().16b |
312 | ext \t1\().16b, \x1\().16b, \x1\().16b, #8 |
313 | eor \t6\().16b, \t6\().16b, \x6\().16b |
314 | ext \t2\().16b, \x2\().16b, \x2\().16b, #8 |
315 | eor \t7\().16b, \t7\().16b, \x7\().16b |
316 | ext \t3\().16b, \x3\().16b, \x3\().16b, #8 |
317 | eor \t1\().16b, \t1\().16b, \x1\().16b |
318 | ext \t4\().16b, \x4\().16b, \x4\().16b, #8 |
319 | eor \t2\().16b, \t2\().16b, \x2\().16b |
320 | ext \t5\().16b, \x5\().16b, \x5\().16b, #8 |
321 | eor \t3\().16b, \t3\().16b, \x3\().16b |
322 | eor \t4\().16b, \t4\().16b, \x4\().16b |
323 | eor \t5\().16b, \t5\().16b, \x5\().16b |
324 | eor \x0\().16b, \x0\().16b, \t6\().16b |
325 | eor \x1\().16b, \x1\().16b, \t6\().16b |
326 | eor \x2\().16b, \x2\().16b, \t0\().16b |
327 | eor \x4\().16b, \x4\().16b, \t2\().16b |
328 | eor \x3\().16b, \x3\().16b, \t1\().16b |
329 | eor \x1\().16b, \x1\().16b, \t7\().16b |
330 | eor \x2\().16b, \x2\().16b, \t7\().16b |
331 | eor \x4\().16b, \x4\().16b, \t6\().16b |
332 | eor \x5\().16b, \x5\().16b, \t3\().16b |
333 | eor \x3\().16b, \x3\().16b, \t6\().16b |
334 | eor \x6\().16b, \x6\().16b, \t4\().16b |
335 | eor \x4\().16b, \x4\().16b, \t7\().16b |
336 | eor \x5\().16b, \x5\().16b, \t7\().16b |
337 | eor \x7\().16b, \x7\().16b, \t5\().16b |
338 | mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ |
339 | \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 |
340 | .endm |
341 | |
342 | .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 |
343 | ushr \t0\().2d, \b0\().2d, #\n |
344 | ushr \t1\().2d, \b1\().2d, #\n |
345 | eor \t0\().16b, \t0\().16b, \a0\().16b |
346 | eor \t1\().16b, \t1\().16b, \a1\().16b |
347 | and \t0\().16b, \t0\().16b, \mask\().16b |
348 | and \t1\().16b, \t1\().16b, \mask\().16b |
349 | eor \a0\().16b, \a0\().16b, \t0\().16b |
350 | shl \t0\().2d, \t0\().2d, #\n |
351 | eor \a1\().16b, \a1\().16b, \t1\().16b |
352 | shl \t1\().2d, \t1\().2d, #\n |
353 | eor \b0\().16b, \b0\().16b, \t0\().16b |
354 | eor \b1\().16b, \b1\().16b, \t1\().16b |
355 | .endm |
356 | |
357 | .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 |
358 | movi \t0\().16b, #0x55 |
359 | movi \t1\().16b, #0x33 |
360 | swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 |
361 | swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 |
362 | movi \t0\().16b, #0x0f |
363 | swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 |
364 | swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 |
365 | swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 |
366 | swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 |
367 | .endm |
368 | |
369 | |
370 | .align 6 |
371 | M0: .octa 0x0004080c0105090d02060a0e03070b0f |
372 | |
373 | M0SR: .octa 0x0004080c05090d010a0e02060f03070b |
374 | SR: .octa 0x0f0e0d0c0a09080b0504070600030201 |
375 | SRM0: .octa 0x01060b0c0207080d0304090e00050a0f |
376 | |
377 | M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 |
378 | ISR: .octa 0x0f0e0d0c080b0a090504070602010003 |
379 | ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f |
380 | |
381 | /* |
382 | * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) |
383 | */ |
384 | SYM_FUNC_START(aesbs_convert_key) |
385 | ld1 {v7.4s}, [x1], #16 // load round 0 key |
386 | ld1 {v17.4s}, [x1], #16 // load round 1 key |
387 | |
388 | movi v8.16b, #0x01 // bit masks |
389 | movi v9.16b, #0x02 |
390 | movi v10.16b, #0x04 |
391 | movi v11.16b, #0x08 |
392 | movi v12.16b, #0x10 |
393 | movi v13.16b, #0x20 |
394 | movi v14.16b, #0x40 |
395 | movi v15.16b, #0x80 |
396 | ldr q16, M0 |
397 | |
398 | sub x2, x2, #1 |
399 | str q7, [x0], #16 // save round 0 key |
400 | |
401 | .Lkey_loop: |
402 | tbl v7.16b ,{v17.16b}, v16.16b |
403 | ld1 {v17.4s}, [x1], #16 // load next round key |
404 | |
405 | cmtst v0.16b, v7.16b, v8.16b |
406 | cmtst v1.16b, v7.16b, v9.16b |
407 | cmtst v2.16b, v7.16b, v10.16b |
408 | cmtst v3.16b, v7.16b, v11.16b |
409 | cmtst v4.16b, v7.16b, v12.16b |
410 | cmtst v5.16b, v7.16b, v13.16b |
411 | cmtst v6.16b, v7.16b, v14.16b |
412 | cmtst v7.16b, v7.16b, v15.16b |
413 | not v0.16b, v0.16b |
414 | not v1.16b, v1.16b |
415 | not v5.16b, v5.16b |
416 | not v6.16b, v6.16b |
417 | |
418 | subs x2, x2, #1 |
419 | stp q0, q1, [x0], #128 |
420 | stp q2, q3, [x0, #-96] |
421 | stp q4, q5, [x0, #-64] |
422 | stp q6, q7, [x0, #-32] |
423 | b.ne .Lkey_loop |
424 | |
425 | movi v7.16b, #0x63 // compose .L63 |
426 | eor v17.16b, v17.16b, v7.16b |
427 | str q17, [x0] |
428 | ret |
429 | SYM_FUNC_END(aesbs_convert_key) |
430 | |
431 | .align 4 |
432 | SYM_FUNC_START_LOCAL(aesbs_encrypt8) |
433 | ldr q9, [bskey], #16 // round 0 key |
434 | ldr q8, M0SR |
435 | ldr q24, SR |
436 | |
437 | eor v10.16b, v0.16b, v9.16b // xor with round0 key |
438 | eor v11.16b, v1.16b, v9.16b |
439 | tbl v0.16b, {v10.16b}, v8.16b |
440 | eor v12.16b, v2.16b, v9.16b |
441 | tbl v1.16b, {v11.16b}, v8.16b |
442 | eor v13.16b, v3.16b, v9.16b |
443 | tbl v2.16b, {v12.16b}, v8.16b |
444 | eor v14.16b, v4.16b, v9.16b |
445 | tbl v3.16b, {v13.16b}, v8.16b |
446 | eor v15.16b, v5.16b, v9.16b |
447 | tbl v4.16b, {v14.16b}, v8.16b |
448 | eor v10.16b, v6.16b, v9.16b |
449 | tbl v5.16b, {v15.16b}, v8.16b |
450 | eor v11.16b, v7.16b, v9.16b |
451 | tbl v6.16b, {v10.16b}, v8.16b |
452 | tbl v7.16b, {v11.16b}, v8.16b |
453 | |
454 | bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 |
455 | |
456 | sub rounds, rounds, #1 |
457 | b .Lenc_sbox |
458 | |
459 | .Lenc_loop: |
460 | shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 |
461 | .Lenc_sbox: |
462 | sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ |
463 | v13, v14, v15 |
464 | subs rounds, rounds, #1 |
465 | b.cc .Lenc_done |
466 | |
467 | enc_next_rk |
468 | |
469 | mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ |
470 | v13, v14, v15 |
471 | |
472 | add_round_key v0, v1, v2, v3, v4, v5, v6, v7 |
473 | |
474 | b.ne .Lenc_loop |
475 | ldr q24, SRM0 |
476 | b .Lenc_loop |
477 | |
478 | .Lenc_done: |
479 | ldr q12, [bskey] // last round key |
480 | |
481 | bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 |
482 | |
483 | eor v0.16b, v0.16b, v12.16b |
484 | eor v1.16b, v1.16b, v12.16b |
485 | eor v4.16b, v4.16b, v12.16b |
486 | eor v6.16b, v6.16b, v12.16b |
487 | eor v3.16b, v3.16b, v12.16b |
488 | eor v7.16b, v7.16b, v12.16b |
489 | eor v2.16b, v2.16b, v12.16b |
490 | eor v5.16b, v5.16b, v12.16b |
491 | ret |
492 | SYM_FUNC_END(aesbs_encrypt8) |
493 | |
494 | .align 4 |
495 | SYM_FUNC_START_LOCAL(aesbs_decrypt8) |
496 | lsl x9, rounds, #7 |
497 | add bskey, bskey, x9 |
498 | |
499 | ldr q9, [bskey, #-112]! // round 0 key |
500 | ldr q8, M0ISR |
501 | ldr q24, ISR |
502 | |
503 | eor v10.16b, v0.16b, v9.16b // xor with round0 key |
504 | eor v11.16b, v1.16b, v9.16b |
505 | tbl v0.16b, {v10.16b}, v8.16b |
506 | eor v12.16b, v2.16b, v9.16b |
507 | tbl v1.16b, {v11.16b}, v8.16b |
508 | eor v13.16b, v3.16b, v9.16b |
509 | tbl v2.16b, {v12.16b}, v8.16b |
510 | eor v14.16b, v4.16b, v9.16b |
511 | tbl v3.16b, {v13.16b}, v8.16b |
512 | eor v15.16b, v5.16b, v9.16b |
513 | tbl v4.16b, {v14.16b}, v8.16b |
514 | eor v10.16b, v6.16b, v9.16b |
515 | tbl v5.16b, {v15.16b}, v8.16b |
516 | eor v11.16b, v7.16b, v9.16b |
517 | tbl v6.16b, {v10.16b}, v8.16b |
518 | tbl v7.16b, {v11.16b}, v8.16b |
519 | |
520 | bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 |
521 | |
522 | sub rounds, rounds, #1 |
523 | b .Ldec_sbox |
524 | |
525 | .Ldec_loop: |
526 | shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 |
527 | .Ldec_sbox: |
528 | inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ |
529 | v13, v14, v15 |
530 | subs rounds, rounds, #1 |
531 | b.cc .Ldec_done |
532 | |
533 | dec_next_rk |
534 | |
535 | add_round_key v0, v1, v6, v4, v2, v7, v3, v5 |
536 | |
537 | inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ |
538 | v13, v14, v15 |
539 | |
540 | b.ne .Ldec_loop |
541 | ldr q24, ISRM0 |
542 | b .Ldec_loop |
543 | .Ldec_done: |
544 | ldr q12, [bskey, #-16] // last round key |
545 | |
546 | bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 |
547 | |
548 | eor v0.16b, v0.16b, v12.16b |
549 | eor v1.16b, v1.16b, v12.16b |
550 | eor v6.16b, v6.16b, v12.16b |
551 | eor v4.16b, v4.16b, v12.16b |
552 | eor v2.16b, v2.16b, v12.16b |
553 | eor v7.16b, v7.16b, v12.16b |
554 | eor v3.16b, v3.16b, v12.16b |
555 | eor v5.16b, v5.16b, v12.16b |
556 | ret |
557 | SYM_FUNC_END(aesbs_decrypt8) |
558 | |
559 | /* |
560 | * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
561 | * int blocks) |
562 | * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
563 | * int blocks) |
564 | */ |
565 | .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 |
566 | frame_push 5 |
567 | |
568 | mov x19, x0 |
569 | mov x20, x1 |
570 | mov x21, x2 |
571 | mov x22, x3 |
572 | mov x23, x4 |
573 | |
574 | 99: mov x5, #1 |
575 | lsl x5, x5, x23 |
576 | subs w23, w23, #8 |
577 | csel x23, x23, xzr, pl |
578 | csel x5, x5, xzr, mi |
579 | |
580 | ld1 {v0.16b}, [x20], #16 |
581 | tbnz x5, #1, 0f |
582 | ld1 {v1.16b}, [x20], #16 |
583 | tbnz x5, #2, 0f |
584 | ld1 {v2.16b}, [x20], #16 |
585 | tbnz x5, #3, 0f |
586 | ld1 {v3.16b}, [x20], #16 |
587 | tbnz x5, #4, 0f |
588 | ld1 {v4.16b}, [x20], #16 |
589 | tbnz x5, #5, 0f |
590 | ld1 {v5.16b}, [x20], #16 |
591 | tbnz x5, #6, 0f |
592 | ld1 {v6.16b}, [x20], #16 |
593 | tbnz x5, #7, 0f |
594 | ld1 {v7.16b}, [x20], #16 |
595 | |
596 | 0: mov bskey, x21 |
597 | mov rounds, x22 |
598 | bl \do8 |
599 | |
600 | st1 {\o0\().16b}, [x19], #16 |
601 | tbnz x5, #1, 1f |
602 | st1 {\o1\().16b}, [x19], #16 |
603 | tbnz x5, #2, 1f |
604 | st1 {\o2\().16b}, [x19], #16 |
605 | tbnz x5, #3, 1f |
606 | st1 {\o3\().16b}, [x19], #16 |
607 | tbnz x5, #4, 1f |
608 | st1 {\o4\().16b}, [x19], #16 |
609 | tbnz x5, #5, 1f |
610 | st1 {\o5\().16b}, [x19], #16 |
611 | tbnz x5, #6, 1f |
612 | st1 {\o6\().16b}, [x19], #16 |
613 | tbnz x5, #7, 1f |
614 | st1 {\o7\().16b}, [x19], #16 |
615 | |
616 | cbz x23, 1f |
617 | b 99b |
618 | |
619 | 1: frame_pop |
620 | ret |
621 | .endm |
622 | |
623 | .align 4 |
624 | SYM_TYPED_FUNC_START(aesbs_ecb_encrypt) |
625 | __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 |
626 | SYM_FUNC_END(aesbs_ecb_encrypt) |
627 | |
628 | .align 4 |
629 | SYM_TYPED_FUNC_START(aesbs_ecb_decrypt) |
630 | __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 |
631 | SYM_FUNC_END(aesbs_ecb_decrypt) |
632 | |
633 | /* |
634 | * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
635 | * int blocks, u8 iv[]) |
636 | */ |
637 | .align 4 |
638 | SYM_FUNC_START(aesbs_cbc_decrypt) |
639 | frame_push 6 |
640 | |
641 | mov x19, x0 |
642 | mov x20, x1 |
643 | mov x21, x2 |
644 | mov x22, x3 |
645 | mov x23, x4 |
646 | mov x24, x5 |
647 | |
648 | 99: mov x6, #1 |
649 | lsl x6, x6, x23 |
650 | subs w23, w23, #8 |
651 | csel x23, x23, xzr, pl |
652 | csel x6, x6, xzr, mi |
653 | |
654 | ld1 {v0.16b}, [x20], #16 |
655 | mov v25.16b, v0.16b |
656 | tbnz x6, #1, 0f |
657 | ld1 {v1.16b}, [x20], #16 |
658 | mov v26.16b, v1.16b |
659 | tbnz x6, #2, 0f |
660 | ld1 {v2.16b}, [x20], #16 |
661 | mov v27.16b, v2.16b |
662 | tbnz x6, #3, 0f |
663 | ld1 {v3.16b}, [x20], #16 |
664 | mov v28.16b, v3.16b |
665 | tbnz x6, #4, 0f |
666 | ld1 {v4.16b}, [x20], #16 |
667 | mov v29.16b, v4.16b |
668 | tbnz x6, #5, 0f |
669 | ld1 {v5.16b}, [x20], #16 |
670 | mov v30.16b, v5.16b |
671 | tbnz x6, #6, 0f |
672 | ld1 {v6.16b}, [x20], #16 |
673 | mov v31.16b, v6.16b |
674 | tbnz x6, #7, 0f |
675 | ld1 {v7.16b}, [x20] |
676 | |
677 | 0: mov bskey, x21 |
678 | mov rounds, x22 |
679 | bl aesbs_decrypt8 |
680 | |
681 | ld1 {v24.16b}, [x24] // load IV |
682 | |
683 | eor v1.16b, v1.16b, v25.16b |
684 | eor v6.16b, v6.16b, v26.16b |
685 | eor v4.16b, v4.16b, v27.16b |
686 | eor v2.16b, v2.16b, v28.16b |
687 | eor v7.16b, v7.16b, v29.16b |
688 | eor v0.16b, v0.16b, v24.16b |
689 | eor v3.16b, v3.16b, v30.16b |
690 | eor v5.16b, v5.16b, v31.16b |
691 | |
692 | st1 {v0.16b}, [x19], #16 |
693 | mov v24.16b, v25.16b |
694 | tbnz x6, #1, 1f |
695 | st1 {v1.16b}, [x19], #16 |
696 | mov v24.16b, v26.16b |
697 | tbnz x6, #2, 1f |
698 | st1 {v6.16b}, [x19], #16 |
699 | mov v24.16b, v27.16b |
700 | tbnz x6, #3, 1f |
701 | st1 {v4.16b}, [x19], #16 |
702 | mov v24.16b, v28.16b |
703 | tbnz x6, #4, 1f |
704 | st1 {v2.16b}, [x19], #16 |
705 | mov v24.16b, v29.16b |
706 | tbnz x6, #5, 1f |
707 | st1 {v7.16b}, [x19], #16 |
708 | mov v24.16b, v30.16b |
709 | tbnz x6, #6, 1f |
710 | st1 {v3.16b}, [x19], #16 |
711 | mov v24.16b, v31.16b |
712 | tbnz x6, #7, 1f |
713 | ld1 {v24.16b}, [x20], #16 |
714 | st1 {v5.16b}, [x19], #16 |
715 | 1: st1 {v24.16b}, [x24] // store IV |
716 | |
717 | cbz x23, 2f |
718 | b 99b |
719 | |
720 | 2: frame_pop |
721 | ret |
722 | SYM_FUNC_END(aesbs_cbc_decrypt) |
723 | |
724 | .macro next_tweak, out, in, const, tmp |
725 | sshr \tmp\().2d, \in\().2d, #63 |
726 | and \tmp\().16b, \tmp\().16b, \const\().16b |
727 | add \out\().2d, \in\().2d, \in\().2d |
728 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 |
729 | eor \out\().16b, \out\().16b, \tmp\().16b |
730 | .endm |
731 | |
732 | /* |
733 | * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
734 | * int blocks, u8 iv[]) |
735 | * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
736 | * int blocks, u8 iv[]) |
737 | */ |
738 | SYM_FUNC_START_LOCAL(__xts_crypt8) |
739 | movi v18.2s, #0x1 |
740 | movi v19.2s, #0x87 |
741 | uzp1 v18.4s, v18.4s, v19.4s |
742 | |
743 | ld1 {v0.16b-v3.16b}, [x1], #64 |
744 | ld1 {v4.16b-v7.16b}, [x1], #64 |
745 | |
746 | next_tweak v26, v25, v18, v19 |
747 | next_tweak v27, v26, v18, v19 |
748 | next_tweak v28, v27, v18, v19 |
749 | next_tweak v29, v28, v18, v19 |
750 | next_tweak v30, v29, v18, v19 |
751 | next_tweak v31, v30, v18, v19 |
752 | next_tweak v16, v31, v18, v19 |
753 | next_tweak v17, v16, v18, v19 |
754 | |
755 | eor v0.16b, v0.16b, v25.16b |
756 | eor v1.16b, v1.16b, v26.16b |
757 | eor v2.16b, v2.16b, v27.16b |
758 | eor v3.16b, v3.16b, v28.16b |
759 | eor v4.16b, v4.16b, v29.16b |
760 | eor v5.16b, v5.16b, v30.16b |
761 | eor v6.16b, v6.16b, v31.16b |
762 | eor v7.16b, v7.16b, v16.16b |
763 | |
764 | stp q16, q17, [x6] |
765 | |
766 | mov bskey, x2 |
767 | mov rounds, x3 |
768 | br x16 |
769 | SYM_FUNC_END(__xts_crypt8) |
770 | |
771 | .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 |
772 | frame_push 0, 32 |
773 | add x6, sp, #.Lframe_local_offset |
774 | |
775 | ld1 {v25.16b}, [x5] |
776 | |
777 | 0: adr x16, \do8 |
778 | bl __xts_crypt8 |
779 | |
780 | eor v16.16b, \o0\().16b, v25.16b |
781 | eor v17.16b, \o1\().16b, v26.16b |
782 | eor v18.16b, \o2\().16b, v27.16b |
783 | eor v19.16b, \o3\().16b, v28.16b |
784 | |
785 | ldp q24, q25, [x6] |
786 | |
787 | eor v20.16b, \o4\().16b, v29.16b |
788 | eor v21.16b, \o5\().16b, v30.16b |
789 | eor v22.16b, \o6\().16b, v31.16b |
790 | eor v23.16b, \o7\().16b, v24.16b |
791 | |
792 | st1 {v16.16b-v19.16b}, [x0], #64 |
793 | st1 {v20.16b-v23.16b}, [x0], #64 |
794 | |
795 | subs x4, x4, #8 |
796 | b.gt 0b |
797 | |
798 | st1 {v25.16b}, [x5] |
799 | frame_pop |
800 | ret |
801 | .endm |
802 | |
803 | SYM_TYPED_FUNC_START(aesbs_xts_encrypt) |
804 | __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 |
805 | SYM_FUNC_END(aesbs_xts_encrypt) |
806 | |
807 | SYM_TYPED_FUNC_START(aesbs_xts_decrypt) |
808 | __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 |
809 | SYM_FUNC_END(aesbs_xts_decrypt) |
810 | |
811 | .macro next_ctr, v |
812 | mov \v\().d[1], x8 |
813 | adds x8, x8, #1 |
814 | mov \v\().d[0], x7 |
815 | adc x7, x7, xzr |
816 | rev64 \v\().16b, \v\().16b |
817 | .endm |
818 | |
819 | /* |
820 | * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], |
821 | * int rounds, int blocks, u8 iv[]) |
822 | */ |
823 | SYM_FUNC_START(aesbs_ctr_encrypt) |
824 | frame_push 0 |
825 | ldp x7, x8, [x5] |
826 | ld1 {v0.16b}, [x5] |
827 | CPU_LE( rev x7, x7 ) |
828 | CPU_LE( rev x8, x8 ) |
829 | adds x8, x8, #1 |
830 | adc x7, x7, xzr |
831 | |
832 | 0: next_ctr v1 |
833 | next_ctr v2 |
834 | next_ctr v3 |
835 | next_ctr v4 |
836 | next_ctr v5 |
837 | next_ctr v6 |
838 | next_ctr v7 |
839 | |
840 | mov bskey, x2 |
841 | mov rounds, x3 |
842 | bl aesbs_encrypt8 |
843 | |
844 | ld1 { v8.16b-v11.16b}, [x1], #64 |
845 | ld1 {v12.16b-v15.16b}, [x1], #64 |
846 | |
847 | eor v8.16b, v0.16b, v8.16b |
848 | eor v9.16b, v1.16b, v9.16b |
849 | eor v10.16b, v4.16b, v10.16b |
850 | eor v11.16b, v6.16b, v11.16b |
851 | eor v12.16b, v3.16b, v12.16b |
852 | eor v13.16b, v7.16b, v13.16b |
853 | eor v14.16b, v2.16b, v14.16b |
854 | eor v15.16b, v5.16b, v15.16b |
855 | |
856 | st1 { v8.16b-v11.16b}, [x0], #64 |
857 | st1 {v12.16b-v15.16b}, [x0], #64 |
858 | |
859 | next_ctr v0 |
860 | subs x4, x4, #8 |
861 | b.gt 0b |
862 | |
863 | st1 {v0.16b}, [x5] |
864 | frame_pop |
865 | ret |
866 | SYM_FUNC_END(aesbs_ctr_encrypt) |
867 | |