| 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
| 2 | /* |
| 3 | * Bit sliced AES using NEON instructions |
| 4 | * |
| 5 | * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> |
| 6 | */ |
| 7 | |
| 8 | /* |
| 9 | * The algorithm implemented here is described in detail by the paper |
| 10 | * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and |
| 11 | * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) |
| 12 | * |
| 13 | * This implementation is based primarily on the OpenSSL implementation |
| 14 | * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> |
| 15 | */ |
| 16 | |
| 17 | #include <linux/linkage.h> |
| 18 | #include <linux/cfi_types.h> |
| 19 | #include <asm/assembler.h> |
| 20 | |
| 21 | .text |
| 22 | |
| 23 | rounds .req x11 |
| 24 | bskey .req x12 |
| 25 | |
| 26 | .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 |
| 27 | eor \b2, \b2, \b1 |
| 28 | eor \b5, \b5, \b6 |
| 29 | eor \b3, \b3, \b0 |
| 30 | eor \b6, \b6, \b2 |
| 31 | eor \b5, \b5, \b0 |
| 32 | eor \b6, \b6, \b3 |
| 33 | eor \b3, \b3, \b7 |
| 34 | eor \b7, \b7, \b5 |
| 35 | eor \b3, \b3, \b4 |
| 36 | eor \b4, \b4, \b5 |
| 37 | eor \b2, \b2, \b7 |
| 38 | eor \b3, \b3, \b1 |
| 39 | eor \b1, \b1, \b5 |
| 40 | .endm |
| 41 | |
| 42 | .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 |
| 43 | eor \b0, \b0, \b6 |
| 44 | eor \b1, \b1, \b4 |
| 45 | eor \b4, \b4, \b6 |
| 46 | eor \b2, \b2, \b0 |
| 47 | eor \b6, \b6, \b1 |
| 48 | eor \b1, \b1, \b5 |
| 49 | eor \b5, \b5, \b3 |
| 50 | eor \b3, \b3, \b7 |
| 51 | eor \b7, \b7, \b5 |
| 52 | eor \b2, \b2, \b5 |
| 53 | eor \b4, \b4, \b7 |
| 54 | .endm |
| 55 | |
| 56 | .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 |
| 57 | eor \b1, \b1, \b7 |
| 58 | eor \b4, \b4, \b7 |
| 59 | eor \b7, \b7, \b5 |
| 60 | eor \b1, \b1, \b3 |
| 61 | eor \b2, \b2, \b5 |
| 62 | eor \b3, \b3, \b7 |
| 63 | eor \b6, \b6, \b1 |
| 64 | eor \b2, \b2, \b0 |
| 65 | eor \b5, \b5, \b3 |
| 66 | eor \b4, \b4, \b6 |
| 67 | eor \b0, \b0, \b6 |
| 68 | eor \b1, \b1, \b4 |
| 69 | .endm |
| 70 | |
| 71 | .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 |
| 72 | eor \b1, \b1, \b5 |
| 73 | eor \b2, \b2, \b7 |
| 74 | eor \b3, \b3, \b1 |
| 75 | eor \b4, \b4, \b5 |
| 76 | eor \b7, \b7, \b5 |
| 77 | eor \b3, \b3, \b4 |
| 78 | eor \b5, \b5, \b0 |
| 79 | eor \b3, \b3, \b7 |
| 80 | eor \b6, \b6, \b2 |
| 81 | eor \b2, \b2, \b1 |
| 82 | eor \b6, \b6, \b3 |
| 83 | eor \b3, \b3, \b0 |
| 84 | eor \b5, \b5, \b6 |
| 85 | .endm |
| 86 | |
| 87 | .macro mul_gf4, x0, x1, y0, y1, t0, t1 |
| 88 | eor \t0, \y0, \y1 |
| 89 | and \t0, \t0, \x0 |
| 90 | eor \x0, \x0, \x1 |
| 91 | and \t1, \x1, \y0 |
| 92 | and \x0, \x0, \y1 |
| 93 | eor \x1, \t1, \t0 |
| 94 | eor \x0, \x0, \t1 |
| 95 | .endm |
| 96 | |
| 97 | .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 |
| 98 | eor \t0, \y0, \y1 |
| 99 | eor \t1, \y2, \y3 |
| 100 | and \t0, \t0, \x0 |
| 101 | and \t1, \t1, \x2 |
| 102 | eor \x0, \x0, \x1 |
| 103 | eor \x2, \x2, \x3 |
| 104 | and \x1, \x1, \y0 |
| 105 | and \x3, \x3, \y2 |
| 106 | and \x0, \x0, \y1 |
| 107 | and \x2, \x2, \y3 |
| 108 | eor \x1, \x1, \x0 |
| 109 | eor \x2, \x2, \x3 |
| 110 | eor \x0, \x0, \t0 |
| 111 | eor \x3, \x3, \t1 |
| 112 | .endm |
| 113 | |
| 114 | .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 115 | y0, y1, y2, y3, t0, t1, t2, t3 |
| 116 | eor \t0, \x0, \x2 |
| 117 | eor \t1, \x1, \x3 |
| 118 | mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 |
| 119 | eor \y0, \y0, \y2 |
| 120 | eor \y1, \y1, \y3 |
| 121 | mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 |
| 122 | eor \x0, \x0, \t0 |
| 123 | eor \x2, \x2, \t0 |
| 124 | eor \x1, \x1, \t1 |
| 125 | eor \x3, \x3, \t1 |
| 126 | eor \t0, \x4, \x6 |
| 127 | eor \t1, \x5, \x7 |
| 128 | mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 |
| 129 | eor \y0, \y0, \y2 |
| 130 | eor \y1, \y1, \y3 |
| 131 | mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 |
| 132 | eor \x4, \x4, \t0 |
| 133 | eor \x6, \x6, \t0 |
| 134 | eor \x5, \x5, \t1 |
| 135 | eor \x7, \x7, \t1 |
| 136 | .endm |
| 137 | |
| 138 | .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 139 | t0, t1, t2, t3, s0, s1, s2, s3 |
| 140 | eor \t3, \x4, \x6 |
| 141 | eor \t0, \x5, \x7 |
| 142 | eor \t1, \x1, \x3 |
| 143 | eor \s1, \x7, \x6 |
| 144 | eor \s0, \x0, \x2 |
| 145 | eor \s3, \t3, \t0 |
| 146 | orr \t2, \t0, \t1 |
| 147 | and \s2, \t3, \s0 |
| 148 | orr \t3, \t3, \s0 |
| 149 | eor \s0, \s0, \t1 |
| 150 | and \t0, \t0, \t1 |
| 151 | eor \t1, \x3, \x2 |
| 152 | and \s3, \s3, \s0 |
| 153 | and \s1, \s1, \t1 |
| 154 | eor \t1, \x4, \x5 |
| 155 | eor \s0, \x1, \x0 |
| 156 | eor \t3, \t3, \s1 |
| 157 | eor \t2, \t2, \s1 |
| 158 | and \s1, \t1, \s0 |
| 159 | orr \t1, \t1, \s0 |
| 160 | eor \t3, \t3, \s3 |
| 161 | eor \t0, \t0, \s1 |
| 162 | eor \t2, \t2, \s2 |
| 163 | eor \t1, \t1, \s3 |
| 164 | eor \t0, \t0, \s2 |
| 165 | and \s0, \x7, \x3 |
| 166 | eor \t1, \t1, \s2 |
| 167 | and \s1, \x6, \x2 |
| 168 | and \s2, \x5, \x1 |
| 169 | orr \s3, \x4, \x0 |
| 170 | eor \t3, \t3, \s0 |
| 171 | eor \t1, \t1, \s2 |
| 172 | eor \s0, \t0, \s3 |
| 173 | eor \t2, \t2, \s1 |
| 174 | and \s2, \t3, \t1 |
| 175 | eor \s1, \t2, \s2 |
| 176 | eor \s3, \s0, \s2 |
| 177 | bsl \s1, \t1, \s0 |
| 178 | not \t0, \s0 |
| 179 | bsl \s0, \s1, \s3 |
| 180 | bsl \t0, \s1, \s3 |
| 181 | bsl \s3, \t3, \t2 |
| 182 | eor \t3, \t3, \t2 |
| 183 | and \s2, \s0, \s3 |
| 184 | eor \t1, \t1, \t0 |
| 185 | eor \s2, \s2, \t3 |
| 186 | mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ |
| 187 | \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 |
| 188 | .endm |
| 189 | |
| 190 | .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ |
| 191 | t0, t1, t2, t3, s0, s1, s2, s3 |
| 192 | in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ |
| 193 | \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b |
| 194 | inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ |
| 195 | \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ |
| 196 | \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ |
| 197 | \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b |
| 198 | out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ |
| 199 | \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b |
| 200 | .endm |
| 201 | |
| 202 | .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ |
| 203 | t0, t1, t2, t3, s0, s1, s2, s3 |
| 204 | inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ |
| 205 | \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b |
| 206 | inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ |
| 207 | \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ |
| 208 | \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ |
| 209 | \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b |
| 210 | inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ |
| 211 | \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b |
| 212 | .endm |
| 213 | |
| 214 | .macro enc_next_rk |
| 215 | ldp q16, q17, [bskey], #128 |
| 216 | ldp q18, q19, [bskey, #-96] |
| 217 | ldp q20, q21, [bskey, #-64] |
| 218 | ldp q22, q23, [bskey, #-32] |
| 219 | .endm |
| 220 | |
| 221 | .macro dec_next_rk |
| 222 | ldp q16, q17, [bskey, #-128]! |
| 223 | ldp q18, q19, [bskey, #32] |
| 224 | ldp q20, q21, [bskey, #64] |
| 225 | ldp q22, q23, [bskey, #96] |
| 226 | .endm |
| 227 | |
| 228 | .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 |
| 229 | eor \x0\().16b, \x0\().16b, v16.16b |
| 230 | eor \x1\().16b, \x1\().16b, v17.16b |
| 231 | eor \x2\().16b, \x2\().16b, v18.16b |
| 232 | eor \x3\().16b, \x3\().16b, v19.16b |
| 233 | eor \x4\().16b, \x4\().16b, v20.16b |
| 234 | eor \x5\().16b, \x5\().16b, v21.16b |
| 235 | eor \x6\().16b, \x6\().16b, v22.16b |
| 236 | eor \x7\().16b, \x7\().16b, v23.16b |
| 237 | .endm |
| 238 | |
| 239 | .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask |
| 240 | tbl \x0\().16b, {\x0\().16b}, \mask\().16b |
| 241 | tbl \x1\().16b, {\x1\().16b}, \mask\().16b |
| 242 | tbl \x2\().16b, {\x2\().16b}, \mask\().16b |
| 243 | tbl \x3\().16b, {\x3\().16b}, \mask\().16b |
| 244 | tbl \x4\().16b, {\x4\().16b}, \mask\().16b |
| 245 | tbl \x5\().16b, {\x5\().16b}, \mask\().16b |
| 246 | tbl \x6\().16b, {\x6\().16b}, \mask\().16b |
| 247 | tbl \x7\().16b, {\x7\().16b}, \mask\().16b |
| 248 | .endm |
| 249 | |
| 250 | .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 251 | t0, t1, t2, t3, t4, t5, t6, t7, inv |
| 252 | ext \t0\().16b, \x0\().16b, \x0\().16b, #12 |
| 253 | ext \t1\().16b, \x1\().16b, \x1\().16b, #12 |
| 254 | eor \x0\().16b, \x0\().16b, \t0\().16b |
| 255 | ext \t2\().16b, \x2\().16b, \x2\().16b, #12 |
| 256 | eor \x1\().16b, \x1\().16b, \t1\().16b |
| 257 | ext \t3\().16b, \x3\().16b, \x3\().16b, #12 |
| 258 | eor \x2\().16b, \x2\().16b, \t2\().16b |
| 259 | ext \t4\().16b, \x4\().16b, \x4\().16b, #12 |
| 260 | eor \x3\().16b, \x3\().16b, \t3\().16b |
| 261 | ext \t5\().16b, \x5\().16b, \x5\().16b, #12 |
| 262 | eor \x4\().16b, \x4\().16b, \t4\().16b |
| 263 | ext \t6\().16b, \x6\().16b, \x6\().16b, #12 |
| 264 | eor \x5\().16b, \x5\().16b, \t5\().16b |
| 265 | ext \t7\().16b, \x7\().16b, \x7\().16b, #12 |
| 266 | eor \x6\().16b, \x6\().16b, \t6\().16b |
| 267 | eor \t1\().16b, \t1\().16b, \x0\().16b |
| 268 | eor \x7\().16b, \x7\().16b, \t7\().16b |
| 269 | ext \x0\().16b, \x0\().16b, \x0\().16b, #8 |
| 270 | eor \t2\().16b, \t2\().16b, \x1\().16b |
| 271 | eor \t0\().16b, \t0\().16b, \x7\().16b |
| 272 | eor \t1\().16b, \t1\().16b, \x7\().16b |
| 273 | ext \x1\().16b, \x1\().16b, \x1\().16b, #8 |
| 274 | eor \t5\().16b, \t5\().16b, \x4\().16b |
| 275 | eor \x0\().16b, \x0\().16b, \t0\().16b |
| 276 | eor \t6\().16b, \t6\().16b, \x5\().16b |
| 277 | eor \x1\().16b, \x1\().16b, \t1\().16b |
| 278 | ext \t0\().16b, \x4\().16b, \x4\().16b, #8 |
| 279 | eor \t4\().16b, \t4\().16b, \x3\().16b |
| 280 | ext \t1\().16b, \x5\().16b, \x5\().16b, #8 |
| 281 | eor \t7\().16b, \t7\().16b, \x6\().16b |
| 282 | ext \x4\().16b, \x3\().16b, \x3\().16b, #8 |
| 283 | eor \t3\().16b, \t3\().16b, \x2\().16b |
| 284 | ext \x5\().16b, \x7\().16b, \x7\().16b, #8 |
| 285 | eor \t4\().16b, \t4\().16b, \x7\().16b |
| 286 | ext \x3\().16b, \x6\().16b, \x6\().16b, #8 |
| 287 | eor \t3\().16b, \t3\().16b, \x7\().16b |
| 288 | ext \x6\().16b, \x2\().16b, \x2\().16b, #8 |
| 289 | eor \x7\().16b, \t1\().16b, \t5\().16b |
| 290 | .ifb \inv |
| 291 | eor \x2\().16b, \t0\().16b, \t4\().16b |
| 292 | eor \x4\().16b, \x4\().16b, \t3\().16b |
| 293 | eor \x5\().16b, \x5\().16b, \t7\().16b |
| 294 | eor \x3\().16b, \x3\().16b, \t6\().16b |
| 295 | eor \x6\().16b, \x6\().16b, \t2\().16b |
| 296 | .else |
| 297 | eor \t3\().16b, \t3\().16b, \x4\().16b |
| 298 | eor \x5\().16b, \x5\().16b, \t7\().16b |
| 299 | eor \x2\().16b, \x3\().16b, \t6\().16b |
| 300 | eor \x3\().16b, \t0\().16b, \t4\().16b |
| 301 | eor \x4\().16b, \x6\().16b, \t2\().16b |
| 302 | mov \x6\().16b, \t3\().16b |
| 303 | .endif |
| 304 | .endm |
| 305 | |
| 306 | .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 307 | t0, t1, t2, t3, t4, t5, t6, t7 |
| 308 | ext \t0\().16b, \x0\().16b, \x0\().16b, #8 |
| 309 | ext \t6\().16b, \x6\().16b, \x6\().16b, #8 |
| 310 | ext \t7\().16b, \x7\().16b, \x7\().16b, #8 |
| 311 | eor \t0\().16b, \t0\().16b, \x0\().16b |
| 312 | ext \t1\().16b, \x1\().16b, \x1\().16b, #8 |
| 313 | eor \t6\().16b, \t6\().16b, \x6\().16b |
| 314 | ext \t2\().16b, \x2\().16b, \x2\().16b, #8 |
| 315 | eor \t7\().16b, \t7\().16b, \x7\().16b |
| 316 | ext \t3\().16b, \x3\().16b, \x3\().16b, #8 |
| 317 | eor \t1\().16b, \t1\().16b, \x1\().16b |
| 318 | ext \t4\().16b, \x4\().16b, \x4\().16b, #8 |
| 319 | eor \t2\().16b, \t2\().16b, \x2\().16b |
| 320 | ext \t5\().16b, \x5\().16b, \x5\().16b, #8 |
| 321 | eor \t3\().16b, \t3\().16b, \x3\().16b |
| 322 | eor \t4\().16b, \t4\().16b, \x4\().16b |
| 323 | eor \t5\().16b, \t5\().16b, \x5\().16b |
| 324 | eor \x0\().16b, \x0\().16b, \t6\().16b |
| 325 | eor \x1\().16b, \x1\().16b, \t6\().16b |
| 326 | eor \x2\().16b, \x2\().16b, \t0\().16b |
| 327 | eor \x4\().16b, \x4\().16b, \t2\().16b |
| 328 | eor \x3\().16b, \x3\().16b, \t1\().16b |
| 329 | eor \x1\().16b, \x1\().16b, \t7\().16b |
| 330 | eor \x2\().16b, \x2\().16b, \t7\().16b |
| 331 | eor \x4\().16b, \x4\().16b, \t6\().16b |
| 332 | eor \x5\().16b, \x5\().16b, \t3\().16b |
| 333 | eor \x3\().16b, \x3\().16b, \t6\().16b |
| 334 | eor \x6\().16b, \x6\().16b, \t4\().16b |
| 335 | eor \x4\().16b, \x4\().16b, \t7\().16b |
| 336 | eor \x5\().16b, \x5\().16b, \t7\().16b |
| 337 | eor \x7\().16b, \x7\().16b, \t5\().16b |
| 338 | mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ |
| 339 | \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 |
| 340 | .endm |
| 341 | |
| 342 | .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 |
| 343 | ushr \t0\().2d, \b0\().2d, #\n |
| 344 | ushr \t1\().2d, \b1\().2d, #\n |
| 345 | eor \t0\().16b, \t0\().16b, \a0\().16b |
| 346 | eor \t1\().16b, \t1\().16b, \a1\().16b |
| 347 | and \t0\().16b, \t0\().16b, \mask\().16b |
| 348 | and \t1\().16b, \t1\().16b, \mask\().16b |
| 349 | eor \a0\().16b, \a0\().16b, \t0\().16b |
| 350 | shl \t0\().2d, \t0\().2d, #\n |
| 351 | eor \a1\().16b, \a1\().16b, \t1\().16b |
| 352 | shl \t1\().2d, \t1\().2d, #\n |
| 353 | eor \b0\().16b, \b0\().16b, \t0\().16b |
| 354 | eor \b1\().16b, \b1\().16b, \t1\().16b |
| 355 | .endm |
| 356 | |
| 357 | .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 |
| 358 | movi \t0\().16b, #0x55 |
| 359 | movi \t1\().16b, #0x33 |
| 360 | swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 |
| 361 | swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 |
| 362 | movi \t0\().16b, #0x0f |
| 363 | swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 |
| 364 | swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 |
| 365 | swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 |
| 366 | swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 |
| 367 | .endm |
| 368 | |
| 369 | |
| 370 | .align 6 |
| 371 | M0: .octa 0x0004080c0105090d02060a0e03070b0f |
| 372 | |
| 373 | M0SR: .octa 0x0004080c05090d010a0e02060f03070b |
| 374 | SR: .octa 0x0f0e0d0c0a09080b0504070600030201 |
| 375 | SRM0: .octa 0x01060b0c0207080d0304090e00050a0f |
| 376 | |
| 377 | M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 |
| 378 | ISR: .octa 0x0f0e0d0c080b0a090504070602010003 |
| 379 | ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f |
| 380 | |
| 381 | /* |
| 382 | * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) |
| 383 | */ |
| 384 | SYM_FUNC_START(aesbs_convert_key) |
| 385 | ld1 {v7.4s}, [x1], #16 // load round 0 key |
| 386 | ld1 {v17.4s}, [x1], #16 // load round 1 key |
| 387 | |
| 388 | movi v8.16b, #0x01 // bit masks |
| 389 | movi v9.16b, #0x02 |
| 390 | movi v10.16b, #0x04 |
| 391 | movi v11.16b, #0x08 |
| 392 | movi v12.16b, #0x10 |
| 393 | movi v13.16b, #0x20 |
| 394 | movi v14.16b, #0x40 |
| 395 | movi v15.16b, #0x80 |
| 396 | ldr q16, M0 |
| 397 | |
| 398 | sub x2, x2, #1 |
| 399 | str q7, [x0], #16 // save round 0 key |
| 400 | |
| 401 | .Lkey_loop: |
| 402 | tbl v7.16b ,{v17.16b}, v16.16b |
| 403 | ld1 {v17.4s}, [x1], #16 // load next round key |
| 404 | |
| 405 | cmtst v0.16b, v7.16b, v8.16b |
| 406 | cmtst v1.16b, v7.16b, v9.16b |
| 407 | cmtst v2.16b, v7.16b, v10.16b |
| 408 | cmtst v3.16b, v7.16b, v11.16b |
| 409 | cmtst v4.16b, v7.16b, v12.16b |
| 410 | cmtst v5.16b, v7.16b, v13.16b |
| 411 | cmtst v6.16b, v7.16b, v14.16b |
| 412 | cmtst v7.16b, v7.16b, v15.16b |
| 413 | not v0.16b, v0.16b |
| 414 | not v1.16b, v1.16b |
| 415 | not v5.16b, v5.16b |
| 416 | not v6.16b, v6.16b |
| 417 | |
| 418 | subs x2, x2, #1 |
| 419 | stp q0, q1, [x0], #128 |
| 420 | stp q2, q3, [x0, #-96] |
| 421 | stp q4, q5, [x0, #-64] |
| 422 | stp q6, q7, [x0, #-32] |
| 423 | b.ne .Lkey_loop |
| 424 | |
| 425 | movi v7.16b, #0x63 // compose .L63 |
| 426 | eor v17.16b, v17.16b, v7.16b |
| 427 | str q17, [x0] |
| 428 | ret |
| 429 | SYM_FUNC_END(aesbs_convert_key) |
| 430 | |
| 431 | .align 4 |
| 432 | SYM_FUNC_START_LOCAL(aesbs_encrypt8) |
| 433 | ldr q9, [bskey], #16 // round 0 key |
| 434 | ldr q8, M0SR |
| 435 | ldr q24, SR |
| 436 | |
| 437 | eor v10.16b, v0.16b, v9.16b // xor with round0 key |
| 438 | eor v11.16b, v1.16b, v9.16b |
| 439 | tbl v0.16b, {v10.16b}, v8.16b |
| 440 | eor v12.16b, v2.16b, v9.16b |
| 441 | tbl v1.16b, {v11.16b}, v8.16b |
| 442 | eor v13.16b, v3.16b, v9.16b |
| 443 | tbl v2.16b, {v12.16b}, v8.16b |
| 444 | eor v14.16b, v4.16b, v9.16b |
| 445 | tbl v3.16b, {v13.16b}, v8.16b |
| 446 | eor v15.16b, v5.16b, v9.16b |
| 447 | tbl v4.16b, {v14.16b}, v8.16b |
| 448 | eor v10.16b, v6.16b, v9.16b |
| 449 | tbl v5.16b, {v15.16b}, v8.16b |
| 450 | eor v11.16b, v7.16b, v9.16b |
| 451 | tbl v6.16b, {v10.16b}, v8.16b |
| 452 | tbl v7.16b, {v11.16b}, v8.16b |
| 453 | |
| 454 | bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 |
| 455 | |
| 456 | sub rounds, rounds, #1 |
| 457 | b .Lenc_sbox |
| 458 | |
| 459 | .Lenc_loop: |
| 460 | shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 |
| 461 | .Lenc_sbox: |
| 462 | sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ |
| 463 | v13, v14, v15 |
| 464 | subs rounds, rounds, #1 |
| 465 | b.cc .Lenc_done |
| 466 | |
| 467 | enc_next_rk |
| 468 | |
| 469 | mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ |
| 470 | v13, v14, v15 |
| 471 | |
| 472 | add_round_key v0, v1, v2, v3, v4, v5, v6, v7 |
| 473 | |
| 474 | b.ne .Lenc_loop |
| 475 | ldr q24, SRM0 |
| 476 | b .Lenc_loop |
| 477 | |
| 478 | .Lenc_done: |
| 479 | ldr q12, [bskey] // last round key |
| 480 | |
| 481 | bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 |
| 482 | |
| 483 | eor v0.16b, v0.16b, v12.16b |
| 484 | eor v1.16b, v1.16b, v12.16b |
| 485 | eor v4.16b, v4.16b, v12.16b |
| 486 | eor v6.16b, v6.16b, v12.16b |
| 487 | eor v3.16b, v3.16b, v12.16b |
| 488 | eor v7.16b, v7.16b, v12.16b |
| 489 | eor v2.16b, v2.16b, v12.16b |
| 490 | eor v5.16b, v5.16b, v12.16b |
| 491 | ret |
| 492 | SYM_FUNC_END(aesbs_encrypt8) |
| 493 | |
| 494 | .align 4 |
| 495 | SYM_FUNC_START_LOCAL(aesbs_decrypt8) |
| 496 | lsl x9, rounds, #7 |
| 497 | add bskey, bskey, x9 |
| 498 | |
| 499 | ldr q9, [bskey, #-112]! // round 0 key |
| 500 | ldr q8, M0ISR |
| 501 | ldr q24, ISR |
| 502 | |
| 503 | eor v10.16b, v0.16b, v9.16b // xor with round0 key |
| 504 | eor v11.16b, v1.16b, v9.16b |
| 505 | tbl v0.16b, {v10.16b}, v8.16b |
| 506 | eor v12.16b, v2.16b, v9.16b |
| 507 | tbl v1.16b, {v11.16b}, v8.16b |
| 508 | eor v13.16b, v3.16b, v9.16b |
| 509 | tbl v2.16b, {v12.16b}, v8.16b |
| 510 | eor v14.16b, v4.16b, v9.16b |
| 511 | tbl v3.16b, {v13.16b}, v8.16b |
| 512 | eor v15.16b, v5.16b, v9.16b |
| 513 | tbl v4.16b, {v14.16b}, v8.16b |
| 514 | eor v10.16b, v6.16b, v9.16b |
| 515 | tbl v5.16b, {v15.16b}, v8.16b |
| 516 | eor v11.16b, v7.16b, v9.16b |
| 517 | tbl v6.16b, {v10.16b}, v8.16b |
| 518 | tbl v7.16b, {v11.16b}, v8.16b |
| 519 | |
| 520 | bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 |
| 521 | |
| 522 | sub rounds, rounds, #1 |
| 523 | b .Ldec_sbox |
| 524 | |
| 525 | .Ldec_loop: |
| 526 | shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 |
| 527 | .Ldec_sbox: |
| 528 | inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ |
| 529 | v13, v14, v15 |
| 530 | subs rounds, rounds, #1 |
| 531 | b.cc .Ldec_done |
| 532 | |
| 533 | dec_next_rk |
| 534 | |
| 535 | add_round_key v0, v1, v6, v4, v2, v7, v3, v5 |
| 536 | |
| 537 | inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ |
| 538 | v13, v14, v15 |
| 539 | |
| 540 | b.ne .Ldec_loop |
| 541 | ldr q24, ISRM0 |
| 542 | b .Ldec_loop |
| 543 | .Ldec_done: |
| 544 | ldr q12, [bskey, #-16] // last round key |
| 545 | |
| 546 | bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 |
| 547 | |
| 548 | eor v0.16b, v0.16b, v12.16b |
| 549 | eor v1.16b, v1.16b, v12.16b |
| 550 | eor v6.16b, v6.16b, v12.16b |
| 551 | eor v4.16b, v4.16b, v12.16b |
| 552 | eor v2.16b, v2.16b, v12.16b |
| 553 | eor v7.16b, v7.16b, v12.16b |
| 554 | eor v3.16b, v3.16b, v12.16b |
| 555 | eor v5.16b, v5.16b, v12.16b |
| 556 | ret |
| 557 | SYM_FUNC_END(aesbs_decrypt8) |
| 558 | |
| 559 | /* |
| 560 | * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 561 | * int blocks) |
| 562 | * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 563 | * int blocks) |
| 564 | */ |
| 565 | .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 |
| 566 | frame_push 5 |
| 567 | |
| 568 | mov x19, x0 |
| 569 | mov x20, x1 |
| 570 | mov x21, x2 |
| 571 | mov x22, x3 |
| 572 | mov x23, x4 |
| 573 | |
| 574 | 99: mov x5, #1 |
| 575 | lsl x5, x5, x23 |
| 576 | subs w23, w23, #8 |
| 577 | csel x23, x23, xzr, pl |
| 578 | csel x5, x5, xzr, mi |
| 579 | |
| 580 | ld1 {v0.16b}, [x20], #16 |
| 581 | tbnz x5, #1, 0f |
| 582 | ld1 {v1.16b}, [x20], #16 |
| 583 | tbnz x5, #2, 0f |
| 584 | ld1 {v2.16b}, [x20], #16 |
| 585 | tbnz x5, #3, 0f |
| 586 | ld1 {v3.16b}, [x20], #16 |
| 587 | tbnz x5, #4, 0f |
| 588 | ld1 {v4.16b}, [x20], #16 |
| 589 | tbnz x5, #5, 0f |
| 590 | ld1 {v5.16b}, [x20], #16 |
| 591 | tbnz x5, #6, 0f |
| 592 | ld1 {v6.16b}, [x20], #16 |
| 593 | tbnz x5, #7, 0f |
| 594 | ld1 {v7.16b}, [x20], #16 |
| 595 | |
| 596 | 0: mov bskey, x21 |
| 597 | mov rounds, x22 |
| 598 | bl \do8 |
| 599 | |
| 600 | st1 {\o0\().16b}, [x19], #16 |
| 601 | tbnz x5, #1, 1f |
| 602 | st1 {\o1\().16b}, [x19], #16 |
| 603 | tbnz x5, #2, 1f |
| 604 | st1 {\o2\().16b}, [x19], #16 |
| 605 | tbnz x5, #3, 1f |
| 606 | st1 {\o3\().16b}, [x19], #16 |
| 607 | tbnz x5, #4, 1f |
| 608 | st1 {\o4\().16b}, [x19], #16 |
| 609 | tbnz x5, #5, 1f |
| 610 | st1 {\o5\().16b}, [x19], #16 |
| 611 | tbnz x5, #6, 1f |
| 612 | st1 {\o6\().16b}, [x19], #16 |
| 613 | tbnz x5, #7, 1f |
| 614 | st1 {\o7\().16b}, [x19], #16 |
| 615 | |
| 616 | cbz x23, 1f |
| 617 | b 99b |
| 618 | |
| 619 | 1: frame_pop |
| 620 | ret |
| 621 | .endm |
| 622 | |
| 623 | .align 4 |
| 624 | SYM_TYPED_FUNC_START(aesbs_ecb_encrypt) |
| 625 | __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 |
| 626 | SYM_FUNC_END(aesbs_ecb_encrypt) |
| 627 | |
| 628 | .align 4 |
| 629 | SYM_TYPED_FUNC_START(aesbs_ecb_decrypt) |
| 630 | __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 |
| 631 | SYM_FUNC_END(aesbs_ecb_decrypt) |
| 632 | |
| 633 | /* |
| 634 | * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 635 | * int blocks, u8 iv[]) |
| 636 | */ |
| 637 | .align 4 |
| 638 | SYM_FUNC_START(aesbs_cbc_decrypt) |
| 639 | frame_push 6 |
| 640 | |
| 641 | mov x19, x0 |
| 642 | mov x20, x1 |
| 643 | mov x21, x2 |
| 644 | mov x22, x3 |
| 645 | mov x23, x4 |
| 646 | mov x24, x5 |
| 647 | |
| 648 | 99: mov x6, #1 |
| 649 | lsl x6, x6, x23 |
| 650 | subs w23, w23, #8 |
| 651 | csel x23, x23, xzr, pl |
| 652 | csel x6, x6, xzr, mi |
| 653 | |
| 654 | ld1 {v0.16b}, [x20], #16 |
| 655 | mov v25.16b, v0.16b |
| 656 | tbnz x6, #1, 0f |
| 657 | ld1 {v1.16b}, [x20], #16 |
| 658 | mov v26.16b, v1.16b |
| 659 | tbnz x6, #2, 0f |
| 660 | ld1 {v2.16b}, [x20], #16 |
| 661 | mov v27.16b, v2.16b |
| 662 | tbnz x6, #3, 0f |
| 663 | ld1 {v3.16b}, [x20], #16 |
| 664 | mov v28.16b, v3.16b |
| 665 | tbnz x6, #4, 0f |
| 666 | ld1 {v4.16b}, [x20], #16 |
| 667 | mov v29.16b, v4.16b |
| 668 | tbnz x6, #5, 0f |
| 669 | ld1 {v5.16b}, [x20], #16 |
| 670 | mov v30.16b, v5.16b |
| 671 | tbnz x6, #6, 0f |
| 672 | ld1 {v6.16b}, [x20], #16 |
| 673 | mov v31.16b, v6.16b |
| 674 | tbnz x6, #7, 0f |
| 675 | ld1 {v7.16b}, [x20] |
| 676 | |
| 677 | 0: mov bskey, x21 |
| 678 | mov rounds, x22 |
| 679 | bl aesbs_decrypt8 |
| 680 | |
| 681 | ld1 {v24.16b}, [x24] // load IV |
| 682 | |
| 683 | eor v1.16b, v1.16b, v25.16b |
| 684 | eor v6.16b, v6.16b, v26.16b |
| 685 | eor v4.16b, v4.16b, v27.16b |
| 686 | eor v2.16b, v2.16b, v28.16b |
| 687 | eor v7.16b, v7.16b, v29.16b |
| 688 | eor v0.16b, v0.16b, v24.16b |
| 689 | eor v3.16b, v3.16b, v30.16b |
| 690 | eor v5.16b, v5.16b, v31.16b |
| 691 | |
| 692 | st1 {v0.16b}, [x19], #16 |
| 693 | mov v24.16b, v25.16b |
| 694 | tbnz x6, #1, 1f |
| 695 | st1 {v1.16b}, [x19], #16 |
| 696 | mov v24.16b, v26.16b |
| 697 | tbnz x6, #2, 1f |
| 698 | st1 {v6.16b}, [x19], #16 |
| 699 | mov v24.16b, v27.16b |
| 700 | tbnz x6, #3, 1f |
| 701 | st1 {v4.16b}, [x19], #16 |
| 702 | mov v24.16b, v28.16b |
| 703 | tbnz x6, #4, 1f |
| 704 | st1 {v2.16b}, [x19], #16 |
| 705 | mov v24.16b, v29.16b |
| 706 | tbnz x6, #5, 1f |
| 707 | st1 {v7.16b}, [x19], #16 |
| 708 | mov v24.16b, v30.16b |
| 709 | tbnz x6, #6, 1f |
| 710 | st1 {v3.16b}, [x19], #16 |
| 711 | mov v24.16b, v31.16b |
| 712 | tbnz x6, #7, 1f |
| 713 | ld1 {v24.16b}, [x20], #16 |
| 714 | st1 {v5.16b}, [x19], #16 |
| 715 | 1: st1 {v24.16b}, [x24] // store IV |
| 716 | |
| 717 | cbz x23, 2f |
| 718 | b 99b |
| 719 | |
| 720 | 2: frame_pop |
| 721 | ret |
| 722 | SYM_FUNC_END(aesbs_cbc_decrypt) |
| 723 | |
| 724 | .macro next_tweak, out, in, const, tmp |
| 725 | sshr \tmp\().2d, \in\().2d, #63 |
| 726 | and \tmp\().16b, \tmp\().16b, \const\().16b |
| 727 | add \out\().2d, \in\().2d, \in\().2d |
| 728 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 |
| 729 | eor \out\().16b, \out\().16b, \tmp\().16b |
| 730 | .endm |
| 731 | |
| 732 | /* |
| 733 | * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 734 | * int blocks, u8 iv[]) |
| 735 | * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 736 | * int blocks, u8 iv[]) |
| 737 | */ |
| 738 | SYM_FUNC_START_LOCAL(__xts_crypt8) |
| 739 | movi v18.2s, #0x1 |
| 740 | movi v19.2s, #0x87 |
| 741 | uzp1 v18.4s, v18.4s, v19.4s |
| 742 | |
| 743 | ld1 {v0.16b-v3.16b}, [x1], #64 |
| 744 | ld1 {v4.16b-v7.16b}, [x1], #64 |
| 745 | |
| 746 | next_tweak v26, v25, v18, v19 |
| 747 | next_tweak v27, v26, v18, v19 |
| 748 | next_tweak v28, v27, v18, v19 |
| 749 | next_tweak v29, v28, v18, v19 |
| 750 | next_tweak v30, v29, v18, v19 |
| 751 | next_tweak v31, v30, v18, v19 |
| 752 | next_tweak v16, v31, v18, v19 |
| 753 | next_tweak v17, v16, v18, v19 |
| 754 | |
| 755 | eor v0.16b, v0.16b, v25.16b |
| 756 | eor v1.16b, v1.16b, v26.16b |
| 757 | eor v2.16b, v2.16b, v27.16b |
| 758 | eor v3.16b, v3.16b, v28.16b |
| 759 | eor v4.16b, v4.16b, v29.16b |
| 760 | eor v5.16b, v5.16b, v30.16b |
| 761 | eor v6.16b, v6.16b, v31.16b |
| 762 | eor v7.16b, v7.16b, v16.16b |
| 763 | |
| 764 | stp q16, q17, [x6] |
| 765 | |
| 766 | mov bskey, x2 |
| 767 | mov rounds, x3 |
| 768 | br x16 |
| 769 | SYM_FUNC_END(__xts_crypt8) |
| 770 | |
| 771 | .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 |
| 772 | frame_push 0, 32 |
| 773 | add x6, sp, #.Lframe_local_offset |
| 774 | |
| 775 | ld1 {v25.16b}, [x5] |
| 776 | |
| 777 | 0: adr x16, \do8 |
| 778 | bl __xts_crypt8 |
| 779 | |
| 780 | eor v16.16b, \o0\().16b, v25.16b |
| 781 | eor v17.16b, \o1\().16b, v26.16b |
| 782 | eor v18.16b, \o2\().16b, v27.16b |
| 783 | eor v19.16b, \o3\().16b, v28.16b |
| 784 | |
| 785 | ldp q24, q25, [x6] |
| 786 | |
| 787 | eor v20.16b, \o4\().16b, v29.16b |
| 788 | eor v21.16b, \o5\().16b, v30.16b |
| 789 | eor v22.16b, \o6\().16b, v31.16b |
| 790 | eor v23.16b, \o7\().16b, v24.16b |
| 791 | |
| 792 | st1 {v16.16b-v19.16b}, [x0], #64 |
| 793 | st1 {v20.16b-v23.16b}, [x0], #64 |
| 794 | |
| 795 | subs x4, x4, #8 |
| 796 | b.gt 0b |
| 797 | |
| 798 | st1 {v25.16b}, [x5] |
| 799 | frame_pop |
| 800 | ret |
| 801 | .endm |
| 802 | |
| 803 | SYM_TYPED_FUNC_START(aesbs_xts_encrypt) |
| 804 | __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 |
| 805 | SYM_FUNC_END(aesbs_xts_encrypt) |
| 806 | |
| 807 | SYM_TYPED_FUNC_START(aesbs_xts_decrypt) |
| 808 | __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 |
| 809 | SYM_FUNC_END(aesbs_xts_decrypt) |
| 810 | |
| 811 | .macro next_ctr, v |
| 812 | mov \v\().d[1], x8 |
| 813 | adds x8, x8, #1 |
| 814 | mov \v\().d[0], x7 |
| 815 | adc x7, x7, xzr |
| 816 | rev64 \v\().16b, \v\().16b |
| 817 | .endm |
| 818 | |
| 819 | /* |
| 820 | * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], |
| 821 | * int rounds, int blocks, u8 iv[]) |
| 822 | */ |
| 823 | SYM_FUNC_START(aesbs_ctr_encrypt) |
| 824 | frame_push 0 |
| 825 | ldp x7, x8, [x5] |
| 826 | ld1 {v0.16b}, [x5] |
| 827 | CPU_LE( rev x7, x7 ) |
| 828 | CPU_LE( rev x8, x8 ) |
| 829 | adds x8, x8, #1 |
| 830 | adc x7, x7, xzr |
| 831 | |
| 832 | 0: next_ctr v1 |
| 833 | next_ctr v2 |
| 834 | next_ctr v3 |
| 835 | next_ctr v4 |
| 836 | next_ctr v5 |
| 837 | next_ctr v6 |
| 838 | next_ctr v7 |
| 839 | |
| 840 | mov bskey, x2 |
| 841 | mov rounds, x3 |
| 842 | bl aesbs_encrypt8 |
| 843 | |
| 844 | ld1 { v8.16b-v11.16b}, [x1], #64 |
| 845 | ld1 {v12.16b-v15.16b}, [x1], #64 |
| 846 | |
| 847 | eor v8.16b, v0.16b, v8.16b |
| 848 | eor v9.16b, v1.16b, v9.16b |
| 849 | eor v10.16b, v4.16b, v10.16b |
| 850 | eor v11.16b, v6.16b, v11.16b |
| 851 | eor v12.16b, v3.16b, v12.16b |
| 852 | eor v13.16b, v7.16b, v13.16b |
| 853 | eor v14.16b, v2.16b, v14.16b |
| 854 | eor v15.16b, v5.16b, v15.16b |
| 855 | |
| 856 | st1 { v8.16b-v11.16b}, [x0], #64 |
| 857 | st1 {v12.16b-v15.16b}, [x0], #64 |
| 858 | |
| 859 | next_ctr v0 |
| 860 | subs x4, x4, #8 |
| 861 | b.gt 0b |
| 862 | |
| 863 | st1 {v0.16b}, [x5] |
| 864 | frame_pop |
| 865 | ret |
| 866 | SYM_FUNC_END(aesbs_ctr_encrypt) |
| 867 | |