| 1 | /* |
| 2 | * ChaCha/HChaCha NEON helper functions |
| 3 | * |
| 4 | * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License version 2 as |
| 8 | * published by the Free Software Foundation. |
| 9 | * |
| 10 | * Originally based on: |
| 11 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions |
| 12 | * |
| 13 | * Copyright (C) 2015 Martin Willi |
| 14 | * |
| 15 | * This program is free software; you can redistribute it and/or modify |
| 16 | * it under the terms of the GNU General Public License as published by |
| 17 | * the Free Software Foundation; either version 2 of the License, or |
| 18 | * (at your option) any later version. |
| 19 | */ |
| 20 | |
| 21 | #include <linux/linkage.h> |
| 22 | #include <asm/assembler.h> |
| 23 | #include <asm/cache.h> |
| 24 | |
| 25 | .text |
| 26 | .align 6 |
| 27 | |
| 28 | /* |
| 29 | * chacha_permute - permute one block |
| 30 | * |
| 31 | * Permute one 64-byte block where the state matrix is stored in the four NEON |
| 32 | * registers v0-v3. It performs matrix operations on four words in parallel, |
| 33 | * but requires shuffling to rearrange the words after each round. |
| 34 | * |
| 35 | * The round count is given in w3. |
| 36 | * |
| 37 | * Clobbers: w3, x10, v4, v12 |
| 38 | */ |
| 39 | SYM_FUNC_START_LOCAL(chacha_permute) |
| 40 | |
| 41 | adr_l x10, ROT8 |
| 42 | ld1 {v12.4s}, [x10] |
| 43 | |
| 44 | .Ldoubleround: |
| 45 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 46 | add v0.4s, v0.4s, v1.4s |
| 47 | eor v3.16b, v3.16b, v0.16b |
| 48 | rev32 v3.8h, v3.8h |
| 49 | |
| 50 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 51 | add v2.4s, v2.4s, v3.4s |
| 52 | eor v4.16b, v1.16b, v2.16b |
| 53 | shl v1.4s, v4.4s, #12 |
| 54 | sri v1.4s, v4.4s, #20 |
| 55 | |
| 56 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 57 | add v0.4s, v0.4s, v1.4s |
| 58 | eor v3.16b, v3.16b, v0.16b |
| 59 | tbl v3.16b, {v3.16b}, v12.16b |
| 60 | |
| 61 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 62 | add v2.4s, v2.4s, v3.4s |
| 63 | eor v4.16b, v1.16b, v2.16b |
| 64 | shl v1.4s, v4.4s, #7 |
| 65 | sri v1.4s, v4.4s, #25 |
| 66 | |
| 67 | // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) |
| 68 | ext v1.16b, v1.16b, v1.16b, #4 |
| 69 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 70 | ext v2.16b, v2.16b, v2.16b, #8 |
| 71 | // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) |
| 72 | ext v3.16b, v3.16b, v3.16b, #12 |
| 73 | |
| 74 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) |
| 75 | add v0.4s, v0.4s, v1.4s |
| 76 | eor v3.16b, v3.16b, v0.16b |
| 77 | rev32 v3.8h, v3.8h |
| 78 | |
| 79 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) |
| 80 | add v2.4s, v2.4s, v3.4s |
| 81 | eor v4.16b, v1.16b, v2.16b |
| 82 | shl v1.4s, v4.4s, #12 |
| 83 | sri v1.4s, v4.4s, #20 |
| 84 | |
| 85 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) |
| 86 | add v0.4s, v0.4s, v1.4s |
| 87 | eor v3.16b, v3.16b, v0.16b |
| 88 | tbl v3.16b, {v3.16b}, v12.16b |
| 89 | |
| 90 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) |
| 91 | add v2.4s, v2.4s, v3.4s |
| 92 | eor v4.16b, v1.16b, v2.16b |
| 93 | shl v1.4s, v4.4s, #7 |
| 94 | sri v1.4s, v4.4s, #25 |
| 95 | |
| 96 | // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) |
| 97 | ext v1.16b, v1.16b, v1.16b, #12 |
| 98 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) |
| 99 | ext v2.16b, v2.16b, v2.16b, #8 |
| 100 | // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) |
| 101 | ext v3.16b, v3.16b, v3.16b, #4 |
| 102 | |
| 103 | subs w3, w3, #2 |
| 104 | b.ne .Ldoubleround |
| 105 | |
| 106 | ret |
| 107 | SYM_FUNC_END(chacha_permute) |
| 108 | |
| 109 | SYM_FUNC_START(chacha_block_xor_neon) |
| 110 | // x0: Input state matrix, s |
| 111 | // x1: 1 data block output, o |
| 112 | // x2: 1 data block input, i |
| 113 | // w3: nrounds |
| 114 | |
| 115 | stp x29, x30, [sp, #-16]! |
| 116 | mov x29, sp |
| 117 | |
| 118 | // x0..3 = s0..3 |
| 119 | ld1 {v0.4s-v3.4s}, [x0] |
| 120 | ld1 {v8.4s-v11.4s}, [x0] |
| 121 | |
| 122 | bl chacha_permute |
| 123 | |
| 124 | ld1 {v4.16b-v7.16b}, [x2] |
| 125 | |
| 126 | // o0 = i0 ^ (x0 + s0) |
| 127 | add v0.4s, v0.4s, v8.4s |
| 128 | eor v0.16b, v0.16b, v4.16b |
| 129 | |
| 130 | // o1 = i1 ^ (x1 + s1) |
| 131 | add v1.4s, v1.4s, v9.4s |
| 132 | eor v1.16b, v1.16b, v5.16b |
| 133 | |
| 134 | // o2 = i2 ^ (x2 + s2) |
| 135 | add v2.4s, v2.4s, v10.4s |
| 136 | eor v2.16b, v2.16b, v6.16b |
| 137 | |
| 138 | // o3 = i3 ^ (x3 + s3) |
| 139 | add v3.4s, v3.4s, v11.4s |
| 140 | eor v3.16b, v3.16b, v7.16b |
| 141 | |
| 142 | st1 {v0.16b-v3.16b}, [x1] |
| 143 | |
| 144 | ldp x29, x30, [sp], #16 |
| 145 | ret |
| 146 | SYM_FUNC_END(chacha_block_xor_neon) |
| 147 | |
| 148 | SYM_FUNC_START(hchacha_block_neon) |
| 149 | // x0: Input state matrix, s |
| 150 | // x1: output (8 32-bit words) |
| 151 | // w2: nrounds |
| 152 | |
| 153 | stp x29, x30, [sp, #-16]! |
| 154 | mov x29, sp |
| 155 | |
| 156 | ld1 {v0.4s-v3.4s}, [x0] |
| 157 | |
| 158 | mov w3, w2 |
| 159 | bl chacha_permute |
| 160 | |
| 161 | st1 {v0.4s}, [x1], #16 |
| 162 | st1 {v3.4s}, [x1] |
| 163 | |
| 164 | ldp x29, x30, [sp], #16 |
| 165 | ret |
| 166 | SYM_FUNC_END(hchacha_block_neon) |
| 167 | |
| 168 | a0 .req w12 |
| 169 | a1 .req w13 |
| 170 | a2 .req w14 |
| 171 | a3 .req w15 |
| 172 | a4 .req w16 |
| 173 | a5 .req w17 |
| 174 | a6 .req w19 |
| 175 | a7 .req w20 |
| 176 | a8 .req w21 |
| 177 | a9 .req w22 |
| 178 | a10 .req w23 |
| 179 | a11 .req w24 |
| 180 | a12 .req w25 |
| 181 | a13 .req w26 |
| 182 | a14 .req w27 |
| 183 | a15 .req w28 |
| 184 | |
| 185 | .align 6 |
| 186 | SYM_FUNC_START(chacha_4block_xor_neon) |
| 187 | frame_push 10 |
| 188 | |
| 189 | // x0: Input state matrix, s |
| 190 | // x1: 4 data blocks output, o |
| 191 | // x2: 4 data blocks input, i |
| 192 | // w3: nrounds |
| 193 | // x4: byte count |
| 194 | |
| 195 | adr_l x10, .Lpermute |
| 196 | and x5, x4, #63 |
| 197 | add x10, x10, x5 |
| 198 | |
| 199 | // |
| 200 | // This function encrypts four consecutive ChaCha blocks by loading |
| 201 | // the state matrix in NEON registers four times. The algorithm performs |
| 202 | // each operation on the corresponding word of each state matrix, hence |
| 203 | // requires no word shuffling. For final XORing step we transpose the |
| 204 | // matrix by interleaving 32- and then 64-bit words, which allows us to |
| 205 | // do XOR in NEON registers. |
| 206 | // |
| 207 | // At the same time, a fifth block is encrypted in parallel using |
| 208 | // scalar registers |
| 209 | // |
| 210 | adr_l x9, CTRINC // ... and ROT8 |
| 211 | ld1 {v30.4s-v31.4s}, [x9] |
| 212 | |
| 213 | // x0..15[0-3] = s0..3[0..3] |
| 214 | add x8, x0, #16 |
| 215 | ld4r { v0.4s- v3.4s}, [x0] |
| 216 | ld4r { v4.4s- v7.4s}, [x8], #16 |
| 217 | ld4r { v8.4s-v11.4s}, [x8], #16 |
| 218 | ld4r {v12.4s-v15.4s}, [x8] |
| 219 | |
| 220 | mov a0, v0.s[0] |
| 221 | mov a1, v1.s[0] |
| 222 | mov a2, v2.s[0] |
| 223 | mov a3, v3.s[0] |
| 224 | mov a4, v4.s[0] |
| 225 | mov a5, v5.s[0] |
| 226 | mov a6, v6.s[0] |
| 227 | mov a7, v7.s[0] |
| 228 | mov a8, v8.s[0] |
| 229 | mov a9, v9.s[0] |
| 230 | mov a10, v10.s[0] |
| 231 | mov a11, v11.s[0] |
| 232 | mov a12, v12.s[0] |
| 233 | mov a13, v13.s[0] |
| 234 | mov a14, v14.s[0] |
| 235 | mov a15, v15.s[0] |
| 236 | |
| 237 | // x12 += counter values 1-4 |
| 238 | add v12.4s, v12.4s, v30.4s |
| 239 | |
| 240 | .Ldoubleround4: |
| 241 | // x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
| 242 | // x1 += x5, x13 = rotl32(x13 ^ x1, 16) |
| 243 | // x2 += x6, x14 = rotl32(x14 ^ x2, 16) |
| 244 | // x3 += x7, x15 = rotl32(x15 ^ x3, 16) |
| 245 | add v0.4s, v0.4s, v4.4s |
| 246 | add a0, a0, a4 |
| 247 | add v1.4s, v1.4s, v5.4s |
| 248 | add a1, a1, a5 |
| 249 | add v2.4s, v2.4s, v6.4s |
| 250 | add a2, a2, a6 |
| 251 | add v3.4s, v3.4s, v7.4s |
| 252 | add a3, a3, a7 |
| 253 | |
| 254 | eor v12.16b, v12.16b, v0.16b |
| 255 | eor a12, a12, a0 |
| 256 | eor v13.16b, v13.16b, v1.16b |
| 257 | eor a13, a13, a1 |
| 258 | eor v14.16b, v14.16b, v2.16b |
| 259 | eor a14, a14, a2 |
| 260 | eor v15.16b, v15.16b, v3.16b |
| 261 | eor a15, a15, a3 |
| 262 | |
| 263 | rev32 v12.8h, v12.8h |
| 264 | ror a12, a12, #16 |
| 265 | rev32 v13.8h, v13.8h |
| 266 | ror a13, a13, #16 |
| 267 | rev32 v14.8h, v14.8h |
| 268 | ror a14, a14, #16 |
| 269 | rev32 v15.8h, v15.8h |
| 270 | ror a15, a15, #16 |
| 271 | |
| 272 | // x8 += x12, x4 = rotl32(x4 ^ x8, 12) |
| 273 | // x9 += x13, x5 = rotl32(x5 ^ x9, 12) |
| 274 | // x10 += x14, x6 = rotl32(x6 ^ x10, 12) |
| 275 | // x11 += x15, x7 = rotl32(x7 ^ x11, 12) |
| 276 | add v8.4s, v8.4s, v12.4s |
| 277 | add a8, a8, a12 |
| 278 | add v9.4s, v9.4s, v13.4s |
| 279 | add a9, a9, a13 |
| 280 | add v10.4s, v10.4s, v14.4s |
| 281 | add a10, a10, a14 |
| 282 | add v11.4s, v11.4s, v15.4s |
| 283 | add a11, a11, a15 |
| 284 | |
| 285 | eor v16.16b, v4.16b, v8.16b |
| 286 | eor a4, a4, a8 |
| 287 | eor v17.16b, v5.16b, v9.16b |
| 288 | eor a5, a5, a9 |
| 289 | eor v18.16b, v6.16b, v10.16b |
| 290 | eor a6, a6, a10 |
| 291 | eor v19.16b, v7.16b, v11.16b |
| 292 | eor a7, a7, a11 |
| 293 | |
| 294 | shl v4.4s, v16.4s, #12 |
| 295 | shl v5.4s, v17.4s, #12 |
| 296 | shl v6.4s, v18.4s, #12 |
| 297 | shl v7.4s, v19.4s, #12 |
| 298 | |
| 299 | sri v4.4s, v16.4s, #20 |
| 300 | ror a4, a4, #20 |
| 301 | sri v5.4s, v17.4s, #20 |
| 302 | ror a5, a5, #20 |
| 303 | sri v6.4s, v18.4s, #20 |
| 304 | ror a6, a6, #20 |
| 305 | sri v7.4s, v19.4s, #20 |
| 306 | ror a7, a7, #20 |
| 307 | |
| 308 | // x0 += x4, x12 = rotl32(x12 ^ x0, 8) |
| 309 | // x1 += x5, x13 = rotl32(x13 ^ x1, 8) |
| 310 | // x2 += x6, x14 = rotl32(x14 ^ x2, 8) |
| 311 | // x3 += x7, x15 = rotl32(x15 ^ x3, 8) |
| 312 | add v0.4s, v0.4s, v4.4s |
| 313 | add a0, a0, a4 |
| 314 | add v1.4s, v1.4s, v5.4s |
| 315 | add a1, a1, a5 |
| 316 | add v2.4s, v2.4s, v6.4s |
| 317 | add a2, a2, a6 |
| 318 | add v3.4s, v3.4s, v7.4s |
| 319 | add a3, a3, a7 |
| 320 | |
| 321 | eor v12.16b, v12.16b, v0.16b |
| 322 | eor a12, a12, a0 |
| 323 | eor v13.16b, v13.16b, v1.16b |
| 324 | eor a13, a13, a1 |
| 325 | eor v14.16b, v14.16b, v2.16b |
| 326 | eor a14, a14, a2 |
| 327 | eor v15.16b, v15.16b, v3.16b |
| 328 | eor a15, a15, a3 |
| 329 | |
| 330 | tbl v12.16b, {v12.16b}, v31.16b |
| 331 | ror a12, a12, #24 |
| 332 | tbl v13.16b, {v13.16b}, v31.16b |
| 333 | ror a13, a13, #24 |
| 334 | tbl v14.16b, {v14.16b}, v31.16b |
| 335 | ror a14, a14, #24 |
| 336 | tbl v15.16b, {v15.16b}, v31.16b |
| 337 | ror a15, a15, #24 |
| 338 | |
| 339 | // x8 += x12, x4 = rotl32(x4 ^ x8, 7) |
| 340 | // x9 += x13, x5 = rotl32(x5 ^ x9, 7) |
| 341 | // x10 += x14, x6 = rotl32(x6 ^ x10, 7) |
| 342 | // x11 += x15, x7 = rotl32(x7 ^ x11, 7) |
| 343 | add v8.4s, v8.4s, v12.4s |
| 344 | add a8, a8, a12 |
| 345 | add v9.4s, v9.4s, v13.4s |
| 346 | add a9, a9, a13 |
| 347 | add v10.4s, v10.4s, v14.4s |
| 348 | add a10, a10, a14 |
| 349 | add v11.4s, v11.4s, v15.4s |
| 350 | add a11, a11, a15 |
| 351 | |
| 352 | eor v16.16b, v4.16b, v8.16b |
| 353 | eor a4, a4, a8 |
| 354 | eor v17.16b, v5.16b, v9.16b |
| 355 | eor a5, a5, a9 |
| 356 | eor v18.16b, v6.16b, v10.16b |
| 357 | eor a6, a6, a10 |
| 358 | eor v19.16b, v7.16b, v11.16b |
| 359 | eor a7, a7, a11 |
| 360 | |
| 361 | shl v4.4s, v16.4s, #7 |
| 362 | shl v5.4s, v17.4s, #7 |
| 363 | shl v6.4s, v18.4s, #7 |
| 364 | shl v7.4s, v19.4s, #7 |
| 365 | |
| 366 | sri v4.4s, v16.4s, #25 |
| 367 | ror a4, a4, #25 |
| 368 | sri v5.4s, v17.4s, #25 |
| 369 | ror a5, a5, #25 |
| 370 | sri v6.4s, v18.4s, #25 |
| 371 | ror a6, a6, #25 |
| 372 | sri v7.4s, v19.4s, #25 |
| 373 | ror a7, a7, #25 |
| 374 | |
| 375 | // x0 += x5, x15 = rotl32(x15 ^ x0, 16) |
| 376 | // x1 += x6, x12 = rotl32(x12 ^ x1, 16) |
| 377 | // x2 += x7, x13 = rotl32(x13 ^ x2, 16) |
| 378 | // x3 += x4, x14 = rotl32(x14 ^ x3, 16) |
| 379 | add v0.4s, v0.4s, v5.4s |
| 380 | add a0, a0, a5 |
| 381 | add v1.4s, v1.4s, v6.4s |
| 382 | add a1, a1, a6 |
| 383 | add v2.4s, v2.4s, v7.4s |
| 384 | add a2, a2, a7 |
| 385 | add v3.4s, v3.4s, v4.4s |
| 386 | add a3, a3, a4 |
| 387 | |
| 388 | eor v15.16b, v15.16b, v0.16b |
| 389 | eor a15, a15, a0 |
| 390 | eor v12.16b, v12.16b, v1.16b |
| 391 | eor a12, a12, a1 |
| 392 | eor v13.16b, v13.16b, v2.16b |
| 393 | eor a13, a13, a2 |
| 394 | eor v14.16b, v14.16b, v3.16b |
| 395 | eor a14, a14, a3 |
| 396 | |
| 397 | rev32 v15.8h, v15.8h |
| 398 | ror a15, a15, #16 |
| 399 | rev32 v12.8h, v12.8h |
| 400 | ror a12, a12, #16 |
| 401 | rev32 v13.8h, v13.8h |
| 402 | ror a13, a13, #16 |
| 403 | rev32 v14.8h, v14.8h |
| 404 | ror a14, a14, #16 |
| 405 | |
| 406 | // x10 += x15, x5 = rotl32(x5 ^ x10, 12) |
| 407 | // x11 += x12, x6 = rotl32(x6 ^ x11, 12) |
| 408 | // x8 += x13, x7 = rotl32(x7 ^ x8, 12) |
| 409 | // x9 += x14, x4 = rotl32(x4 ^ x9, 12) |
| 410 | add v10.4s, v10.4s, v15.4s |
| 411 | add a10, a10, a15 |
| 412 | add v11.4s, v11.4s, v12.4s |
| 413 | add a11, a11, a12 |
| 414 | add v8.4s, v8.4s, v13.4s |
| 415 | add a8, a8, a13 |
| 416 | add v9.4s, v9.4s, v14.4s |
| 417 | add a9, a9, a14 |
| 418 | |
| 419 | eor v16.16b, v5.16b, v10.16b |
| 420 | eor a5, a5, a10 |
| 421 | eor v17.16b, v6.16b, v11.16b |
| 422 | eor a6, a6, a11 |
| 423 | eor v18.16b, v7.16b, v8.16b |
| 424 | eor a7, a7, a8 |
| 425 | eor v19.16b, v4.16b, v9.16b |
| 426 | eor a4, a4, a9 |
| 427 | |
| 428 | shl v5.4s, v16.4s, #12 |
| 429 | shl v6.4s, v17.4s, #12 |
| 430 | shl v7.4s, v18.4s, #12 |
| 431 | shl v4.4s, v19.4s, #12 |
| 432 | |
| 433 | sri v5.4s, v16.4s, #20 |
| 434 | ror a5, a5, #20 |
| 435 | sri v6.4s, v17.4s, #20 |
| 436 | ror a6, a6, #20 |
| 437 | sri v7.4s, v18.4s, #20 |
| 438 | ror a7, a7, #20 |
| 439 | sri v4.4s, v19.4s, #20 |
| 440 | ror a4, a4, #20 |
| 441 | |
| 442 | // x0 += x5, x15 = rotl32(x15 ^ x0, 8) |
| 443 | // x1 += x6, x12 = rotl32(x12 ^ x1, 8) |
| 444 | // x2 += x7, x13 = rotl32(x13 ^ x2, 8) |
| 445 | // x3 += x4, x14 = rotl32(x14 ^ x3, 8) |
| 446 | add v0.4s, v0.4s, v5.4s |
| 447 | add a0, a0, a5 |
| 448 | add v1.4s, v1.4s, v6.4s |
| 449 | add a1, a1, a6 |
| 450 | add v2.4s, v2.4s, v7.4s |
| 451 | add a2, a2, a7 |
| 452 | add v3.4s, v3.4s, v4.4s |
| 453 | add a3, a3, a4 |
| 454 | |
| 455 | eor v15.16b, v15.16b, v0.16b |
| 456 | eor a15, a15, a0 |
| 457 | eor v12.16b, v12.16b, v1.16b |
| 458 | eor a12, a12, a1 |
| 459 | eor v13.16b, v13.16b, v2.16b |
| 460 | eor a13, a13, a2 |
| 461 | eor v14.16b, v14.16b, v3.16b |
| 462 | eor a14, a14, a3 |
| 463 | |
| 464 | tbl v15.16b, {v15.16b}, v31.16b |
| 465 | ror a15, a15, #24 |
| 466 | tbl v12.16b, {v12.16b}, v31.16b |
| 467 | ror a12, a12, #24 |
| 468 | tbl v13.16b, {v13.16b}, v31.16b |
| 469 | ror a13, a13, #24 |
| 470 | tbl v14.16b, {v14.16b}, v31.16b |
| 471 | ror a14, a14, #24 |
| 472 | |
| 473 | // x10 += x15, x5 = rotl32(x5 ^ x10, 7) |
| 474 | // x11 += x12, x6 = rotl32(x6 ^ x11, 7) |
| 475 | // x8 += x13, x7 = rotl32(x7 ^ x8, 7) |
| 476 | // x9 += x14, x4 = rotl32(x4 ^ x9, 7) |
| 477 | add v10.4s, v10.4s, v15.4s |
| 478 | add a10, a10, a15 |
| 479 | add v11.4s, v11.4s, v12.4s |
| 480 | add a11, a11, a12 |
| 481 | add v8.4s, v8.4s, v13.4s |
| 482 | add a8, a8, a13 |
| 483 | add v9.4s, v9.4s, v14.4s |
| 484 | add a9, a9, a14 |
| 485 | |
| 486 | eor v16.16b, v5.16b, v10.16b |
| 487 | eor a5, a5, a10 |
| 488 | eor v17.16b, v6.16b, v11.16b |
| 489 | eor a6, a6, a11 |
| 490 | eor v18.16b, v7.16b, v8.16b |
| 491 | eor a7, a7, a8 |
| 492 | eor v19.16b, v4.16b, v9.16b |
| 493 | eor a4, a4, a9 |
| 494 | |
| 495 | shl v5.4s, v16.4s, #7 |
| 496 | shl v6.4s, v17.4s, #7 |
| 497 | shl v7.4s, v18.4s, #7 |
| 498 | shl v4.4s, v19.4s, #7 |
| 499 | |
| 500 | sri v5.4s, v16.4s, #25 |
| 501 | ror a5, a5, #25 |
| 502 | sri v6.4s, v17.4s, #25 |
| 503 | ror a6, a6, #25 |
| 504 | sri v7.4s, v18.4s, #25 |
| 505 | ror a7, a7, #25 |
| 506 | sri v4.4s, v19.4s, #25 |
| 507 | ror a4, a4, #25 |
| 508 | |
| 509 | subs w3, w3, #2 |
| 510 | b.ne .Ldoubleround4 |
| 511 | |
| 512 | ld4r {v16.4s-v19.4s}, [x0], #16 |
| 513 | ld4r {v20.4s-v23.4s}, [x0], #16 |
| 514 | |
| 515 | // x12 += counter values 0-3 |
| 516 | add v12.4s, v12.4s, v30.4s |
| 517 | |
| 518 | // x0[0-3] += s0[0] |
| 519 | // x1[0-3] += s0[1] |
| 520 | // x2[0-3] += s0[2] |
| 521 | // x3[0-3] += s0[3] |
| 522 | add v0.4s, v0.4s, v16.4s |
| 523 | mov w6, v16.s[0] |
| 524 | mov w7, v17.s[0] |
| 525 | add v1.4s, v1.4s, v17.4s |
| 526 | mov w8, v18.s[0] |
| 527 | mov w9, v19.s[0] |
| 528 | add v2.4s, v2.4s, v18.4s |
| 529 | add a0, a0, w6 |
| 530 | add a1, a1, w7 |
| 531 | add v3.4s, v3.4s, v19.4s |
| 532 | add a2, a2, w8 |
| 533 | add a3, a3, w9 |
| 534 | CPU_BE( rev a0, a0 ) |
| 535 | CPU_BE( rev a1, a1 ) |
| 536 | CPU_BE( rev a2, a2 ) |
| 537 | CPU_BE( rev a3, a3 ) |
| 538 | |
| 539 | ld4r {v24.4s-v27.4s}, [x0], #16 |
| 540 | ld4r {v28.4s-v31.4s}, [x0] |
| 541 | |
| 542 | // x4[0-3] += s1[0] |
| 543 | // x5[0-3] += s1[1] |
| 544 | // x6[0-3] += s1[2] |
| 545 | // x7[0-3] += s1[3] |
| 546 | add v4.4s, v4.4s, v20.4s |
| 547 | mov w6, v20.s[0] |
| 548 | mov w7, v21.s[0] |
| 549 | add v5.4s, v5.4s, v21.4s |
| 550 | mov w8, v22.s[0] |
| 551 | mov w9, v23.s[0] |
| 552 | add v6.4s, v6.4s, v22.4s |
| 553 | add a4, a4, w6 |
| 554 | add a5, a5, w7 |
| 555 | add v7.4s, v7.4s, v23.4s |
| 556 | add a6, a6, w8 |
| 557 | add a7, a7, w9 |
| 558 | CPU_BE( rev a4, a4 ) |
| 559 | CPU_BE( rev a5, a5 ) |
| 560 | CPU_BE( rev a6, a6 ) |
| 561 | CPU_BE( rev a7, a7 ) |
| 562 | |
| 563 | // x8[0-3] += s2[0] |
| 564 | // x9[0-3] += s2[1] |
| 565 | // x10[0-3] += s2[2] |
| 566 | // x11[0-3] += s2[3] |
| 567 | add v8.4s, v8.4s, v24.4s |
| 568 | mov w6, v24.s[0] |
| 569 | mov w7, v25.s[0] |
| 570 | add v9.4s, v9.4s, v25.4s |
| 571 | mov w8, v26.s[0] |
| 572 | mov w9, v27.s[0] |
| 573 | add v10.4s, v10.4s, v26.4s |
| 574 | add a8, a8, w6 |
| 575 | add a9, a9, w7 |
| 576 | add v11.4s, v11.4s, v27.4s |
| 577 | add a10, a10, w8 |
| 578 | add a11, a11, w9 |
| 579 | CPU_BE( rev a8, a8 ) |
| 580 | CPU_BE( rev a9, a9 ) |
| 581 | CPU_BE( rev a10, a10 ) |
| 582 | CPU_BE( rev a11, a11 ) |
| 583 | |
| 584 | // x12[0-3] += s3[0] |
| 585 | // x13[0-3] += s3[1] |
| 586 | // x14[0-3] += s3[2] |
| 587 | // x15[0-3] += s3[3] |
| 588 | add v12.4s, v12.4s, v28.4s |
| 589 | mov w6, v28.s[0] |
| 590 | mov w7, v29.s[0] |
| 591 | add v13.4s, v13.4s, v29.4s |
| 592 | mov w8, v30.s[0] |
| 593 | mov w9, v31.s[0] |
| 594 | add v14.4s, v14.4s, v30.4s |
| 595 | add a12, a12, w6 |
| 596 | add a13, a13, w7 |
| 597 | add v15.4s, v15.4s, v31.4s |
| 598 | add a14, a14, w8 |
| 599 | add a15, a15, w9 |
| 600 | CPU_BE( rev a12, a12 ) |
| 601 | CPU_BE( rev a13, a13 ) |
| 602 | CPU_BE( rev a14, a14 ) |
| 603 | CPU_BE( rev a15, a15 ) |
| 604 | |
| 605 | // interleave 32-bit words in state n, n+1 |
| 606 | ldp w6, w7, [x2], #64 |
| 607 | zip1 v16.4s, v0.4s, v1.4s |
| 608 | ldp w8, w9, [x2, #-56] |
| 609 | eor a0, a0, w6 |
| 610 | zip2 v17.4s, v0.4s, v1.4s |
| 611 | eor a1, a1, w7 |
| 612 | zip1 v18.4s, v2.4s, v3.4s |
| 613 | eor a2, a2, w8 |
| 614 | zip2 v19.4s, v2.4s, v3.4s |
| 615 | eor a3, a3, w9 |
| 616 | ldp w6, w7, [x2, #-48] |
| 617 | zip1 v20.4s, v4.4s, v5.4s |
| 618 | ldp w8, w9, [x2, #-40] |
| 619 | eor a4, a4, w6 |
| 620 | zip2 v21.4s, v4.4s, v5.4s |
| 621 | eor a5, a5, w7 |
| 622 | zip1 v22.4s, v6.4s, v7.4s |
| 623 | eor a6, a6, w8 |
| 624 | zip2 v23.4s, v6.4s, v7.4s |
| 625 | eor a7, a7, w9 |
| 626 | ldp w6, w7, [x2, #-32] |
| 627 | zip1 v24.4s, v8.4s, v9.4s |
| 628 | ldp w8, w9, [x2, #-24] |
| 629 | eor a8, a8, w6 |
| 630 | zip2 v25.4s, v8.4s, v9.4s |
| 631 | eor a9, a9, w7 |
| 632 | zip1 v26.4s, v10.4s, v11.4s |
| 633 | eor a10, a10, w8 |
| 634 | zip2 v27.4s, v10.4s, v11.4s |
| 635 | eor a11, a11, w9 |
| 636 | ldp w6, w7, [x2, #-16] |
| 637 | zip1 v28.4s, v12.4s, v13.4s |
| 638 | ldp w8, w9, [x2, #-8] |
| 639 | eor a12, a12, w6 |
| 640 | zip2 v29.4s, v12.4s, v13.4s |
| 641 | eor a13, a13, w7 |
| 642 | zip1 v30.4s, v14.4s, v15.4s |
| 643 | eor a14, a14, w8 |
| 644 | zip2 v31.4s, v14.4s, v15.4s |
| 645 | eor a15, a15, w9 |
| 646 | |
| 647 | add x3, x2, x4 |
| 648 | sub x3, x3, #128 // start of last block |
| 649 | |
| 650 | subs x5, x4, #128 |
| 651 | csel x2, x2, x3, ge |
| 652 | |
| 653 | // interleave 64-bit words in state n, n+2 |
| 654 | zip1 v0.2d, v16.2d, v18.2d |
| 655 | zip2 v4.2d, v16.2d, v18.2d |
| 656 | stp a0, a1, [x1], #64 |
| 657 | zip1 v8.2d, v17.2d, v19.2d |
| 658 | zip2 v12.2d, v17.2d, v19.2d |
| 659 | stp a2, a3, [x1, #-56] |
| 660 | |
| 661 | subs x6, x4, #192 |
| 662 | ld1 {v16.16b-v19.16b}, [x2], #64 |
| 663 | csel x2, x2, x3, ge |
| 664 | |
| 665 | zip1 v1.2d, v20.2d, v22.2d |
| 666 | zip2 v5.2d, v20.2d, v22.2d |
| 667 | stp a4, a5, [x1, #-48] |
| 668 | zip1 v9.2d, v21.2d, v23.2d |
| 669 | zip2 v13.2d, v21.2d, v23.2d |
| 670 | stp a6, a7, [x1, #-40] |
| 671 | |
| 672 | subs x7, x4, #256 |
| 673 | ld1 {v20.16b-v23.16b}, [x2], #64 |
| 674 | csel x2, x2, x3, ge |
| 675 | |
| 676 | zip1 v2.2d, v24.2d, v26.2d |
| 677 | zip2 v6.2d, v24.2d, v26.2d |
| 678 | stp a8, a9, [x1, #-32] |
| 679 | zip1 v10.2d, v25.2d, v27.2d |
| 680 | zip2 v14.2d, v25.2d, v27.2d |
| 681 | stp a10, a11, [x1, #-24] |
| 682 | |
| 683 | subs x8, x4, #320 |
| 684 | ld1 {v24.16b-v27.16b}, [x2], #64 |
| 685 | csel x2, x2, x3, ge |
| 686 | |
| 687 | zip1 v3.2d, v28.2d, v30.2d |
| 688 | zip2 v7.2d, v28.2d, v30.2d |
| 689 | stp a12, a13, [x1, #-16] |
| 690 | zip1 v11.2d, v29.2d, v31.2d |
| 691 | zip2 v15.2d, v29.2d, v31.2d |
| 692 | stp a14, a15, [x1, #-8] |
| 693 | |
| 694 | tbnz x5, #63, .Lt128 |
| 695 | ld1 {v28.16b-v31.16b}, [x2] |
| 696 | |
| 697 | // xor with corresponding input, write to output |
| 698 | eor v16.16b, v16.16b, v0.16b |
| 699 | eor v17.16b, v17.16b, v1.16b |
| 700 | eor v18.16b, v18.16b, v2.16b |
| 701 | eor v19.16b, v19.16b, v3.16b |
| 702 | |
| 703 | tbnz x6, #63, .Lt192 |
| 704 | |
| 705 | eor v20.16b, v20.16b, v4.16b |
| 706 | eor v21.16b, v21.16b, v5.16b |
| 707 | eor v22.16b, v22.16b, v6.16b |
| 708 | eor v23.16b, v23.16b, v7.16b |
| 709 | |
| 710 | st1 {v16.16b-v19.16b}, [x1], #64 |
| 711 | tbnz x7, #63, .Lt256 |
| 712 | |
| 713 | eor v24.16b, v24.16b, v8.16b |
| 714 | eor v25.16b, v25.16b, v9.16b |
| 715 | eor v26.16b, v26.16b, v10.16b |
| 716 | eor v27.16b, v27.16b, v11.16b |
| 717 | |
| 718 | st1 {v20.16b-v23.16b}, [x1], #64 |
| 719 | tbnz x8, #63, .Lt320 |
| 720 | |
| 721 | eor v28.16b, v28.16b, v12.16b |
| 722 | eor v29.16b, v29.16b, v13.16b |
| 723 | eor v30.16b, v30.16b, v14.16b |
| 724 | eor v31.16b, v31.16b, v15.16b |
| 725 | |
| 726 | st1 {v24.16b-v27.16b}, [x1], #64 |
| 727 | st1 {v28.16b-v31.16b}, [x1] |
| 728 | |
| 729 | .Lout: frame_pop |
| 730 | ret |
| 731 | |
| 732 | // fewer than 192 bytes of in/output |
| 733 | .Lt192: cbz x5, 1f // exactly 128 bytes? |
| 734 | ld1 {v28.16b-v31.16b}, [x10] |
| 735 | add x5, x5, x1 |
| 736 | tbl v28.16b, {v4.16b-v7.16b}, v28.16b |
| 737 | tbl v29.16b, {v4.16b-v7.16b}, v29.16b |
| 738 | tbl v30.16b, {v4.16b-v7.16b}, v30.16b |
| 739 | tbl v31.16b, {v4.16b-v7.16b}, v31.16b |
| 740 | |
| 741 | 0: eor v20.16b, v20.16b, v28.16b |
| 742 | eor v21.16b, v21.16b, v29.16b |
| 743 | eor v22.16b, v22.16b, v30.16b |
| 744 | eor v23.16b, v23.16b, v31.16b |
| 745 | st1 {v20.16b-v23.16b}, [x5] // overlapping stores |
| 746 | 1: st1 {v16.16b-v19.16b}, [x1] |
| 747 | b .Lout |
| 748 | |
| 749 | // fewer than 128 bytes of in/output |
| 750 | .Lt128: ld1 {v28.16b-v31.16b}, [x10] |
| 751 | add x5, x5, x1 |
| 752 | sub x1, x1, #64 |
| 753 | tbl v28.16b, {v0.16b-v3.16b}, v28.16b |
| 754 | tbl v29.16b, {v0.16b-v3.16b}, v29.16b |
| 755 | tbl v30.16b, {v0.16b-v3.16b}, v30.16b |
| 756 | tbl v31.16b, {v0.16b-v3.16b}, v31.16b |
| 757 | ld1 {v16.16b-v19.16b}, [x1] // reload first output block |
| 758 | b 0b |
| 759 | |
| 760 | // fewer than 256 bytes of in/output |
| 761 | .Lt256: cbz x6, 2f // exactly 192 bytes? |
| 762 | ld1 {v4.16b-v7.16b}, [x10] |
| 763 | add x6, x6, x1 |
| 764 | tbl v0.16b, {v8.16b-v11.16b}, v4.16b |
| 765 | tbl v1.16b, {v8.16b-v11.16b}, v5.16b |
| 766 | tbl v2.16b, {v8.16b-v11.16b}, v6.16b |
| 767 | tbl v3.16b, {v8.16b-v11.16b}, v7.16b |
| 768 | |
| 769 | eor v28.16b, v28.16b, v0.16b |
| 770 | eor v29.16b, v29.16b, v1.16b |
| 771 | eor v30.16b, v30.16b, v2.16b |
| 772 | eor v31.16b, v31.16b, v3.16b |
| 773 | st1 {v28.16b-v31.16b}, [x6] // overlapping stores |
| 774 | 2: st1 {v20.16b-v23.16b}, [x1] |
| 775 | b .Lout |
| 776 | |
| 777 | // fewer than 320 bytes of in/output |
| 778 | .Lt320: cbz x7, 3f // exactly 256 bytes? |
| 779 | ld1 {v4.16b-v7.16b}, [x10] |
| 780 | add x7, x7, x1 |
| 781 | tbl v0.16b, {v12.16b-v15.16b}, v4.16b |
| 782 | tbl v1.16b, {v12.16b-v15.16b}, v5.16b |
| 783 | tbl v2.16b, {v12.16b-v15.16b}, v6.16b |
| 784 | tbl v3.16b, {v12.16b-v15.16b}, v7.16b |
| 785 | |
| 786 | eor v28.16b, v28.16b, v0.16b |
| 787 | eor v29.16b, v29.16b, v1.16b |
| 788 | eor v30.16b, v30.16b, v2.16b |
| 789 | eor v31.16b, v31.16b, v3.16b |
| 790 | st1 {v28.16b-v31.16b}, [x7] // overlapping stores |
| 791 | 3: st1 {v24.16b-v27.16b}, [x1] |
| 792 | b .Lout |
| 793 | SYM_FUNC_END(chacha_4block_xor_neon) |
| 794 | |
| 795 | .section ".rodata" , "a" , %progbits |
| 796 | .align L1_CACHE_SHIFT |
| 797 | .Lpermute: |
| 798 | .set .Li, 0 |
| 799 | .rept 128 |
| 800 | .byte (.Li - 64) |
| 801 | .set .Li, .Li + 1 |
| 802 | .endr |
| 803 | |
| 804 | CTRINC: .word 1, 2, 3, 4 |
| 805 | ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f |
| 806 | |