| 1 | /* Optimized memcpy for Fujitsu A64FX processor. |
| 2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
| 3 | |
| 4 | This file is part of the GNU C Library. |
| 5 | |
| 6 | The GNU C Library is free software; you can redistribute it and/or |
| 7 | modify it under the terms of the GNU Lesser General Public |
| 8 | License as published by the Free Software Foundation; either |
| 9 | version 2.1 of the License, or (at your option) any later version. |
| 10 | |
| 11 | The GNU C Library is distributed in the hope that it will be useful, |
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | Lesser General Public License for more details. |
| 15 | |
| 16 | You should have received a copy of the GNU Lesser General Public |
| 17 | License along with the GNU C Library. If not, see |
| 18 | <https://www.gnu.org/licenses/>. */ |
| 19 | |
| 20 | #include <sysdep.h> |
| 21 | |
| 22 | #undef BTI_C |
| 23 | #define BTI_C |
| 24 | |
| 25 | /* Assumptions: |
| 26 | * |
| 27 | * ARMv8.2-a, AArch64, unaligned accesses, sve |
| 28 | * |
| 29 | */ |
| 30 | |
| 31 | #define dstin x0 |
| 32 | #define src x1 |
| 33 | #define n x2 |
| 34 | #define dst x3 |
| 35 | #define dstend x4 |
| 36 | #define srcend x5 |
| 37 | #define tmp x6 |
| 38 | #define vlen x7 |
| 39 | #define vlen8 x8 |
| 40 | |
| 41 | #if HAVE_AARCH64_SVE_ASM |
| 42 | |
| 43 | .arch armv8.2-a+sve |
| 44 | |
| 45 | .macro ld1b_unroll8 |
| 46 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 47 | ld1b z1.b, p0/z, [src, 1, mul vl] |
| 48 | ld1b z2.b, p0/z, [src, 2, mul vl] |
| 49 | ld1b z3.b, p0/z, [src, 3, mul vl] |
| 50 | ld1b z4.b, p0/z, [src, 4, mul vl] |
| 51 | ld1b z5.b, p0/z, [src, 5, mul vl] |
| 52 | ld1b z6.b, p0/z, [src, 6, mul vl] |
| 53 | ld1b z7.b, p0/z, [src, 7, mul vl] |
| 54 | .endm |
| 55 | |
| 56 | .macro stld1b_unroll4a |
| 57 | st1b z0.b, p0, [dst, 0, mul vl] |
| 58 | st1b z1.b, p0, [dst, 1, mul vl] |
| 59 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 60 | ld1b z1.b, p0/z, [src, 1, mul vl] |
| 61 | st1b z2.b, p0, [dst, 2, mul vl] |
| 62 | st1b z3.b, p0, [dst, 3, mul vl] |
| 63 | ld1b z2.b, p0/z, [src, 2, mul vl] |
| 64 | ld1b z3.b, p0/z, [src, 3, mul vl] |
| 65 | .endm |
| 66 | |
| 67 | .macro stld1b_unroll4b |
| 68 | st1b z4.b, p0, [dst, 4, mul vl] |
| 69 | st1b z5.b, p0, [dst, 5, mul vl] |
| 70 | ld1b z4.b, p0/z, [src, 4, mul vl] |
| 71 | ld1b z5.b, p0/z, [src, 5, mul vl] |
| 72 | st1b z6.b, p0, [dst, 6, mul vl] |
| 73 | st1b z7.b, p0, [dst, 7, mul vl] |
| 74 | ld1b z6.b, p0/z, [src, 6, mul vl] |
| 75 | ld1b z7.b, p0/z, [src, 7, mul vl] |
| 76 | .endm |
| 77 | |
| 78 | .macro stld1b_unroll8 |
| 79 | stld1b_unroll4a |
| 80 | stld1b_unroll4b |
| 81 | .endm |
| 82 | |
| 83 | .macro st1b_unroll8 |
| 84 | st1b z0.b, p0, [dst, 0, mul vl] |
| 85 | st1b z1.b, p0, [dst, 1, mul vl] |
| 86 | st1b z2.b, p0, [dst, 2, mul vl] |
| 87 | st1b z3.b, p0, [dst, 3, mul vl] |
| 88 | st1b z4.b, p0, [dst, 4, mul vl] |
| 89 | st1b z5.b, p0, [dst, 5, mul vl] |
| 90 | st1b z6.b, p0, [dst, 6, mul vl] |
| 91 | st1b z7.b, p0, [dst, 7, mul vl] |
| 92 | .endm |
| 93 | |
| 94 | #undef BTI_C |
| 95 | #define BTI_C |
| 96 | |
| 97 | ENTRY (__memcpy_a64fx) |
| 98 | |
| 99 | PTR_ARG (0) |
| 100 | PTR_ARG (1) |
| 101 | SIZE_ARG (2) |
| 102 | |
| 103 | cntb vlen |
| 104 | cmp n, vlen, lsl 1 |
| 105 | b.hi L(copy_small) |
| 106 | whilelo p1.b, vlen, n |
| 107 | whilelo p0.b, xzr, n |
| 108 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 109 | ld1b z1.b, p1/z, [src, 1, mul vl] |
| 110 | st1b z0.b, p0, [dstin, 0, mul vl] |
| 111 | st1b z1.b, p1, [dstin, 1, mul vl] |
| 112 | ret |
| 113 | |
| 114 | .p2align 4 |
| 115 | |
| 116 | L(copy_small): |
| 117 | cmp n, vlen, lsl 3 |
| 118 | b.hi L(copy_large) |
| 119 | add dstend, dstin, n |
| 120 | add srcend, src, n |
| 121 | cmp n, vlen, lsl 2 |
| 122 | b.hi 1f |
| 123 | |
| 124 | /* Copy 2-4 vectors. */ |
| 125 | ptrue p0.b |
| 126 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 127 | ld1b z1.b, p0/z, [src, 1, mul vl] |
| 128 | ld1b z2.b, p0/z, [srcend, -2, mul vl] |
| 129 | ld1b z3.b, p0/z, [srcend, -1, mul vl] |
| 130 | st1b z0.b, p0, [dstin, 0, mul vl] |
| 131 | st1b z1.b, p0, [dstin, 1, mul vl] |
| 132 | st1b z2.b, p0, [dstend, -2, mul vl] |
| 133 | st1b z3.b, p0, [dstend, -1, mul vl] |
| 134 | ret |
| 135 | |
| 136 | .p2align 4 |
| 137 | /* Copy 4-8 vectors. */ |
| 138 | 1: ptrue p0.b |
| 139 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 140 | ld1b z1.b, p0/z, [src, 1, mul vl] |
| 141 | ld1b z2.b, p0/z, [src, 2, mul vl] |
| 142 | ld1b z3.b, p0/z, [src, 3, mul vl] |
| 143 | ld1b z4.b, p0/z, [srcend, -4, mul vl] |
| 144 | ld1b z5.b, p0/z, [srcend, -3, mul vl] |
| 145 | ld1b z6.b, p0/z, [srcend, -2, mul vl] |
| 146 | ld1b z7.b, p0/z, [srcend, -1, mul vl] |
| 147 | st1b z0.b, p0, [dstin, 0, mul vl] |
| 148 | st1b z1.b, p0, [dstin, 1, mul vl] |
| 149 | st1b z2.b, p0, [dstin, 2, mul vl] |
| 150 | st1b z3.b, p0, [dstin, 3, mul vl] |
| 151 | st1b z4.b, p0, [dstend, -4, mul vl] |
| 152 | st1b z5.b, p0, [dstend, -3, mul vl] |
| 153 | st1b z6.b, p0, [dstend, -2, mul vl] |
| 154 | st1b z7.b, p0, [dstend, -1, mul vl] |
| 155 | ret |
| 156 | |
| 157 | .p2align 4 |
| 158 | /* At least 8 vectors - always align to vector length for |
| 159 | higher and consistent write performance. */ |
| 160 | L(copy_large): |
| 161 | sub tmp, vlen, 1 |
| 162 | and tmp, dstin, tmp |
| 163 | sub tmp, vlen, tmp |
| 164 | whilelo p1.b, xzr, tmp |
| 165 | ld1b z1.b, p1/z, [src] |
| 166 | st1b z1.b, p1, [dstin] |
| 167 | add dst, dstin, tmp |
| 168 | add src, src, tmp |
| 169 | sub n, n, tmp |
| 170 | ptrue p0.b |
| 171 | |
| 172 | lsl vlen8, vlen, 3 |
| 173 | subs n, n, vlen8 |
| 174 | b.ls 3f |
| 175 | ld1b_unroll8 |
| 176 | add src, src, vlen8 |
| 177 | subs n, n, vlen8 |
| 178 | b.ls 2f |
| 179 | |
| 180 | .p2align 4 |
| 181 | /* 8x unrolled and software pipelined loop. */ |
| 182 | 1: stld1b_unroll8 |
| 183 | add dst, dst, vlen8 |
| 184 | add src, src, vlen8 |
| 185 | subs n, n, vlen8 |
| 186 | b.hi 1b |
| 187 | 2: st1b_unroll8 |
| 188 | add dst, dst, vlen8 |
| 189 | 3: add n, n, vlen8 |
| 190 | |
| 191 | /* Move last 0-8 vectors. */ |
| 192 | L(last_bytes): |
| 193 | cmp n, vlen, lsl 1 |
| 194 | b.hi 1f |
| 195 | whilelo p0.b, xzr, n |
| 196 | whilelo p1.b, vlen, n |
| 197 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 198 | ld1b z1.b, p1/z, [src, 1, mul vl] |
| 199 | st1b z0.b, p0, [dst, 0, mul vl] |
| 200 | st1b z1.b, p1, [dst, 1, mul vl] |
| 201 | ret |
| 202 | |
| 203 | .p2align 4 |
| 204 | |
| 205 | 1: add srcend, src, n |
| 206 | add dstend, dst, n |
| 207 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 208 | ld1b z1.b, p0/z, [src, 1, mul vl] |
| 209 | ld1b z2.b, p0/z, [srcend, -2, mul vl] |
| 210 | ld1b z3.b, p0/z, [srcend, -1, mul vl] |
| 211 | cmp n, vlen, lsl 2 |
| 212 | b.hi 1f |
| 213 | |
| 214 | st1b z0.b, p0, [dst, 0, mul vl] |
| 215 | st1b z1.b, p0, [dst, 1, mul vl] |
| 216 | st1b z2.b, p0, [dstend, -2, mul vl] |
| 217 | st1b z3.b, p0, [dstend, -1, mul vl] |
| 218 | ret |
| 219 | |
| 220 | 1: ld1b z4.b, p0/z, [src, 2, mul vl] |
| 221 | ld1b z5.b, p0/z, [src, 3, mul vl] |
| 222 | ld1b z6.b, p0/z, [srcend, -4, mul vl] |
| 223 | ld1b z7.b, p0/z, [srcend, -3, mul vl] |
| 224 | st1b z0.b, p0, [dst, 0, mul vl] |
| 225 | st1b z1.b, p0, [dst, 1, mul vl] |
| 226 | st1b z4.b, p0, [dst, 2, mul vl] |
| 227 | st1b z5.b, p0, [dst, 3, mul vl] |
| 228 | st1b z6.b, p0, [dstend, -4, mul vl] |
| 229 | st1b z7.b, p0, [dstend, -3, mul vl] |
| 230 | st1b z2.b, p0, [dstend, -2, mul vl] |
| 231 | st1b z3.b, p0, [dstend, -1, mul vl] |
| 232 | ret |
| 233 | |
| 234 | END (__memcpy_a64fx) |
| 235 | |
| 236 | |
| 237 | ENTRY_ALIGN (__memmove_a64fx, 4) |
| 238 | |
| 239 | PTR_ARG (0) |
| 240 | PTR_ARG (1) |
| 241 | SIZE_ARG (2) |
| 242 | |
| 243 | /* Fast case for up to 2 vectors. */ |
| 244 | cntb vlen |
| 245 | cmp n, vlen, lsl 1 |
| 246 | b.hi 1f |
| 247 | whilelo p0.b, xzr, n |
| 248 | whilelo p1.b, vlen, n |
| 249 | ld1b z0.b, p0/z, [src, 0, mul vl] |
| 250 | ld1b z1.b, p1/z, [src, 1, mul vl] |
| 251 | st1b z0.b, p0, [dstin, 0, mul vl] |
| 252 | st1b z1.b, p1, [dstin, 1, mul vl] |
| 253 | L(full_overlap): |
| 254 | ret |
| 255 | |
| 256 | .p2align 4 |
| 257 | /* Check for overlapping moves. Return if there is a full overlap. |
| 258 | Small moves up to 8 vectors use the overlap-safe copy_small code. |
| 259 | Non-overlapping or overlapping moves with dst < src use memcpy. |
| 260 | Overlapping moves with dst > src use a backward copy loop. */ |
| 261 | 1: sub tmp, dstin, src |
| 262 | ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */ |
| 263 | b.eq L(full_overlap) |
| 264 | cmp n, vlen, lsl 3 |
| 265 | b.ls L(copy_small) |
| 266 | cmp tmp, n |
| 267 | b.hs L(copy_large) |
| 268 | |
| 269 | /* Align to vector length. */ |
| 270 | add dst, dstin, n |
| 271 | sub tmp, vlen, 1 |
| 272 | ands tmp, dst, tmp |
| 273 | csel tmp, tmp, vlen, ne |
| 274 | whilelo p1.b, xzr, tmp |
| 275 | sub n, n, tmp |
| 276 | ld1b z1.b, p1/z, [src, n] |
| 277 | st1b z1.b, p1, [dstin, n] |
| 278 | add src, src, n |
| 279 | add dst, dstin, n |
| 280 | |
| 281 | ptrue p0.b |
| 282 | lsl vlen8, vlen, 3 |
| 283 | subs n, n, vlen8 |
| 284 | b.ls 3f |
| 285 | sub src, src, vlen8 |
| 286 | ld1b_unroll8 |
| 287 | subs n, n, vlen8 |
| 288 | b.ls 2f |
| 289 | |
| 290 | .p2align 4 |
| 291 | /* 8x unrolled and software pipelined backward copy loop. */ |
| 292 | 1: sub src, src, vlen8 |
| 293 | sub dst, dst, vlen8 |
| 294 | stld1b_unroll8 |
| 295 | subs n, n, vlen8 |
| 296 | b.hi 1b |
| 297 | 2: sub dst, dst, vlen8 |
| 298 | st1b_unroll8 |
| 299 | 3: add n, n, vlen8 |
| 300 | |
| 301 | /* Adjust src/dst for last 0-8 vectors. */ |
| 302 | sub src, src, n |
| 303 | mov dst, dstin |
| 304 | b L(last_bytes) |
| 305 | |
| 306 | END (__memmove_a64fx) |
| 307 | #endif /* HAVE_AARCH64_SVE_ASM */ |
| 308 | |