| 1 | //===----------------------------------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains assembly-optimized implementations of Scalable Matrix |
| 11 | /// Extension (SME) compatible memset and memchr functions. |
| 12 | /// |
| 13 | /// These implementations depend on unaligned access and floating-point support. |
| 14 | /// |
| 15 | /// Routines taken from libc/AOR_v20.02/string/aarch64. |
| 16 | /// |
| 17 | //===----------------------------------------------------------------------===// |
| 18 | |
| 19 | #include "../assembly.h" |
| 20 | |
| 21 | // |
| 22 | // __arm_sc_memset |
| 23 | // |
| 24 | |
| 25 | #define dstin x0 |
| 26 | #define val x1 |
| 27 | #define valw w1 |
| 28 | #define count x2 |
| 29 | #define dst x3 |
| 30 | #define dstend2 x4 |
| 31 | #define zva_val x5 |
| 32 | |
| 33 | DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset) |
| 34 | #ifdef __ARM_FEATURE_SVE |
| 35 | mov z0.b, valw |
| 36 | #else |
| 37 | bfi valw, valw, #8, #8 |
| 38 | bfi valw, valw, #16, #16 |
| 39 | bfi val, val, #32, #32 |
| 40 | fmov d0, val |
| 41 | fmov v0.d[1], val |
| 42 | #endif |
| 43 | add dstend2, dstin, count |
| 44 | |
| 45 | cmp count, 96 |
| 46 | b.hi 7f // set_long |
| 47 | cmp count, 16 |
| 48 | b.hs 4f // set_medium |
| 49 | mov val, v0.D[0] |
| 50 | |
| 51 | /* Set 0..15 bytes. */ |
| 52 | tbz count, 3, 1f |
| 53 | str val, [dstin] |
| 54 | str val, [dstend2, -8] |
| 55 | ret |
| 56 | nop |
| 57 | 1: tbz count, 2, 2f |
| 58 | str valw, [dstin] |
| 59 | str valw, [dstend2, -4] |
| 60 | ret |
| 61 | 2: cbz count, 3f |
| 62 | strb valw, [dstin] |
| 63 | tbz count, 1, 3f |
| 64 | strh valw, [dstend2, -2] |
| 65 | 3: ret |
| 66 | |
| 67 | /* Set 17..96 bytes. */ |
| 68 | 4: // set_medium |
| 69 | str q0, [dstin] |
| 70 | tbnz count, 6, 6f // set96 |
| 71 | str q0, [dstend2, -16] |
| 72 | tbz count, 5, 5f |
| 73 | str q0, [dstin, 16] |
| 74 | str q0, [dstend2, -32] |
| 75 | 5: ret |
| 76 | |
| 77 | .p2align 4 |
| 78 | /* Set 64..96 bytes. Write 64 bytes from the start and |
| 79 | 32 bytes from the end. */ |
| 80 | 6: // set96 |
| 81 | str q0, [dstin, 16] |
| 82 | stp q0, q0, [dstin, 32] |
| 83 | stp q0, q0, [dstend2, -32] |
| 84 | ret |
| 85 | |
| 86 | .p2align 4 |
| 87 | 7: // set_long |
| 88 | and valw, valw, 255 |
| 89 | bic dst, dstin, 15 |
| 90 | str q0, [dstin] |
| 91 | cmp count, 160 |
| 92 | ccmp valw, 0, 0, hs |
| 93 | b.ne 9f // no_zva |
| 94 | |
| 95 | #ifndef SKIP_ZVA_CHECK |
| 96 | mrs zva_val, dczid_el0 |
| 97 | and zva_val, zva_val, 31 |
| 98 | cmp zva_val, 4 /* ZVA size is 64 bytes. */ |
| 99 | b.ne 9f // no_zva |
| 100 | #endif |
| 101 | str q0, [dst, 16] |
| 102 | stp q0, q0, [dst, 32] |
| 103 | bic dst, dst, 63 |
| 104 | sub count, dstend2, dst /* Count is now 64 too large. */ |
| 105 | sub count, count, 128 /* Adjust count and bias for loop. */ |
| 106 | |
| 107 | .p2align 4 |
| 108 | 8: // zva_loop |
| 109 | add dst, dst, 64 |
| 110 | dc zva, dst |
| 111 | subs count, count, 64 |
| 112 | b.hi 8b // zva_loop |
| 113 | stp q0, q0, [dstend2, -64] |
| 114 | stp q0, q0, [dstend2, -32] |
| 115 | ret |
| 116 | |
| 117 | 9: // no_zva |
| 118 | sub count, dstend2, dst /* Count is 16 too large. */ |
| 119 | sub dst, dst, 16 /* Dst is biased by -32. */ |
| 120 | sub count, count, 64 + 16 /* Adjust count and bias for loop. */ |
| 121 | 10: // no_zva_loop |
| 122 | stp q0, q0, [dst, 32] |
| 123 | stp q0, q0, [dst, 64]! |
| 124 | subs count, count, 64 |
| 125 | b.hi 10b // no_zva_loop |
| 126 | stp q0, q0, [dstend2, -64] |
| 127 | stp q0, q0, [dstend2, -32] |
| 128 | ret |
| 129 | END_COMPILERRT_FUNCTION(__arm_sc_memset) |
| 130 | |
| 131 | // |
| 132 | // __arm_sc_memchr |
| 133 | // |
| 134 | |
| 135 | #define srcin x0 |
| 136 | #define chrin w1 |
| 137 | #define cntin x2 |
| 138 | |
| 139 | #define result x0 |
| 140 | |
| 141 | #define src x3 |
| 142 | #define tmp x4 |
| 143 | #define wtmp2 w5 |
| 144 | #define synd x6 |
| 145 | #define soff x9 |
| 146 | #define cntrem x10 |
| 147 | |
| 148 | #define vrepchr v0 |
| 149 | #define vdata1 v1 |
| 150 | #define vdata2 v2 |
| 151 | #define vhas_chr1 v3 |
| 152 | #define vhas_chr2 v4 |
| 153 | #define vrepmask v5 |
| 154 | #define vend v6 |
| 155 | |
| 156 | /* |
| 157 | * Core algorithm: |
| 158 | * |
| 159 | * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits |
| 160 | * per byte. For each tuple, bit 0 is set if the relevant byte matched the |
| 161 | * requested character and bit 1 is not used (faster than using a 32bit |
| 162 | * syndrome). Since the bits in the syndrome reflect exactly the order in which |
| 163 | * things occur in the original string, counting trailing zeros allows to |
| 164 | * identify exactly which byte has matched. |
| 165 | */ |
| 166 | |
| 167 | DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr) |
| 168 | /* Do not dereference srcin if no bytes to compare. */ |
| 169 | cbz cntin, 4f |
| 170 | /* |
| 171 | * Magic constant 0x40100401 allows us to identify which lane matches |
| 172 | * the requested byte. |
| 173 | */ |
| 174 | mov wtmp2, #0x0401 |
| 175 | movk wtmp2, #0x4010, lsl #16 |
| 176 | dup vrepchr.16b, chrin |
| 177 | /* Work with aligned 32-byte chunks */ |
| 178 | bic src, srcin, #31 |
| 179 | dup vrepmask.4s, wtmp2 |
| 180 | ands soff, srcin, #31 |
| 181 | and cntrem, cntin, #31 |
| 182 | b.eq 0f |
| 183 | |
| 184 | /* |
| 185 | * Input string is not 32-byte aligned. We calculate the syndrome |
| 186 | * value for the aligned 32 bytes block containing the first bytes |
| 187 | * and mask the irrelevant part. |
| 188 | */ |
| 189 | |
| 190 | ld1 {vdata1.16b, vdata2.16b}, [src], #32 |
| 191 | sub tmp, soff, #32 |
| 192 | adds cntin, cntin, tmp |
| 193 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b |
| 194 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b |
| 195 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b |
| 196 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b |
| 197 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ |
| 198 | addp vend.16b, vend.16b, vend.16b /* 128->64 */ |
| 199 | mov synd, vend.d[0] |
| 200 | /* Clear the soff*2 lower bits */ |
| 201 | lsl tmp, soff, #1 |
| 202 | lsr synd, synd, tmp |
| 203 | lsl synd, synd, tmp |
| 204 | /* The first block can also be the last */ |
| 205 | b.ls 2f |
| 206 | /* Have we found something already? */ |
| 207 | cbnz synd, 3f |
| 208 | |
| 209 | 0: // loop |
| 210 | ld1 {vdata1.16b, vdata2.16b}, [src], #32 |
| 211 | subs cntin, cntin, #32 |
| 212 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b |
| 213 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b |
| 214 | /* If we're out of data we finish regardless of the result */ |
| 215 | b.ls 1f |
| 216 | /* Use a fast check for the termination condition */ |
| 217 | orr vend.16b, vhas_chr1.16b, vhas_chr2.16b |
| 218 | addp vend.2d, vend.2d, vend.2d |
| 219 | mov synd, vend.d[0] |
| 220 | /* We're not out of data, loop if we haven't found the character */ |
| 221 | cbz synd, 0b |
| 222 | |
| 223 | 1: // end |
| 224 | /* Termination condition found, let's calculate the syndrome value */ |
| 225 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b |
| 226 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b |
| 227 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ |
| 228 | addp vend.16b, vend.16b, vend.16b /* 128->64 */ |
| 229 | mov synd, vend.d[0] |
| 230 | /* Only do the clear for the last possible block */ |
| 231 | b.hi 3f |
| 232 | |
| 233 | 2: // masklast |
| 234 | /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ |
| 235 | add tmp, cntrem, soff |
| 236 | and tmp, tmp, #31 |
| 237 | sub tmp, tmp, #32 |
| 238 | neg tmp, tmp, lsl #1 |
| 239 | lsl synd, synd, tmp |
| 240 | lsr synd, synd, tmp |
| 241 | |
| 242 | 3: // tail |
| 243 | /* Count the trailing zeros using bit reversing */ |
| 244 | rbit synd, synd |
| 245 | /* Compensate the last post-increment */ |
| 246 | sub src, src, #32 |
| 247 | /* Check that we have found a character */ |
| 248 | cmp synd, #0 |
| 249 | /* And count the leading zeros */ |
| 250 | clz synd, synd |
| 251 | /* Compute the potential result */ |
| 252 | add result, src, synd, lsr #1 |
| 253 | /* Select result or NULL */ |
| 254 | csel result, xzr, result, eq |
| 255 | ret |
| 256 | |
| 257 | 4: // zero_length |
| 258 | mov result, #0 |
| 259 | ret |
| 260 | END_COMPILERRT_FUNCTION(__arm_sc_memchr) |
| 261 | |
| 262 | |