| 1 | /* Optimized strlen implementation for POWER10 LE. |
| 2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | /* To reuse the code for rawmemchr, we have some extra steps compared to the |
| 22 | strlen implementation: |
| 23 | - Sum the initial value of r3 with the position at which the char was |
| 24 | found, to guarantee we return a pointer and not the length. |
| 25 | - In the main loop, subtract each byte by the char we are looking for, |
| 26 | so we can keep using vminub to quickly check 64B at once. */ |
| 27 | #ifdef USE_AS_RAWMEMCHR |
| 28 | # ifndef RAWMEMCHR |
| 29 | # define FUNCNAME __rawmemchr |
| 30 | # else |
| 31 | # define FUNCNAME RAWMEMCHR |
| 32 | # endif |
| 33 | # define MCOUNT_NARGS 2 |
| 34 | # define VREG_ZERO v20 |
| 35 | # define OFF_START_LOOP 256 |
| 36 | # define RAWMEMCHR_SUBTRACT_VECTORS \ |
| 37 | vsububm v4,v4,v18; \ |
| 38 | vsububm v5,v5,v18; \ |
| 39 | vsububm v6,v6,v18; \ |
| 40 | vsububm v7,v7,v18; |
| 41 | # define TAIL(vreg,increment) \ |
| 42 | vctzlsbb r4,vreg; \ |
| 43 | addi r4,r4,increment; \ |
| 44 | add r3,r5,r4; \ |
| 45 | blr |
| 46 | |
| 47 | #else /* strlen */ |
| 48 | |
| 49 | # ifndef STRLEN |
| 50 | # define FUNCNAME __strlen |
| 51 | # define DEFINE_STRLEN_HIDDEN_DEF 1 |
| 52 | # else |
| 53 | # define FUNCNAME STRLEN |
| 54 | # endif |
| 55 | # define MCOUNT_NARGS 1 |
| 56 | # define VREG_ZERO v18 |
| 57 | # define OFF_START_LOOP 192 |
| 58 | # define TAIL(vreg,increment) \ |
| 59 | vctzlsbb r4,vreg; \ |
| 60 | subf r3,r3,r5; \ |
| 61 | addi r4,r4,increment; \ |
| 62 | add r3,r3,r4; \ |
| 63 | blr |
| 64 | #endif /* USE_AS_RAWMEMCHR */ |
| 65 | |
| 66 | /* TODO: Replace macros by the actual instructions when minimum binutils becomes |
| 67 | >= 2.35. This is used to keep compatibility with older versions. */ |
| 68 | #define (rt,vrb) \ |
| 69 | .long(((4)<<(32-6)) \ |
| 70 | | ((rt)<<(32-11)) \ |
| 71 | | ((8)<<(32-16)) \ |
| 72 | | ((vrb)<<(32-21)) \ |
| 73 | | 1602) |
| 74 | |
| 75 | #define LXVP(xtp,dq,ra) \ |
| 76 | .long(((6)<<(32-6)) \ |
| 77 | | ((((xtp)-32)>>1)<<(32-10)) \ |
| 78 | | ((1)<<(32-11)) \ |
| 79 | | ((ra)<<(32-16)) \ |
| 80 | | dq) |
| 81 | |
| 82 | #define CHECK16(vreg,offset,addr,label) \ |
| 83 | lxv vreg+32,offset(addr); \ |
| 84 | vcmpequb. vreg,vreg,v18; \ |
| 85 | bne cr6,L(label); |
| 86 | |
| 87 | /* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # |
| 88 | of bytes already checked. */ |
| 89 | #define CHECK64(offset,addr,label) \ |
| 90 | li r6,offset; \ |
| 91 | LXVP(v4+32,offset,addr); \ |
| 92 | LXVP(v6+32,offset+32,addr); \ |
| 93 | RAWMEMCHR_SUBTRACT_VECTORS; \ |
| 94 | vminub v14,v4,v5; \ |
| 95 | vminub v15,v6,v7; \ |
| 96 | vminub v16,v14,v15; \ |
| 97 | vcmpequb. v0,v16,VREG_ZERO; \ |
| 98 | bne cr6,L(label) |
| 99 | |
| 100 | /* Implements the function |
| 101 | |
| 102 | int [r3] strlen (const void *s [r3]) |
| 103 | |
| 104 | but when USE_AS_RAWMEMCHR is set, implements the function |
| 105 | |
| 106 | void* [r3] rawmemchr (const void *s [r3], int c [r4]) |
| 107 | |
| 108 | The implementation can load bytes past a matching byte, but only |
| 109 | up to the next 64B boundary, so it never crosses a page. */ |
| 110 | |
| 111 | .machine power9 |
| 112 | |
| 113 | ENTRY_TOCLESS (FUNCNAME, 4) |
| 114 | CALL_MCOUNT MCOUNT_NARGS |
| 115 | |
| 116 | #ifdef USE_AS_RAWMEMCHR |
| 117 | xori r5,r4,0xff |
| 118 | |
| 119 | mtvsrd v18+32,r4 /* matching char in v18 */ |
| 120 | mtvsrd v19+32,r5 /* non matching char in v19 */ |
| 121 | |
| 122 | vspltb v18,v18,7 /* replicate */ |
| 123 | vspltb v19,v19,7 /* replicate */ |
| 124 | #else |
| 125 | vspltisb v19,-1 |
| 126 | #endif |
| 127 | vspltisb VREG_ZERO,0 |
| 128 | |
| 129 | /* Next 16B-aligned address. Prepare address for L(aligned). */ |
| 130 | addi r5,r3,16 |
| 131 | clrrdi r5,r5,4 |
| 132 | |
| 133 | /* Align data and fill bytes not loaded with non matching char. */ |
| 134 | lvx v0,0,r3 |
| 135 | lvsr v1,0,r3 |
| 136 | vperm v0,v19,v0,v1 |
| 137 | |
| 138 | vcmpequb. v6,v0,v18 |
| 139 | beq cr6,L(aligned) |
| 140 | |
| 141 | #ifdef USE_AS_RAWMEMCHR |
| 142 | vctzlsbb r6,v6 |
| 143 | add r3,r3,r6 |
| 144 | #else |
| 145 | vctzlsbb r3,v6 |
| 146 | #endif |
| 147 | blr |
| 148 | |
| 149 | /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is |
| 150 | optimized for longer strings, so checking the first bytes in 16B |
| 151 | chunks benefits a lot small strings. */ |
| 152 | .p2align 5 |
| 153 | L(aligned): |
| 154 | #ifdef USE_AS_RAWMEMCHR |
| 155 | cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to |
| 156 | choose how we will perform the main loop. */ |
| 157 | #endif |
| 158 | /* Prepare address for the loop. */ |
| 159 | addi r4,r3,OFF_START_LOOP |
| 160 | clrrdi r4,r4,6 |
| 161 | |
| 162 | CHECK16(v0,0,r5,tail1) |
| 163 | CHECK16(v1,16,r5,tail2) |
| 164 | CHECK16(v2,32,r5,tail3) |
| 165 | CHECK16(v3,48,r5,tail4) |
| 166 | CHECK16(v4,64,r5,tail5) |
| 167 | CHECK16(v5,80,r5,tail6) |
| 168 | CHECK16(v6,96,r5,tail7) |
| 169 | CHECK16(v7,112,r5,tail8) |
| 170 | CHECK16(v8,128,r5,tail9) |
| 171 | CHECK16(v9,144,r5,tail10) |
| 172 | CHECK16(v10,160,r5,tail11) |
| 173 | #ifdef USE_AS_RAWMEMCHR |
| 174 | CHECK16(v0,176,r5,tail12) |
| 175 | CHECK16(v1,192,r5,tail13) |
| 176 | CHECK16(v2,208,r5,tail14) |
| 177 | CHECK16(v3,224,r5,tail15) |
| 178 | #endif |
| 179 | |
| 180 | addi r5,r4,128 |
| 181 | |
| 182 | #ifdef USE_AS_RAWMEMCHR |
| 183 | /* If c == 0, use the same loop as strlen, without the vsububm. */ |
| 184 | beq cr5,L(loop) |
| 185 | |
| 186 | /* This is very similar to the block after L(loop), the difference is |
| 187 | that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract |
| 188 | each byte loaded by the char we are looking for, this way we can keep |
| 189 | using vminub to merge the results and checking for nulls. */ |
| 190 | .p2align 5 |
| 191 | L(rawmemchr_loop): |
| 192 | CHECK64(0,r4,pre_tail_64b) |
| 193 | CHECK64(64,r4,pre_tail_64b) |
| 194 | addi r4,r4,256 |
| 195 | |
| 196 | CHECK64(0,r5,tail_64b) |
| 197 | CHECK64(64,r5,tail_64b) |
| 198 | addi r5,r5,256 |
| 199 | |
| 200 | b L(rawmemchr_loop) |
| 201 | #endif |
| 202 | /* Switch to a more aggressive approach checking 64B each time. Use 2 |
| 203 | pointers 128B apart and unroll the loop once to make the pointer |
| 204 | updates and usages separated enough to avoid stalls waiting for |
| 205 | address calculation. */ |
| 206 | .p2align 5 |
| 207 | L(loop): |
| 208 | #undef RAWMEMCHR_SUBTRACT_VECTORS |
| 209 | #define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */ |
| 210 | CHECK64(0,r4,pre_tail_64b) |
| 211 | CHECK64(64,r4,pre_tail_64b) |
| 212 | addi r4,r4,256 |
| 213 | |
| 214 | CHECK64(0,r5,tail_64b) |
| 215 | CHECK64(64,r5,tail_64b) |
| 216 | addi r5,r5,256 |
| 217 | |
| 218 | b L(loop) |
| 219 | |
| 220 | .p2align 5 |
| 221 | L(pre_tail_64b): |
| 222 | mr r5,r4 |
| 223 | L(tail_64b): |
| 224 | /* OK, we found a null byte. Let's look for it in the current 64-byte |
| 225 | block and mark it in its corresponding VR. lxvp vx,0(ry) puts the |
| 226 | low 16B bytes into vx+1, and the high into vx, so the order here is |
| 227 | v5, v4, v7, v6. */ |
| 228 | vcmpequb v1,v5,VREG_ZERO |
| 229 | vcmpequb v2,v4,VREG_ZERO |
| 230 | vcmpequb v3,v7,VREG_ZERO |
| 231 | vcmpequb v4,v6,VREG_ZERO |
| 232 | |
| 233 | /* Take into account the other 64B blocks we had already checked. */ |
| 234 | add r5,r5,r6 |
| 235 | |
| 236 | /* Extract first bit of each byte. */ |
| 237 | VEXTRACTBM(r7,v1) |
| 238 | VEXTRACTBM(r8,v2) |
| 239 | VEXTRACTBM(r9,v3) |
| 240 | VEXTRACTBM(r10,v4) |
| 241 | |
| 242 | /* Shift each value into their corresponding position. */ |
| 243 | sldi r8,r8,16 |
| 244 | sldi r9,r9,32 |
| 245 | sldi r10,r10,48 |
| 246 | |
| 247 | /* Merge the results. */ |
| 248 | or r7,r7,r8 |
| 249 | or r8,r9,r10 |
| 250 | or r10,r8,r7 |
| 251 | |
| 252 | cnttzd r0,r10 /* Count trailing zeros before the match. */ |
| 253 | #ifndef USE_AS_RAWMEMCHR |
| 254 | subf r5,r3,r5 |
| 255 | #endif |
| 256 | add r3,r5,r0 /* Compute final length. */ |
| 257 | blr |
| 258 | |
| 259 | .p2align 5 |
| 260 | L(tail1): |
| 261 | TAIL(v0,0) |
| 262 | |
| 263 | .p2align 5 |
| 264 | L(tail2): |
| 265 | TAIL(v1,16) |
| 266 | |
| 267 | .p2align 5 |
| 268 | L(tail3): |
| 269 | TAIL(v2,32) |
| 270 | |
| 271 | .p2align 5 |
| 272 | L(tail4): |
| 273 | TAIL(v3,48) |
| 274 | |
| 275 | .p2align 5 |
| 276 | L(tail5): |
| 277 | TAIL(v4,64) |
| 278 | |
| 279 | .p2align 5 |
| 280 | L(tail6): |
| 281 | TAIL(v5,80) |
| 282 | |
| 283 | .p2align 5 |
| 284 | L(tail7): |
| 285 | TAIL(v6,96) |
| 286 | |
| 287 | .p2align 5 |
| 288 | L(tail8): |
| 289 | TAIL(v7,112) |
| 290 | |
| 291 | .p2align 5 |
| 292 | L(tail9): |
| 293 | TAIL(v8,128) |
| 294 | |
| 295 | .p2align 5 |
| 296 | L(tail10): |
| 297 | TAIL(v9,144) |
| 298 | |
| 299 | .p2align 5 |
| 300 | L(tail11): |
| 301 | TAIL(v10,160) |
| 302 | |
| 303 | #ifdef USE_AS_RAWMEMCHR |
| 304 | .p2align 5 |
| 305 | L(tail12): |
| 306 | TAIL(v0,176) |
| 307 | |
| 308 | .p2align 5 |
| 309 | L(tail13): |
| 310 | TAIL(v1,192) |
| 311 | |
| 312 | .p2align 5 |
| 313 | L(tail14): |
| 314 | TAIL(v2,208) |
| 315 | |
| 316 | .p2align 5 |
| 317 | L(tail15): |
| 318 | TAIL(v3,224) |
| 319 | #endif |
| 320 | |
| 321 | END (FUNCNAME) |
| 322 | |
| 323 | #ifdef USE_AS_RAWMEMCHR |
| 324 | weak_alias (__rawmemchr,rawmemchr) |
| 325 | libc_hidden_builtin_def (__rawmemchr) |
| 326 | #else |
| 327 | # ifdef DEFINE_STRLEN_HIDDEN_DEF |
| 328 | weak_alias (__strlen, strlen) |
| 329 | libc_hidden_builtin_def (strlen) |
| 330 | # endif |
| 331 | #endif |
| 332 | |