| 1 | /* Optimized strlen implementation for PowerPC64/POWER9. |
| 2 | Copyright (C) 2020-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | #ifndef STRLEN |
| 22 | # define STRLEN __strlen |
| 23 | # define DEFINE_STRLEN_HIDDEN_DEF 1 |
| 24 | #endif |
| 25 | |
| 26 | /* Implements the function |
| 27 | |
| 28 | int [r3] strlen (const void *s [r3]) |
| 29 | |
| 30 | The implementation can load bytes past a matching byte, but only |
| 31 | up to the next 64B boundary, so it never crosses a page. */ |
| 32 | |
| 33 | .machine power9 |
| 34 | ENTRY_TOCLESS (STRLEN, 4) |
| 35 | CALL_MCOUNT 2 |
| 36 | |
| 37 | vspltisb v18,0 |
| 38 | vspltisb v19,-1 |
| 39 | |
| 40 | neg r5,r3 |
| 41 | rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */ |
| 42 | |
| 43 | /* Align data and fill bytes not loaded with non matching char. */ |
| 44 | lvx v0,0,r3 |
| 45 | lvsr v1,0,r3 |
| 46 | vperm v0,v19,v0,v1 |
| 47 | |
| 48 | vcmpequb. v6,v0,v18 |
| 49 | beq cr6,L(aligned) |
| 50 | |
| 51 | vctzlsbb r3,v6 |
| 52 | blr |
| 53 | |
| 54 | /* Test 64B 16B at a time. The 64B vector loop is optimized for |
| 55 | longer strings. Likewise, we check a multiple of 64B to avoid |
| 56 | breaking the alignment calculation below. */ |
| 57 | L(aligned): |
| 58 | add r4,r3,r9 |
| 59 | rldicl. r5,r4,60,62 /* Determine the number of 48B loops needed for |
| 60 | alignment to 64B. And test for zero. */ |
| 61 | |
| 62 | lxv v0+32,0(r4) |
| 63 | vcmpequb. v6,v0,v18 |
| 64 | bne cr6,L(tail1) |
| 65 | |
| 66 | lxv v0+32,16(r4) |
| 67 | vcmpequb. v6,v0,v18 |
| 68 | bne cr6,L(tail2) |
| 69 | |
| 70 | lxv v0+32,32(r4) |
| 71 | vcmpequb. v6,v0,v18 |
| 72 | bne cr6,L(tail3) |
| 73 | |
| 74 | lxv v0+32,48(r4) |
| 75 | vcmpequb. v6,v0,v18 |
| 76 | bne cr6,L(tail4) |
| 77 | addi r4,r4,64 |
| 78 | |
| 79 | /* Speculatively generate a fake 16B aligned address to generate the |
| 80 | vector byte constant 0,1,..,15 using lvsl during reduction. */ |
| 81 | li r0,0 |
| 82 | |
| 83 | /* Skip the alignment if already 64B aligned. */ |
| 84 | beq L(loop_64b) |
| 85 | mtctr r5 |
| 86 | |
| 87 | /* Test 48B per iteration until 64B aligned. */ |
| 88 | .p2align 5 |
| 89 | L(loop): |
| 90 | lxv v0+32,0(r4) |
| 91 | vcmpequb. v6,v0,v18 |
| 92 | bne cr6,L(tail1) |
| 93 | |
| 94 | lxv v0+32,16(r4) |
| 95 | vcmpequb. v6,v0,v18 |
| 96 | bne cr6,L(tail2) |
| 97 | |
| 98 | lxv v0+32,32(r4) |
| 99 | vcmpequb. v6,v0,v18 |
| 100 | bne cr6,L(tail3) |
| 101 | |
| 102 | addi r4,r4,48 |
| 103 | bdnz L(loop) |
| 104 | |
| 105 | .p2align 5 |
| 106 | L(loop_64b): |
| 107 | lxv v1+32,0(r4) /* Load 4 quadwords. */ |
| 108 | lxv v2+32,16(r4) |
| 109 | lxv v3+32,32(r4) |
| 110 | lxv v4+32,48(r4) |
| 111 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| 112 | vminub v6,v3,v4 |
| 113 | vminub v7,v5,v6 |
| 114 | vcmpequb. v7,v7,v18 /* Check for NULLs. */ |
| 115 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
| 116 | bne cr6,L(vmx_zero) |
| 117 | |
| 118 | lxv v1+32,0(r4) /* Load 4 quadwords. */ |
| 119 | lxv v2+32,16(r4) |
| 120 | lxv v3+32,32(r4) |
| 121 | lxv v4+32,48(r4) |
| 122 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| 123 | vminub v6,v3,v4 |
| 124 | vminub v7,v5,v6 |
| 125 | vcmpequb. v7,v7,v18 /* Check for NULLs. */ |
| 126 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
| 127 | bne cr6,L(vmx_zero) |
| 128 | |
| 129 | lxv v1+32,0(r4) /* Load 4 quadwords. */ |
| 130 | lxv v2+32,16(r4) |
| 131 | lxv v3+32,32(r4) |
| 132 | lxv v4+32,48(r4) |
| 133 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| 134 | vminub v6,v3,v4 |
| 135 | vminub v7,v5,v6 |
| 136 | vcmpequb. v7,v7,v18 /* Check for NULLs. */ |
| 137 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
| 138 | beq cr6,L(loop_64b) |
| 139 | |
| 140 | L(vmx_zero): |
| 141 | /* OK, we found a null byte. Let's look for it in the current 64-byte |
| 142 | block and mark it in its corresponding VR. */ |
| 143 | vcmpequb v1,v1,v18 |
| 144 | vcmpequb v2,v2,v18 |
| 145 | vcmpequb v3,v3,v18 |
| 146 | vcmpequb v4,v4,v18 |
| 147 | |
| 148 | /* We will now 'compress' the result into a single doubleword, so it |
| 149 | can be moved to a GPR for the final calculation. First, we |
| 150 | generate an appropriate mask for vbpermq, so we can permute bits into |
| 151 | the first halfword. */ |
| 152 | vspltisb v10,3 |
| 153 | lvsl v11,0,r0 |
| 154 | vslb v10,v11,v10 |
| 155 | |
| 156 | /* Permute the first bit of each byte into bits 48-63. */ |
| 157 | vbpermq v1,v1,v10 |
| 158 | vbpermq v2,v2,v10 |
| 159 | vbpermq v3,v3,v10 |
| 160 | vbpermq v4,v4,v10 |
| 161 | |
| 162 | /* Shift each component into its correct position for merging. */ |
| 163 | vsldoi v2,v2,v2,2 |
| 164 | vsldoi v3,v3,v3,4 |
| 165 | vsldoi v4,v4,v4,6 |
| 166 | |
| 167 | /* Merge the results and move to a GPR. */ |
| 168 | vor v1,v2,v1 |
| 169 | vor v2,v3,v4 |
| 170 | vor v4,v1,v2 |
| 171 | mfvrd r10,v4 |
| 172 | |
| 173 | /* Adjust address to the begninning of the current 64-byte block. */ |
| 174 | addi r4,r4,-64 |
| 175 | |
| 176 | cnttzd r0,r10 /* Count trailing zeros before the match. */ |
| 177 | subf r5,r3,r4 |
| 178 | add r3,r5,r0 /* Compute final length. */ |
| 179 | blr |
| 180 | |
| 181 | L(tail1): |
| 182 | vctzlsbb r0,v6 |
| 183 | add r4,r4,r0 |
| 184 | subf r3,r3,r4 |
| 185 | blr |
| 186 | |
| 187 | L(tail2): |
| 188 | vctzlsbb r0,v6 |
| 189 | add r4,r4,r0 |
| 190 | addi r4,r4,16 |
| 191 | subf r3,r3,r4 |
| 192 | blr |
| 193 | |
| 194 | L(tail3): |
| 195 | vctzlsbb r0,v6 |
| 196 | add r4,r4,r0 |
| 197 | addi r4,r4,32 |
| 198 | subf r3,r3,r4 |
| 199 | blr |
| 200 | |
| 201 | L(tail4): |
| 202 | vctzlsbb r0,v6 |
| 203 | add r4,r4,r0 |
| 204 | addi r4,r4,48 |
| 205 | subf r3,r3,r4 |
| 206 | blr |
| 207 | |
| 208 | END (STRLEN) |
| 209 | |
| 210 | #ifdef DEFINE_STRLEN_HIDDEN_DEF |
| 211 | weak_alias (__strlen, strlen) |
| 212 | libc_hidden_builtin_def (strlen) |
| 213 | #endif |
| 214 | |