| 1 | /* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized |
| 2 | loop. |
| 3 | Copyright (C) 2016-2024 Free Software Foundation, Inc. |
| 4 | This file is part of the GNU C Library. |
| 5 | |
| 6 | The GNU C Library is free software; you can redistribute it and/or |
| 7 | modify it under the terms of the GNU Lesser General Public |
| 8 | License as published by the Free Software Foundation; either |
| 9 | version 2.1 of the License, or (at your option) any later version. |
| 10 | |
| 11 | The GNU C Library is distributed in the hope that it will be useful, |
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | Lesser General Public License for more details. |
| 15 | |
| 16 | You should have received a copy of the GNU Lesser General Public |
| 17 | License along with the GNU C Library; if not, see |
| 18 | <https://www.gnu.org/licenses/>. */ |
| 19 | |
| 20 | #include <sysdep.h> |
| 21 | |
| 22 | /* int [r3] strlen (char *s [r3]) */ |
| 23 | |
| 24 | #ifndef STRLEN |
| 25 | # define STRLEN strlen |
| 26 | #endif |
| 27 | .machine power8 |
| 28 | ENTRY_TOCLESS (STRLEN, 4) |
| 29 | CALL_MCOUNT 1 |
| 30 | dcbt 0,r3 |
| 31 | clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ |
| 32 | rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
| 33 | li r0,0 /* Doubleword with null chars to use |
| 34 | with cmpb. */ |
| 35 | li r5,-1 /* MASK = 0xffffffffffffffff. */ |
| 36 | ld r12,0(r4) /* Load doubleword from memory. */ |
| 37 | #ifdef __LITTLE_ENDIAN__ |
| 38 | sld r5,r5,r6 |
| 39 | #else |
| 40 | srd r5,r5,r6 /* MASK = MASK >> padding. */ |
| 41 | #endif |
| 42 | orc r9,r12,r5 /* Mask bits that are not part of the string. */ |
| 43 | cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ |
| 44 | cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ |
| 45 | bne cr7,L(done) |
| 46 | |
| 47 | /* For shorter strings (< 64 bytes), we will not use vector registers, |
| 48 | as the overhead isn't worth it. So, let's use GPRs instead. This |
| 49 | will be done the same way as we do in the POWER7 implementation. |
| 50 | Let's see if we are aligned to a quadword boundary. If so, we can |
| 51 | jump to the first (non-vectorized) loop. Otherwise, we have to |
| 52 | handle the next DWORD first. */ |
| 53 | mtcrf 0x01,r4 |
| 54 | mr r9,r4 |
| 55 | addi r9,r9,8 |
| 56 | bt 28,L(align64) |
| 57 | |
| 58 | /* Handle the next 8 bytes so we are aligned to a quadword |
| 59 | boundary. */ |
| 60 | ldu r5,8(r4) |
| 61 | cmpb r10,r5,r0 |
| 62 | cmpdi cr7,r10,0 |
| 63 | addi r9,r9,8 |
| 64 | bne cr7,L(done) |
| 65 | |
| 66 | L(align64): |
| 67 | /* Proceed to the old (POWER7) implementation, checking two doublewords |
| 68 | per iteration. For the first 56 bytes, we will just check for null |
| 69 | characters. After that, we will also check if we are 64-byte aligned |
| 70 | so we can jump to the vectorized implementation. We will unroll |
| 71 | these loops to avoid excessive branching. */ |
| 72 | ld r6,8(r4) |
| 73 | ldu r5,16(r4) |
| 74 | cmpb r10,r6,r0 |
| 75 | cmpb r11,r5,r0 |
| 76 | or r5,r10,r11 |
| 77 | cmpdi cr7,r5,0 |
| 78 | addi r9,r9,16 |
| 79 | bne cr7,L(dword_zero) |
| 80 | |
| 81 | ld r6,8(r4) |
| 82 | ldu r5,16(r4) |
| 83 | cmpb r10,r6,r0 |
| 84 | cmpb r11,r5,r0 |
| 85 | or r5,r10,r11 |
| 86 | cmpdi cr7,r5,0 |
| 87 | addi r9,r9,16 |
| 88 | bne cr7,L(dword_zero) |
| 89 | |
| 90 | ld r6,8(r4) |
| 91 | ldu r5,16(r4) |
| 92 | cmpb r10,r6,r0 |
| 93 | cmpb r11,r5,r0 |
| 94 | or r5,r10,r11 |
| 95 | cmpdi cr7,r5,0 |
| 96 | addi r9,r9,16 |
| 97 | bne cr7,L(dword_zero) |
| 98 | |
| 99 | /* Are we 64-byte aligned? If so, jump to the vectorized loop. |
| 100 | Note: aligning to 64-byte will necessarily slow down performance for |
| 101 | strings around 64 bytes in length due to the extra comparisons |
| 102 | required to check alignment for the vectorized loop. This is a |
| 103 | necessary tradeoff we are willing to take in order to speed up the |
| 104 | calculation for larger strings. */ |
| 105 | andi. r10,r9,63 |
| 106 | beq cr0,L(preloop) |
| 107 | ld r6,8(r4) |
| 108 | ldu r5,16(r4) |
| 109 | cmpb r10,r6,r0 |
| 110 | cmpb r11,r5,r0 |
| 111 | or r5,r10,r11 |
| 112 | cmpdi cr7,r5,0 |
| 113 | addi r9,r9,16 |
| 114 | bne cr7,L(dword_zero) |
| 115 | |
| 116 | andi. r10,r9,63 |
| 117 | beq cr0,L(preloop) |
| 118 | ld r6,8(r4) |
| 119 | ldu r5,16(r4) |
| 120 | cmpb r10,r6,r0 |
| 121 | cmpb r11,r5,r0 |
| 122 | or r5,r10,r11 |
| 123 | cmpdi cr7,r5,0 |
| 124 | addi r9,r9,16 |
| 125 | bne cr7,L(dword_zero) |
| 126 | |
| 127 | andi. r10,r9,63 |
| 128 | beq cr0,L(preloop) |
| 129 | ld r6,8(r4) |
| 130 | ldu r5,16(r4) |
| 131 | cmpb r10,r6,r0 |
| 132 | cmpb r11,r5,r0 |
| 133 | or r5,r10,r11 |
| 134 | cmpdi cr7,r5,0 |
| 135 | addi r9,r9,16 |
| 136 | |
| 137 | /* At this point, we are necessarily 64-byte aligned. If no zeroes were |
| 138 | found, jump to the vectorized loop. */ |
| 139 | beq cr7,L(preloop) |
| 140 | |
| 141 | L(dword_zero): |
| 142 | /* OK, one (or both) of the doublewords contains a null byte. Check |
| 143 | the first doubleword and decrement the address in case the first |
| 144 | doubleword really contains a null byte. */ |
| 145 | |
| 146 | cmpdi cr6,r10,0 |
| 147 | addi r4,r4,-8 |
| 148 | bne cr6,L(done) |
| 149 | |
| 150 | /* The null byte must be in the second doubleword. Adjust the address |
| 151 | again and move the result of cmpb to r10 so we can calculate the |
| 152 | length. */ |
| 153 | |
| 154 | mr r10,r11 |
| 155 | addi r4,r4,8 |
| 156 | |
| 157 | /* If the null byte was found in the non-vectorized code, compute the |
| 158 | final length. r10 has the output of the cmpb instruction, that is, |
| 159 | it contains 0xff in the same position as the null byte in the |
| 160 | original doubleword from the string. Use that to calculate the |
| 161 | length. */ |
| 162 | L(done): |
| 163 | #ifdef __LITTLE_ENDIAN__ |
| 164 | addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
| 165 | andc r9, r9,r10 |
| 166 | popcntd r0, r9 /* Count the bits in the mask. */ |
| 167 | #else |
| 168 | cntlzd r0,r10 /* Count leading zeros before the match. */ |
| 169 | #endif |
| 170 | subf r5,r3,r4 |
| 171 | srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ |
| 172 | add r3,r5,r0 /* Compute final length. */ |
| 173 | blr |
| 174 | |
| 175 | /* Vectorized implementation starts here. */ |
| 176 | .p2align 4 |
| 177 | L(preloop): |
| 178 | /* Set up for the loop. */ |
| 179 | mr r4,r9 |
| 180 | li r7, 16 /* Load required offsets. */ |
| 181 | li r8, 32 |
| 182 | li r9, 48 |
| 183 | li r12, 8 |
| 184 | vxor v0,v0,v0 /* VR with null chars to use with |
| 185 | vcmpequb. */ |
| 186 | |
| 187 | /* Main loop to look for the end of the string. We will read in |
| 188 | 64-byte chunks. Align it to 32 bytes and unroll it 3 times to |
| 189 | leverage the icache performance. */ |
| 190 | .p2align 5 |
| 191 | L(loop): |
| 192 | lvx v1,r4,r0 /* Load 4 quadwords. */ |
| 193 | lvx v2,r4,r7 |
| 194 | lvx v3,r4,r8 |
| 195 | lvx v4,r4,r9 |
| 196 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| 197 | vminub v6,v3,v4 |
| 198 | vminub v7,v5,v6 |
| 199 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| 200 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
| 201 | bne cr6,L(vmx_zero) |
| 202 | |
| 203 | lvx v1,r4,r0 /* Load 4 quadwords. */ |
| 204 | lvx v2,r4,r7 |
| 205 | lvx v3,r4,r8 |
| 206 | lvx v4,r4,r9 |
| 207 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| 208 | vminub v6,v3,v4 |
| 209 | vminub v7,v5,v6 |
| 210 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| 211 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
| 212 | bne cr6,L(vmx_zero) |
| 213 | |
| 214 | lvx v1,r4,r0 /* Load 4 quadwords. */ |
| 215 | lvx v2,r4,r7 |
| 216 | lvx v3,r4,r8 |
| 217 | lvx v4,r4,r9 |
| 218 | vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| 219 | vminub v6,v3,v4 |
| 220 | vminub v7,v5,v6 |
| 221 | vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| 222 | addi r4,r4,64 /* Adjust address for the next iteration. */ |
| 223 | beq cr6,L(loop) |
| 224 | |
| 225 | L(vmx_zero): |
| 226 | /* OK, we found a null byte. Let's look for it in the current 64-byte |
| 227 | block and mark it in its corresponding VR. */ |
| 228 | vcmpequb v1,v1,v0 |
| 229 | vcmpequb v2,v2,v0 |
| 230 | vcmpequb v3,v3,v0 |
| 231 | vcmpequb v4,v4,v0 |
| 232 | |
| 233 | /* We will now 'compress' the result into a single doubleword, so it |
| 234 | can be moved to a GPR for the final calculation. First, we |
| 235 | generate an appropriate mask for vbpermq, so we can permute bits into |
| 236 | the first halfword. */ |
| 237 | vspltisb v10,3 |
| 238 | lvsl v11,r0,r0 |
| 239 | vslb v10,v11,v10 |
| 240 | |
| 241 | /* Permute the first bit of each byte into bits 48-63. */ |
| 242 | vbpermq v1,v1,v10 |
| 243 | vbpermq v2,v2,v10 |
| 244 | vbpermq v3,v3,v10 |
| 245 | vbpermq v4,v4,v10 |
| 246 | |
| 247 | /* Shift each component into its correct position for merging. */ |
| 248 | #ifdef __LITTLE_ENDIAN__ |
| 249 | vsldoi v2,v2,v2,2 |
| 250 | vsldoi v3,v3,v3,4 |
| 251 | vsldoi v4,v4,v4,6 |
| 252 | #else |
| 253 | vsldoi v1,v1,v1,6 |
| 254 | vsldoi v2,v2,v2,4 |
| 255 | vsldoi v3,v3,v3,2 |
| 256 | #endif |
| 257 | |
| 258 | /* Merge the results and move to a GPR. */ |
| 259 | vor v1,v2,v1 |
| 260 | vor v2,v3,v4 |
| 261 | vor v4,v1,v2 |
| 262 | mfvrd r10,v4 |
| 263 | |
| 264 | /* Adjust address to the begninning of the current 64-byte block. */ |
| 265 | addi r4,r4,-64 |
| 266 | |
| 267 | #ifdef __LITTLE_ENDIAN__ |
| 268 | addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
| 269 | andc r9, r9,r10 |
| 270 | popcntd r0, r9 /* Count the bits in the mask. */ |
| 271 | #else |
| 272 | cntlzd r0,r10 /* Count leading zeros before the match. */ |
| 273 | #endif |
| 274 | subf r5,r3,r4 |
| 275 | add r3,r5,r0 /* Compute final length. */ |
| 276 | blr |
| 277 | |
| 278 | END (STRLEN) |
| 279 | libc_hidden_builtin_def (strlen) |
| 280 | |