| 1 | /* Optimized strcmp implementation for PowerPC64/POWER8. |
| 2 | Copyright (C) 2015-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | #ifndef STRCMP |
| 22 | # define STRCMP strcmp |
| 23 | #endif |
| 24 | |
| 25 | /* Implements the function |
| 26 | |
| 27 | size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) |
| 28 | |
| 29 | The implementation uses unaligned doubleword access to avoid specialized |
| 30 | code paths depending of data alignment. Although recent powerpc64 uses |
| 31 | 64K as default, the page cross handling assumes minimum page size of |
| 32 | 4k. */ |
| 33 | |
| 34 | .machine power8 |
| 35 | ENTRY_TOCLESS (STRCMP, 4) |
| 36 | li r0,0 |
| 37 | |
| 38 | /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using |
| 39 | the code: |
| 40 | |
| 41 | (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) |
| 42 | |
| 43 | with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ |
| 44 | |
| 45 | rldicl r7,r3,0,52 |
| 46 | rldicl r9,r4,0,52 |
| 47 | cmpldi cr7,r7,4096-16 |
| 48 | bgt cr7,L(pagecross_check) |
| 49 | cmpldi cr5,r9,4096-16 |
| 50 | bgt cr5,L(pagecross_check) |
| 51 | |
| 52 | /* For short string up to 16 bytes, load both s1 and s2 using |
| 53 | unaligned dwords and compare. */ |
| 54 | ld r8,0(r3) |
| 55 | ld r10,0(r4) |
| 56 | cmpb r12,r8,r0 |
| 57 | cmpb r11,r8,r10 |
| 58 | orc. r9,r12,r11 |
| 59 | bne cr0,L(different_nocmpb) |
| 60 | |
| 61 | ld r8,8(r3) |
| 62 | ld r10,8(r4) |
| 63 | cmpb r12,r8,r0 |
| 64 | cmpb r11,r8,r10 |
| 65 | orc. r9,r12,r11 |
| 66 | bne cr0,L(different_nocmpb) |
| 67 | |
| 68 | addi r7,r3,16 |
| 69 | addi r4,r4,16 |
| 70 | |
| 71 | L(align_8b): |
| 72 | /* Now it has checked for first 16 bytes, align source1 to doubleword |
| 73 | and adjust source2 address. */ |
| 74 | rldicl r9,r7,0,61 /* source1 alignment to doubleword */ |
| 75 | subf r4,r9,r4 /* Adjust source2 address based on source1 |
| 76 | alignment. */ |
| 77 | rldicr r7,r7,0,60 /* Align source1 to doubleword. */ |
| 78 | |
| 79 | /* At this point, source1 alignment is 0 and source2 alignment is |
| 80 | between 0 and 7. Check is source2 alignment is 0, meaning both |
| 81 | sources have the same alignment. */ |
| 82 | andi. r9,r4,0x7 |
| 83 | bne cr0,L(loop_diff_align) |
| 84 | |
| 85 | /* If both source1 and source2 are doubleword aligned, there is no |
| 86 | need for page boundary cross checks. */ |
| 87 | |
| 88 | ld r8,0(r7) |
| 89 | ld r10,0(r4) |
| 90 | cmpb r12,r8,r0 |
| 91 | cmpb r11,r8,r10 |
| 92 | orc. r9,r12,r11 |
| 93 | bne cr0,L(different_nocmpb) |
| 94 | |
| 95 | .align 4 |
| 96 | L(loop_equal_align): |
| 97 | ld r8,8(r7) |
| 98 | ld r10,8(r4) |
| 99 | cmpb r12,r8,r0 |
| 100 | cmpb r11,r8,r10 |
| 101 | orc. r9,r12,r11 |
| 102 | bne cr0,L(different_nocmpb) |
| 103 | |
| 104 | ld r8,16(r7) |
| 105 | ld r10,16(r4) |
| 106 | cmpb r12,r8,r0 |
| 107 | cmpb r11,r8,r10 |
| 108 | orc. r9,r12,r11 |
| 109 | bne cr0,L(different_nocmpb) |
| 110 | |
| 111 | ldu r8,24(r7) |
| 112 | ldu r10,24(r4) |
| 113 | cmpb r12,r8,r0 |
| 114 | cmpb r11,r8,r10 |
| 115 | orc. r9,r12,r11 |
| 116 | bne cr0,L(different_nocmpb) |
| 117 | |
| 118 | b L(loop_equal_align) |
| 119 | |
| 120 | /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb |
| 121 | result and r10 the dword from s2. To code isolate the byte |
| 122 | up to end (including the '\0'), masking with 0xFF the remaining |
| 123 | ones: |
| 124 | |
| 125 | #if __LITTLE_ENDIAN__ |
| 126 | (__builtin_ffsl (x) - 1) = counting trailing zero bits |
| 127 | r9 = (__builtin_ffsl (r9) - 1) + 8; |
| 128 | r9 = -1UL << r9 |
| 129 | #else |
| 130 | r9 = __builtin_clzl (r9) + 8; |
| 131 | r9 = -1UL >> r9 |
| 132 | #endif |
| 133 | r8 = r8 | r9 |
| 134 | r10 = r10 | r9 */ |
| 135 | |
| 136 | #ifdef __LITTLE_ENDIAN__ |
| 137 | nor r9,r9,r9 |
| 138 | L(different_nocmpb): |
| 139 | neg r3,r9 |
| 140 | and r9,r9,r3 |
| 141 | cntlzd r9,r9 |
| 142 | subfic r9,r9,63 |
| 143 | #else |
| 144 | not r9,r9 |
| 145 | L(different_nocmpb): |
| 146 | cntlzd r9,r9 |
| 147 | subfic r9,r9,56 |
| 148 | #endif |
| 149 | srd r3,r8,r9 |
| 150 | srd r10,r10,r9 |
| 151 | rldicl r10,r10,0,56 |
| 152 | rldicl r3,r3,0,56 |
| 153 | subf r3,r10,r3 |
| 154 | extsw r3,r3 |
| 155 | blr |
| 156 | |
| 157 | .align 4 |
| 158 | L(pagecross_check): |
| 159 | subfic r9,r9,4096 |
| 160 | subfic r7,r7,4096 |
| 161 | cmpld cr7,r7,r9 |
| 162 | bge cr7,L(pagecross) |
| 163 | mr r7,r9 |
| 164 | |
| 165 | /* If unaligned 16 bytes reads across a 4K page boundary, it uses |
| 166 | a simple byte a byte comparison until the page alignment for s1 |
| 167 | is reached. */ |
| 168 | L(pagecross): |
| 169 | add r7,r3,r7 |
| 170 | subf r9,r3,r7 |
| 171 | mtctr r9 |
| 172 | |
| 173 | .align 4 |
| 174 | L(pagecross_loop): |
| 175 | /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 |
| 176 | and if *s1 is '\0'. */ |
| 177 | lbz r9,0(r3) |
| 178 | lbz r10,0(r4) |
| 179 | addi r3,r3,1 |
| 180 | addi r4,r4,1 |
| 181 | cmplw cr7,r9,r10 |
| 182 | cmpdi cr5,r9,r0 |
| 183 | bne cr7,L(pagecross_ne) |
| 184 | beq cr5,L(pagecross_nullfound) |
| 185 | bdnz L(pagecross_loop) |
| 186 | b L(align_8b) |
| 187 | |
| 188 | .align 4 |
| 189 | /* The unaligned read of source2 will cross a 4K page boundary, |
| 190 | and the different byte or NULL maybe be in the remaining page |
| 191 | bytes. Since it can not use the unaligned load, the algorithm |
| 192 | reads and compares 8 bytes to keep source1 doubleword aligned. */ |
| 193 | L(check_source2_byte): |
| 194 | li r9,8 |
| 195 | mtctr r9 |
| 196 | |
| 197 | .align 4 |
| 198 | L(check_source2_byte_loop): |
| 199 | lbz r9,0(r7) |
| 200 | lbz r10,0(r4) |
| 201 | addi r7,r7,1 |
| 202 | addi r4,r4,1 |
| 203 | cmplw cr7,r9,10 |
| 204 | cmpdi r5,r9,0 |
| 205 | bne cr7,L(pagecross_ne) |
| 206 | beq cr5,L(pagecross_nullfound) |
| 207 | bdnz L(check_source2_byte_loop) |
| 208 | |
| 209 | /* If source2 is unaligned to doubleword, the code needs to check |
| 210 | on each iteration if the unaligned doubleword access will cross |
| 211 | a 4k page boundary. */ |
| 212 | .align 5 |
| 213 | L(loop_unaligned): |
| 214 | ld r8,0(r7) |
| 215 | ld r10,0(r4) |
| 216 | cmpb r12,r8,r0 |
| 217 | cmpb r11,r8,r10 |
| 218 | orc. r9,r12,r11 |
| 219 | bne cr0,L(different_nocmpb) |
| 220 | addi r7,r7,8 |
| 221 | addi r4,r4,8 |
| 222 | |
| 223 | L(loop_diff_align): |
| 224 | /* Check if [src2]+8 cross a 4k page boundary: |
| 225 | |
| 226 | srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) |
| 227 | |
| 228 | with PAGE_SIZE being 4096. */ |
| 229 | rldicl r9,r4,0,52 |
| 230 | cmpldi cr7,r9,4088 |
| 231 | ble cr7,L(loop_unaligned) |
| 232 | b L(check_source2_byte) |
| 233 | |
| 234 | .align 4 |
| 235 | L(pagecross_ne): |
| 236 | extsw r3,r9 |
| 237 | mr r9,r10 |
| 238 | L(pagecross_retdiff): |
| 239 | subf r9,r9,r3 |
| 240 | extsw r3,r9 |
| 241 | blr |
| 242 | |
| 243 | .align 4 |
| 244 | L(pagecross_nullfound): |
| 245 | li r3,0 |
| 246 | b L(pagecross_retdiff) |
| 247 | END (STRCMP) |
| 248 | libc_hidden_builtin_def (strcmp) |
| 249 | |