| 1 | /* Optimized 64-bit memset implementation for POWER6. |
| 2 | Copyright (C) 1997-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
| 22 | Returns 's'. |
| 23 | |
| 24 | The memset is done in three sizes: byte (8 bits), word (32 bits), |
| 25 | cache line (256 bits). There is a special case for setting cache lines |
| 26 | to 0, to take advantage of the dcbz instruction. */ |
| 27 | |
| 28 | #ifndef MEMSET |
| 29 | # define MEMSET memset |
| 30 | #endif |
| 31 | .machine power6 |
| 32 | ENTRY_TOCLESS (MEMSET, 7) |
| 33 | CALL_MCOUNT 3 |
| 34 | |
| 35 | #define rTMP r0 |
| 36 | #define rRTN r3 /* Initial value of 1st argument. */ |
| 37 | #define rMEMP0 r3 /* Original value of 1st arg. */ |
| 38 | #define rCHR r4 /* Char to set in each byte. */ |
| 39 | #define rLEN r5 /* Length of region to set. */ |
| 40 | #define rMEMP r6 /* Address at which we are storing. */ |
| 41 | #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ |
| 42 | #define rMEMP2 r8 |
| 43 | #define rMEMP3 r9 /* Alt mem pointer. */ |
| 44 | L(_memset): |
| 45 | /* Take care of case for size <= 4. */ |
| 46 | cmpldi cr1, rLEN, 8 |
| 47 | andi. rALIGN, rMEMP0, 7 |
| 48 | mr rMEMP, rMEMP0 |
| 49 | ble cr1, L(small) |
| 50 | |
| 51 | /* Align to doubleword boundary. */ |
| 52 | cmpldi cr5, rLEN, 31 |
| 53 | insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */ |
| 54 | beq+ L(aligned2) |
| 55 | mtcrf 0x01, rMEMP0 |
| 56 | subfic rALIGN, rALIGN, 8 |
| 57 | cror 28,30,31 /* Detect odd word aligned. */ |
| 58 | add rMEMP, rMEMP, rALIGN |
| 59 | sub rLEN, rLEN, rALIGN |
| 60 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
| 61 | bt 29, L(g4) |
| 62 | /* Process the even word of doubleword. */ |
| 63 | bf+ 31, L(g2) |
| 64 | stb rCHR, 0(rMEMP0) |
| 65 | bt 30, L(g4x) |
| 66 | L(g2): |
| 67 | sth rCHR, -6(rMEMP) |
| 68 | L(g4x): |
| 69 | stw rCHR, -4(rMEMP) |
| 70 | b L(aligned) |
| 71 | /* Process the odd word of doubleword. */ |
| 72 | L(g4): |
| 73 | bf 28, L(g4x) /* If false, word aligned on odd word. */ |
| 74 | bf+ 31, L(g0) |
| 75 | stb rCHR, 0(rMEMP0) |
| 76 | bt 30, L(aligned) |
| 77 | L(g0): |
| 78 | sth rCHR, -2(rMEMP) |
| 79 | |
| 80 | /* Handle the case of size < 31. */ |
| 81 | L(aligned2): |
| 82 | insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */ |
| 83 | L(aligned): |
| 84 | mtcrf 0x01, rLEN |
| 85 | ble cr5, L(medium) |
| 86 | /* Align to 32-byte boundary. */ |
| 87 | andi. rALIGN, rMEMP, 0x18 |
| 88 | subfic rALIGN, rALIGN, 0x20 |
| 89 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
| 90 | beq L(caligned) |
| 91 | mtcrf 0x01, rALIGN |
| 92 | add rMEMP, rMEMP, rALIGN |
| 93 | sub rLEN, rLEN, rALIGN |
| 94 | cmplwi cr1, rALIGN, 0x10 |
| 95 | mr rMEMP2, rMEMP |
| 96 | bf 28, L(a1) |
| 97 | stdu rCHR, -8(rMEMP2) |
| 98 | L(a1): blt cr1, L(a2) |
| 99 | std rCHR, -8(rMEMP2) |
| 100 | stdu rCHR, -16(rMEMP2) |
| 101 | L(a2): |
| 102 | |
| 103 | /* Now aligned to a 32 byte boundary. */ |
| 104 | .align 4 |
| 105 | L(caligned): |
| 106 | cmpldi cr1, rCHR, 0 |
| 107 | clrrdi. rALIGN, rLEN, 5 |
| 108 | mtcrf 0x01, rLEN |
| 109 | beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ |
| 110 | beq L(medium) /* We may not actually get to do a full line. */ |
| 111 | .align 4 |
| 112 | /* Storing a non-zero "c" value. We are aligned at a sector (32-byte) |
| 113 | boundary may not be at cache line (128-byte) boundary. */ |
| 114 | L(nzloopstart): |
| 115 | /* memset in 32-byte chunks until we get to a cache line boundary. |
| 116 | If rLEN is less than the distance to the next cache-line boundary use |
| 117 | cacheAligned1 code to finish the tail. */ |
| 118 | cmpldi cr1,rLEN,128 |
| 119 | |
| 120 | andi. rTMP,rMEMP,127 |
| 121 | blt cr1,L(cacheAligned1) |
| 122 | addi rMEMP3,rMEMP,32 |
| 123 | beq L(nzCacheAligned) |
| 124 | addi rLEN,rLEN,-32 |
| 125 | std rCHR,0(rMEMP) |
| 126 | std rCHR,8(rMEMP) |
| 127 | std rCHR,16(rMEMP) |
| 128 | addi rMEMP,rMEMP,32 |
| 129 | andi. rTMP,rMEMP3,127 |
| 130 | std rCHR,-8(rMEMP3) |
| 131 | |
| 132 | beq L(nzCacheAligned) |
| 133 | addi rLEN,rLEN,-32 |
| 134 | std rCHR,0(rMEMP3) |
| 135 | addi rMEMP,rMEMP,32 |
| 136 | std rCHR,8(rMEMP3) |
| 137 | andi. rTMP,rMEMP,127 |
| 138 | std rCHR,16(rMEMP3) |
| 139 | std rCHR,24(rMEMP3) |
| 140 | |
| 141 | beq L(nzCacheAligned) |
| 142 | addi rLEN,rLEN,-32 |
| 143 | std rCHR,32(rMEMP3) |
| 144 | addi rMEMP,rMEMP,32 |
| 145 | cmpldi cr1,rLEN,128 |
| 146 | std rCHR,40(rMEMP3) |
| 147 | cmpldi cr6,rLEN,256 |
| 148 | li rMEMP2,128 |
| 149 | std rCHR,48(rMEMP3) |
| 150 | std rCHR,56(rMEMP3) |
| 151 | blt cr1,L(cacheAligned1) |
| 152 | b L(nzCacheAligned128) |
| 153 | |
| 154 | /* Now we are aligned to the cache line and can use dcbtst. */ |
| 155 | .align 4 |
| 156 | L(nzCacheAligned): |
| 157 | cmpldi cr1,rLEN,128 |
| 158 | blt cr1,L(cacheAligned1) |
| 159 | b L(nzCacheAligned128) |
| 160 | .align 5 |
| 161 | L(nzCacheAligned128): |
| 162 | cmpldi cr1,rLEN,256 |
| 163 | addi rMEMP3,rMEMP,64 |
| 164 | std rCHR,0(rMEMP) |
| 165 | std rCHR,8(rMEMP) |
| 166 | std rCHR,16(rMEMP) |
| 167 | std rCHR,24(rMEMP) |
| 168 | std rCHR,32(rMEMP) |
| 169 | std rCHR,40(rMEMP) |
| 170 | std rCHR,48(rMEMP) |
| 171 | std rCHR,56(rMEMP) |
| 172 | addi rMEMP,rMEMP3,64 |
| 173 | addi rLEN,rLEN,-128 |
| 174 | std rCHR,0(rMEMP3) |
| 175 | std rCHR,8(rMEMP3) |
| 176 | std rCHR,16(rMEMP3) |
| 177 | std rCHR,24(rMEMP3) |
| 178 | std rCHR,32(rMEMP3) |
| 179 | std rCHR,40(rMEMP3) |
| 180 | std rCHR,48(rMEMP3) |
| 181 | std rCHR,56(rMEMP3) |
| 182 | bge cr1,L(nzCacheAligned128) |
| 183 | dcbtst 0,rMEMP |
| 184 | b L(cacheAligned1) |
| 185 | .align 5 |
| 186 | /* Storing a zero "c" value. We are aligned at a sector (32-byte) |
| 187 | boundary but may not be at cache line (128-byte) boundary. If the |
| 188 | remaining length spans a full cache line we can use the Data cache |
| 189 | block zero instruction. */ |
| 190 | L(zloopstart): |
| 191 | /* memset in 32-byte chunks until we get to a cache line boundary. |
| 192 | If rLEN is less than the distance to the next cache-line boundary use |
| 193 | cacheAligned1 code to finish the tail. */ |
| 194 | cmpldi cr1,rLEN,128 |
| 195 | beq L(medium) |
| 196 | L(getCacheAligned): |
| 197 | andi. rTMP,rMEMP,127 |
| 198 | nop |
| 199 | blt cr1,L(cacheAligned1) |
| 200 | addi rMEMP3,rMEMP,32 |
| 201 | beq L(cacheAligned) |
| 202 | addi rLEN,rLEN,-32 |
| 203 | std rCHR,0(rMEMP) |
| 204 | std rCHR,8(rMEMP) |
| 205 | std rCHR,16(rMEMP) |
| 206 | addi rMEMP,rMEMP,32 |
| 207 | andi. rTMP,rMEMP3,127 |
| 208 | std rCHR,-8(rMEMP3) |
| 209 | L(getCacheAligned2): |
| 210 | beq L(cacheAligned) |
| 211 | addi rLEN,rLEN,-32 |
| 212 | std rCHR,0(rMEMP3) |
| 213 | std rCHR,8(rMEMP3) |
| 214 | addi rMEMP,rMEMP,32 |
| 215 | andi. rTMP,rMEMP,127 |
| 216 | std rCHR,16(rMEMP3) |
| 217 | std rCHR,24(rMEMP3) |
| 218 | L(getCacheAligned3): |
| 219 | beq L(cacheAligned) |
| 220 | addi rLEN,rLEN,-32 |
| 221 | std rCHR,32(rMEMP3) |
| 222 | addi rMEMP,rMEMP,32 |
| 223 | cmpldi cr1,rLEN,128 |
| 224 | std rCHR,40(rMEMP3) |
| 225 | cmpldi cr6,rLEN,256 |
| 226 | li rMEMP2,128 |
| 227 | std rCHR,48(rMEMP3) |
| 228 | std rCHR,56(rMEMP3) |
| 229 | blt cr1,L(cacheAligned1) |
| 230 | blt cr6,L(cacheAligned128) |
| 231 | b L(cacheAlignedx) |
| 232 | |
| 233 | /* Now we are aligned to the cache line and can use dcbz. */ |
| 234 | .align 5 |
| 235 | L(cacheAligned): |
| 236 | cmpldi cr1,rLEN,128 |
| 237 | cmpldi cr6,rLEN,256 |
| 238 | blt cr1,L(cacheAligned1) |
| 239 | li rMEMP2,128 |
| 240 | L(cacheAlignedx): |
| 241 | cmpldi cr5,rLEN,640 |
| 242 | blt cr6,L(cacheAligned128) |
| 243 | bgt cr5,L(cacheAligned512) |
| 244 | cmpldi cr6,rLEN,512 |
| 245 | dcbz 0,rMEMP |
| 246 | cmpldi cr1,rLEN,384 |
| 247 | dcbz rMEMP2,rMEMP |
| 248 | addi rMEMP,rMEMP,256 |
| 249 | addi rLEN,rLEN,-256 |
| 250 | blt cr1,L(cacheAligned1) |
| 251 | blt cr6,L(cacheAligned128) |
| 252 | b L(cacheAligned256) |
| 253 | .align 5 |
| 254 | /* A simple loop for the longer (>640 bytes) lengths. This form limits |
| 255 | the branch miss-predicted to exactly 1 at loop exit.*/ |
| 256 | L(cacheAligned512): |
| 257 | cmpldi cr1,rLEN,128 |
| 258 | blt cr1,L(cacheAligned1) |
| 259 | dcbz 0,rMEMP |
| 260 | addi rLEN,rLEN,-128 |
| 261 | addi rMEMP,rMEMP,128 |
| 262 | b L(cacheAligned512) |
| 263 | .align 5 |
| 264 | L(cacheAligned256): |
| 265 | |
| 266 | cmpldi cr6,rLEN,512 |
| 267 | |
| 268 | dcbz 0,rMEMP |
| 269 | cmpldi cr1,rLEN,384 |
| 270 | dcbz rMEMP2,rMEMP |
| 271 | addi rMEMP,rMEMP,256 |
| 272 | addi rLEN,rLEN,-256 |
| 273 | |
| 274 | bge cr6,L(cacheAligned256) |
| 275 | |
| 276 | blt cr1,L(cacheAligned1) |
| 277 | .align 4 |
| 278 | L(cacheAligned128): |
| 279 | dcbz 0,rMEMP |
| 280 | addi rMEMP,rMEMP,128 |
| 281 | addi rLEN,rLEN,-128 |
| 282 | nop |
| 283 | L(cacheAligned1): |
| 284 | cmpldi cr1,rLEN,32 |
| 285 | blt cr1,L(handletail32) |
| 286 | addi rMEMP3,rMEMP,32 |
| 287 | addi rLEN,rLEN,-32 |
| 288 | std rCHR,0(rMEMP) |
| 289 | std rCHR,8(rMEMP) |
| 290 | std rCHR,16(rMEMP) |
| 291 | addi rMEMP,rMEMP,32 |
| 292 | cmpldi cr1,rLEN,32 |
| 293 | std rCHR,-8(rMEMP3) |
| 294 | L(cacheAligned2): |
| 295 | blt cr1,L(handletail32) |
| 296 | addi rLEN,rLEN,-32 |
| 297 | std rCHR,0(rMEMP3) |
| 298 | std rCHR,8(rMEMP3) |
| 299 | addi rMEMP,rMEMP,32 |
| 300 | cmpldi cr1,rLEN,32 |
| 301 | std rCHR,16(rMEMP3) |
| 302 | std rCHR,24(rMEMP3) |
| 303 | nop |
| 304 | L(cacheAligned3): |
| 305 | blt cr1,L(handletail32) |
| 306 | addi rMEMP,rMEMP,32 |
| 307 | addi rLEN,rLEN,-32 |
| 308 | std rCHR,32(rMEMP3) |
| 309 | std rCHR,40(rMEMP3) |
| 310 | std rCHR,48(rMEMP3) |
| 311 | std rCHR,56(rMEMP3) |
| 312 | |
| 313 | /* We are here because the length or remainder (rLEN) is less than the |
| 314 | cache line/sector size and does not justify aggressive loop unrolling. |
| 315 | So set up the preconditions for L(medium) and go there. */ |
| 316 | .align 3 |
| 317 | L(handletail32): |
| 318 | cmpldi cr1,rLEN,0 |
| 319 | beqlr cr1 |
| 320 | b L(medium) |
| 321 | |
| 322 | .align 5 |
| 323 | L(small): |
| 324 | /* Memset of 8 bytes or less. */ |
| 325 | cmpldi cr6, rLEN, 4 |
| 326 | cmpldi cr5, rLEN, 1 |
| 327 | ble cr6,L(le4) |
| 328 | subi rLEN, rLEN, 4 |
| 329 | stb rCHR,0(rMEMP) |
| 330 | stb rCHR,1(rMEMP) |
| 331 | stb rCHR,2(rMEMP) |
| 332 | stb rCHR,3(rMEMP) |
| 333 | addi rMEMP,rMEMP, 4 |
| 334 | cmpldi cr5, rLEN, 1 |
| 335 | L(le4): |
| 336 | cmpldi cr1, rLEN, 3 |
| 337 | bltlr cr5 |
| 338 | stb rCHR, 0(rMEMP) |
| 339 | beqlr cr5 |
| 340 | stb rCHR, 1(rMEMP) |
| 341 | bltlr cr1 |
| 342 | stb rCHR, 2(rMEMP) |
| 343 | beqlr cr1 |
| 344 | stb rCHR, 3(rMEMP) |
| 345 | blr |
| 346 | |
| 347 | /* Memset of 0-31 bytes. */ |
| 348 | .align 5 |
| 349 | L(medium): |
| 350 | insrdi rCHR, rCHR, 32, 0 /* Replicate word to double word. */ |
| 351 | cmpldi cr1, rLEN, 16 |
| 352 | L(medium_tail2): |
| 353 | add rMEMP, rMEMP, rLEN |
| 354 | L(medium_tail): |
| 355 | bt- 31, L(medium_31t) |
| 356 | bt- 30, L(medium_30t) |
| 357 | L(medium_30f): |
| 358 | bt 29, L(medium_29t) |
| 359 | L(medium_29f): |
| 360 | bge cr1, L(medium_27t) |
| 361 | bflr 28 |
| 362 | std rCHR, -8(rMEMP) |
| 363 | blr |
| 364 | |
| 365 | L(medium_31t): |
| 366 | stbu rCHR, -1(rMEMP) |
| 367 | bf- 30, L(medium_30f) |
| 368 | L(medium_30t): |
| 369 | sthu rCHR, -2(rMEMP) |
| 370 | bf- 29, L(medium_29f) |
| 371 | L(medium_29t): |
| 372 | stwu rCHR, -4(rMEMP) |
| 373 | blt cr1, L(medium_27f) |
| 374 | L(medium_27t): |
| 375 | std rCHR, -8(rMEMP) |
| 376 | stdu rCHR, -16(rMEMP) |
| 377 | L(medium_27f): |
| 378 | bflr 28 |
| 379 | L(medium_28t): |
| 380 | std rCHR, -8(rMEMP) |
| 381 | blr |
| 382 | END_GEN_TB (MEMSET,TB_TOCLESS) |
| 383 | libc_hidden_builtin_def (memset) |
| 384 | |