| 1 | /* Optimized memcpy implementation for PowerPC32 on PowerPC64. |
| 2 | Copyright (C) 2003-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
| 22 | Returns 'dst'. |
| 23 | |
| 24 | Memcpy handles short copies (< 32-bytes) using a binary move blocks |
| 25 | (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled |
| 26 | with the appropriate combination of byte and halfword load/stores. |
| 27 | There is minimal effort to optimize the alignment of short moves. |
| 28 | |
| 29 | Longer moves (>= 32-bytes) justify the effort to get at least the |
| 30 | destination word (4-byte) aligned. Further optimization is |
| 31 | possible when both source and destination are word aligned. |
| 32 | Each case has an optimized unrolled loop. */ |
| 33 | |
| 34 | .machine power4 |
| 35 | EALIGN (memcpy, 5, 0) |
| 36 | CALL_MCOUNT |
| 37 | |
| 38 | stwu 1,-32(1) |
| 39 | cfi_adjust_cfa_offset(32) |
| 40 | stw 30,20(1) |
| 41 | cfi_offset(30,(20-32)) |
| 42 | mr 30,3 |
| 43 | cmplwi cr1,5,31 |
| 44 | stw 31,24(1) |
| 45 | cfi_offset(31,(24-32)) |
| 46 | neg 0,3 |
| 47 | andi. 11,3,3 /* check alignment of dst. */ |
| 48 | clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */ |
| 49 | clrlwi 10,4,30 /* check alignment of src. */ |
| 50 | cmplwi cr6,5,8 |
| 51 | ble- cr1,.L2 /* If move < 32 bytes use short move code. */ |
| 52 | cmplw cr6,10,11 |
| 53 | mr 12,4 |
| 54 | srwi 9,5,2 /* Number of full words remaining. */ |
| 55 | mtcrf 0x01,0 |
| 56 | mr 31,5 |
| 57 | beq .L0 |
| 58 | |
| 59 | subf 31,0,5 |
| 60 | /* Move 0-3 bytes as needed to get the destination word aligned. */ |
| 61 | 1: bf 31,2f |
| 62 | lbz 6,0(12) |
| 63 | addi 12,12,1 |
| 64 | stb 6,0(3) |
| 65 | addi 3,3,1 |
| 66 | 2: bf 30,0f |
| 67 | lhz 6,0(12) |
| 68 | addi 12,12,2 |
| 69 | sth 6,0(3) |
| 70 | addi 3,3,2 |
| 71 | 0: |
| 72 | clrlwi 10,12,30 /* check alignment of src again. */ |
| 73 | srwi 9,31,2 /* Number of full words remaining. */ |
| 74 | |
| 75 | /* Copy words from source to destination, assuming the destination is |
| 76 | aligned on a word boundary. |
| 77 | |
| 78 | At this point we know there are at least 25 bytes left (32-7) to copy. |
| 79 | The next step is to determine if the source is also word aligned. |
| 80 | If not branch to the unaligned move code at .L6. which uses |
| 81 | a load, shift, store strategy. |
| 82 | |
| 83 | Otherwise source and destination are word aligned, and we can use |
| 84 | the optimized word copy loop. */ |
| 85 | .L0: |
| 86 | clrlwi 11,31,30 /* calculate the number of tail bytes */ |
| 87 | mtcrf 0x01,9 |
| 88 | bne- cr6,.L6 /* If source is not word aligned. */ |
| 89 | |
| 90 | /* Move words where destination and source are word aligned. |
| 91 | Use an unrolled loop to copy 4 words (16-bytes) per iteration. |
| 92 | If the copy is not an exact multiple of 16 bytes, 1-3 |
| 93 | words are copied as needed to set up the main loop. After |
| 94 | the main loop exits there may be a tail of 1-3 bytes. These bytes are |
| 95 | copied a halfword/byte at a time as needed to preserve alignment. */ |
| 96 | |
| 97 | srwi 8,31,4 /* calculate the 16 byte loop count */ |
| 98 | cmplwi cr1,9,4 |
| 99 | cmplwi cr6,11,0 |
| 100 | mr 11,12 |
| 101 | |
| 102 | bf 30,1f |
| 103 | lwz 6,0(12) |
| 104 | lwz 7,4(12) |
| 105 | addi 11,12,8 |
| 106 | mtctr 8 |
| 107 | stw 6,0(3) |
| 108 | stw 7,4(3) |
| 109 | addi 10,3,8 |
| 110 | bf 31,4f |
| 111 | lwz 0,8(12) |
| 112 | stw 0,8(3) |
| 113 | blt cr1,3f |
| 114 | addi 11,12,12 |
| 115 | addi 10,3,12 |
| 116 | b 4f |
| 117 | .align 4 |
| 118 | 1: |
| 119 | mr 10,3 |
| 120 | mtctr 8 |
| 121 | bf 31,4f |
| 122 | lwz 6,0(12) |
| 123 | addi 11,12,4 |
| 124 | stw 6,0(3) |
| 125 | addi 10,3,4 |
| 126 | |
| 127 | .align 4 |
| 128 | 4: |
| 129 | lwz 6,0(11) |
| 130 | lwz 7,4(11) |
| 131 | lwz 8,8(11) |
| 132 | lwz 0,12(11) |
| 133 | stw 6,0(10) |
| 134 | stw 7,4(10) |
| 135 | stw 8,8(10) |
| 136 | stw 0,12(10) |
| 137 | addi 11,11,16 |
| 138 | addi 10,10,16 |
| 139 | bdnz 4b |
| 140 | 3: |
| 141 | clrrwi 0,31,2 |
| 142 | mtcrf 0x01,31 |
| 143 | beq cr6,0f |
| 144 | .L9: |
| 145 | add 3,3,0 |
| 146 | add 12,12,0 |
| 147 | |
| 148 | /* At this point we have a tail of 0-3 bytes and we know that the |
| 149 | destination is word aligned. */ |
| 150 | 2: bf 30,1f |
| 151 | lhz 6,0(12) |
| 152 | addi 12,12,2 |
| 153 | sth 6,0(3) |
| 154 | addi 3,3,2 |
| 155 | 1: bf 31,0f |
| 156 | lbz 6,0(12) |
| 157 | stb 6,0(3) |
| 158 | 0: |
| 159 | /* Return original dst pointer. */ |
| 160 | mr 3,30 |
| 161 | lwz 30,20(1) |
| 162 | lwz 31,24(1) |
| 163 | addi 1,1,32 |
| 164 | blr |
| 165 | |
| 166 | /* Copy up to 31 bytes. This is divided into two cases 0-8 bytes and |
| 167 | 9-31 bytes. Each case is handled without loops, using binary |
| 168 | (1,2,4,8) tests. |
| 169 | |
| 170 | In the short (0-8 byte) case no attempt is made to force alignment |
| 171 | of either source or destination. The hardware will handle the |
| 172 | unaligned load/stores with small delays for crossing 32- 64-byte, and |
| 173 | 4096-byte boundaries. Since these short moves are unlikely to be |
| 174 | unaligned or cross these boundaries, the overhead to force |
| 175 | alignment is not justified. |
| 176 | |
| 177 | The longer (9-31 byte) move is more likely to cross 32- or 64-byte |
| 178 | boundaries. Since only loads are sensitive to the 32-/64-byte |
| 179 | boundaries it is more important to align the source than the |
| 180 | destination. If the source is not already word aligned, we first |
| 181 | move 1-3 bytes as needed. While the destination and stores may |
| 182 | still be unaligned, this is only an issue for page (4096 byte |
| 183 | boundary) crossing, which should be rare for these short moves. |
| 184 | The hardware handles this case automatically with a small delay. */ |
| 185 | |
| 186 | .align 4 |
| 187 | .L2: |
| 188 | mtcrf 0x01,5 |
| 189 | neg 8,4 |
| 190 | clrrwi 11,4,2 |
| 191 | andi. 0,8,3 |
| 192 | ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ |
| 193 | /* At least 9 bytes left. Get the source word aligned. */ |
| 194 | cmplwi cr1,5,16 |
| 195 | mr 10,5 |
| 196 | mr 12,4 |
| 197 | cmplwi cr6,0,2 |
| 198 | beq .L3 /* If the source is already word aligned skip this. */ |
| 199 | /* Copy 1-3 bytes to get source address word aligned. */ |
| 200 | lwz 6,0(11) |
| 201 | subf 10,0,5 |
| 202 | add 12,4,0 |
| 203 | blt cr6,5f |
| 204 | srwi 7,6,16 |
| 205 | bgt cr6,3f |
| 206 | #ifdef __LITTLE_ENDIAN__ |
| 207 | sth 7,0(3) |
| 208 | #else |
| 209 | sth 6,0(3) |
| 210 | #endif |
| 211 | b 7f |
| 212 | .align 4 |
| 213 | 3: |
| 214 | #ifdef __LITTLE_ENDIAN__ |
| 215 | rotlwi 6,6,24 |
| 216 | stb 6,0(3) |
| 217 | sth 7,1(3) |
| 218 | #else |
| 219 | stb 7,0(3) |
| 220 | sth 6,1(3) |
| 221 | #endif |
| 222 | b 7f |
| 223 | .align 4 |
| 224 | 5: |
| 225 | #ifdef __LITTLE_ENDIAN__ |
| 226 | rotlwi 6,6,8 |
| 227 | #endif |
| 228 | stb 6,0(3) |
| 229 | 7: |
| 230 | cmplwi cr1,10,16 |
| 231 | add 3,3,0 |
| 232 | mtcrf 0x01,10 |
| 233 | .align 4 |
| 234 | .L3: |
| 235 | /* At least 6 bytes left and the source is word aligned. */ |
| 236 | blt cr1,8f |
| 237 | 16: /* Move 16 bytes. */ |
| 238 | lwz 6,0(12) |
| 239 | lwz 7,4(12) |
| 240 | stw 6,0(3) |
| 241 | lwz 6,8(12) |
| 242 | stw 7,4(3) |
| 243 | lwz 7,12(12) |
| 244 | addi 12,12,16 |
| 245 | stw 6,8(3) |
| 246 | stw 7,12(3) |
| 247 | addi 3,3,16 |
| 248 | 8: /* Move 8 bytes. */ |
| 249 | bf 28,4f |
| 250 | lwz 6,0(12) |
| 251 | lwz 7,4(12) |
| 252 | addi 12,12,8 |
| 253 | stw 6,0(3) |
| 254 | stw 7,4(3) |
| 255 | addi 3,3,8 |
| 256 | 4: /* Move 4 bytes. */ |
| 257 | bf 29,2f |
| 258 | lwz 6,0(12) |
| 259 | addi 12,12,4 |
| 260 | stw 6,0(3) |
| 261 | addi 3,3,4 |
| 262 | 2: /* Move 2-3 bytes. */ |
| 263 | bf 30,1f |
| 264 | lhz 6,0(12) |
| 265 | sth 6,0(3) |
| 266 | bf 31,0f |
| 267 | lbz 7,2(12) |
| 268 | stb 7,2(3) |
| 269 | mr 3,30 |
| 270 | lwz 30,20(1) |
| 271 | addi 1,1,32 |
| 272 | blr |
| 273 | 1: /* Move 1 byte. */ |
| 274 | bf 31,0f |
| 275 | lbz 6,0(12) |
| 276 | stb 6,0(3) |
| 277 | 0: |
| 278 | /* Return original dst pointer. */ |
| 279 | mr 3,30 |
| 280 | lwz 30,20(1) |
| 281 | addi 1,1,32 |
| 282 | blr |
| 283 | |
| 284 | /* Special case to copy 0-8 bytes. */ |
| 285 | .align 4 |
| 286 | .LE8: |
| 287 | mr 12,4 |
| 288 | bne cr6,4f |
| 289 | lwz 6,0(4) |
| 290 | lwz 7,4(4) |
| 291 | stw 6,0(3) |
| 292 | stw 7,4(3) |
| 293 | /* Return original dst pointer. */ |
| 294 | mr 3,30 |
| 295 | lwz 30,20(1) |
| 296 | addi 1,1,32 |
| 297 | blr |
| 298 | .align 4 |
| 299 | 4: bf 29,2b |
| 300 | lwz 6,0(4) |
| 301 | stw 6,0(3) |
| 302 | 6: |
| 303 | bf 30,5f |
| 304 | lhz 7,4(4) |
| 305 | sth 7,4(3) |
| 306 | bf 31,0f |
| 307 | lbz 8,6(4) |
| 308 | stb 8,6(3) |
| 309 | mr 3,30 |
| 310 | lwz 30,20(1) |
| 311 | addi 1,1,32 |
| 312 | blr |
| 313 | .align 4 |
| 314 | 5: |
| 315 | bf 31,0f |
| 316 | lbz 6,4(4) |
| 317 | stb 6,4(3) |
| 318 | .align 4 |
| 319 | 0: |
| 320 | /* Return original dst pointer. */ |
| 321 | mr 3,30 |
| 322 | lwz 30,20(1) |
| 323 | addi 1,1,32 |
| 324 | blr |
| 325 | |
| 326 | .align 4 |
| 327 | .L6: |
| 328 | |
| 329 | /* Copy words where the destination is aligned but the source is |
| 330 | not. Use aligned word loads from the source, shifted to realign |
| 331 | the data, to allow aligned destination stores. |
| 332 | Use an unrolled loop to copy 4 words (16-bytes) per iteration. |
| 333 | A single word is retained for storing at loop exit to avoid walking |
| 334 | off the end of a page within the loop. |
| 335 | If the copy is not an exact multiple of 16 bytes, 1-3 |
| 336 | words are copied as needed to set up the main loop. After |
| 337 | the main loop exits there may be a tail of 1-3 bytes. These bytes are |
| 338 | copied a halfword/byte at a time as needed to preserve alignment. */ |
| 339 | |
| 340 | |
| 341 | cmplwi cr6,11,0 /* are there tail bytes left ? */ |
| 342 | subf 5,10,12 /* back up src pointer to prev word alignment */ |
| 343 | slwi 10,10,3 /* calculate number of bits to shift 1st word left */ |
| 344 | addi 11,9,-1 /* we move one word after the loop */ |
| 345 | srwi 8,11,2 /* calculate the 16 byte loop count */ |
| 346 | lwz 6,0(5) /* load 1st src word into R6 */ |
| 347 | mr 4,3 |
| 348 | lwz 7,4(5) /* load 2nd src word into R7 */ |
| 349 | mtcrf 0x01,11 |
| 350 | subfic 9,10,32 /* number of bits to shift 2nd word right */ |
| 351 | mtctr 8 |
| 352 | bf 30,1f |
| 353 | |
| 354 | /* there are at least two words to copy, so copy them */ |
| 355 | #ifdef __LITTLE_ENDIAN__ |
| 356 | srw 0,6,10 |
| 357 | slw 8,7,9 |
| 358 | #else |
| 359 | slw 0,6,10 /* shift 1st src word to left align it in R0 */ |
| 360 | srw 8,7,9 /* shift 2nd src word to right align it in R8 */ |
| 361 | #endif |
| 362 | or 0,0,8 /* or them to get word to store */ |
| 363 | lwz 6,8(5) /* load the 3rd src word */ |
| 364 | stw 0,0(4) /* store the 1st dst word */ |
| 365 | #ifdef __LITTLE_ENDIAN__ |
| 366 | srw 0,7,10 |
| 367 | slw 8,6,9 |
| 368 | #else |
| 369 | slw 0,7,10 /* now left align 2nd src word into R0 */ |
| 370 | srw 8,6,9 /* shift 3rd src word to right align it in R8 */ |
| 371 | #endif |
| 372 | or 0,0,8 /* or them to get word to store */ |
| 373 | lwz 7,12(5) |
| 374 | stw 0,4(4) /* store the 2nd dst word */ |
| 375 | addi 4,4,8 |
| 376 | addi 5,5,16 |
| 377 | bf 31,4f |
| 378 | /* there is a third word to copy, so copy it */ |
| 379 | #ifdef __LITTLE_ENDIAN__ |
| 380 | srw 0,6,10 |
| 381 | slw 8,7,9 |
| 382 | #else |
| 383 | slw 0,6,10 /* shift 3rd src word to left align it in R0 */ |
| 384 | srw 8,7,9 /* shift 4th src word to right align it in R8 */ |
| 385 | #endif |
| 386 | or 0,0,8 /* or them to get word to store */ |
| 387 | stw 0,0(4) /* store 3rd dst word */ |
| 388 | mr 6,7 |
| 389 | lwz 7,0(5) |
| 390 | addi 5,5,4 |
| 391 | addi 4,4,4 |
| 392 | b 4f |
| 393 | .align 4 |
| 394 | 1: |
| 395 | #ifdef __LITTLE_ENDIAN__ |
| 396 | srw 0,6,10 |
| 397 | slw 8,7,9 |
| 398 | #else |
| 399 | slw 0,6,10 /* shift 1st src word to left align it in R0 */ |
| 400 | srw 8,7,9 /* shift 2nd src word to right align it in R8 */ |
| 401 | #endif |
| 402 | addi 5,5,8 |
| 403 | or 0,0,8 /* or them to get word to store */ |
| 404 | bf 31,4f |
| 405 | mr 6,7 |
| 406 | lwz 7,0(5) |
| 407 | addi 5,5,4 |
| 408 | stw 0,0(4) /* store the 1st dst word */ |
| 409 | addi 4,4,4 |
| 410 | |
| 411 | .align 4 |
| 412 | 4: |
| 413 | /* copy 16 bytes at a time */ |
| 414 | #ifdef __LITTLE_ENDIAN__ |
| 415 | srw 0,6,10 |
| 416 | slw 8,7,9 |
| 417 | #else |
| 418 | slw 0,6,10 |
| 419 | srw 8,7,9 |
| 420 | #endif |
| 421 | or 0,0,8 |
| 422 | lwz 6,0(5) |
| 423 | stw 0,0(4) |
| 424 | #ifdef __LITTLE_ENDIAN__ |
| 425 | srw 0,7,10 |
| 426 | slw 8,6,9 |
| 427 | #else |
| 428 | slw 0,7,10 |
| 429 | srw 8,6,9 |
| 430 | #endif |
| 431 | or 0,0,8 |
| 432 | lwz 7,4(5) |
| 433 | stw 0,4(4) |
| 434 | #ifdef __LITTLE_ENDIAN__ |
| 435 | srw 0,6,10 |
| 436 | slw 8,7,9 |
| 437 | #else |
| 438 | slw 0,6,10 |
| 439 | srw 8,7,9 |
| 440 | #endif |
| 441 | or 0,0,8 |
| 442 | lwz 6,8(5) |
| 443 | stw 0,8(4) |
| 444 | #ifdef __LITTLE_ENDIAN__ |
| 445 | srw 0,7,10 |
| 446 | slw 8,6,9 |
| 447 | #else |
| 448 | slw 0,7,10 |
| 449 | srw 8,6,9 |
| 450 | #endif |
| 451 | or 0,0,8 |
| 452 | lwz 7,12(5) |
| 453 | stw 0,12(4) |
| 454 | addi 5,5,16 |
| 455 | addi 4,4,16 |
| 456 | bdnz+ 4b |
| 457 | 8: |
| 458 | /* calculate and store the final word */ |
| 459 | #ifdef __LITTLE_ENDIAN__ |
| 460 | srw 0,6,10 |
| 461 | slw 8,7,9 |
| 462 | #else |
| 463 | slw 0,6,10 |
| 464 | srw 8,7,9 |
| 465 | #endif |
| 466 | or 0,0,8 |
| 467 | stw 0,0(4) |
| 468 | 3: |
| 469 | clrrwi 0,31,2 |
| 470 | mtcrf 0x01,31 |
| 471 | bne cr6,.L9 /* If the tail is 0 bytes we are done! */ |
| 472 | |
| 473 | /* Return original dst pointer. */ |
| 474 | mr 3,30 |
| 475 | lwz 30,20(1) |
| 476 | lwz 31,24(1) |
| 477 | addi 1,1,32 |
| 478 | blr |
| 479 | END (memcpy) |
| 480 | |
| 481 | libc_hidden_builtin_def (memcpy) |
| 482 | |