| 1 | /* Optimized memset for PowerPC476 (128-byte cacheline). |
| 2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library. If not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | /* memset |
| 22 | |
| 23 | r3:destination address and return address |
| 24 | r4:source integer to copy |
| 25 | r5:byte count |
| 26 | r11:sources integer to copy in all 32 bits of reg |
| 27 | r12:temp return address |
| 28 | |
| 29 | Save return address in r12 |
| 30 | If destination is unaligned and count is greater than 255 bytes |
| 31 | set 0-3 bytes to make destination aligned |
| 32 | If count is greater than 255 bytes and setting zero to memory |
| 33 | use dbcz to set memory when we can |
| 34 | otherwise do the following |
| 35 | If 16 or more words to set we use 16 word copy loop. |
| 36 | Finally we set 0-15 extra bytes with string store. */ |
| 37 | |
| 38 | EALIGN (memset, 5, 0) |
| 39 | rlwinm r11,r4,0,24,31 |
| 40 | rlwimi r11,r4,8,16,23 |
| 41 | rlwimi r11,r11,16,0,15 |
| 42 | addi r12,r3,0 |
| 43 | cmpwi r5,0x00FF |
| 44 | ble L(preword8_count_loop) |
| 45 | cmpwi r4,0x00 |
| 46 | beq L(use_dcbz) |
| 47 | neg r6,r3 |
| 48 | clrlwi. r6,r6,30 |
| 49 | beq L(preword8_count_loop) |
| 50 | addi r8,0,1 |
| 51 | mtctr r6 |
| 52 | subi r3,r3,1 |
| 53 | |
| 54 | L(unaligned_bytecopy_loop): |
| 55 | stbu r11,0x1(r3) |
| 56 | subf. r5,r8,r5 |
| 57 | beq L(end_memset) |
| 58 | bdnz L(unaligned_bytecopy_loop) |
| 59 | addi r3,r3,1 |
| 60 | |
| 61 | L(preword8_count_loop): |
| 62 | srwi. r6,r5,4 |
| 63 | beq L(preword2_count_loop) |
| 64 | mtctr r6 |
| 65 | addi r3,r3,-4 |
| 66 | mr r8,r11 |
| 67 | mr r9,r11 |
| 68 | mr r10,r11 |
| 69 | |
| 70 | L(word8_count_loop_no_dcbt): |
| 71 | stwu r8,4(r3) |
| 72 | stwu r9,4(r3) |
| 73 | subi r5,r5,0x10 |
| 74 | stwu r10,4(r3) |
| 75 | stwu r11,4(r3) |
| 76 | bdnz L(word8_count_loop_no_dcbt) |
| 77 | addi r3,r3,4 |
| 78 | |
| 79 | L(preword2_count_loop): |
| 80 | clrlwi. r7,r5,28 |
| 81 | beq L(end_memset) |
| 82 | mr r8,r11 |
| 83 | mr r9,r11 |
| 84 | mr r10,r11 |
| 85 | mtxer r7 |
| 86 | stswx r8,0,r3 |
| 87 | |
| 88 | L(end_memset): |
| 89 | addi r3,r12,0 |
| 90 | blr |
| 91 | |
| 92 | L(use_dcbz): |
| 93 | neg r6,r3 |
| 94 | clrlwi. r7,r6,28 |
| 95 | beq L(skip_string_loop) |
| 96 | mr r8,r11 |
| 97 | mr r9,r11 |
| 98 | mr r10,r11 |
| 99 | subf r5,r7,r5 |
| 100 | mtxer r7 |
| 101 | stswx r8,0,r3 |
| 102 | add r3,r3,r7 |
| 103 | |
| 104 | L(skip_string_loop): |
| 105 | clrlwi r8,r6,25 |
| 106 | srwi. r8,r8,4 |
| 107 | beq L(dcbz_pre_loop) |
| 108 | mtctr r8 |
| 109 | |
| 110 | L(word_loop): |
| 111 | stw r11,0(r3) |
| 112 | subi r5,r5,0x10 |
| 113 | stw r11,4(r3) |
| 114 | stw r11,8(r3) |
| 115 | stw r11,12(r3) |
| 116 | addi r3,r3,0x10 |
| 117 | bdnz L(word_loop) |
| 118 | |
| 119 | L(dcbz_pre_loop): |
| 120 | srwi r6,r5,7 |
| 121 | mtctr r6 |
| 122 | addi r7,0,0 |
| 123 | |
| 124 | L(dcbz_loop): |
| 125 | dcbz r3,r7 |
| 126 | addi r3,r3,0x80 |
| 127 | subi r5,r5,0x80 |
| 128 | bdnz L(dcbz_loop) |
| 129 | srwi. r6,r5,4 |
| 130 | beq L(postword2_count_loop) |
| 131 | mtctr r6 |
| 132 | |
| 133 | L(postword8_count_loop): |
| 134 | stw r11,0(r3) |
| 135 | subi r5,r5,0x10 |
| 136 | stw r11,4(r3) |
| 137 | stw r11,8(r3) |
| 138 | stw r11,12(r3) |
| 139 | addi r3,r3,0x10 |
| 140 | bdnz L(postword8_count_loop) |
| 141 | |
| 142 | L(postword2_count_loop): |
| 143 | clrlwi. r7,r5,28 |
| 144 | beq L(end_memset) |
| 145 | mr r8,r11 |
| 146 | mr r9,r11 |
| 147 | mr r10,r11 |
| 148 | mtxer r7 |
| 149 | stswx r8,0,r3 |
| 150 | b L(end_memset) |
| 151 | END (memset) |
| 152 | libc_hidden_builtin_def (memset) |
| 153 | |