| 1 | /* Optimized memset implementation for PowerPC64. |
| 2 | Copyright (C) 1997-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); |
| 22 | Returns 's'. |
| 23 | |
| 24 | The memset is done in three sizes: byte (8 bits), word (32 bits), |
| 25 | cache line (1024 bits). There is a special case for setting cache lines |
| 26 | to 0, to take advantage of the dcbz instruction. */ |
| 27 | |
| 28 | .machine power4 |
| 29 | EALIGN (memset, 5, 0) |
| 30 | CALL_MCOUNT |
| 31 | |
| 32 | #define rTMP r0 |
| 33 | #define rRTN r3 /* Initial value of 1st argument. */ |
| 34 | #define rMEMP0 r3 /* Original value of 1st arg. */ |
| 35 | #define rCHR r4 /* Char to set in each byte. */ |
| 36 | #define rLEN r5 /* Length of region to set. */ |
| 37 | #define rMEMP r6 /* Address at which we are storing. */ |
| 38 | #define rALIGN r7 /* Number of bytes we are setting now (when aligning). */ |
| 39 | #define rMEMP2 r8 |
| 40 | |
| 41 | #define rNEG64 r8 /* Constant -64 for clearing with dcbz. */ |
| 42 | #define rCLS r8 /* Cache line size (known to be 128). */ |
| 43 | #define rCLM r9 /* Cache line size mask to check for cache alignment. */ |
| 44 | L(_memset): |
| 45 | /* Take care of case for size <= 4. */ |
| 46 | cmplwi cr1, rLEN, 4 |
| 47 | andi. rALIGN, rMEMP0, 3 |
| 48 | mr rMEMP, rMEMP0 |
| 49 | ble- cr1, L(small) |
| 50 | |
| 51 | /* Align to word boundary. */ |
| 52 | cmplwi cr5, rLEN, 31 |
| 53 | insrwi rCHR, rCHR, 8, 16 /* Replicate byte to halfword. */ |
| 54 | beq+ L(aligned) |
| 55 | mtcrf 0x01, rMEMP0 |
| 56 | subfic rALIGN, rALIGN, 4 |
| 57 | add rMEMP, rMEMP, rALIGN |
| 58 | sub rLEN, rLEN, rALIGN |
| 59 | bf+ 31, L(g0) |
| 60 | stb rCHR, 0(rMEMP0) |
| 61 | bt 30, L(aligned) |
| 62 | L(g0): |
| 63 | sth rCHR, -2(rMEMP) |
| 64 | |
| 65 | /* Handle the case of size < 31. */ |
| 66 | L(aligned): |
| 67 | mtcrf 0x01, rLEN |
| 68 | insrwi rCHR, rCHR, 16, 0 /* Replicate halfword to word. */ |
| 69 | ble cr5, L(medium) |
| 70 | /* Align to 32-byte boundary. */ |
| 71 | andi. rALIGN, rMEMP, 0x1C |
| 72 | subfic rALIGN, rALIGN, 0x20 |
| 73 | beq L(caligned) |
| 74 | mtcrf 0x01, rALIGN |
| 75 | add rMEMP, rMEMP, rALIGN |
| 76 | sub rLEN, rLEN, rALIGN |
| 77 | cmplwi cr1, rALIGN, 0x10 |
| 78 | mr rMEMP2, rMEMP |
| 79 | bf 28, L(a1) |
| 80 | stw rCHR, -4(rMEMP2) |
| 81 | stwu rCHR, -8(rMEMP2) |
| 82 | L(a1): blt cr1, L(a2) |
| 83 | stw rCHR, -4(rMEMP2) |
| 84 | stw rCHR, -8(rMEMP2) |
| 85 | stw rCHR, -12(rMEMP2) |
| 86 | stwu rCHR, -16(rMEMP2) |
| 87 | L(a2): bf 29, L(caligned) |
| 88 | stw rCHR, -4(rMEMP2) |
| 89 | |
| 90 | /* Now aligned to a 32 byte boundary. */ |
| 91 | L(caligned): |
| 92 | cmplwi cr1, rCHR, 0 |
| 93 | clrrwi. rALIGN, rLEN, 5 |
| 94 | mtcrf 0x01, rLEN |
| 95 | beq cr1, L(zloopstart) /* Special case for clearing memory using dcbz. */ |
| 96 | L(nondcbz): |
| 97 | srwi rTMP, rALIGN, 5 |
| 98 | mtctr rTMP |
| 99 | beq L(medium) /* We may not actually get to do a full line. */ |
| 100 | clrlwi. rLEN, rLEN, 27 |
| 101 | add rMEMP, rMEMP, rALIGN |
| 102 | li rNEG64, -0x40 |
| 103 | bdz L(cloopdone) |
| 104 | |
| 105 | .align 4 |
| 106 | L(c3): dcbtst rNEG64, rMEMP |
| 107 | stw rCHR, -4(rMEMP) |
| 108 | stw rCHR, -8(rMEMP) |
| 109 | stw rCHR, -12(rMEMP) |
| 110 | stw rCHR, -16(rMEMP) |
| 111 | stw rCHR, -20(rMEMP) |
| 112 | stw rCHR, -24(rMEMP) |
| 113 | stw rCHR, -28(rMEMP) |
| 114 | stwu rCHR, -32(rMEMP) |
| 115 | bdnz L(c3) |
| 116 | L(cloopdone): |
| 117 | stw rCHR, -4(rMEMP) |
| 118 | stw rCHR, -8(rMEMP) |
| 119 | stw rCHR, -12(rMEMP) |
| 120 | stw rCHR, -16(rMEMP) |
| 121 | cmplwi cr1, rLEN, 16 |
| 122 | stw rCHR, -20(rMEMP) |
| 123 | stw rCHR, -24(rMEMP) |
| 124 | stw rCHR, -28(rMEMP) |
| 125 | stwu rCHR, -32(rMEMP) |
| 126 | beqlr |
| 127 | add rMEMP, rMEMP, rALIGN |
| 128 | b L(medium_tail2) |
| 129 | |
| 130 | .align 5 |
| 131 | /* Clear lines of memory in 128-byte chunks. */ |
| 132 | L(zloopstart): |
| 133 | /* If the remaining length is less the 32 bytes, don't bother getting |
| 134 | the cache line size. */ |
| 135 | beq L(medium) |
| 136 | li rCLS,128 /* cache line size is 128 */ |
| 137 | dcbt 0,rMEMP |
| 138 | L(getCacheAligned): |
| 139 | cmplwi cr1,rLEN,32 |
| 140 | andi. rTMP,rMEMP,127 |
| 141 | blt cr1,L(handletail32) |
| 142 | beq L(cacheAligned) |
| 143 | addi rMEMP,rMEMP,32 |
| 144 | addi rLEN,rLEN,-32 |
| 145 | stw rCHR,-32(rMEMP) |
| 146 | stw rCHR,-28(rMEMP) |
| 147 | stw rCHR,-24(rMEMP) |
| 148 | stw rCHR,-20(rMEMP) |
| 149 | stw rCHR,-16(rMEMP) |
| 150 | stw rCHR,-12(rMEMP) |
| 151 | stw rCHR,-8(rMEMP) |
| 152 | stw rCHR,-4(rMEMP) |
| 153 | b L(getCacheAligned) |
| 154 | |
| 155 | /* Now we are aligned to the cache line and can use dcbz. */ |
| 156 | .align 4 |
| 157 | L(cacheAligned): |
| 158 | cmplw cr1,rLEN,rCLS |
| 159 | blt cr1,L(handletail32) |
| 160 | dcbz 0,rMEMP |
| 161 | subf rLEN,rCLS,rLEN |
| 162 | add rMEMP,rMEMP,rCLS |
| 163 | b L(cacheAligned) |
| 164 | |
| 165 | /* We are here because the cache line size was set and the remainder |
| 166 | (rLEN) is less than the actual cache line size. |
| 167 | So set up the preconditions for L(nondcbz) and go there. */ |
| 168 | L(handletail32): |
| 169 | clrrwi. rALIGN, rLEN, 5 |
| 170 | b L(nondcbz) |
| 171 | |
| 172 | .align 5 |
| 173 | L(small): |
| 174 | /* Memset of 4 bytes or less. */ |
| 175 | cmplwi cr5, rLEN, 1 |
| 176 | cmplwi cr1, rLEN, 3 |
| 177 | bltlr cr5 |
| 178 | stb rCHR, 0(rMEMP) |
| 179 | beqlr cr5 |
| 180 | stb rCHR, 1(rMEMP) |
| 181 | bltlr cr1 |
| 182 | stb rCHR, 2(rMEMP) |
| 183 | beqlr cr1 |
| 184 | stb rCHR, 3(rMEMP) |
| 185 | blr |
| 186 | |
| 187 | /* Memset of 0-31 bytes. */ |
| 188 | .align 5 |
| 189 | L(medium): |
| 190 | cmplwi cr1, rLEN, 16 |
| 191 | L(medium_tail2): |
| 192 | add rMEMP, rMEMP, rLEN |
| 193 | L(medium_tail): |
| 194 | bt- 31, L(medium_31t) |
| 195 | bt- 30, L(medium_30t) |
| 196 | L(medium_30f): |
| 197 | bt- 29, L(medium_29t) |
| 198 | L(medium_29f): |
| 199 | bge- cr1, L(medium_27t) |
| 200 | bflr- 28 |
| 201 | stw rCHR, -4(rMEMP) |
| 202 | stw rCHR, -8(rMEMP) |
| 203 | blr |
| 204 | |
| 205 | L(medium_31t): |
| 206 | stbu rCHR, -1(rMEMP) |
| 207 | bf- 30, L(medium_30f) |
| 208 | L(medium_30t): |
| 209 | sthu rCHR, -2(rMEMP) |
| 210 | bf- 29, L(medium_29f) |
| 211 | L(medium_29t): |
| 212 | stwu rCHR, -4(rMEMP) |
| 213 | blt- cr1, L(medium_27f) |
| 214 | L(medium_27t): |
| 215 | stw rCHR, -4(rMEMP) |
| 216 | stw rCHR, -8(rMEMP) |
| 217 | stw rCHR, -12(rMEMP) |
| 218 | stwu rCHR, -16(rMEMP) |
| 219 | L(medium_27f): |
| 220 | bflr- 28 |
| 221 | L(medium_28t): |
| 222 | stw rCHR, -4(rMEMP) |
| 223 | stw rCHR, -8(rMEMP) |
| 224 | blr |
| 225 | END (memset) |
| 226 | libc_hidden_builtin_def (memset) |
| 227 | |