1 | /* memchr (str, chr, len) -- Return pointer to first occurrence of CHR in STR |
2 | less than LEN. For Intel 80x86, x>=3. |
3 | Copyright (C) 1994-2024 Free Software Foundation, Inc. |
4 | This file is part of the GNU C Library. |
5 | |
6 | The GNU C Library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Lesser General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2.1 of the License, or (at your option) any later version. |
10 | |
11 | The GNU C Library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Lesser General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Lesser General Public |
17 | License along with the GNU C Library; if not, see |
18 | <https://www.gnu.org/licenses/>. */ |
19 | |
20 | #include <sysdep.h> |
21 | #include "asm-syntax.h" |
22 | |
23 | #define PARMS 4+8 /* space for 2 saved regs */ |
24 | #define RTN PARMS |
25 | #define STR RTN |
26 | #define CHR STR+4 |
27 | #define LEN CHR+4 |
28 | |
29 | .text |
30 | ENTRY (__memchr) |
31 | |
32 | /* Save callee-safe registers used in this function. */ |
33 | pushl %esi |
34 | cfi_adjust_cfa_offset (4) |
35 | pushl %edi |
36 | cfi_adjust_cfa_offset (4) |
37 | cfi_rel_offset (edi, 0) |
38 | |
39 | /* Load parameters into registers. */ |
40 | movl STR(%esp), %eax /* str: pointer to memory block. */ |
41 | movl CHR(%esp), %edx /* c: byte we are looking for. */ |
42 | movl LEN(%esp), %esi /* len: length of memory block. */ |
43 | cfi_rel_offset (esi, 4) |
44 | |
45 | /* If my must not test more than three characters test |
46 | them one by one. This is especially true for 0. */ |
47 | cmpl $4, %esi |
48 | jb L(3) |
49 | |
50 | /* At the moment %edx contains CHR. What we need for the |
51 | algorithm is CHR in all bytes of the dword. Avoid |
52 | operations on 16 bit words because these require an |
53 | prefix byte (and one more cycle). */ |
54 | movb %dl, %dh /* Now it is 0|0|c|c */ |
55 | movl %edx, %ecx |
56 | shll $16, %edx /* Now c|c|0|0 */ |
57 | movw %cx, %dx /* And finally c|c|c|c */ |
58 | |
59 | /* Better performance can be achieved if the word (32 |
60 | bit) memory access is aligned on a four-byte-boundary. |
61 | So process first bytes one by one until boundary is |
62 | reached. Don't use a loop for better performance. */ |
63 | |
64 | testb $3, %al /* correctly aligned ? */ |
65 | je L(2) /* yes => begin loop */ |
66 | cmpb %dl, (%eax) /* compare byte */ |
67 | je L(9) /* target found => return */ |
68 | incl %eax /* increment source pointer */ |
69 | decl %esi /* decrement length counter */ |
70 | je L(4) /* len==0 => return NULL */ |
71 | |
72 | testb $3, %al /* correctly aligned ? */ |
73 | je L(2) /* yes => begin loop */ |
74 | cmpb %dl, (%eax) /* compare byte */ |
75 | je L(9) /* target found => return */ |
76 | incl %eax /* increment source pointer */ |
77 | decl %esi /* decrement length counter */ |
78 | je L(4) /* len==0 => return NULL */ |
79 | |
80 | testb $3, %al /* correctly aligned ? */ |
81 | je L(2) /* yes => begin loop */ |
82 | cmpb %dl, (%eax) /* compare byte */ |
83 | je L(9) /* target found => return */ |
84 | incl %eax /* increment source pointer */ |
85 | decl %esi /* decrement length counter */ |
86 | /* no test for len==0 here, because this is done in the |
87 | loop head */ |
88 | jmp L(2) |
89 | |
90 | /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to |
91 | change any of the hole bits of LONGWORD. |
92 | |
93 | 1) Is this safe? Will it catch all the zero bytes? |
94 | Suppose there is a byte with all zeros. Any carry bits |
95 | propagating from its left will fall into the hole at its |
96 | least significant bit and stop. Since there will be no |
97 | carry from its most significant bit, the LSB of the |
98 | byte to the left will be unchanged, and the zero will be |
99 | detected. |
100 | |
101 | 2) Is this worthwhile? Will it ignore everything except |
102 | zero bytes? Suppose every byte of LONGWORD has a bit set |
103 | somewhere. There will be a carry into bit 8. If bit 8 |
104 | is set, this will carry into bit 16. If bit 8 is clear, |
105 | one of bits 9-15 must be set, so there will be a carry |
106 | into bit 16. Similarly, there will be a carry into bit |
107 | 24. If one of bits 24-31 is set, there will be a carry |
108 | into bit 32 (=carry flag), so all of the hole bits will |
109 | be changed. |
110 | |
111 | 3) But wait! Aren't we looking for CHR, not zero? |
112 | Good point. So what we do is XOR LONGWORD with a longword, |
113 | each of whose bytes is CHR. This turns each byte that is CHR |
114 | into a zero. */ |
115 | |
116 | |
117 | /* Each round the main loop processes 16 bytes. */ |
118 | |
119 | ALIGN (4) |
120 | |
121 | L(1): movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
122 | movl $0xfefefeff, %edi /* magic value */ |
123 | xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c |
124 | are now 0 */ |
125 | addl %ecx, %edi /* add the magic value to the word. We get |
126 | carry bits reported for each byte which |
127 | is *not* 0 */ |
128 | |
129 | /* According to the algorithm we had to reverse the effect of the |
130 | XOR first and then test the overflow bits. But because the |
131 | following XOR would destroy the carry flag and it would (in a |
132 | representation with more than 32 bits) not alter then last |
133 | overflow, we can now test this condition. If no carry is signaled |
134 | no overflow must have occurred in the last byte => it was 0. */ |
135 | jnc L(8) |
136 | |
137 | /* We are only interested in carry bits that change due to the |
138 | previous add, so remove original bits */ |
139 | xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ |
140 | |
141 | /* Now test for the other three overflow bits. */ |
142 | orl $0xfefefeff, %edi /* set all non-carry bits */ |
143 | incl %edi /* add 1: if one carry bit was *not* set |
144 | the addition will not result in 0. */ |
145 | |
146 | /* If at least one byte of the word is CHR we don't get 0 in %edi. */ |
147 | jnz L(8) /* found it => return pointer */ |
148 | |
149 | /* This process is unfolded four times for better performance. |
150 | we don't increment the source pointer each time. Instead we |
151 | use offsets and increment by 16 in each run of the loop. But |
152 | before probing for the matching byte we need some extra code |
153 | (following LL(13) below). Even the len can be compared with |
154 | constants instead of decrementing each time. */ |
155 | |
156 | movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ |
157 | movl $0xfefefeff, %edi /* magic value */ |
158 | xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c |
159 | are now 0 */ |
160 | addl %ecx, %edi /* add the magic value to the word. We get |
161 | carry bits reported for each byte which |
162 | is *not* 0 */ |
163 | jnc L(7) /* highest byte is CHR => return pointer */ |
164 | xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ |
165 | orl $0xfefefeff, %edi /* set all non-carry bits */ |
166 | incl %edi /* add 1: if one carry bit was *not* set |
167 | the addition will not result in 0. */ |
168 | jnz L(7) /* found it => return pointer */ |
169 | |
170 | movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ |
171 | movl $0xfefefeff, %edi /* magic value */ |
172 | xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c |
173 | are now 0 */ |
174 | addl %ecx, %edi /* add the magic value to the word. We get |
175 | carry bits reported for each byte which |
176 | is *not* 0 */ |
177 | jnc L(6) /* highest byte is CHR => return pointer */ |
178 | xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ |
179 | orl $0xfefefeff, %edi /* set all non-carry bits */ |
180 | incl %edi /* add 1: if one carry bit was *not* set |
181 | the addition will not result in 0. */ |
182 | jnz L(6) /* found it => return pointer */ |
183 | |
184 | movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ |
185 | movl $0xfefefeff, %edi /* magic value */ |
186 | xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c |
187 | are now 0 */ |
188 | addl %ecx, %edi /* add the magic value to the word. We get |
189 | carry bits reported for each byte which |
190 | is *not* 0 */ |
191 | jnc L(5) /* highest byte is CHR => return pointer */ |
192 | xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ |
193 | orl $0xfefefeff, %edi /* set all non-carry bits */ |
194 | incl %edi /* add 1: if one carry bit was *not* set |
195 | the addition will not result in 0. */ |
196 | jnz L(5) /* found it => return pointer */ |
197 | |
198 | /* Adjust both counters for a full round, i.e. 16 bytes. */ |
199 | addl $16, %eax |
200 | L(2): subl $16, %esi |
201 | jae L(1) /* Still more than 16 bytes remaining */ |
202 | |
203 | /* Process remaining bytes separately. */ |
204 | cmpl $4-16, %esi /* rest < 4 bytes? */ |
205 | jb L(3) /* yes, than test byte by byte */ |
206 | |
207 | movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
208 | movl $0xfefefeff, %edi /* magic value */ |
209 | xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c |
210 | are now 0 */ |
211 | addl %ecx, %edi /* add the magic value to the word. We get |
212 | carry bits reported for each byte which |
213 | is *not* 0 */ |
214 | jnc L(8) /* highest byte is CHR => return pointer */ |
215 | xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ |
216 | orl $0xfefefeff, %edi /* set all non-carry bits */ |
217 | incl %edi /* add 1: if one carry bit was *not* set |
218 | the addition will not result in 0. */ |
219 | jne L(8) /* found it => return pointer */ |
220 | addl $4, %eax /* adjust source pointer */ |
221 | |
222 | cmpl $8-16, %esi /* rest < 8 bytes? */ |
223 | jb L(3) /* yes, than test byte by byte */ |
224 | |
225 | movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
226 | movl $0xfefefeff, %edi /* magic value */ |
227 | xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c |
228 | are now 0 */ |
229 | addl %ecx, %edi /* add the magic value to the word. We get |
230 | carry bits reported for each byte which |
231 | is *not* 0 */ |
232 | jnc L(8) /* highest byte is CHR => return pointer */ |
233 | xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ |
234 | orl $0xfefefeff, %edi /* set all non-carry bits */ |
235 | incl %edi /* add 1: if one carry bit was *not* set |
236 | the addition will not result in 0. */ |
237 | jne L(8) /* found it => return pointer */ |
238 | addl $4, %eax /* adjust source pointer */ |
239 | |
240 | cmpl $12-16, %esi /* rest < 12 bytes? */ |
241 | jb L(3) /* yes, than test byte by byte */ |
242 | |
243 | movl (%eax), %ecx /* get word (= 4 bytes) in question */ |
244 | movl $0xfefefeff, %edi /* magic value */ |
245 | xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c |
246 | are now 0 */ |
247 | addl %ecx, %edi /* add the magic value to the word. We get |
248 | carry bits reported for each byte which |
249 | is *not* 0 */ |
250 | jnc L(8) /* highest byte is CHR => return pointer */ |
251 | xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ |
252 | orl $0xfefefeff, %edi /* set all non-carry bits */ |
253 | incl %edi /* add 1: if one carry bit was *not* set |
254 | the addition will not result in 0. */ |
255 | jne L(8) /* found it => return pointer */ |
256 | addl $4, %eax /* adjust source pointer */ |
257 | |
258 | /* Check the remaining bytes one by one. */ |
259 | L(3): andl $3, %esi /* mask out uninteresting bytes */ |
260 | jz L(4) /* no remaining bytes => return NULL */ |
261 | |
262 | cmpb %dl, (%eax) /* compare byte with CHR */ |
263 | je L(9) /* equal, than return pointer */ |
264 | incl %eax /* increment source pointer */ |
265 | decl %esi /* decrement length */ |
266 | jz L(4) /* no remaining bytes => return NULL */ |
267 | |
268 | cmpb %dl, (%eax) /* compare byte with CHR */ |
269 | je L(9) /* equal, than return pointer */ |
270 | incl %eax /* increment source pointer */ |
271 | decl %esi /* decrement length */ |
272 | jz L(4) /* no remaining bytes => return NULL */ |
273 | |
274 | cmpb %dl, (%eax) /* compare byte with CHR */ |
275 | je L(9) /* equal, than return pointer */ |
276 | |
277 | L(4): /* no byte found => return NULL */ |
278 | xorl %eax, %eax |
279 | jmp L(9) |
280 | |
281 | /* add missing source pointer increments */ |
282 | L(5): addl $4, %eax |
283 | L(6): addl $4, %eax |
284 | L(7): addl $4, %eax |
285 | |
286 | /* Test for the matching byte in the word. %ecx contains a NUL |
287 | char in the byte which originally was the byte we are looking |
288 | at. */ |
289 | L(8): testb %cl, %cl /* test first byte in dword */ |
290 | jz L(9) /* if zero => return pointer */ |
291 | incl %eax /* increment source pointer */ |
292 | |
293 | testb %ch, %ch /* test second byte in dword */ |
294 | jz L(9) /* if zero => return pointer */ |
295 | incl %eax /* increment source pointer */ |
296 | |
297 | testl $0xff0000, %ecx /* test third byte in dword */ |
298 | jz L(9) /* if zero => return pointer */ |
299 | incl %eax /* increment source pointer */ |
300 | |
301 | /* No further test needed we we know it is one of the four bytes. */ |
302 | L(9): popl %edi /* pop saved registers */ |
303 | cfi_adjust_cfa_offset (-4) |
304 | cfi_restore (edi) |
305 | popl %esi |
306 | cfi_adjust_cfa_offset (-4) |
307 | cfi_restore (esi) |
308 | |
309 | ret |
310 | END (__memchr) |
311 | |
312 | weak_alias (__memchr, memchr) |
313 | libc_hidden_builtin_def (memchr) |
314 | |