1/* memchr optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
23 so we need this to build for ISA V2 builds. */
24#if ISA_SHOULD_BUILD (2)
25
26# ifndef MEMCHR
27# define MEMCHR __memchr_sse2
28# endif
29# ifdef USE_AS_WMEMCHR
30# define PCMPEQ pcmpeqd
31# define CHAR_PER_VEC 4
32# else
33# define PCMPEQ pcmpeqb
34# define CHAR_PER_VEC 16
35# endif
36
37/* fast SSE2 version with using pmaxub and 64 byte loop */
38
39 .text
40ENTRY(MEMCHR)
41 movd %esi, %xmm1
42 mov %edi, %ecx
43
44# ifdef __ILP32__
45 /* Clear the upper 32 bits. */
46 movl %edx, %edx
47# endif
48# ifdef USE_AS_WMEMCHR
49 test %RDX_LP, %RDX_LP
50 jz L(return_null)
51# else
52 punpcklbw %xmm1, %xmm1
53 test %RDX_LP, %RDX_LP
54 jz L(return_null)
55 punpcklbw %xmm1, %xmm1
56# endif
57
58 and $63, %ecx
59 pshufd $0, %xmm1, %xmm1
60
61 cmp $48, %ecx
62 ja L(crosscache)
63
64 movdqu (%rdi), %xmm0
65 PCMPEQ %xmm1, %xmm0
66 pmovmskb %xmm0, %eax
67 test %eax, %eax
68
69 jnz L(matches_1)
70 sub $CHAR_PER_VEC, %rdx
71 jbe L(return_null)
72 add $16, %rdi
73 and $15, %ecx
74 and $-16, %rdi
75# ifdef USE_AS_WMEMCHR
76 shr $2, %ecx
77# endif
78 add %rcx, %rdx
79 sub $(CHAR_PER_VEC * 4), %rdx
80 jbe L(exit_loop)
81 jmp L(loop_prolog)
82
83 .p2align 4
84L(crosscache):
85 and $15, %ecx
86 and $-16, %rdi
87 movdqa (%rdi), %xmm0
88
89 PCMPEQ %xmm1, %xmm0
90 /* Check if there is a match. */
91 pmovmskb %xmm0, %eax
92 /* Remove the leading bytes. */
93 sar %cl, %eax
94 test %eax, %eax
95 je L(unaligned_no_match)
96 /* Check which byte is a match. */
97 bsf %eax, %eax
98# ifdef USE_AS_WMEMCHR
99 mov %eax, %esi
100 shr $2, %esi
101 sub %rsi, %rdx
102# else
103 sub %rax, %rdx
104# endif
105 jbe L(return_null)
106 add %rdi, %rax
107 add %rcx, %rax
108 ret
109
110 .p2align 4
111L(unaligned_no_match):
112 /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
113 "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
114 possible addition overflow. */
115 neg %rcx
116 add $16, %rcx
117# ifdef USE_AS_WMEMCHR
118 shr $2, %ecx
119# endif
120 sub %rcx, %rdx
121 jbe L(return_null)
122 add $16, %rdi
123 sub $(CHAR_PER_VEC * 4), %rdx
124 jbe L(exit_loop)
125
126 .p2align 4
127L(loop_prolog):
128 movdqa (%rdi), %xmm0
129 PCMPEQ %xmm1, %xmm0
130 pmovmskb %xmm0, %eax
131 test %eax, %eax
132 jnz L(matches)
133
134 movdqa 16(%rdi), %xmm2
135 PCMPEQ %xmm1, %xmm2
136 pmovmskb %xmm2, %eax
137 test %eax, %eax
138 jnz L(matches16)
139
140 movdqa 32(%rdi), %xmm3
141 PCMPEQ %xmm1, %xmm3
142 pmovmskb %xmm3, %eax
143 test %eax, %eax
144 jnz L(matches32)
145
146 movdqa 48(%rdi), %xmm4
147 PCMPEQ %xmm1, %xmm4
148 add $64, %rdi
149 pmovmskb %xmm4, %eax
150 test %eax, %eax
151 jnz L(matches0)
152
153 test $0x3f, %rdi
154 jz L(align64_loop)
155
156 sub $(CHAR_PER_VEC * 4), %rdx
157 jbe L(exit_loop)
158
159 movdqa (%rdi), %xmm0
160 PCMPEQ %xmm1, %xmm0
161 pmovmskb %xmm0, %eax
162 test %eax, %eax
163 jnz L(matches)
164
165 movdqa 16(%rdi), %xmm2
166 PCMPEQ %xmm1, %xmm2
167 pmovmskb %xmm2, %eax
168 test %eax, %eax
169 jnz L(matches16)
170
171 movdqa 32(%rdi), %xmm3
172 PCMPEQ %xmm1, %xmm3
173 pmovmskb %xmm3, %eax
174 test %eax, %eax
175 jnz L(matches32)
176
177 movdqa 48(%rdi), %xmm3
178 PCMPEQ %xmm1, %xmm3
179 pmovmskb %xmm3, %eax
180
181 add $64, %rdi
182 test %eax, %eax
183 jnz L(matches0)
184
185 mov %rdi, %rcx
186 and $-64, %rdi
187 and $63, %ecx
188# ifdef USE_AS_WMEMCHR
189 shr $2, %ecx
190# endif
191 add %rcx, %rdx
192
193 .p2align 4
194L(align64_loop):
195 sub $(CHAR_PER_VEC * 4), %rdx
196 jbe L(exit_loop)
197 movdqa (%rdi), %xmm0
198 movdqa 16(%rdi), %xmm2
199 movdqa 32(%rdi), %xmm3
200 movdqa 48(%rdi), %xmm4
201
202 PCMPEQ %xmm1, %xmm0
203 PCMPEQ %xmm1, %xmm2
204 PCMPEQ %xmm1, %xmm3
205 PCMPEQ %xmm1, %xmm4
206
207 pmaxub %xmm0, %xmm3
208 pmaxub %xmm2, %xmm4
209 pmaxub %xmm3, %xmm4
210 pmovmskb %xmm4, %eax
211
212 add $64, %rdi
213
214 test %eax, %eax
215 jz L(align64_loop)
216
217 sub $64, %rdi
218
219 pmovmskb %xmm0, %eax
220 test %eax, %eax
221 jnz L(matches)
222
223 pmovmskb %xmm2, %eax
224 test %eax, %eax
225 jnz L(matches16)
226
227 movdqa 32(%rdi), %xmm3
228 PCMPEQ %xmm1, %xmm3
229
230 PCMPEQ 48(%rdi), %xmm1
231 pmovmskb %xmm3, %eax
232 test %eax, %eax
233 jnz L(matches32)
234
235 pmovmskb %xmm1, %eax
236 bsf %eax, %eax
237 lea 48(%rdi, %rax), %rax
238 ret
239
240 .p2align 4
241L(exit_loop):
242 add $(CHAR_PER_VEC * 2), %edx
243 jle L(exit_loop_32)
244
245 movdqa (%rdi), %xmm0
246 PCMPEQ %xmm1, %xmm0
247 pmovmskb %xmm0, %eax
248 test %eax, %eax
249 jnz L(matches)
250
251 movdqa 16(%rdi), %xmm2
252 PCMPEQ %xmm1, %xmm2
253 pmovmskb %xmm2, %eax
254 test %eax, %eax
255 jnz L(matches16)
256
257 movdqa 32(%rdi), %xmm3
258 PCMPEQ %xmm1, %xmm3
259 pmovmskb %xmm3, %eax
260 test %eax, %eax
261 jnz L(matches32_1)
262 sub $CHAR_PER_VEC, %edx
263 jle L(return_null)
264
265 PCMPEQ 48(%rdi), %xmm1
266 pmovmskb %xmm1, %eax
267 test %eax, %eax
268 jnz L(matches48_1)
269 xor %eax, %eax
270 ret
271
272 .p2align 4
273L(exit_loop_32):
274 add $(CHAR_PER_VEC * 2), %edx
275 movdqa (%rdi), %xmm0
276 PCMPEQ %xmm1, %xmm0
277 pmovmskb %xmm0, %eax
278 test %eax, %eax
279 jnz L(matches_1)
280 sub $CHAR_PER_VEC, %edx
281 jbe L(return_null)
282
283 PCMPEQ 16(%rdi), %xmm1
284 pmovmskb %xmm1, %eax
285 test %eax, %eax
286 jnz L(matches16_1)
287 xor %eax, %eax
288 ret
289
290 .p2align 4
291L(matches0):
292 bsf %eax, %eax
293 lea -16(%rax, %rdi), %rax
294 ret
295
296 .p2align 4
297L(matches):
298 bsf %eax, %eax
299 add %rdi, %rax
300 ret
301
302 .p2align 4
303L(matches16):
304 bsf %eax, %eax
305 lea 16(%rax, %rdi), %rax
306 ret
307
308 .p2align 4
309L(matches32):
310 bsf %eax, %eax
311 lea 32(%rax, %rdi), %rax
312 ret
313
314 .p2align 4
315L(matches_1):
316 bsf %eax, %eax
317# ifdef USE_AS_WMEMCHR
318 mov %eax, %esi
319 shr $2, %esi
320 sub %rsi, %rdx
321# else
322 sub %rax, %rdx
323# endif
324 jbe L(return_null)
325 add %rdi, %rax
326 ret
327
328 .p2align 4
329L(matches16_1):
330 bsf %eax, %eax
331# ifdef USE_AS_WMEMCHR
332 mov %eax, %esi
333 shr $2, %esi
334 sub %rsi, %rdx
335# else
336 sub %rax, %rdx
337# endif
338 jbe L(return_null)
339 lea 16(%rdi, %rax), %rax
340 ret
341
342 .p2align 4
343L(matches32_1):
344 bsf %eax, %eax
345# ifdef USE_AS_WMEMCHR
346 mov %eax, %esi
347 shr $2, %esi
348 sub %rsi, %rdx
349# else
350 sub %rax, %rdx
351# endif
352 jbe L(return_null)
353 lea 32(%rdi, %rax), %rax
354 ret
355
356 .p2align 4
357L(matches48_1):
358 bsf %eax, %eax
359# ifdef USE_AS_WMEMCHR
360 mov %eax, %esi
361 shr $2, %esi
362 sub %rsi, %rdx
363# else
364 sub %rax, %rdx
365# endif
366 jbe L(return_null)
367 lea 48(%rdi, %rax), %rax
368 ret
369
370 .p2align 4
371L(return_null):
372 xor %eax, %eax
373 ret
374END(MEMCHR)
375#endif
376

source code of glibc/sysdeps/x86_64/multiarch/memchr-sse2.S