1/* memchr/wmemchr optimized with AVX2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22#if ISA_SHOULD_BUILD (3)
23
24# ifndef MEMCHR
25# define MEMCHR __memchr_avx2
26# endif
27
28# ifdef USE_AS_WMEMCHR
29# define VPCMPEQ vpcmpeqd
30# define VPBROADCAST vpbroadcastd
31# define CHAR_SIZE 4
32# else
33# define VPCMPEQ vpcmpeqb
34# define VPBROADCAST vpbroadcastb
35# define CHAR_SIZE 1
36# endif
37
38# ifdef USE_AS_RAWMEMCHR
39# define ERAW_PTR_REG ecx
40# define RRAW_PTR_REG rcx
41# define ALGN_PTR_REG rdi
42# else
43# define ERAW_PTR_REG edi
44# define RRAW_PTR_REG rdi
45# define ALGN_PTR_REG rcx
46# endif
47
48# ifndef VZEROUPPER
49# define VZEROUPPER vzeroupper
50# endif
51
52# ifndef SECTION
53# define SECTION(p) p##.avx
54# endif
55
56# define VEC_SIZE 32
57# define PAGE_SIZE 4096
58# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
59
60 .section SECTION(.text),"ax",@progbits
61ENTRY_P2ALIGN (MEMCHR, 5)
62# ifndef USE_AS_RAWMEMCHR
63 /* Check for zero length. */
64# ifdef __ILP32__
65 /* Clear upper bits. */
66 and %RDX_LP, %RDX_LP
67# else
68 test %RDX_LP, %RDX_LP
69# endif
70 jz L(null)
71# endif
72 /* Broadcast CHAR to YMMMATCH. */
73 vmovd %esi, %xmm0
74 VPBROADCAST %xmm0, %ymm0
75 /* Check if we may cross page boundary with one vector load. */
76 movl %edi, %eax
77 andl $(PAGE_SIZE - 1), %eax
78 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
79 ja L(cross_page_boundary)
80
81 /* Check the first VEC_SIZE bytes. */
82 VPCMPEQ (%rdi), %ymm0, %ymm1
83 vpmovmskb %ymm1, %eax
84# ifndef USE_AS_RAWMEMCHR
85 /* If length < CHAR_PER_VEC handle special. */
86 cmpq $CHAR_PER_VEC, %rdx
87 jbe L(first_vec_x0)
88# endif
89 testl %eax, %eax
90 jz L(aligned_more)
91 bsfl %eax, %eax
92 addq %rdi, %rax
93L(return_vzeroupper):
94 ZERO_UPPER_VEC_REGISTERS_RETURN
95
96
97# ifndef USE_AS_RAWMEMCHR
98 .p2align 4
99L(first_vec_x0):
100 /* Check if first match was before length. */
101 tzcntl %eax, %eax
102# ifdef USE_AS_WMEMCHR
103 /* NB: Multiply length by 4 to get byte count. */
104 sall $2, %edx
105# endif
106 COND_VZEROUPPER
107 /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
108 block. branch here as opposed to cmovcc is not that costly. Common
109 usage of memchr is to check if the return was NULL (if string was
110 known to contain CHAR user would use rawmemchr). This branch will be
111 highly correlated with the user branch and can be used by most
112 modern branch predictors to predict the user branch. */
113 cmpl %eax, %edx
114 jle L(null)
115 addq %rdi, %rax
116 ret
117# endif
118
119 .p2align 4,, 10
120L(first_vec_x1):
121 bsfl %eax, %eax
122 incq %rdi
123 addq %rdi, %rax
124 VZEROUPPER_RETURN
125# ifndef USE_AS_RAWMEMCHR
126 /* First in aligning bytes here. */
127L(null):
128 xorl %eax, %eax
129 ret
130# endif
131 .p2align 4
132L(first_vec_x2):
133 tzcntl %eax, %eax
134 addq $(VEC_SIZE + 1), %rdi
135 addq %rdi, %rax
136 VZEROUPPER_RETURN
137
138 .p2align 4
139L(first_vec_x3):
140 tzcntl %eax, %eax
141 addq $(VEC_SIZE * 2 + 1), %rdi
142 addq %rdi, %rax
143 VZEROUPPER_RETURN
144
145
146 .p2align 4
147L(first_vec_x4):
148 tzcntl %eax, %eax
149 addq $(VEC_SIZE * 3 + 1), %rdi
150 addq %rdi, %rax
151 VZEROUPPER_RETURN
152
153 .p2align 4
154L(aligned_more):
155 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
156 since data is only aligned to VEC_SIZE. */
157
158# ifndef USE_AS_RAWMEMCHR
159L(cross_page_continue):
160 /* Align data to VEC_SIZE - 1. */
161 xorl %ecx, %ecx
162 subl %edi, %ecx
163 orq $(VEC_SIZE - 1), %rdi
164 /* esi is for adjusting length to see if near the end. */
165 leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
166# ifdef USE_AS_WMEMCHR
167 /* NB: Divide bytes by 4 to get the wchar_t count. */
168 sarl $2, %esi
169# endif
170# else
171 orq $(VEC_SIZE - 1), %rdi
172L(cross_page_continue):
173# endif
174 /* Load first VEC regardless. */
175 VPCMPEQ 1(%rdi), %ymm0, %ymm1
176 vpmovmskb %ymm1, %eax
177# ifndef USE_AS_RAWMEMCHR
178 /* Adjust length. If near end handle specially. */
179 subq %rsi, %rdx
180 jbe L(last_4x_vec_or_less)
181# endif
182 testl %eax, %eax
183 jnz L(first_vec_x1)
184
185 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
186 vpmovmskb %ymm1, %eax
187 testl %eax, %eax
188 jnz L(first_vec_x2)
189
190 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
191 vpmovmskb %ymm1, %eax
192 testl %eax, %eax
193 jnz L(first_vec_x3)
194
195 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
196 vpmovmskb %ymm1, %eax
197 testl %eax, %eax
198 jnz L(first_vec_x4)
199
200# ifndef USE_AS_RAWMEMCHR
201 /* Check if at last VEC_SIZE * 4 length. */
202 subq $(CHAR_PER_VEC * 4), %rdx
203 jbe L(last_4x_vec_or_less_cmpeq)
204 /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
205 length. */
206 incq %rdi
207 movl %edi, %ecx
208 orq $(VEC_SIZE * 4 - 1), %rdi
209 andl $(VEC_SIZE * 4 - 1), %ecx
210# ifdef USE_AS_WMEMCHR
211 /* NB: Divide bytes by 4 to get the wchar_t count. */
212 sarl $2, %ecx
213# endif
214 addq %rcx, %rdx
215# else
216 /* Align data to VEC_SIZE * 4 - 1 for loop. */
217 incq %rdi
218 orq $(VEC_SIZE * 4 - 1), %rdi
219# endif
220
221 /* Compare 4 * VEC at a time forward. */
222 .p2align 4
223L(loop_4x_vec):
224 VPCMPEQ 1(%rdi), %ymm0, %ymm1
225 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
226 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
227 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
228 vpor %ymm1, %ymm2, %ymm5
229 vpor %ymm3, %ymm4, %ymm6
230 vpor %ymm5, %ymm6, %ymm5
231
232 vpmovmskb %ymm5, %ecx
233# ifdef USE_AS_RAWMEMCHR
234 subq $-(VEC_SIZE * 4), %rdi
235 testl %ecx, %ecx
236 jz L(loop_4x_vec)
237# else
238 testl %ecx, %ecx
239 jnz L(loop_4x_vec_end)
240
241 subq $-(VEC_SIZE * 4), %rdi
242
243 subq $(CHAR_PER_VEC * 4), %rdx
244 ja L(loop_4x_vec)
245
246 /* Fall through into less than 4 remaining vectors of length
247 case. */
248 VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
249 vpmovmskb %ymm1, %eax
250 .p2align 4
251L(last_4x_vec_or_less):
252# ifdef USE_AS_WMEMCHR
253 /* NB: Multiply length by 4 to get byte count. */
254 sall $2, %edx
255# endif
256 /* Check if first VEC contained match. */
257 testl %eax, %eax
258 jnz L(first_vec_x1_check)
259
260 /* If remaining length > VEC_SIZE * 2. */
261 addl $(VEC_SIZE * 2), %edx
262 jg L(last_4x_vec)
263
264L(last_2x_vec):
265 /* If remaining length < VEC_SIZE. */
266 addl $VEC_SIZE, %edx
267 jle L(zero_end)
268
269 /* Check VEC2 and compare any match with remaining length. */
270 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
271 vpmovmskb %ymm1, %eax
272 tzcntl %eax, %eax
273 cmpl %eax, %edx
274 jbe L(set_zero_end)
275 addq $(VEC_SIZE + 1), %rdi
276 addq %rdi, %rax
277L(zero_end):
278 VZEROUPPER_RETURN
279
280 .p2align 4
281L(loop_4x_vec_end):
282# endif
283 /* rawmemchr will fall through into this if match was found in
284 loop. */
285
286 vpmovmskb %ymm1, %eax
287 testl %eax, %eax
288 jnz L(last_vec_x1_return)
289
290 vpmovmskb %ymm2, %eax
291 testl %eax, %eax
292 jnz L(last_vec_x2_return)
293
294 vpmovmskb %ymm3, %eax
295 /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
296 salq $32, %rcx
297 orq %rcx, %rax
298 tzcntq %rax, %rax
299# ifdef USE_AS_RAWMEMCHR
300 subq $(VEC_SIZE * 2 - 1), %rdi
301# else
302 subq $-(VEC_SIZE * 2 + 1), %rdi
303# endif
304 addq %rdi, %rax
305 VZEROUPPER_RETURN
306# ifndef USE_AS_RAWMEMCHR
307
308 .p2align 4
309L(first_vec_x1_check):
310 tzcntl %eax, %eax
311 /* Adjust length. */
312 subl $-(VEC_SIZE * 4), %edx
313 /* Check if match within remaining length. */
314 cmpl %eax, %edx
315 jbe L(set_zero_end)
316 incq %rdi
317 addq %rdi, %rax
318 VZEROUPPER_RETURN
319 .p2align 4,, 6
320L(set_zero_end):
321 xorl %eax, %eax
322 VZEROUPPER_RETURN
323# endif
324
325 .p2align 4
326L(last_vec_x1_return):
327 tzcntl %eax, %eax
328# ifdef USE_AS_RAWMEMCHR
329 subq $(VEC_SIZE * 4 - 1), %rdi
330# else
331 incq %rdi
332# endif
333 addq %rdi, %rax
334 VZEROUPPER_RETURN
335
336 .p2align 4
337L(last_vec_x2_return):
338 tzcntl %eax, %eax
339# ifdef USE_AS_RAWMEMCHR
340 subq $(VEC_SIZE * 3 - 1), %rdi
341# else
342 subq $-(VEC_SIZE + 1), %rdi
343# endif
344 addq %rdi, %rax
345 VZEROUPPER_RETURN
346
347# ifndef USE_AS_RAWMEMCHR
348 .p2align 4
349L(last_4x_vec_or_less_cmpeq):
350 VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
351 vpmovmskb %ymm1, %eax
352# ifdef USE_AS_WMEMCHR
353 /* NB: Multiply length by 4 to get byte count. */
354 sall $2, %edx
355# endif
356 subq $-(VEC_SIZE * 4), %rdi
357 /* Check first VEC regardless. */
358 testl %eax, %eax
359 jnz L(first_vec_x1_check)
360
361 /* If remaining length <= CHAR_PER_VEC * 2. */
362 addl $(VEC_SIZE * 2), %edx
363 jle L(last_2x_vec)
364 .p2align 4
365L(last_4x_vec):
366 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
367 vpmovmskb %ymm1, %eax
368 testl %eax, %eax
369 jnz L(last_vec_x2_return)
370
371 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
372 vpmovmskb %ymm1, %eax
373
374 /* Create mask for possible matches within remaining length. */
375 movq $-1, %rcx
376 bzhiq %rdx, %rcx, %rcx
377
378 /* Test matches in data against length match. */
379 andl %ecx, %eax
380 jnz L(last_vec_x3)
381
382 /* if remaining length <= VEC_SIZE * 3 (Note this is after
383 remaining length was found to be > VEC_SIZE * 2. */
384 subl $VEC_SIZE, %edx
385 jbe L(zero_end2)
386
387 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
388 vpmovmskb %ymm1, %eax
389 /* Shift remaining length mask for last VEC. */
390 shrq $32, %rcx
391 andl %ecx, %eax
392 jz L(zero_end2)
393 tzcntl %eax, %eax
394 addq $(VEC_SIZE * 3 + 1), %rdi
395 addq %rdi, %rax
396L(zero_end2):
397 VZEROUPPER_RETURN
398
399 .p2align 4
400L(last_vec_x3):
401 tzcntl %eax, %eax
402 subq $-(VEC_SIZE * 2 + 1), %rdi
403 addq %rdi, %rax
404 VZEROUPPER_RETURN
405# endif
406
407 .p2align 4
408L(cross_page_boundary):
409 /* Save pointer before aligning as its original value is necessary for
410 computer return address if byte is found or adjusting length if it
411 is not and this is memchr. */
412 movq %rdi, %rcx
413 /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
414 and rdi for rawmemchr. */
415 orq $(VEC_SIZE - 1), %ALGN_PTR_REG
416 VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
417 vpmovmskb %ymm1, %eax
418# ifndef USE_AS_RAWMEMCHR
419 /* Calculate length until end of page (length checked for a match). */
420 leaq 1(%ALGN_PTR_REG), %rsi
421 subq %RRAW_PTR_REG, %rsi
422# ifdef USE_AS_WMEMCHR
423 /* NB: Divide bytes by 4 to get wchar_t count. */
424 shrl $2, %esi
425# endif
426# endif
427 /* Remove the leading bytes. */
428 sarxl %ERAW_PTR_REG, %eax, %eax
429# ifndef USE_AS_RAWMEMCHR
430 /* Check the end of data. */
431 cmpq %rsi, %rdx
432 jbe L(first_vec_x0)
433# endif
434 testl %eax, %eax
435 jz L(cross_page_continue)
436 bsfl %eax, %eax
437 addq %RRAW_PTR_REG, %rax
438 VZEROUPPER_RETURN
439
440
441END (MEMCHR)
442#endif
443

source code of glibc/sysdeps/x86_64/multiarch/memchr-avx2.S