1/* strchr/strchrnul optimized with AVX2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef STRCHR
26# define STRCHR __strchr_avx2
27# endif
28
29# ifdef USE_AS_WCSCHR
30# define VPBROADCAST vpbroadcastd
31# define VPCMPEQ vpcmpeqd
32# define VPMINU vpminud
33# define CHAR_REG esi
34# else
35# define VPBROADCAST vpbroadcastb
36# define VPCMPEQ vpcmpeqb
37# define VPMINU vpminub
38# define CHAR_REG sil
39# endif
40
41# ifndef VZEROUPPER
42# define VZEROUPPER vzeroupper
43# endif
44
45# ifndef SECTION
46# define SECTION(p) p##.avx
47# endif
48
49# define VEC_SIZE 32
50# define PAGE_SIZE 4096
51
52 .section SECTION(.text),"ax",@progbits
53ENTRY_P2ALIGN (STRCHR, 5)
54 /* Broadcast CHAR to YMM0. */
55 vmovd %esi, %xmm0
56 movl %edi, %eax
57 andl $(PAGE_SIZE - 1), %eax
58 VPBROADCAST %xmm0, %ymm0
59 vpxor %xmm1, %xmm1, %xmm1
60
61 /* Check if we cross page boundary with one vector load. */
62 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
63 ja L(cross_page_boundary)
64
65 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
66 null byte. */
67 vmovdqu (%rdi), %ymm2
68 VPCMPEQ %ymm2, %ymm0, %ymm3
69 VPCMPEQ %ymm2, %ymm1, %ymm2
70 vpor %ymm3, %ymm2, %ymm3
71 vpmovmskb %ymm3, %eax
72 testl %eax, %eax
73 jz L(aligned_more)
74 tzcntl %eax, %eax
75# ifndef USE_AS_STRCHRNUL
76 /* Found CHAR or the null byte. */
77 cmp (%rdi, %rax), %CHAR_REG
78 /* NB: Use a branch instead of cmovcc here. The expectation is
79 that with strchr the user will branch based on input being
80 null. Since this branch will be 100% predictive of the user
81 branch a branch miss here should save what otherwise would
82 be branch miss in the user code. Otherwise using a branch 1)
83 saves code size and 2) is faster in highly predictable
84 environments. */
85 jne L(zero)
86# endif
87 addq %rdi, %rax
88L(return_vzeroupper):
89 ZERO_UPPER_VEC_REGISTERS_RETURN
90
91# ifndef USE_AS_STRCHRNUL
92L(zero):
93 xorl %eax, %eax
94 VZEROUPPER_RETURN
95# endif
96
97
98 .p2align 4
99L(first_vec_x1):
100 /* Use bsf to save code size. */
101 bsfl %eax, %eax
102 incq %rdi
103# ifndef USE_AS_STRCHRNUL
104 /* Found CHAR or the null byte. */
105 cmp (%rdi, %rax), %CHAR_REG
106 jne L(zero)
107# endif
108 addq %rdi, %rax
109 VZEROUPPER_RETURN
110
111 .p2align 4,, 10
112L(first_vec_x2):
113 /* Use bsf to save code size. */
114 bsfl %eax, %eax
115 addq $(VEC_SIZE + 1), %rdi
116# ifndef USE_AS_STRCHRNUL
117 /* Found CHAR or the null byte. */
118 cmp (%rdi, %rax), %CHAR_REG
119 jne L(zero)
120# endif
121 addq %rdi, %rax
122 VZEROUPPER_RETURN
123
124 .p2align 4,, 8
125L(first_vec_x3):
126 /* Use bsf to save code size. */
127 bsfl %eax, %eax
128 addq $(VEC_SIZE * 2 + 1), %rdi
129# ifndef USE_AS_STRCHRNUL
130 /* Found CHAR or the null byte. */
131 cmp (%rdi, %rax), %CHAR_REG
132 jne L(zero)
133# endif
134 addq %rdi, %rax
135 VZEROUPPER_RETURN
136
137 .p2align 4,, 10
138L(first_vec_x4):
139 /* Use bsf to save code size. */
140 bsfl %eax, %eax
141 addq $(VEC_SIZE * 3 + 1), %rdi
142# ifndef USE_AS_STRCHRNUL
143 /* Found CHAR or the null byte. */
144 cmp (%rdi, %rax), %CHAR_REG
145 jne L(zero)
146# endif
147 addq %rdi, %rax
148 VZEROUPPER_RETURN
149
150
151
152 .p2align 4
153L(aligned_more):
154 /* Align data to VEC_SIZE - 1. This is the same number of
155 instructions as using andq -VEC_SIZE but saves 4 bytes of code
156 on x4 check. */
157 orq $(VEC_SIZE - 1), %rdi
158L(cross_page_continue):
159 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
160 since data is only aligned to VEC_SIZE. */
161 vmovdqa 1(%rdi), %ymm2
162 VPCMPEQ %ymm2, %ymm0, %ymm3
163 VPCMPEQ %ymm2, %ymm1, %ymm2
164 vpor %ymm3, %ymm2, %ymm3
165 vpmovmskb %ymm3, %eax
166 testl %eax, %eax
167 jnz L(first_vec_x1)
168
169 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
170 VPCMPEQ %ymm2, %ymm0, %ymm3
171 VPCMPEQ %ymm2, %ymm1, %ymm2
172 vpor %ymm3, %ymm2, %ymm3
173 vpmovmskb %ymm3, %eax
174 testl %eax, %eax
175 jnz L(first_vec_x2)
176
177 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
178 VPCMPEQ %ymm2, %ymm0, %ymm3
179 VPCMPEQ %ymm2, %ymm1, %ymm2
180 vpor %ymm3, %ymm2, %ymm3
181 vpmovmskb %ymm3, %eax
182 testl %eax, %eax
183 jnz L(first_vec_x3)
184
185 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
186 VPCMPEQ %ymm2, %ymm0, %ymm3
187 VPCMPEQ %ymm2, %ymm1, %ymm2
188 vpor %ymm3, %ymm2, %ymm3
189 vpmovmskb %ymm3, %eax
190 testl %eax, %eax
191 jnz L(first_vec_x4)
192 /* Align data to VEC_SIZE * 4 - 1. */
193 incq %rdi
194 orq $(VEC_SIZE * 4 - 1), %rdi
195 .p2align 4
196L(loop_4x_vec):
197 /* Compare 4 * VEC at a time forward. */
198 vmovdqa 1(%rdi), %ymm6
199 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
200
201 /* Leaves only CHARS matching esi as 0. */
202 vpxor %ymm6, %ymm0, %ymm2
203 vpxor %ymm7, %ymm0, %ymm3
204
205 VPMINU %ymm2, %ymm6, %ymm2
206 VPMINU %ymm3, %ymm7, %ymm3
207
208 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
209 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
210
211 vpxor %ymm6, %ymm0, %ymm4
212 vpxor %ymm7, %ymm0, %ymm5
213
214 VPMINU %ymm4, %ymm6, %ymm4
215 VPMINU %ymm5, %ymm7, %ymm5
216
217 VPMINU %ymm2, %ymm3, %ymm6
218 VPMINU %ymm4, %ymm5, %ymm7
219
220 VPMINU %ymm6, %ymm7, %ymm7
221
222 VPCMPEQ %ymm7, %ymm1, %ymm7
223 vpmovmskb %ymm7, %ecx
224 subq $-(VEC_SIZE * 4), %rdi
225 testl %ecx, %ecx
226 jz L(loop_4x_vec)
227
228 VPCMPEQ %ymm2, %ymm1, %ymm2
229 vpmovmskb %ymm2, %eax
230 testl %eax, %eax
231 jnz L(last_vec_x0)
232
233
234 VPCMPEQ %ymm3, %ymm1, %ymm3
235 vpmovmskb %ymm3, %eax
236 testl %eax, %eax
237 jnz L(last_vec_x1)
238
239 VPCMPEQ %ymm4, %ymm1, %ymm4
240 vpmovmskb %ymm4, %eax
241 /* rcx has combined result from all 4 VEC. It will only be used
242 if the first 3 other VEC all did not contain a match. */
243 salq $32, %rcx
244 orq %rcx, %rax
245 tzcntq %rax, %rax
246 subq $(VEC_SIZE * 2 - 1), %rdi
247# ifndef USE_AS_STRCHRNUL
248 /* Found CHAR or the null byte. */
249 cmp (%rdi, %rax), %CHAR_REG
250 jne L(zero_end)
251# endif
252 addq %rdi, %rax
253 VZEROUPPER_RETURN
254
255
256 .p2align 4,, 10
257L(last_vec_x0):
258 /* Use bsf to save code size. */
259 bsfl %eax, %eax
260 addq $-(VEC_SIZE * 4 - 1), %rdi
261# ifndef USE_AS_STRCHRNUL
262 /* Found CHAR or the null byte. */
263 cmp (%rdi, %rax), %CHAR_REG
264 jne L(zero_end)
265# endif
266 addq %rdi, %rax
267 VZEROUPPER_RETURN
268
269
270 .p2align 4,, 10
271L(last_vec_x1):
272 tzcntl %eax, %eax
273 subq $(VEC_SIZE * 3 - 1), %rdi
274# ifndef USE_AS_STRCHRNUL
275 /* Found CHAR or the null byte. */
276 cmp (%rdi, %rax), %CHAR_REG
277 jne L(zero_end)
278# endif
279 addq %rdi, %rax
280 VZEROUPPER_RETURN
281
282# ifndef USE_AS_STRCHRNUL
283L(zero_end):
284 xorl %eax, %eax
285 VZEROUPPER_RETURN
286# endif
287
288 /* Cold case for crossing page with first load. */
289 .p2align 4,, 8
290L(cross_page_boundary):
291 movq %rdi, %rdx
292 /* Align rdi to VEC_SIZE - 1. */
293 orq $(VEC_SIZE - 1), %rdi
294 vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
295 VPCMPEQ %ymm2, %ymm0, %ymm3
296 VPCMPEQ %ymm2, %ymm1, %ymm2
297 vpor %ymm3, %ymm2, %ymm3
298 vpmovmskb %ymm3, %eax
299 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
300 so no need to manually mod edx. */
301 sarxl %edx, %eax, %eax
302 testl %eax, %eax
303 jz L(cross_page_continue)
304 tzcntl %eax, %eax
305# ifndef USE_AS_STRCHRNUL
306 xorl %ecx, %ecx
307 /* Found CHAR or the null byte. */
308 cmp (%rdx, %rax), %CHAR_REG
309 jne L(zero_end)
310# endif
311 addq %rdx, %rax
312 VZEROUPPER_RETURN
313
314END (STRCHR)
315#endif
316

source code of glibc/sysdeps/x86_64/multiarch/strchr-avx2.S