1/* strrchr/wcsrchr optimized with AVX2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef STRRCHR
26# define STRRCHR __strrchr_avx2
27# endif
28
29# ifdef USE_AS_WCSRCHR
30# define VPBROADCAST vpbroadcastd
31# define VPCMPEQ vpcmpeqd
32# define VPMIN vpminud
33# define CHAR_SIZE 4
34# else
35# define VPBROADCAST vpbroadcastb
36# define VPCMPEQ vpcmpeqb
37# define VPMIN vpminub
38# define CHAR_SIZE 1
39# endif
40
41# ifndef VZEROUPPER
42# define VZEROUPPER vzeroupper
43# endif
44
45# ifndef SECTION
46# define SECTION(p) p##.avx
47# endif
48
49# define VEC_SIZE 32
50# define PAGE_SIZE 4096
51
52 .section SECTION(.text), "ax", @progbits
53ENTRY(STRRCHR)
54 vmovd %esi, %xmm7
55 movl %edi, %eax
56 /* Broadcast CHAR to YMM4. */
57 VPBROADCAST %xmm7, %ymm7
58 vpxor %xmm0, %xmm0, %xmm0
59
60 /* Shift here instead of `andl` to save code size (saves a fetch
61 block). */
62 sall $20, %eax
63 cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
64 ja L(cross_page)
65
66L(page_cross_continue):
67 vmovdqu (%rdi), %ymm1
68 /* Check end of string match. */
69 VPCMPEQ %ymm1, %ymm0, %ymm6
70 vpmovmskb %ymm6, %ecx
71 testl %ecx, %ecx
72 jz L(aligned_more)
73
74 /* Only check match with search CHAR if needed. */
75 VPCMPEQ %ymm1, %ymm7, %ymm1
76 vpmovmskb %ymm1, %eax
77 /* Check if match before first zero. */
78 blsmskl %ecx, %ecx
79 andl %ecx, %eax
80 jz L(ret0)
81 bsrl %eax, %eax
82 addq %rdi, %rax
83 /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
84 search CHAR is zero we are correct. Either way `andq
85 -CHAR_SIZE, %rax` gets the correct result. */
86# ifdef USE_AS_WCSRCHR
87 andq $-CHAR_SIZE, %rax
88# endif
89L(ret0):
90L(return_vzeroupper):
91 ZERO_UPPER_VEC_REGISTERS_RETURN
92
93 /* Returns for first vec x1/x2 have hard coded backward search
94 path for earlier matches. */
95 .p2align 4,, 10
96L(first_vec_x1):
97 VPCMPEQ %ymm2, %ymm7, %ymm6
98 vpmovmskb %ymm6, %eax
99 blsmskl %ecx, %ecx
100 andl %ecx, %eax
101 jnz L(first_vec_x1_return)
102
103 .p2align 4,, 4
104L(first_vec_x0_test):
105 VPCMPEQ %ymm1, %ymm7, %ymm6
106 vpmovmskb %ymm6, %eax
107 testl %eax, %eax
108 jz L(ret1)
109 bsrl %eax, %eax
110 addq %r8, %rax
111# ifdef USE_AS_WCSRCHR
112 andq $-CHAR_SIZE, %rax
113# endif
114L(ret1):
115 VZEROUPPER_RETURN
116
117 .p2align 4,, 10
118L(first_vec_x0_x1_test):
119 VPCMPEQ %ymm2, %ymm7, %ymm6
120 vpmovmskb %ymm6, %eax
121 /* Check ymm2 for search CHAR match. If no match then check ymm1
122 before returning. */
123 testl %eax, %eax
124 jz L(first_vec_x0_test)
125 .p2align 4,, 4
126L(first_vec_x1_return):
127 bsrl %eax, %eax
128 leaq 1(%rdi, %rax), %rax
129# ifdef USE_AS_WCSRCHR
130 andq $-CHAR_SIZE, %rax
131# endif
132 VZEROUPPER_RETURN
133
134
135 .p2align 4,, 10
136L(first_vec_x2):
137 VPCMPEQ %ymm3, %ymm7, %ymm6
138 vpmovmskb %ymm6, %eax
139 blsmskl %ecx, %ecx
140 /* If no in-range search CHAR match in ymm3 then need to check
141 ymm1/ymm2 for an earlier match (we delay checking search
142 CHAR matches until needed). */
143 andl %ecx, %eax
144 jz L(first_vec_x0_x1_test)
145 bsrl %eax, %eax
146 leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
147# ifdef USE_AS_WCSRCHR
148 andq $-CHAR_SIZE, %rax
149# endif
150 VZEROUPPER_RETURN
151
152
153 .p2align 4
154L(aligned_more):
155 /* Save original pointer if match was in VEC 0. */
156 movq %rdi, %r8
157
158 /* Align src. */
159 orq $(VEC_SIZE - 1), %rdi
160 vmovdqu 1(%rdi), %ymm2
161 VPCMPEQ %ymm2, %ymm0, %ymm6
162 vpmovmskb %ymm6, %ecx
163 testl %ecx, %ecx
164 jnz L(first_vec_x1)
165
166 vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
167 VPCMPEQ %ymm3, %ymm0, %ymm6
168 vpmovmskb %ymm6, %ecx
169 testl %ecx, %ecx
170 jnz L(first_vec_x2)
171
172 /* Save pointer again before realigning. */
173 movq %rdi, %rsi
174 addq $(VEC_SIZE + 1), %rdi
175 andq $-(VEC_SIZE * 2), %rdi
176 .p2align 4
177L(first_aligned_loop):
178 /* Do 2x VEC at a time. Any more and the cost of finding the
179 match outweighs loop benefit. */
180 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
181 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
182
183 VPCMPEQ %ymm4, %ymm7, %ymm6
184 VPMIN %ymm4, %ymm5, %ymm8
185 VPCMPEQ %ymm5, %ymm7, %ymm10
186 vpor %ymm6, %ymm10, %ymm5
187 VPCMPEQ %ymm8, %ymm0, %ymm8
188 vpor %ymm5, %ymm8, %ymm9
189
190 vpmovmskb %ymm9, %eax
191 addq $(VEC_SIZE * 2), %rdi
192 /* No zero or search CHAR. */
193 testl %eax, %eax
194 jz L(first_aligned_loop)
195
196 /* If no zero CHAR then go to second loop (this allows us to
197 throw away all prior work). */
198 vpmovmskb %ymm8, %ecx
199 testl %ecx, %ecx
200 jz L(second_aligned_loop_prep)
201
202 /* Search char could be zero so we need to get the true match.
203 */
204 vpmovmskb %ymm5, %eax
205 testl %eax, %eax
206 jnz L(first_aligned_loop_return)
207
208 .p2align 4,, 4
209L(first_vec_x1_or_x2):
210 VPCMPEQ %ymm3, %ymm7, %ymm3
211 VPCMPEQ %ymm2, %ymm7, %ymm2
212 vpmovmskb %ymm3, %eax
213 vpmovmskb %ymm2, %edx
214 /* Use add for macro-fusion. */
215 addq %rax, %rdx
216 jz L(first_vec_x0_test)
217 /* NB: We could move this shift to before the branch and save a
218 bit of code size / performance on the fall through. The
219 branch leads to the null case which generally seems hotter
220 than char in first 3x VEC. */
221 salq $32, %rax
222 addq %rdx, %rax
223 bsrq %rax, %rax
224 leaq 1(%rsi, %rax), %rax
225# ifdef USE_AS_WCSRCHR
226 andq $-CHAR_SIZE, %rax
227# endif
228 VZEROUPPER_RETURN
229
230 .p2align 4,, 8
231L(first_aligned_loop_return):
232 VPCMPEQ %ymm4, %ymm0, %ymm4
233 vpmovmskb %ymm4, %edx
234 salq $32, %rcx
235 orq %rdx, %rcx
236
237 vpmovmskb %ymm10, %eax
238 vpmovmskb %ymm6, %edx
239 salq $32, %rax
240 orq %rdx, %rax
241 blsmskq %rcx, %rcx
242 andq %rcx, %rax
243 jz L(first_vec_x1_or_x2)
244
245 bsrq %rax, %rax
246 leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
247# ifdef USE_AS_WCSRCHR
248 andq $-CHAR_SIZE, %rax
249# endif
250 VZEROUPPER_RETURN
251
252 /* Search char cannot be zero. */
253 .p2align 4
254L(second_aligned_loop_set_furthest_match):
255 /* Save VEC and pointer from most recent match. */
256L(second_aligned_loop_prep):
257 movq %rdi, %rsi
258 vmovdqu %ymm6, %ymm2
259 vmovdqu %ymm10, %ymm3
260
261 .p2align 4
262L(second_aligned_loop):
263 /* Search 2x at at time. */
264 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
265 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
266
267 VPCMPEQ %ymm4, %ymm7, %ymm6
268 VPMIN %ymm4, %ymm5, %ymm1
269 VPCMPEQ %ymm5, %ymm7, %ymm10
270 vpor %ymm6, %ymm10, %ymm5
271 VPCMPEQ %ymm1, %ymm0, %ymm1
272 vpor %ymm5, %ymm1, %ymm9
273
274 vpmovmskb %ymm9, %eax
275 addq $(VEC_SIZE * 2), %rdi
276 testl %eax, %eax
277 jz L(second_aligned_loop)
278 vpmovmskb %ymm1, %ecx
279 testl %ecx, %ecx
280 jz L(second_aligned_loop_set_furthest_match)
281 vpmovmskb %ymm5, %eax
282 testl %eax, %eax
283 jnz L(return_new_match)
284
285 /* This is the hot patch. We know CHAR is inbounds and that
286 ymm3/ymm2 have latest match. */
287 .p2align 4,, 4
288L(return_old_match):
289 vpmovmskb %ymm3, %eax
290 vpmovmskb %ymm2, %edx
291 salq $32, %rax
292 orq %rdx, %rax
293 bsrq %rax, %rax
294 /* Search char cannot be zero so safe to just use lea for
295 wcsrchr. */
296 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
297 VZEROUPPER_RETURN
298
299 /* Last iteration also potentially has a match. */
300 .p2align 4,, 8
301L(return_new_match):
302 VPCMPEQ %ymm4, %ymm0, %ymm4
303 vpmovmskb %ymm4, %edx
304 salq $32, %rcx
305 orq %rdx, %rcx
306
307 vpmovmskb %ymm10, %eax
308 vpmovmskb %ymm6, %edx
309 salq $32, %rax
310 orq %rdx, %rax
311 blsmskq %rcx, %rcx
312 andq %rcx, %rax
313 jz L(return_old_match)
314 bsrq %rax, %rax
315 /* Search char cannot be zero so safe to just use lea for
316 wcsrchr. */
317 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
318 VZEROUPPER_RETURN
319
320 .p2align 4,, 4
321L(cross_page):
322 movq %rdi, %rsi
323 andq $-VEC_SIZE, %rsi
324 vmovdqu (%rsi), %ymm1
325 VPCMPEQ %ymm1, %ymm0, %ymm6
326 vpmovmskb %ymm6, %ecx
327 /* Shift out zero CHAR matches that are before the beginning of
328 src (rdi). */
329 shrxl %edi, %ecx, %ecx
330 testl %ecx, %ecx
331 jz L(page_cross_continue)
332 VPCMPEQ %ymm1, %ymm7, %ymm1
333 vpmovmskb %ymm1, %eax
334
335 /* Shift out search CHAR matches that are before the beginning of
336 src (rdi). */
337 shrxl %edi, %eax, %eax
338 blsmskl %ecx, %ecx
339 /* Check if any search CHAR match in range. */
340 andl %ecx, %eax
341 jz L(ret2)
342 bsrl %eax, %eax
343 addq %rdi, %rax
344# ifdef USE_AS_WCSRCHR
345 andq $-CHAR_SIZE, %rax
346# endif
347L(ret2):
348 VZEROUPPER_RETURN
349END(STRRCHR)
350#endif
351

source code of glibc/sysdeps/x86_64/multiarch/strrchr-avx2.S