1/* strrchr optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* ISA level >= 2 because there are no {wcs|str}rchr-sse4
22 implementations. */
23#if ISA_SHOULD_BUILD (2)
24
25# include <sysdep.h>
26
27# ifndef STRRCHR
28# define STRRCHR __strrchr_sse2
29# endif
30
31# ifdef USE_AS_WCSRCHR
32# define PCMPEQ pcmpeqd
33# define CHAR_SIZE 4
34# define PMINU pminud
35# else
36# define PCMPEQ pcmpeqb
37# define CHAR_SIZE 1
38# define PMINU pminub
39# endif
40
41# define PAGE_SIZE 4096
42# define VEC_SIZE 16
43
44 .text
45ENTRY(STRRCHR)
46 movd %esi, %xmm0
47 movq %rdi, %rax
48 andl $(PAGE_SIZE - 1), %eax
49# ifndef USE_AS_WCSRCHR
50 punpcklbw %xmm0, %xmm0
51 punpcklwd %xmm0, %xmm0
52# endif
53 pshufd $0, %xmm0, %xmm0
54 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
55 ja L(cross_page)
56
57L(cross_page_continue):
58 movups (%rdi), %xmm1
59 pxor %xmm2, %xmm2
60 PCMPEQ %xmm1, %xmm2
61 pmovmskb %xmm2, %ecx
62 testl %ecx, %ecx
63 jz L(aligned_more)
64
65 PCMPEQ %xmm0, %xmm1
66 pmovmskb %xmm1, %eax
67 leal -1(%rcx), %edx
68 xorl %edx, %ecx
69 andl %ecx, %eax
70 jz L(ret0)
71 bsrl %eax, %eax
72 addq %rdi, %rax
73 /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
74 search CHAR is zero we are correct. Either way `andq
75 -CHAR_SIZE, %rax` gets the correct result. */
76# ifdef USE_AS_WCSRCHR
77 andq $-CHAR_SIZE, %rax
78# endif
79L(ret0):
80 ret
81
82 /* Returns for first vec x1/x2 have hard coded backward search
83 path for earlier matches. */
84 .p2align 4
85L(first_vec_x0_test):
86 PCMPEQ %xmm0, %xmm1
87 pmovmskb %xmm1, %eax
88 testl %eax, %eax
89 jz L(ret0)
90 bsrl %eax, %eax
91 addq %r8, %rax
92# ifdef USE_AS_WCSRCHR
93 andq $-CHAR_SIZE, %rax
94# endif
95 ret
96
97 .p2align 4
98L(first_vec_x1):
99 PCMPEQ %xmm0, %xmm2
100 pmovmskb %xmm2, %eax
101 leal -1(%rcx), %edx
102 xorl %edx, %ecx
103 andl %ecx, %eax
104 jz L(first_vec_x0_test)
105 bsrl %eax, %eax
106 leaq (VEC_SIZE)(%rdi, %rax), %rax
107# ifdef USE_AS_WCSRCHR
108 andq $-CHAR_SIZE, %rax
109# endif
110 ret
111
112 .p2align 4
113L(first_vec_x1_test):
114 PCMPEQ %xmm0, %xmm2
115 pmovmskb %xmm2, %eax
116 testl %eax, %eax
117 jz L(first_vec_x0_test)
118 bsrl %eax, %eax
119 leaq (VEC_SIZE)(%rdi, %rax), %rax
120# ifdef USE_AS_WCSRCHR
121 andq $-CHAR_SIZE, %rax
122# endif
123 ret
124
125 .p2align 4
126L(first_vec_x2):
127 PCMPEQ %xmm0, %xmm3
128 pmovmskb %xmm3, %eax
129 leal -1(%rcx), %edx
130 xorl %edx, %ecx
131 andl %ecx, %eax
132 jz L(first_vec_x1_test)
133 bsrl %eax, %eax
134 leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
135# ifdef USE_AS_WCSRCHR
136 andq $-CHAR_SIZE, %rax
137# endif
138 ret
139
140 .p2align 4
141L(aligned_more):
142 /* Save original pointer if match was in VEC 0. */
143 movq %rdi, %r8
144 andq $-VEC_SIZE, %rdi
145
146 movaps VEC_SIZE(%rdi), %xmm2
147 pxor %xmm3, %xmm3
148 PCMPEQ %xmm2, %xmm3
149 pmovmskb %xmm3, %ecx
150 testl %ecx, %ecx
151 jnz L(first_vec_x1)
152
153 movaps (VEC_SIZE * 2)(%rdi), %xmm3
154 pxor %xmm4, %xmm4
155 PCMPEQ %xmm3, %xmm4
156 pmovmskb %xmm4, %ecx
157 testl %ecx, %ecx
158 jnz L(first_vec_x2)
159
160 addq $VEC_SIZE, %rdi
161 /* Save pointer again before realigning. */
162 movq %rdi, %rsi
163 andq $-(VEC_SIZE * 2), %rdi
164 .p2align 4
165L(first_loop):
166 /* Do 2x VEC at a time. */
167 movaps (VEC_SIZE * 2)(%rdi), %xmm4
168 movaps (VEC_SIZE * 3)(%rdi), %xmm5
169 /* Since SSE2 no pminud so wcsrchr needs separate logic for
170 detecting zero. Note if this is found to be a bottleneck it
171 may be worth adding an SSE4.1 wcsrchr implementation. */
172# ifdef USE_AS_WCSRCHR
173 movaps %xmm5, %xmm6
174 pxor %xmm8, %xmm8
175
176 PCMPEQ %xmm8, %xmm5
177 PCMPEQ %xmm4, %xmm8
178 por %xmm5, %xmm8
179# else
180 movaps %xmm5, %xmm6
181 PMINU %xmm4, %xmm5
182# endif
183
184 movaps %xmm4, %xmm9
185 PCMPEQ %xmm0, %xmm4
186 PCMPEQ %xmm0, %xmm6
187 movaps %xmm6, %xmm7
188 por %xmm4, %xmm6
189# ifndef USE_AS_WCSRCHR
190 pxor %xmm8, %xmm8
191 PCMPEQ %xmm5, %xmm8
192# endif
193 pmovmskb %xmm8, %ecx
194 pmovmskb %xmm6, %eax
195
196 addq $(VEC_SIZE * 2), %rdi
197 /* Use `addl` 1) so we can undo it with `subl` and 2) it can
198 macro-fuse with `jz`. */
199 addl %ecx, %eax
200 jz L(first_loop)
201
202 /* Check if there is zero match. */
203 testl %ecx, %ecx
204 jz L(second_loop_match)
205
206 /* Check if there was a match in last iteration. */
207 subl %ecx, %eax
208 jnz L(new_match)
209
210L(first_loop_old_match):
211 PCMPEQ %xmm0, %xmm2
212 PCMPEQ %xmm0, %xmm3
213 pmovmskb %xmm2, %ecx
214 pmovmskb %xmm3, %eax
215 addl %eax, %ecx
216 jz L(first_vec_x0_test)
217 /* NB: We could move this shift to before the branch and save a
218 bit of code size / performance on the fall through. The
219 branch leads to the null case which generally seems hotter
220 than char in first 3x VEC. */
221 sall $16, %eax
222 orl %ecx, %eax
223
224 bsrl %eax, %eax
225 addq %rsi, %rax
226# ifdef USE_AS_WCSRCHR
227 andq $-CHAR_SIZE, %rax
228# endif
229 ret
230
231 .p2align 4
232L(new_match):
233 pxor %xmm6, %xmm6
234 PCMPEQ %xmm9, %xmm6
235 pmovmskb %xmm6, %eax
236 sall $16, %ecx
237 orl %eax, %ecx
238
239 /* We can't reuse either of the old comparisons as since we mask
240 of zeros after first zero (instead of using the full
241 comparison) we can't guarantee no interference between match
242 after end of string and valid match. */
243 pmovmskb %xmm4, %eax
244 pmovmskb %xmm7, %edx
245 sall $16, %edx
246 orl %edx, %eax
247
248 leal -1(%ecx), %edx
249 xorl %edx, %ecx
250 andl %ecx, %eax
251 jz L(first_loop_old_match)
252 bsrl %eax, %eax
253 addq %rdi, %rax
254# ifdef USE_AS_WCSRCHR
255 andq $-CHAR_SIZE, %rax
256# endif
257 ret
258
259 /* Save minimum state for getting most recent match. We can
260 throw out all previous work. */
261 .p2align 4
262L(second_loop_match):
263 movq %rdi, %rsi
264 movaps %xmm4, %xmm2
265 movaps %xmm7, %xmm3
266
267 .p2align 4
268L(second_loop):
269 movaps (VEC_SIZE * 2)(%rdi), %xmm4
270 movaps (VEC_SIZE * 3)(%rdi), %xmm5
271 /* Since SSE2 no pminud so wcsrchr needs separate logic for
272 detecting zero. Note if this is found to be a bottleneck it
273 may be worth adding an SSE4.1 wcsrchr implementation. */
274# ifdef USE_AS_WCSRCHR
275 movaps %xmm5, %xmm6
276 pxor %xmm8, %xmm8
277
278 PCMPEQ %xmm8, %xmm5
279 PCMPEQ %xmm4, %xmm8
280 por %xmm5, %xmm8
281# else
282 movaps %xmm5, %xmm6
283 PMINU %xmm4, %xmm5
284# endif
285
286 movaps %xmm4, %xmm9
287 PCMPEQ %xmm0, %xmm4
288 PCMPEQ %xmm0, %xmm6
289 movaps %xmm6, %xmm7
290 por %xmm4, %xmm6
291# ifndef USE_AS_WCSRCHR
292 pxor %xmm8, %xmm8
293 PCMPEQ %xmm5, %xmm8
294# endif
295
296 pmovmskb %xmm8, %ecx
297 pmovmskb %xmm6, %eax
298
299 addq $(VEC_SIZE * 2), %rdi
300 /* Either null term or new occurrence of CHAR. */
301 addl %ecx, %eax
302 jz L(second_loop)
303
304 /* No null term so much be new occurrence of CHAR. */
305 testl %ecx, %ecx
306 jz L(second_loop_match)
307
308
309 subl %ecx, %eax
310 jnz L(second_loop_new_match)
311
312L(second_loop_old_match):
313 pmovmskb %xmm2, %ecx
314 pmovmskb %xmm3, %eax
315 sall $16, %eax
316 orl %ecx, %eax
317 bsrl %eax, %eax
318 addq %rsi, %rax
319# ifdef USE_AS_WCSRCHR
320 andq $-CHAR_SIZE, %rax
321# endif
322 ret
323
324 .p2align 4
325L(second_loop_new_match):
326 pxor %xmm6, %xmm6
327 PCMPEQ %xmm9, %xmm6
328 pmovmskb %xmm6, %eax
329 sall $16, %ecx
330 orl %eax, %ecx
331
332 /* We can't reuse either of the old comparisons as since we mask
333 of zeros after first zero (instead of using the full
334 comparison) we can't guarantee no interference between match
335 after end of string and valid match. */
336 pmovmskb %xmm4, %eax
337 pmovmskb %xmm7, %edx
338 sall $16, %edx
339 orl %edx, %eax
340
341 leal -1(%ecx), %edx
342 xorl %edx, %ecx
343 andl %ecx, %eax
344 jz L(second_loop_old_match)
345 bsrl %eax, %eax
346 addq %rdi, %rax
347# ifdef USE_AS_WCSRCHR
348 andq $-CHAR_SIZE, %rax
349# endif
350 ret
351
352 .p2align 4,, 4
353L(cross_page):
354 movq %rdi, %rsi
355 andq $-VEC_SIZE, %rsi
356 movaps (%rsi), %xmm1
357 pxor %xmm2, %xmm2
358 PCMPEQ %xmm1, %xmm2
359 pmovmskb %xmm2, %edx
360 movl %edi, %ecx
361 andl $(VEC_SIZE - 1), %ecx
362 sarl %cl, %edx
363 jz L(cross_page_continue)
364 PCMPEQ %xmm0, %xmm1
365 pmovmskb %xmm1, %eax
366 sarl %cl, %eax
367 leal -1(%rdx), %ecx
368 xorl %edx, %ecx
369 andl %ecx, %eax
370 jz L(ret1)
371 bsrl %eax, %eax
372 addq %rdi, %rax
373# ifdef USE_AS_WCSRCHR
374 andq $-CHAR_SIZE, %rax
375# endif
376L(ret1):
377 ret
378END(STRRCHR)
379#endif
380

source code of glibc/sysdeps/x86_64/multiarch/strrchr-sse2.S