1/* memrchr optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22 so we need this to build for ISA V2 builds. */
23#if ISA_SHOULD_BUILD (2)
24
25# ifndef MEMRCHR
26# define MEMRCHR __memrchr_sse2
27# endif
28
29# include <sysdep.h>
30# define VEC_SIZE 16
31# define PAGE_SIZE 4096
32
33 .text
34ENTRY_P2ALIGN(MEMRCHR, 6)
35# ifdef __ILP32__
36 /* Clear upper bits. */
37 mov %RDX_LP, %RDX_LP
38# endif
39 movd %esi, %xmm0
40
41 /* Get end pointer. */
42 leaq (%rdx, %rdi), %rcx
43
44 punpcklbw %xmm0, %xmm0
45 punpcklwd %xmm0, %xmm0
46 pshufd $0, %xmm0, %xmm0
47
48 /* Check if we can load 1x VEC without cross a page. */
49 testl $(PAGE_SIZE - VEC_SIZE), %ecx
50 jz L(page_cross)
51
52 /* NB: This load happens regardless of whether rdx (len) is zero. Since
53 it doesn't cross a page and the standard guarantees any pointer have
54 at least one-valid byte this load must be safe. For the entire
55 history of the x86 memrchr implementation this has been possible so
56 no code "should" be relying on a zero-length check before this load.
57 The zero-length check is moved to the page cross case because it is
58 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
59 into 2-cache lines. */
60 movups -(VEC_SIZE)(%rcx), %xmm1
61 pcmpeqb %xmm0, %xmm1
62 pmovmskb %xmm1, %eax
63
64 subq $VEC_SIZE, %rdx
65 ja L(more_1x_vec)
66L(ret_vec_x0_test):
67 /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
68 zero. */
69 bsrl %eax, %eax
70 jz L(ret_0)
71 /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
72 if out of bounds. */
73 addl %edx, %eax
74 jl L(zero_0)
75 /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
76 ptr. */
77 addq %rdi, %rax
78L(ret_0):
79 ret
80
81 .p2align 4,, 5
82L(ret_vec_x0):
83 bsrl %eax, %eax
84 leaq -(VEC_SIZE)(%rcx, %rax), %rax
85 ret
86
87 .p2align 4,, 2
88L(zero_0):
89 xorl %eax, %eax
90 ret
91
92
93 .p2align 4,, 8
94L(more_1x_vec):
95 testl %eax, %eax
96 jnz L(ret_vec_x0)
97
98 /* Align rcx (pointer to string). */
99 decq %rcx
100 andq $-VEC_SIZE, %rcx
101
102 movq %rcx, %rdx
103 /* NB: We could consistenyl save 1-byte in this pattern with `movaps
104 %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
105 it adds more frontend uops (even if the moves can be eliminated) and
106 some percentage of the time actual backend uops. */
107 movaps -(VEC_SIZE)(%rcx), %xmm1
108 pcmpeqb %xmm0, %xmm1
109 subq %rdi, %rdx
110 pmovmskb %xmm1, %eax
111
112 cmpq $(VEC_SIZE * 2), %rdx
113 ja L(more_2x_vec)
114L(last_2x_vec):
115 subl $VEC_SIZE, %edx
116 jbe L(ret_vec_x0_test)
117
118 testl %eax, %eax
119 jnz L(ret_vec_x0)
120
121 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
122 pcmpeqb %xmm0, %xmm1
123 pmovmskb %xmm1, %eax
124
125 subl $VEC_SIZE, %edx
126 bsrl %eax, %eax
127 jz L(ret_1)
128 addl %edx, %eax
129 jl L(zero_0)
130 addq %rdi, %rax
131L(ret_1):
132 ret
133
134 /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
135 causes the hot pause (length <= VEC_SIZE) to span multiple cache
136 lines. Naturally aligned % 16 to 8-bytes. */
137L(page_cross):
138 /* Zero length check. */
139 testq %rdx, %rdx
140 jz L(zero_0)
141
142 leaq -1(%rcx), %r8
143 andq $-(VEC_SIZE), %r8
144
145 movaps (%r8), %xmm1
146 pcmpeqb %xmm0, %xmm1
147 pmovmskb %xmm1, %esi
148 /* Shift out negative alignment (because we are starting from endptr and
149 working backwards). */
150 negl %ecx
151 /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
152 explicitly. */
153 andl $(VEC_SIZE - 1), %ecx
154 shl %cl, %esi
155 movzwl %si, %eax
156 leaq (%rdi, %rdx), %rcx
157 cmpq %rdi, %r8
158 ja L(more_1x_vec)
159 subl $VEC_SIZE, %edx
160 bsrl %eax, %eax
161 jz L(ret_2)
162 addl %edx, %eax
163 jl L(zero_1)
164 addq %rdi, %rax
165L(ret_2):
166 ret
167
168 /* Fits in aliging bytes. */
169L(zero_1):
170 xorl %eax, %eax
171 ret
172
173 .p2align 4,, 5
174L(ret_vec_x1):
175 bsrl %eax, %eax
176 leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
177 ret
178
179 .p2align 4,, 8
180L(more_2x_vec):
181 testl %eax, %eax
182 jnz L(ret_vec_x0)
183
184 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
185 pcmpeqb %xmm0, %xmm1
186 pmovmskb %xmm1, %eax
187 testl %eax, %eax
188 jnz L(ret_vec_x1)
189
190
191 movaps -(VEC_SIZE * 3)(%rcx), %xmm1
192 pcmpeqb %xmm0, %xmm1
193 pmovmskb %xmm1, %eax
194
195 subq $(VEC_SIZE * 4), %rdx
196 ja L(more_4x_vec)
197
198 addl $(VEC_SIZE), %edx
199 jle L(ret_vec_x2_test)
200
201L(last_vec):
202 testl %eax, %eax
203 jnz L(ret_vec_x2)
204
205 movaps -(VEC_SIZE * 4)(%rcx), %xmm1
206 pcmpeqb %xmm0, %xmm1
207 pmovmskb %xmm1, %eax
208
209 subl $(VEC_SIZE), %edx
210 bsrl %eax, %eax
211 jz L(ret_3)
212 addl %edx, %eax
213 jl L(zero_2)
214 addq %rdi, %rax
215L(ret_3):
216 ret
217
218 .p2align 4,, 6
219L(ret_vec_x2_test):
220 bsrl %eax, %eax
221 jz L(zero_2)
222 addl %edx, %eax
223 jl L(zero_2)
224 addq %rdi, %rax
225 ret
226
227L(zero_2):
228 xorl %eax, %eax
229 ret
230
231
232 .p2align 4,, 5
233L(ret_vec_x2):
234 bsrl %eax, %eax
235 leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
236 ret
237
238 .p2align 4,, 5
239L(ret_vec_x3):
240 bsrl %eax, %eax
241 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
242 ret
243
244 .p2align 4,, 8
245L(more_4x_vec):
246 testl %eax, %eax
247 jnz L(ret_vec_x2)
248
249 movaps -(VEC_SIZE * 4)(%rcx), %xmm1
250 pcmpeqb %xmm0, %xmm1
251 pmovmskb %xmm1, %eax
252
253 testl %eax, %eax
254 jnz L(ret_vec_x3)
255
256 addq $-(VEC_SIZE * 4), %rcx
257 cmpq $(VEC_SIZE * 4), %rdx
258 jbe L(last_4x_vec)
259
260 /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
261 keeping the code from spilling to the next cache line. */
262 addq $(VEC_SIZE * 4 - 1), %rcx
263 andq $-(VEC_SIZE * 4), %rcx
264 leaq (VEC_SIZE * 4)(%rdi), %rdx
265 andq $-(VEC_SIZE * 4), %rdx
266
267 .p2align 4,, 11
268L(loop_4x_vec):
269 movaps (VEC_SIZE * -1)(%rcx), %xmm1
270 movaps (VEC_SIZE * -2)(%rcx), %xmm2
271 movaps (VEC_SIZE * -3)(%rcx), %xmm3
272 movaps (VEC_SIZE * -4)(%rcx), %xmm4
273 pcmpeqb %xmm0, %xmm1
274 pcmpeqb %xmm0, %xmm2
275 pcmpeqb %xmm0, %xmm3
276 pcmpeqb %xmm0, %xmm4
277
278 por %xmm1, %xmm2
279 por %xmm3, %xmm4
280 por %xmm2, %xmm4
281
282 pmovmskb %xmm4, %esi
283 testl %esi, %esi
284 jnz L(loop_end)
285
286 addq $-(VEC_SIZE * 4), %rcx
287 cmpq %rdx, %rcx
288 jne L(loop_4x_vec)
289
290 subl %edi, %edx
291
292 /* Ends up being 1-byte nop. */
293 .p2align 4,, 2
294L(last_4x_vec):
295 movaps -(VEC_SIZE)(%rcx), %xmm1
296 pcmpeqb %xmm0, %xmm1
297 pmovmskb %xmm1, %eax
298
299 cmpl $(VEC_SIZE * 2), %edx
300 jbe L(last_2x_vec)
301
302 testl %eax, %eax
303 jnz L(ret_vec_x0)
304
305
306 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
307 pcmpeqb %xmm0, %xmm1
308 pmovmskb %xmm1, %eax
309
310 testl %eax, %eax
311 jnz L(ret_vec_end)
312
313 movaps -(VEC_SIZE * 3)(%rcx), %xmm1
314 pcmpeqb %xmm0, %xmm1
315 pmovmskb %xmm1, %eax
316
317 subl $(VEC_SIZE * 3), %edx
318 ja L(last_vec)
319 bsrl %eax, %eax
320 jz L(ret_4)
321 addl %edx, %eax
322 jl L(zero_3)
323 addq %rdi, %rax
324L(ret_4):
325 ret
326
327 /* Ends up being 1-byte nop. */
328 .p2align 4,, 3
329L(loop_end):
330 pmovmskb %xmm1, %eax
331 sall $16, %eax
332 jnz L(ret_vec_end)
333
334 pmovmskb %xmm2, %eax
335 testl %eax, %eax
336 jnz L(ret_vec_end)
337
338 pmovmskb %xmm3, %eax
339 /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
340 then it won't affect the result in esi (VEC4). If ecx is non-zero
341 then CHAR in VEC3 and bsrq will use that position. */
342 sall $16, %eax
343 orl %esi, %eax
344 bsrl %eax, %eax
345 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
346 ret
347
348L(ret_vec_end):
349 bsrl %eax, %eax
350 leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
351 ret
352 /* Use in L(last_4x_vec). In the same cache line. This is just a spare
353 aligning bytes. */
354L(zero_3):
355 xorl %eax, %eax
356 ret
357 /* 2-bytes from next cache line. */
358END(MEMRCHR)
359#endif
360

source code of glibc/sysdeps/x86_64/multiarch/memrchr-sse2.S