memrchr-sse2.S source code [glibc/sysdeps/x86_64/multiarch/memrchr-sse2.S]

1	/ memrchr optimized with SSE2.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	/ MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation*
22	so we need this to build for ISA V2 builds. /*
23	#if ISA_SHOULD_BUILD (2)
24
25	# ifndef MEMRCHR
26	# define MEMRCHR __memrchr_sse2
27	# endif
28
29	# include <sysdep.h>
30	# define VEC_SIZE 16
31	# define PAGE_SIZE 4096
32
33	.text
34	ENTRY_P2ALIGN(MEMRCHR, `6`)
35	# ifdef __ILP32__
36	/ Clear upper bits. /
37	mov %RDX_LP, %RDX_LP
38	# endif
39	movd %esi, %xmm0
40
41	/ Get end pointer. /
42	leaq (%rdx, %rdi), %rcx
43
44	punpcklbw %xmm0, %xmm0
45	punpcklwd %xmm0, %xmm0
46	pshufd $`0`, %xmm0, %xmm0
47
48	/ Check if we can load 1x VEC without cross a page. /
49	testl $(PAGE_SIZE - VEC_SIZE), %ecx
50	jz L(page_cross)
51
52	/ NB: This load happens regardless of whether rdx (len) is zero. Since*
53	it doesn't cross a page and the standard guarantees any pointer have
54	at least one-valid byte this load must be safe. For the entire
55	history of the x86 memrchr implementation this has been possible so
56	no code "should" be relying on a zero-length check before this load.
57	The zero-length check is moved to the page cross case because it is
58	1) pretty cold and including it pushes the hot case len <= VEC_SIZE
59	into 2-cache lines. /*
60	movups -(VEC_SIZE)(%rcx), %xmm1
61	pcmpeqb %xmm0, %xmm1
62	pmovmskb %xmm1, %eax
63
64	subq $VEC_SIZE, %rdx
65	ja L(more_1x_vec)
66	L(ret_vec_x0_test):
67	/ Zero-flag set if eax (src) is zero. Destination unchanged if src is*
68	zero. /*
69	bsrl %eax, %eax
70	jz L(ret_0)
71	/ Check if the CHAR match is in bounds. Need to truly zero `eax` here*
72	if out of bounds. /*
73	addl %edx, %eax
74	jl L(zero_0)
75	/ Since we subtracted VEC_SIZE from rdx earlier we can just add to base*
76	ptr. /*
77	addq %rdi, %rax
78	L(ret_0):
79	ret
80
81	.p2align `4`,, `5`
82	L(ret_vec_x0):
83	bsrl %eax, %eax
84	leaq -(VEC_SIZE)(%rcx, %rax), %rax
85	ret
86
87	.p2align `4`,, `2`
88	L(zero_0):
89	xorl %eax, %eax
90	ret
91
92
93	.p2align `4`,, `8`
94	L(more_1x_vec):
95	testl %eax, %eax
96	jnz L(ret_vec_x0)
97
98	/ Align rcx (pointer to string). /
99	decq %rcx
100	andq $-VEC_SIZE, %rcx
101
102	movq %rcx, %rdx
103	/ NB: We could consistenyl save 1-byte in this pattern with `movaps*
104	%xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
105	it adds more frontend uops (even if the moves can be eliminated) and
106	some percentage of the time actual backend uops. /*
107	movaps -(VEC_SIZE)(%rcx), %xmm1
108	pcmpeqb %xmm0, %xmm1
109	subq %rdi, %rdx
110	pmovmskb %xmm1, %eax
111
112	cmpq $(VEC_SIZE * `2`), %rdx
113	ja L(more_2x_vec)
114	L(last_2x_vec):
115	subl $VEC_SIZE, %edx
116	jbe L(ret_vec_x0_test)
117
118	testl %eax, %eax
119	jnz L(ret_vec_x0)
120
121	movaps -(VEC_SIZE * `2`)(%rcx), %xmm1
122	pcmpeqb %xmm0, %xmm1
123	pmovmskb %xmm1, %eax
124
125	subl $VEC_SIZE, %edx
126	bsrl %eax, %eax
127	jz L(ret_1)
128	addl %edx, %eax
129	jl L(zero_0)
130	addq %rdi, %rax
131	L(ret_1):
132	ret
133
134	/ Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)*
135	causes the hot pause (length <= VEC_SIZE) to span multiple cache
136	lines. Naturally aligned % 16 to 8-bytes. /*
137	L(page_cross):
138	/ Zero length check. /
139	testq %rdx, %rdx
140	jz L(zero_0)
141
142	leaq -`1`(%rcx), %r8
143	andq $-(VEC_SIZE), %r8
144
145	movaps (%r8), %xmm1
146	pcmpeqb %xmm0, %xmm1
147	pmovmskb %xmm1, %esi
148	/ Shift out negative alignment (because we are starting from endptr and*
149	working backwards). /*
150	negl %ecx
151	/ 32-bit shift but VEC_SIZE=16 so need to mask the shift count*
152	explicitly. /*
153	andl $(VEC_SIZE - `1`), %ecx
154	shl %cl, %esi
155	movzwl %si, %eax
156	leaq (%rdi, %rdx), %rcx
157	cmpq %rdi, %r8
158	ja L(more_1x_vec)
159	subl $VEC_SIZE, %edx
160	bsrl %eax, %eax
161	jz L(ret_2)
162	addl %edx, %eax
163	jl L(zero_1)
164	addq %rdi, %rax
165	L(ret_2):
166	ret
167
168	/ Fits in aliging bytes. /
169	L(zero_1):
170	xorl %eax, %eax
171	ret
172
173	.p2align `4`,, `5`
174	L(ret_vec_x1):
175	bsrl %eax, %eax
176	leaq -(VEC_SIZE * `2`)(%rcx, %rax), %rax
177	ret
178
179	.p2align `4`,, `8`
180	L(more_2x_vec):
181	testl %eax, %eax
182	jnz L(ret_vec_x0)
183
184	movaps -(VEC_SIZE * `2`)(%rcx), %xmm1
185	pcmpeqb %xmm0, %xmm1
186	pmovmskb %xmm1, %eax
187	testl %eax, %eax
188	jnz L(ret_vec_x1)
189
190
191	movaps -(VEC_SIZE * `3`)(%rcx), %xmm1
192	pcmpeqb %xmm0, %xmm1
193	pmovmskb %xmm1, %eax
194
195	subq $(VEC_SIZE * `4`), %rdx
196	ja L(more_4x_vec)
197
198	addl $(VEC_SIZE), %edx
199	jle L(ret_vec_x2_test)
200
201	L(last_vec):
202	testl %eax, %eax
203	jnz L(ret_vec_x2)
204
205	movaps -(VEC_SIZE * `4`)(%rcx), %xmm1
206	pcmpeqb %xmm0, %xmm1
207	pmovmskb %xmm1, %eax
208
209	subl $(VEC_SIZE), %edx
210	bsrl %eax, %eax
211	jz L(ret_3)
212	addl %edx, %eax
213	jl L(zero_2)
214	addq %rdi, %rax
215	L(ret_3):
216	ret
217
218	.p2align `4`,, `6`
219	L(ret_vec_x2_test):
220	bsrl %eax, %eax
221	jz L(zero_2)
222	addl %edx, %eax
223	jl L(zero_2)
224	addq %rdi, %rax
225	ret
226
227	L(zero_2):
228	xorl %eax, %eax
229	ret
230
231
232	.p2align `4`,, `5`
233	L(ret_vec_x2):
234	bsrl %eax, %eax
235	leaq -(VEC_SIZE * `3`)(%rcx, %rax), %rax
236	ret
237
238	.p2align `4`,, `5`
239	L(ret_vec_x3):
240	bsrl %eax, %eax
241	leaq -(VEC_SIZE * `4`)(%rcx, %rax), %rax
242	ret
243
244	.p2align `4`,, `8`
245	L(more_4x_vec):
246	testl %eax, %eax
247	jnz L(ret_vec_x2)
248
249	movaps -(VEC_SIZE * `4`)(%rcx), %xmm1
250	pcmpeqb %xmm0, %xmm1
251	pmovmskb %xmm1, %eax
252
253	testl %eax, %eax
254	jnz L(ret_vec_x3)
255
256	addq $-(VEC_SIZE * `4`), %rcx
257	cmpq $(VEC_SIZE * `4`), %rdx
258	jbe L(last_4x_vec)
259
260	/ Offset everything by 4x VEC_SIZE here to save a few bytes at the end*
261	keeping the code from spilling to the next cache line. /*
262	addq $(VEC_SIZE * `4` - `1`), %rcx
263	andq $-(VEC_SIZE * `4`), %rcx
264	leaq (VEC_SIZE * `4`)(%rdi), %rdx
265	andq $-(VEC_SIZE * `4`), %rdx
266
267	.p2align `4`,, `11`
268	L(loop_4x_vec):
269	movaps (VEC_SIZE * -`1`)(%rcx), %xmm1
270	movaps (VEC_SIZE * -`2`)(%rcx), %xmm2
271	movaps (VEC_SIZE * -`3`)(%rcx), %xmm3
272	movaps (VEC_SIZE * -`4`)(%rcx), %xmm4
273	pcmpeqb %xmm0, %xmm1
274	pcmpeqb %xmm0, %xmm2
275	pcmpeqb %xmm0, %xmm3
276	pcmpeqb %xmm0, %xmm4
277
278	por %xmm1, %xmm2
279	por %xmm3, %xmm4
280	por %xmm2, %xmm4
281
282	pmovmskb %xmm4, %esi
283	testl %esi, %esi
284	jnz L(loop_end)
285
286	addq $-(VEC_SIZE * `4`), %rcx
287	cmpq %rdx, %rcx
288	jne L(loop_4x_vec)
289
290	subl %edi, %edx
291
292	/ Ends up being 1-byte nop. /
293	.p2align `4`,, `2`
294	L(last_4x_vec):
295	movaps -(VEC_SIZE)(%rcx), %xmm1
296	pcmpeqb %xmm0, %xmm1
297	pmovmskb %xmm1, %eax
298
299	cmpl $(VEC_SIZE * `2`), %edx
300	jbe L(last_2x_vec)
301
302	testl %eax, %eax
303	jnz L(ret_vec_x0)
304
305
306	movaps -(VEC_SIZE * `2`)(%rcx), %xmm1
307	pcmpeqb %xmm0, %xmm1
308	pmovmskb %xmm1, %eax
309
310	testl %eax, %eax
311	jnz L(ret_vec_end)
312
313	movaps -(VEC_SIZE * `3`)(%rcx), %xmm1
314	pcmpeqb %xmm0, %xmm1
315	pmovmskb %xmm1, %eax
316
317	subl $(VEC_SIZE * `3`), %edx
318	ja L(last_vec)
319	bsrl %eax, %eax
320	jz L(ret_4)
321	addl %edx, %eax
322	jl L(zero_3)
323	addq %rdi, %rax
324	L(ret_4):
325	ret
326
327	/ Ends up being 1-byte nop. /
328	.p2align `4`,, `3`
329	L(loop_end):
330	pmovmskb %xmm1, %eax
331	sall $`16`, %eax
332	jnz L(ret_vec_end)
333
334	pmovmskb %xmm2, %eax
335	testl %eax, %eax
336	jnz L(ret_vec_end)
337
338	pmovmskb %xmm3, %eax
339	/ Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)*
340	then it won't affect the result in esi (VEC4). If ecx is non-zero
341	then CHAR in VEC3 and bsrq will use that position. /*
342	sall $`16`, %eax
343	orl %esi, %eax
344	bsrl %eax, %eax
345	leaq -(VEC_SIZE * `4`)(%rcx, %rax), %rax
346	ret
347
348	L(ret_vec_end):
349	bsrl %eax, %eax
350	leaq (VEC_SIZE * -`2`)(%rax, %rcx), %rax
351	ret
352	/ Use in L(last_4x_vec). In the same cache line. This is just a spare*
353	aligning bytes. /*
354	L(zero_3):
355	xorl %eax, %eax
356	ret
357	/ 2-bytes from next cache line. /
358	END(MEMRCHR)
359	#endif
360

source code of glibc/sysdeps/x86_64/multiarch/memrchr-sse2.S