memrchr-avx2.S source code [glibc/sysdeps/x86_64/multiarch/memrchr-avx2.S]

1	/ memrchr optimized with AVX2.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23	# include <sysdep.h>
24
25	# ifndef MEMRCHR
26	# define MEMRCHR __memrchr_avx2
27	# endif
28
29	# ifndef VZEROUPPER
30	# define VZEROUPPER vzeroupper
31	# endif
32
33	# ifndef SECTION
34	# define SECTION(p) p##.avx
35	# endif
36
37	# define VEC_SIZE 32
38	# define PAGE_SIZE 4096
39	.section SECTION(.text), "ax", @progbits
40	ENTRY_P2ALIGN(MEMRCHR, `6`)
41	# ifdef __ILP32__
42	/ Clear upper bits. /
43	and %RDX_LP, %RDX_LP
44	# else
45	test %RDX_LP, %RDX_LP
46	# endif
47	jz L(zero_0)
48
49	vmovd %esi, %xmm0
50	/ Get end pointer. Minus one for two reasons. 1) It is necessary for a*
51	correct page cross check and 2) it correctly sets up end ptr to be
52	subtract by lzcnt aligned. /*
53	leaq -`1`(%rdx, %rdi), %rax
54
55	vpbroadcastb %xmm0, %ymm0
56
57	/ Check if we can load 1x VEC without cross a page. /
58	testl $(PAGE_SIZE - VEC_SIZE), %eax
59	jz L(page_cross)
60
61	vpcmpeqb -(VEC_SIZE - `1`)(%rax), %ymm0, %ymm1
62	vpmovmskb %ymm1, %ecx
63	cmpq $VEC_SIZE, %rdx
64	ja L(more_1x_vec)
65
66	L(ret_vec_x0_test):
67	/ If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which*
68	will guarantee edx (len) is less than it. /*
69	lzcntl %ecx, %ecx
70
71	/ Hoist vzeroupper (not great for RTM) to save code size. This allows*
72	all logic for edx (len) <= VEC_SIZE to fit in first cache line. /*
73	COND_VZEROUPPER
74	cmpl %ecx, %edx
75	jle L(zero_0)
76	subq %rcx, %rax
77	ret
78
79	/ Fits in aligning bytes of first cache line. /
80	L(zero_0):
81	xorl %eax, %eax
82	ret
83
84	.p2align `4`,, `9`
85	L(ret_vec_x0):
86	lzcntl %ecx, %ecx
87	subq %rcx, %rax
88	L(return_vzeroupper):
89	ZERO_UPPER_VEC_REGISTERS_RETURN
90
91	.p2align `4`,, `10`
92	L(more_1x_vec):
93	testl %ecx, %ecx
94	jnz L(ret_vec_x0)
95
96	/ Align rax (string pointer). /
97	andq $-VEC_SIZE, %rax
98
99	/ Recompute remaining length after aligning. /
100	movq %rax, %rdx
101	/ Need this comparison next no matter what. /
102	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
103	subq %rdi, %rdx
104	decq %rax
105	vpmovmskb %ymm1, %ecx
106	/ Fall through for short (hotter than length). /
107	cmpq $(VEC_SIZE * `2`), %rdx
108	ja L(more_2x_vec)
109	L(last_2x_vec):
110	cmpl $VEC_SIZE, %edx
111	jbe L(ret_vec_x0_test)
112
113	testl %ecx, %ecx
114	jnz L(ret_vec_x0)
115
116	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm1
117	vpmovmskb %ymm1, %ecx
118	/ 64-bit lzcnt. This will naturally add 32 to position. /
119	lzcntq %rcx, %rcx
120	COND_VZEROUPPER
121	cmpl %ecx, %edx
122	jle L(zero_0)
123	subq %rcx, %rax
124	ret
125
126
127	/ Inexpensive place to put this regarding code size / target alignments*
128	/ ICache NLP. Necessary for 2-byte encoding of jump to page cross
129	case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
130	in first cache line. /*
131	L(page_cross):
132	movq %rax, %rsi
133	andq $-VEC_SIZE, %rsi
134	vpcmpeqb (%rsi), %ymm0, %ymm1
135	vpmovmskb %ymm1, %ecx
136	/ Shift out negative alignment (because we are starting from endptr and*
137	working backwards). /*
138	movl %eax, %r8d
139	/ notl because eax already has endptr - 1. (-x = ~(x - 1)). /
140	notl %r8d
141	shlxl %r8d, %ecx, %ecx
142	cmpq %rdi, %rsi
143	ja L(more_1x_vec)
144	lzcntl %ecx, %ecx
145	COND_VZEROUPPER
146	cmpl %ecx, %edx
147	jle L(zero_0)
148	subq %rcx, %rax
149	ret
150	.p2align `4`,, `11`
151	L(ret_vec_x1):
152	/ This will naturally add 32 to position. /
153	lzcntq %rcx, %rcx
154	subq %rcx, %rax
155	VZEROUPPER_RETURN
156	.p2align `4`,, `10`
157	L(more_2x_vec):
158	testl %ecx, %ecx
159	jnz L(ret_vec_x0)
160
161	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm1
162	vpmovmskb %ymm1, %ecx
163	testl %ecx, %ecx
164	jnz L(ret_vec_x1)
165
166
167	/ Needed no matter what. /
168	vpcmpeqb -(VEC_SIZE * `3` - `1`)(%rax), %ymm0, %ymm1
169	vpmovmskb %ymm1, %ecx
170
171	subq $(VEC_SIZE * `4`), %rdx
172	ja L(more_4x_vec)
173
174	cmpl $(VEC_SIZE * -`1`), %edx
175	jle L(ret_vec_x2_test)
176
177	L(last_vec):
178	testl %ecx, %ecx
179	jnz L(ret_vec_x2)
180
181	/ Needed no matter what. /
182	vpcmpeqb -(VEC_SIZE * `4` - `1`)(%rax), %ymm0, %ymm1
183	vpmovmskb %ymm1, %ecx
184	lzcntl %ecx, %ecx
185	subq $(VEC_SIZE * `3`), %rax
186	COND_VZEROUPPER
187	subq %rcx, %rax
188	cmpq %rax, %rdi
189	ja L(zero_2)
190	ret
191
192	/ First in aligning bytes. /
193	L(zero_2):
194	xorl %eax, %eax
195	ret
196
197	.p2align `4`,, `4`
198	L(ret_vec_x2_test):
199	lzcntl %ecx, %ecx
200	subq $(VEC_SIZE * `2`), %rax
201	COND_VZEROUPPER
202	subq %rcx, %rax
203	cmpq %rax, %rdi
204	ja L(zero_2)
205	ret
206
207
208	.p2align `4`,, `11`
209	L(ret_vec_x2):
210	/ ecx must be non-zero. /
211	bsrl %ecx, %ecx
212	leaq (VEC_SIZE * -`3` + `1`)(%rcx, %rax), %rax
213	VZEROUPPER_RETURN
214
215	.p2align `4`,, `14`
216	L(ret_vec_x3):
217	/ ecx must be non-zero. /
218	bsrl %ecx, %ecx
219	leaq (VEC_SIZE * -`4` + `1`)(%rcx, %rax), %rax
220	VZEROUPPER_RETURN
221
222
223
224	.p2align `4`
225	L(more_4x_vec):
226	testl %ecx, %ecx
227	jnz L(ret_vec_x2)
228
229	vpcmpeqb -(VEC_SIZE * `4` - `1`)(%rax), %ymm0, %ymm1
230	vpmovmskb %ymm1, %ecx
231
232	testl %ecx, %ecx
233	jnz L(ret_vec_x3)
234
235	/ Check if near end before re-aligning (otherwise might do an*
236	unnecessary loop iteration). /*
237	addq $-(VEC_SIZE * `4`), %rax
238	cmpq $(VEC_SIZE * `4`), %rdx
239	jbe L(last_4x_vec)
240
241	/ Align rax to (VEC_SIZE - 1). /
242	orq $(VEC_SIZE * `4` - `1`), %rax
243	movq %rdi, %rdx
244	/ Get endptr for loop in rdx. NB: Can't just do while rax > rdi because*
245	lengths that overflow can be valid and break the comparison. /*
246	orq $(VEC_SIZE * `4` - `1`), %rdx
247
248	.p2align `4`
249	L(loop_4x_vec):
250	/ Need this comparison next no matter what. /
251	vpcmpeqb -(VEC_SIZE * `1` - `1`)(%rax), %ymm0, %ymm1
252	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm2
253	vpcmpeqb -(VEC_SIZE * `3` - `1`)(%rax), %ymm0, %ymm3
254	vpcmpeqb -(VEC_SIZE * `4` - `1`)(%rax), %ymm0, %ymm4
255
256	vpor %ymm1, %ymm2, %ymm2
257	vpor %ymm3, %ymm4, %ymm4
258	vpor %ymm2, %ymm4, %ymm4
259	vpmovmskb %ymm4, %esi
260
261	testl %esi, %esi
262	jnz L(loop_end)
263
264	addq $(VEC_SIZE * -`4`), %rax
265	cmpq %rdx, %rax
266	jne L(loop_4x_vec)
267
268	subl %edi, %edx
269	incl %edx
270
271	L(last_4x_vec):
272	/ Used no matter what. /
273	vpcmpeqb -(VEC_SIZE * `1` - `1`)(%rax), %ymm0, %ymm1
274	vpmovmskb %ymm1, %ecx
275
276	cmpl $(VEC_SIZE * `2`), %edx
277	jbe L(last_2x_vec)
278
279	testl %ecx, %ecx
280	jnz L(ret_vec_x0_end)
281
282	vpcmpeqb -(VEC_SIZE * `2` - `1`)(%rax), %ymm0, %ymm1
283	vpmovmskb %ymm1, %ecx
284	testl %ecx, %ecx
285	jnz L(ret_vec_x1_end)
286
287	/ Used no matter what. /
288	vpcmpeqb -(VEC_SIZE * `3` - `1`)(%rax), %ymm0, %ymm1
289	vpmovmskb %ymm1, %ecx
290
291	cmpl $(VEC_SIZE * `3`), %edx
292	ja L(last_vec)
293
294	lzcntl %ecx, %ecx
295	subq $(VEC_SIZE * `2`), %rax
296	COND_VZEROUPPER
297	subq %rcx, %rax
298	cmpq %rax, %rdi
299	jbe L(ret0)
300	xorl %eax, %eax
301	L(ret0):
302	ret
303
304
305	.p2align `4`
306	L(loop_end):
307	vpmovmskb %ymm1, %ecx
308	testl %ecx, %ecx
309	jnz L(ret_vec_x0_end)
310
311	vpmovmskb %ymm2, %ecx
312	testl %ecx, %ecx
313	jnz L(ret_vec_x1_end)
314
315	vpmovmskb %ymm3, %ecx
316	/ Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)*
317	then it won't affect the result in esi (VEC4). If ecx is non-zero
318	then CHAR in VEC3 and bsrq will use that position. /*
319	salq $`32`, %rcx
320	orq %rsi, %rcx
321	bsrq %rcx, %rcx
322	leaq (VEC_SIZE * -`4` + `1`)(%rcx, %rax), %rax
323	VZEROUPPER_RETURN
324
325	.p2align `4`,, `4`
326	L(ret_vec_x1_end):
327	/ 64-bit version will automatically add 32 (VEC_SIZE). /
328	lzcntq %rcx, %rcx
329	subq %rcx, %rax
330	VZEROUPPER_RETURN
331
332	.p2align `4`,, `4`
333	L(ret_vec_x0_end):
334	lzcntl %ecx, %ecx
335	subq %rcx, %rax
336	VZEROUPPER_RETURN
337
338	/ 2 bytes until next cache line. /
339	END(MEMRCHR)
340	#endif
341

source code of glibc/sysdeps/x86_64/multiarch/memrchr-avx2.S