memcmp-avx2-movbe.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S]

1	/ memcmp/wmemcmp optimized with AVX2.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23	/ memcmp/wmemcmp is implemented as:*
24	1. Use ymm vector compares when possible. The only case where
25	vector compares is not possible for when size < VEC_SIZE
26	and loading from either s1 or s2 would cause a page cross.
27	2. For size from 2 to 7 bytes on page cross, load as big endian
28	with movbe and bswap to avoid branches.
29	3. Use xmm vector compare when size >= 4 bytes for memcmp or
30	size >= 8 bytes for wmemcmp.
31	4. Optimistically compare up to first 4 VEC_SIZE one at a*
32	to check for early mismatches. Only do this if its guaranteed the
33	work is not wasted.
34	5. If size is 8 VEC_SIZE or less, unroll the loop.*
35	6. Compare 4 VEC_SIZE at a time with the aligned first memory*
36	area.
37	7. Use 2 vector compares when size is 2 VEC_SIZE or less.*
38	8. Use 4 vector compares when size is 4 VEC_SIZE or less.*
39	9. Use 8 vector compares when size is 8 VEC_SIZE or less. /
40
41
42	# include <sysdep.h>
43
44	# ifndef MEMCMP
45	# define MEMCMP __memcmp_avx2_movbe
46	# endif
47
48	# ifdef USE_AS_WMEMCMP
49	# define CHAR_SIZE 4
50	# define VPCMPEQ vpcmpeqd
51	# else
52	# define CHAR_SIZE 1
53	# define VPCMPEQ vpcmpeqb
54	# endif
55
56	# ifndef VZEROUPPER
57	# define VZEROUPPER vzeroupper
58	# endif
59
60	# ifndef SECTION
61	# define SECTION(p) p##.avx
62	# endif
63
64	# define VEC_SIZE 32
65	# define PAGE_SIZE 4096
66
67	/ Warning!*
68	wmemcmp has to use SIGNED comparison for elements.
69	memcmp has to use UNSIGNED comparison for elements.
70	*/
71
72	.section SECTION(.text),"ax",@progbits
73	ENTRY (MEMCMP)
74	# ifdef USE_AS_WMEMCMP
75	shl $`2`, %RDX_LP
76	# elif defined __ILP32__
77	/ Clear the upper 32 bits. /
78	movl %edx, %edx
79	# endif
80	cmp $VEC_SIZE, %RDX_LP
81	jb L(less_vec)
82
83	/ From VEC to 2 * VEC. No branch when size == VEC_SIZE. /
84	vmovdqu (%rsi), %ymm1
85	VPCMPEQ (%rdi), %ymm1, %ymm1
86	vpmovmskb %ymm1, %eax
87	/ NB: eax must be destination register if going to*
88	L(return_vec_[0,2]). For L(return_vec_3 destination register
89	must be ecx. /*
90	incl %eax
91	jnz L(return_vec_0)
92
93	cmpq $(VEC_SIZE * `2`), %rdx
94	jbe L(last_1x_vec)
95
96	/ Check second VEC no matter what. /
97	vmovdqu VEC_SIZE(%rsi), %ymm2
98	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
99	vpmovmskb %ymm2, %eax
100	/ If all 4 VEC where equal eax will be all 1s so incl will*
101	overflow and set zero flag. /*
102	incl %eax
103	jnz L(return_vec_1)
104
105	/ Less than 4 * VEC. /
106	cmpq $(VEC_SIZE * `4`), %rdx
107	jbe L(last_2x_vec)
108
109	/ Check third and fourth VEC no matter what. /
110	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
111	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
112	vpmovmskb %ymm3, %eax
113	incl %eax
114	jnz L(return_vec_2)
115	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
116	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
117	vpmovmskb %ymm4, %ecx
118	incl %ecx
119	jnz L(return_vec_3)
120
121	/ Go to 4x VEC loop. /
122	cmpq $(VEC_SIZE * `8`), %rdx
123	ja L(more_8x_vec)
124
125	/ Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any*
126	branches. /*
127
128	/ Load first two VEC from s2 before adjusting addresses. /
129	vmovdqu -(VEC_SIZE * `4`)(%rsi, %rdx), %ymm1
130	vmovdqu -(VEC_SIZE * `3`)(%rsi, %rdx), %ymm2
131	leaq -(`4` * VEC_SIZE)(%rdi, %rdx), %rdi
132	leaq -(`4` * VEC_SIZE)(%rsi, %rdx), %rsi
133
134	/ Wait to load from s1 until addressed adjust due to*
135	unlamination of microfusion with complex address mode. /*
136	VPCMPEQ (%rdi), %ymm1, %ymm1
137	VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
138
139	vmovdqu (VEC_SIZE * `2`)(%rsi), %ymm3
140	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
141	vmovdqu (VEC_SIZE * `3`)(%rsi), %ymm4
142	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
143
144	/ Reduce VEC0 - VEC4. /
145	vpand %ymm1, %ymm2, %ymm5
146	vpand %ymm3, %ymm4, %ymm6
147	vpand %ymm5, %ymm6, %ymm7
148	vpmovmskb %ymm7, %ecx
149	incl %ecx
150	jnz L(return_vec_0_1_2_3)
151	/ NB: eax must be zero to reach here. /
152	VZEROUPPER_RETURN
153
154	.p2align `4`
155	L(return_vec_0):
156	tzcntl %eax, %eax
157	# ifdef USE_AS_WMEMCMP
158	movl (%rdi, %rax), %ecx
159	xorl %edx, %edx
160	cmpl (%rsi, %rax), %ecx
161	/ NB: no partial register stall here because xorl zero idiom*
162	above. /*
163	setg %dl
164	leal -`1`(%rdx, %rdx), %eax
165	# else
166	movzbl (%rsi, %rax), %ecx
167	movzbl (%rdi, %rax), %eax
168	subl %ecx, %eax
169	# endif
170	L(return_vzeroupper):
171	ZERO_UPPER_VEC_REGISTERS_RETURN
172
173	.p2align `4`
174	L(return_vec_1):
175	tzcntl %eax, %eax
176	# ifdef USE_AS_WMEMCMP
177	movl VEC_SIZE(%rdi, %rax), %ecx
178	xorl %edx, %edx
179	cmpl VEC_SIZE(%rsi, %rax), %ecx
180	setg %dl
181	leal -`1`(%rdx, %rdx), %eax
182	# else
183	movzbl VEC_SIZE(%rsi, %rax), %ecx
184	movzbl VEC_SIZE(%rdi, %rax), %eax
185	subl %ecx, %eax
186	# endif
187	VZEROUPPER_RETURN
188
189	.p2align `4`
190	L(return_vec_2):
191	tzcntl %eax, %eax
192	# ifdef USE_AS_WMEMCMP
193	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
194	xorl %edx, %edx
195	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
196	setg %dl
197	leal -`1`(%rdx, %rdx), %eax
198	# else
199	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
200	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
201	subl %ecx, %eax
202	# endif
203	VZEROUPPER_RETURN
204
205	/ NB: p2align 5 here to ensure 4x loop is 32 byte aligned. /
206	.p2align `5`
207	L(`8x_return_vec_0_1_2_3`):
208	/ Returning from L(more_8x_vec) requires restoring rsi. /
209	addq %rdi, %rsi
210	L(return_vec_0_1_2_3):
211	vpmovmskb %ymm1, %eax
212	incl %eax
213	jnz L(return_vec_0)
214
215	vpmovmskb %ymm2, %eax
216	incl %eax
217	jnz L(return_vec_1)
218
219	vpmovmskb %ymm3, %eax
220	incl %eax
221	jnz L(return_vec_2)
222	L(return_vec_3):
223	tzcntl %ecx, %ecx
224	# ifdef USE_AS_WMEMCMP
225	movl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
226	xorl %edx, %edx
227	cmpl (VEC_SIZE * `3`)(%rsi, %rcx), %eax
228	setg %dl
229	leal -`1`(%rdx, %rdx), %eax
230	# else
231	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
232	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %ecx
233	subl %ecx, %eax
234	# endif
235	VZEROUPPER_RETURN
236
237	.p2align `4`
238	L(more_8x_vec):
239	/ Set end of s1 in rdx. /
240	leaq -(VEC_SIZE * `4`)(%rdi, %rdx), %rdx
241	/ rsi stores s2 - s1. This allows loop to only update one*
242	pointer. /*
243	subq %rdi, %rsi
244	/ Align s1 pointer. /
245	andq $-VEC_SIZE, %rdi
246	/ Adjust because first 4x vec where check already. /
247	subq $-(VEC_SIZE * `4`), %rdi
248	.p2align `4`
249	L(loop_4x_vec):
250	/ rsi has s2 - s1 so get correct address by adding s1 (in rdi).*
251	*/
252	vmovdqu (%rsi, %rdi), %ymm1
253	VPCMPEQ (%rdi), %ymm1, %ymm1
254
255	vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
256	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
257
258	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdi), %ymm3
259	VPCMPEQ (VEC_SIZE * `2`)(%rdi), %ymm3, %ymm3
260
261	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdi), %ymm4
262	VPCMPEQ (VEC_SIZE * `3`)(%rdi), %ymm4, %ymm4
263
264	vpand %ymm1, %ymm2, %ymm5
265	vpand %ymm3, %ymm4, %ymm6
266	vpand %ymm5, %ymm6, %ymm7
267	vpmovmskb %ymm7, %ecx
268	incl %ecx
269	jnz L(`8x_return_vec_0_1_2_3`)
270	subq $-(VEC_SIZE * `4`), %rdi
271	/ Check if s1 pointer at end. /
272	cmpq %rdx, %rdi
273	jb L(loop_4x_vec)
274
275	subq %rdx, %rdi
276	/ rdi has 4 * VEC_SIZE - remaining length. /
277	cmpl $(VEC_SIZE * `3`), %edi
278	jae L(`8x_last_1x_vec`)
279	/ Load regardless of branch. /
280	vmovdqu (VEC_SIZE * `2`)(%rsi, %rdx), %ymm3
281	cmpl $(VEC_SIZE * `2`), %edi
282	jae L(`8x_last_2x_vec`)
283
284	/ Check last 4 VEC. /
285	vmovdqu (%rsi, %rdx), %ymm1
286	VPCMPEQ (%rdx), %ymm1, %ymm1
287
288	vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
289	VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
290
291	VPCMPEQ (VEC_SIZE * `2`)(%rdx), %ymm3, %ymm3
292
293	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdx), %ymm4
294	VPCMPEQ (VEC_SIZE * `3`)(%rdx), %ymm4, %ymm4
295
296	vpand %ymm1, %ymm2, %ymm5
297	vpand %ymm3, %ymm4, %ymm6
298	vpand %ymm5, %ymm6, %ymm7
299	vpmovmskb %ymm7, %ecx
300	/ Restore s1 pointer to rdi. /
301	movq %rdx, %rdi
302	incl %ecx
303	jnz L(`8x_return_vec_0_1_2_3`)
304	/ NB: eax must be zero to reach here. /
305	VZEROUPPER_RETURN
306
307	/ Only entry is from L(more_8x_vec). /
308	.p2align `4`
309	L(`8x_last_2x_vec`):
310	/ Check second to last VEC. rdx store end pointer of s1 and*
311	ymm3 has already been loaded with second to last VEC from s2.
312	*/
313	VPCMPEQ (VEC_SIZE * `2`)(%rdx), %ymm3, %ymm3
314	vpmovmskb %ymm3, %eax
315	incl %eax
316	jnz L(`8x_return_vec_2`)
317	/ Check last VEC. /
318	.p2align `4`
319	L(`8x_last_1x_vec`):
320	vmovdqu (VEC_SIZE * `3`)(%rsi, %rdx), %ymm4
321	VPCMPEQ (VEC_SIZE * `3`)(%rdx), %ymm4, %ymm4
322	vpmovmskb %ymm4, %eax
323	incl %eax
324	jnz L(`8x_return_vec_3`)
325	VZEROUPPER_RETURN
326
327	.p2align `4`
328	L(last_2x_vec):
329	/ Check second to last VEC. /
330	vmovdqu -(VEC_SIZE * `2`)(%rsi, %rdx), %ymm1
331	VPCMPEQ -(VEC_SIZE * `2`)(%rdi, %rdx), %ymm1, %ymm1
332	vpmovmskb %ymm1, %eax
333	incl %eax
334	jnz L(return_vec_1_end)
335	/ Check last VEC. /
336	L(last_1x_vec):
337	vmovdqu -(VEC_SIZE * `1`)(%rsi, %rdx), %ymm1
338	VPCMPEQ -(VEC_SIZE * `1`)(%rdi, %rdx), %ymm1, %ymm1
339	vpmovmskb %ymm1, %eax
340	incl %eax
341	jnz L(return_vec_0_end)
342	VZEROUPPER_RETURN
343
344	.p2align `4`
345	L(`8x_return_vec_2`):
346	subq $VEC_SIZE, %rdx
347	L(`8x_return_vec_3`):
348	tzcntl %eax, %eax
349	addq %rdx, %rax
350	# ifdef USE_AS_WMEMCMP
351	movl (VEC_SIZE * `3`)(%rax), %ecx
352	xorl %edx, %edx
353	cmpl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
354	setg %dl
355	leal -`1`(%rdx, %rdx), %eax
356	# else
357	movzbl (VEC_SIZE * `3`)(%rsi, %rax), %ecx
358	movzbl (VEC_SIZE * `3`)(%rax), %eax
359	subl %ecx, %eax
360	# endif
361	VZEROUPPER_RETURN
362
363	.p2align `4`
364	L(return_vec_1_end):
365	tzcntl %eax, %eax
366	addl %edx, %eax
367	# ifdef USE_AS_WMEMCMP
368	movl -(VEC_SIZE * `2`)(%rdi, %rax), %ecx
369	xorl %edx, %edx
370	cmpl -(VEC_SIZE * `2`)(%rsi, %rax), %ecx
371	setg %dl
372	leal -`1`(%rdx, %rdx), %eax
373	# else
374	movzbl -(VEC_SIZE * `2`)(%rsi, %rax), %ecx
375	movzbl -(VEC_SIZE * `2`)(%rdi, %rax), %eax
376	subl %ecx, %eax
377	# endif
378	VZEROUPPER_RETURN
379
380	.p2align `4`
381	L(return_vec_0_end):
382	tzcntl %eax, %eax
383	addl %edx, %eax
384	# ifdef USE_AS_WMEMCMP
385	movl -VEC_SIZE(%rdi, %rax), %ecx
386	xorl %edx, %edx
387	cmpl -VEC_SIZE(%rsi, %rax), %ecx
388	setg %dl
389	leal -`1`(%rdx, %rdx), %eax
390	# else
391	movzbl -VEC_SIZE(%rsi, %rax), %ecx
392	movzbl -VEC_SIZE(%rdi, %rax), %eax
393	subl %ecx, %eax
394	# endif
395	VZEROUPPER_RETURN
396
397	.p2align `4`
398	L(less_vec):
399	/ Check if one or less CHAR. This is necessary for size = 0 but*
400	is also faster for size = CHAR_SIZE. /*
401	cmpl $CHAR_SIZE, %edx
402	jbe L(one_or_less)
403
404	/ Check if loading one VEC from either s1 or s2 could cause a*
405	page cross. This can have false positives but is by far the
406	fastest method. /*
407	movl %edi, %eax
408	orl %esi, %eax
409	andl $(PAGE_SIZE - `1`), %eax
410	cmpl $(PAGE_SIZE - VEC_SIZE), %eax
411	jg L(page_cross_less_vec)
412
413	/ No page cross possible. /
414	vmovdqu (%rsi), %ymm2
415	VPCMPEQ (%rdi), %ymm2, %ymm2
416	vpmovmskb %ymm2, %eax
417	incl %eax
418	/ Result will be zero if s1 and s2 match. Otherwise first set*
419	bit will be first mismatch. /*
420	bzhil %edx, %eax, %edx
421	jnz L(return_vec_0)
422	xorl %eax, %eax
423	VZEROUPPER_RETURN
424
425	.p2align `4`
426	L(page_cross_less_vec):
427	/ if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28*
428	bytes. /*
429	cmpl $`16`, %edx
430	jae L(between_16_31)
431	# ifndef USE_AS_WMEMCMP
432	cmpl $`8`, %edx
433	jae L(between_8_15)
434	/ Fall through for [4, 7]. /
435	cmpl $`4`, %edx
436	jb L(between_2_3)
437
438	movbe (%rdi), %eax
439	movbe (%rsi), %ecx
440	shlq $`32`, %rax
441	shlq $`32`, %rcx
442	movbe -`4`(%rdi, %rdx), %edi
443	movbe -`4`(%rsi, %rdx), %esi
444	orq %rdi, %rax
445	orq %rsi, %rcx
446	subq %rcx, %rax
447	/ Fast path for return zero. /
448	jnz L(ret_nonzero)
449	/ No ymm register was touched. /
450	ret
451
452	.p2align `4`
453	L(one_or_less):
454	jb L(zero)
455	movzbl (%rsi), %ecx
456	movzbl (%rdi), %eax
457	subl %ecx, %eax
458	/ No ymm register was touched. /
459	ret
460
461	.p2align `4`,, `5`
462	L(ret_nonzero):
463	sbbl %eax, %eax
464	orl $`1`, %eax
465	/ No ymm register was touched. /
466	ret
467
468	.p2align `4`,, `2`
469	L(zero):
470	xorl %eax, %eax
471	/ No ymm register was touched. /
472	ret
473
474	.p2align `4`
475	L(between_8_15):
476	movbe (%rdi), %rax
477	movbe (%rsi), %rcx
478	subq %rcx, %rax
479	jnz L(ret_nonzero)
480	movbe -`8`(%rdi, %rdx), %rax
481	movbe -`8`(%rsi, %rdx), %rcx
482	subq %rcx, %rax
483	/ Fast path for return zero. /
484	jnz L(ret_nonzero)
485	/ No ymm register was touched. /
486	ret
487	# else
488	/ If USE_AS_WMEMCMP fall through into 8-15 byte case. /
489	vmovq (%rdi), %xmm1
490	vmovq (%rsi), %xmm2
491	VPCMPEQ %xmm1, %xmm2, %xmm2
492	vpmovmskb %xmm2, %eax
493	subl $`0xffff`, %eax
494	jnz L(return_vec_0)
495	/ Use overlapping loads to avoid branches. /
496	leaq -`8`(%rdi, %rdx), %rdi
497	leaq -`8`(%rsi, %rdx), %rsi
498	vmovq (%rdi), %xmm1
499	vmovq (%rsi), %xmm2
500	VPCMPEQ %xmm1, %xmm2, %xmm2
501	vpmovmskb %xmm2, %eax
502	subl $`0xffff`, %eax
503	/ Fast path for return zero. /
504	jnz L(return_vec_0)
505	/ No ymm register was touched. /
506	ret
507	# endif
508
509	.p2align `4`,, `10`
510	L(between_16_31):
511	/ From 16 to 31 bytes. No branch when size == 16. /
512	vmovdqu (%rsi), %xmm2
513	VPCMPEQ (%rdi), %xmm2, %xmm2
514	vpmovmskb %xmm2, %eax
515	subl $`0xffff`, %eax
516	jnz L(return_vec_0)
517
518	/ Use overlapping loads to avoid branches. /
519
520	vmovdqu -`16`(%rsi, %rdx), %xmm2
521	leaq -`16`(%rdi, %rdx), %rdi
522	leaq -`16`(%rsi, %rdx), %rsi
523	VPCMPEQ (%rdi), %xmm2, %xmm2
524	vpmovmskb %xmm2, %eax
525	subl $`0xffff`, %eax
526	/ Fast path for return zero. /
527	jnz L(return_vec_0)
528	/ No ymm register was touched. /
529	ret
530
531	# ifdef USE_AS_WMEMCMP
532	.p2align `4`,, `2`
533	L(zero):
534	xorl %eax, %eax
535	ret
536
537	.p2align `4`
538	L(one_or_less):
539	jb L(zero)
540	movl (%rdi), %ecx
541	xorl %edx, %edx
542	cmpl (%rsi), %ecx
543	je L(zero)
544	setg %dl
545	leal -`1`(%rdx, %rdx), %eax
546	/ No ymm register was touched. /
547	ret
548	# else
549
550	.p2align `4`
551	L(between_2_3):
552	/ Load as big endian to avoid branches. /
553	movzwl (%rdi), %eax
554	movzwl (%rsi), %ecx
555	bswap %eax
556	bswap %ecx
557	shrl %eax
558	shrl %ecx
559	movzbl -`1`(%rdi, %rdx), %edi
560	movzbl -`1`(%rsi, %rdx), %esi
561	orl %edi, %eax
562	orl %esi, %ecx
563	/ Subtraction is okay because the upper bit is zero. /
564	subl %ecx, %eax
565	/ No ymm register was touched. /
566	ret
567	# endif
568
569	END (MEMCMP)
570	#endif
571

source code of glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S