memcmp-sse2.S source code [glibc/sysdeps/x86_64/multiarch/memcmp-sse2.S]

1	/ memcmp with SSE2.*
2	Copyright (C) 2017-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19
20	#include <isa-level.h>
21
22	/ MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation*
23	so we need this to build for ISA V2 builds. /*
24	#if ISA_SHOULD_BUILD (2)
25
26	#include <sysdep.h>
27
28	# ifndef MEMCMP
29	# define MEMCMP __memcmp_sse2
30	# endif
31
32	# ifdef USE_AS_WMEMCMP
33	# define PCMPEQ pcmpeqd
34	# define CHAR_SIZE 4
35	# define SIZE_OFFSET (0)
36	# else
37	# define PCMPEQ pcmpeqb
38	# define CHAR_SIZE 1
39	# endif
40
41	# ifdef USE_AS_MEMCMPEQ
42	# define SIZE_OFFSET (0)
43	# define CHECK_CMP(x, y) subl x, y
44	# else
45	# ifndef SIZE_OFFSET
46	# define SIZE_OFFSET (CHAR_PER_VEC * 2)
47	# endif
48	# define CHECK_CMP(x, y) cmpl x, y
49	# endif
50
51	# define VEC_SIZE 16
52	# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54	# ifndef MEMCMP
55	# define MEMCMP memcmp
56	# endif
57
58	.text
59	ENTRY(MEMCMP)
60	# ifdef __ILP32__
61	/ Clear the upper 32 bits. /
62	movl %edx, %edx
63	# endif
64	# ifdef USE_AS_WMEMCMP
65	/ Use 0xffff to test for mismatches on pmovmskb bitmask. Store*
66	in ecx for code size. This is preferable to using `incw` as
67	it avoids partial register stalls on older hardware (pre
68	SnB). /*
69	movl $`0xffff`, %ecx
70	# endif
71	cmpq $CHAR_PER_VEC, %rdx
72	ja L(more_1x_vec)
73
74	# ifdef USE_AS_WMEMCMP
75	/ saves a byte of code keeping the fall through path n = [2, 4]*
76	in the initial cache line. /*
77	decl %edx
78	jle L(cmp_0_1)
79
80	movq (%rsi), %xmm0
81	movq (%rdi), %xmm1
82	PCMPEQ %xmm0, %xmm1
83	pmovmskb %xmm1, %eax
84	subl %ecx, %eax
85	jnz L(ret_nonzero_vec_start_0)
86
87	movq -`4`(%rsi, %rdx, CHAR_SIZE), %xmm0
88	movq -`4`(%rdi, %rdx, CHAR_SIZE), %xmm1
89	PCMPEQ %xmm0, %xmm1
90	pmovmskb %xmm1, %eax
91	subl %ecx, %eax
92	jnz L(ret_nonzero_vec_end_0_adj)
93	# else
94	cmpl $`8`, %edx
95	ja L(cmp_9_16)
96
97	cmpl $`4`, %edx
98	jb L(cmp_0_3)
99
100	# ifdef USE_AS_MEMCMPEQ
101	movl (%rsi), %eax
102	subl (%rdi), %eax
103
104	movl -`4`(%rsi, %rdx), %esi
105	subl -`4`(%rdi, %rdx), %esi
106
107	orl %esi, %eax
108	ret
109	# else
110	/ Combine comparisons for lo and hi 4-byte comparisons. /
111	movl -`4`(%rsi, %rdx), %ecx
112	movl -`4`(%rdi, %rdx), %eax
113	shlq $`32`, %rcx
114	shlq $`32`, %rax
115	movl (%rsi), %esi
116	movl (%rdi), %edi
117	orq %rsi, %rcx
118	orq %rdi, %rax
119	/ Only compute proper return if not-equal. /
120	cmpq %rcx, %rax
121	jnz L(ret_nonzero)
122	xorl %eax, %eax
123	ret
124	# endif
125
126	.p2align `4`,, `10`
127	L(cmp_9_16):
128	# ifdef USE_AS_MEMCMPEQ
129	movq (%rsi), %rax
130	subq (%rdi), %rax
131
132	movq -`8`(%rsi, %rdx), %rcx
133	subq -`8`(%rdi, %rdx), %rcx
134	orq %rcx, %rax
135	/ Convert 64 bit -> 32 bit boolean (we should have made the ABI*
136	return long). /*
137	setnz %cl
138	movzbl %cl, %eax
139	# else
140	movq (%rsi), %rcx
141	movq (%rdi), %rax
142	/ Only compute proper return if not-equal. /
143	cmpq %rcx, %rax
144	jnz L(ret_nonzero)
145
146	movq -`8`(%rsi, %rdx, CHAR_SIZE), %rcx
147	movq -`8`(%rdi, %rdx, CHAR_SIZE), %rax
148	/ Only compute proper return if not-equal. /
149	cmpq %rcx, %rax
150	jnz L(ret_nonzero)
151	xorl %eax, %eax
152	# endif
153	# endif
154	ret
155
156	.p2align `4`,, `8`
157	L(cmp_0_1):
158	/ Flag set by earlier comparison against 1. /
159	jne L(cmp_0_0)
160	# ifdef USE_AS_WMEMCMP
161	movl (%rdi), %ecx
162	xorl %edx, %edx
163	cmpl (%rsi), %ecx
164	je L(cmp_0_0)
165	setg %dl
166	leal -`1`(%rdx, %rdx), %eax
167	# else
168	movzbl (%rdi), %eax
169	movzbl (%rsi), %ecx
170	subl %ecx, %eax
171	# endif
172	ret
173
174	/ Fits in aligning bytes. /
175	L(cmp_0_0):
176	xorl %eax, %eax
177	ret
178
179	# ifdef USE_AS_WMEMCMP
180	.p2align `4`
181	L(ret_nonzero_vec_start_0):
182	bsfl %eax, %eax
183	movl (%rdi, %rax), %ecx
184	xorl %edx, %edx
185	cmpl (%rsi, %rax), %ecx
186	/ NB: no partial register stall here because xorl zero idiom*
187	above. /*
188	setg %dl
189	leal -`1`(%rdx, %rdx), %eax
190	ret
191	# else
192
193	# ifndef USE_AS_MEMCMPEQ
194	.p2align `4`,, `14`
195	L(ret_nonzero):
196	/ Need to bswap to get proper return without branch. /
197	bswapq %rcx
198	bswapq %rax
199	subq %rcx, %rax
200	sbbl %eax, %eax
201	orl $`1`, %eax
202	ret
203	# endif
204
205	.p2align `4`
206	L(cmp_0_3):
207	# ifdef USE_AS_MEMCMPEQ
208	/ No reason to add to dependency chain on rdx. Saving a the*
209	bytes here doesn't change number of fetch blocks. /*
210	cmpl $`1`, %edx
211	jbe L(cmp_0_1)
212	# else
213	/ We need the code size to prevent taking an extra fetch block.*
214	*/
215	decl %edx
216	jle L(cmp_0_1)
217	# endif
218	movzwl (%rsi), %ecx
219	movzwl (%rdi), %eax
220
221	# ifdef USE_AS_MEMCMPEQ
222	subl %ecx, %eax
223
224	movzbl -`1`(%rsi, %rdx), %esi
225	movzbl -`1`(%rdi, %rdx), %edi
226	subl %edi, %esi
227	orl %esi, %eax
228	# else
229	bswapl %ecx
230	bswapl %eax
231
232	/ Implicit right shift by one. We just need to displace the*
233	sign bits. /*
234	shrl %ecx
235	shrl %eax
236
237	/ Eat a partial register stall here. Saves code stopping*
238	L(cmp_0_3) from bleeding into the next fetch block and saves
239	an ALU. /*
240	movb (%rsi, %rdx), %cl
241	movzbl (%rdi, %rdx), %edi
242	orl %edi, %eax
243	subl %ecx, %eax
244	# endif
245	ret
246	# endif
247
248	.p2align `5`
249	L(more_1x_vec):
250	# ifndef USE_AS_WMEMCMP
251	/ Use 0xffff to test for mismatches on pmovmskb bitmask. Store*
252	in ecx for code size. This is preferable to using `incw` as
253	it avoids partial register stalls on older hardware (pre
254	SnB). /*
255	movl $`0xffff`, %ecx
256	# endif
257	movups (%rsi), %xmm0
258	movups (%rdi), %xmm1
259	PCMPEQ %xmm0, %xmm1
260	pmovmskb %xmm1, %eax
261	subl %ecx, %eax
262	jnz L(ret_nonzero_vec_start_0)
263	# if SIZE_OFFSET == 0
264	cmpq $(CHAR_PER_VEC * `2`), %rdx
265	# else
266	/ Offset rdx. Saves just enough code size to keep the*
267	L(last_2x_vec) case and the non-zero return in a single
268	cache line. /*
269	subq $(CHAR_PER_VEC * `2`), %rdx
270	# endif
271	ja L(more_2x_vec)
272
273	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
274	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
275	PCMPEQ %xmm0, %xmm1
276	pmovmskb %xmm1, %eax
277	subl %ecx, %eax
278	# ifndef USE_AS_MEMCMPEQ
279	/ Don't use `incw ax` as machines this code runs on are liable*
280	to have partial register stall. /*
281	jnz L(ret_nonzero_vec_end_0)
282	# else
283	/ Various return targets for memcmpeq. Will always be hot in*
284	Icache and get short encoding. /*
285	L(ret_nonzero_vec_start_1):
286	L(ret_nonzero_vec_start_0):
287	L(ret_nonzero_vec_end_0):
288	# endif
289	ret
290
291	# ifndef USE_AS_MEMCMPEQ
292	# ifdef USE_AS_WMEMCMP
293	.p2align `4`
294	L(ret_nonzero_vec_end_0_adj):
295	addl $`3`, %edx
296	# else
297	.p2align `4`,, `8`
298	# endif
299	L(ret_nonzero_vec_end_0):
300	bsfl %eax, %eax
301	# ifdef USE_AS_WMEMCMP
302	leal (%rax, %rdx, CHAR_SIZE), %eax
303	movl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rax), %ecx
304	xorl %edx, %edx
305	cmpl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rax), %ecx
306	/ NB: no partial register stall here because xorl zero idiom*
307	above. /*
308	setg %dl
309	leal -`1`(%rdx, %rdx), %eax
310	# else
311	/ Use `addq` instead of `addl` here so that even if `rax` + `rdx`*
312	is negative value of the sum will be usable as a 64-bit offset
313	(negative 32-bit numbers zero-extend to a large and often
314	out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is
315	an invariant when `memcmp` is used correctly, but if the input
316	strings `rsi`/`rdi` are concurrently modified as the function
317	runs (there is a Data-Race) it is possible for `rax` + `rdx` to
318	be negative. Given that there is virtually no extra to cost
319	using `addq` instead of `addl` we may as well protect the
320	data-race case. /*
321	addq %rdx, %rax
322	movzbl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rax), %ecx
323	movzbl (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rax), %eax
324	subl %ecx, %eax
325	# endif
326	ret
327	# ifndef USE_AS_WMEMCMP
328	.p2align `4`,, `10`
329	L(ret_nonzero_vec_start_0):
330	bsfl %eax, %eax
331	movzbl (%rsi, %rax), %ecx
332	movzbl (%rdi, %rax), %eax
333	subl %ecx, %eax
334	ret
335	# endif
336	# else
337	# endif
338
339	.p2align `5`
340	L(more_2x_vec):
341	movups (VEC_SIZE * `1`)(%rsi), %xmm0
342	movups (VEC_SIZE * `1`)(%rdi), %xmm1
343	PCMPEQ %xmm0, %xmm1
344	pmovmskb %xmm1, %eax
345	subl %ecx, %eax
346	jnz L(ret_nonzero_vec_start_1)
347
348	cmpq $(CHAR_PER_VEC * `4` - SIZE_OFFSET), %rdx
349	jbe L(last_2x_vec)
350
351	cmpq $(CHAR_PER_VEC * `8` - SIZE_OFFSET), %rdx
352	ja L(more_8x_vec)
353
354	/ Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.*
355	This can harm performance if non-zero return in [65, 80] or
356	[97, 112] but helps performance otherwise. Generally zero-
357	return is hotter. /*
358	movups (VEC_SIZE * `2`)(%rsi), %xmm0
359	movups (VEC_SIZE * `2`)(%rdi), %xmm1
360	PCMPEQ %xmm0, %xmm1
361	movups (VEC_SIZE * `3`)(%rsi), %xmm2
362	movups (VEC_SIZE * `3`)(%rdi), %xmm3
363	PCMPEQ %xmm2, %xmm3
364	pand %xmm1, %xmm3
365
366	pmovmskb %xmm3, %eax
367	CHECK_CMP (%ecx, %eax)
368	jnz L(ret_nonzero_vec_start_2_3)
369
370	cmpl $(CHAR_PER_VEC * `6` - SIZE_OFFSET), %edx
371	jbe L(last_2x_vec)
372
373	movups (VEC_SIZE * `4`)(%rsi), %xmm0
374	movups (VEC_SIZE * `4`)(%rdi), %xmm1
375	PCMPEQ %xmm0, %xmm1
376	movups (VEC_SIZE * `5`)(%rsi), %xmm2
377	movups (VEC_SIZE * `5`)(%rdi), %xmm3
378	PCMPEQ %xmm2, %xmm3
379	pand %xmm1, %xmm3
380
381	pmovmskb %xmm3, %eax
382	CHECK_CMP (%ecx, %eax)
383	# ifdef USE_AS_MEMCMPEQ
384	jz L(last_2x_vec)
385	ret
386	# else
387	jnz L(ret_nonzero_vec_start_4_5)
388	# endif
389	.p2align `4`
390	L(last_2x_vec):
391	movups (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
392	movups (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
393	PCMPEQ %xmm0, %xmm1
394	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
395	movups (VEC_SIZE * -`1` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
396	PCMPEQ %xmm2, %xmm3
397	pand %xmm1, %xmm3
398	pmovmskb %xmm3, %eax
399	subl %ecx, %eax
400	# ifdef USE_AS_MEMCMPEQ
401	/ Various return targets for memcmpeq. Will always be hot in*
402	Icache and get short encoding. /*
403	L(ret_nonzero_vec_start_2_3):
404	L(ret_nonzero_vec_start_4_5):
405	ret
406	# else
407	jnz L(ret_nonzero_vec_end_1)
408	ret
409
410	.p2align `4`,, `8`
411	L(ret_nonzero_vec_end_1):
412	pmovmskb %xmm1, %ecx
413	/ High 16 bits of eax guaranteed to be all ones. Rotate them in*
414	to we can do `or + not` with just `xor`. /*
415	rorl $`16`, %eax
416	xorl %ecx, %eax
417	/ Partial register stall. /
418
419	bsfl %eax, %eax
420	# ifdef USE_AS_WMEMCMP
421	leal (%rax, %rdx, CHAR_SIZE), %eax
422	movl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rax), %ecx
423	xorl %edx, %edx
424	cmpl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rax), %ecx
425	/ NB: no partial register stall here because xorl zero idiom*
426	above. /*
427	setg %dl
428	leal -`1`(%rdx, %rdx), %eax
429	# else
430	addl %edx, %eax
431	movzbl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rsi, %rax), %ecx
432	movzbl (VEC_SIZE * -`2` + SIZE_OFFSET)(%rdi, %rax), %eax
433	subl %ecx, %eax
434	# endif
435	ret
436
437	.p2align `4`
438	L(ret_nonzero_vec_start_4_5):
439	pmovmskb %xmm1, %edx
440	sall $`16`, %eax
441	leal `1`(%rax, %rdx), %eax
442	bsfl %eax, %eax
443	# ifdef USE_AS_WMEMCMP
444	movl (VEC_SIZE * `4`)(%rdi, %rax), %ecx
445	xorl %edx, %edx
446	cmpl (VEC_SIZE * `4`)(%rsi, %rax), %ecx
447	/ NB: no partial register stall here because xorl zero idiom*
448	above. /*
449	setg %dl
450	leal -`1`(%rdx, %rdx), %eax
451	# else
452	movzbl (VEC_SIZE * `4`)(%rsi, %rax), %ecx
453	movzbl (VEC_SIZE * `4`)(%rdi, %rax), %eax
454	subl %ecx, %eax
455	# endif
456	ret
457
458	.p2align `4`,, `8`
459	L(ret_nonzero_vec_start_1):
460	bsfl %eax, %eax
461	# ifdef USE_AS_WMEMCMP
462	movl (VEC_SIZE * `1`)(%rdi, %rax), %ecx
463	xorl %edx, %edx
464	cmpl (VEC_SIZE * `1`)(%rsi, %rax), %ecx
465	/ NB: no partial register stall here because xorl zero idiom*
466	above. /*
467	setg %dl
468	leal -`1`(%rdx, %rdx), %eax
469	# else
470	movzbl (VEC_SIZE * `1`)(%rsi, %rax), %ecx
471	movzbl (VEC_SIZE * `1`)(%rdi, %rax), %eax
472	subl %ecx, %eax
473	# endif
474	ret
475	# endif
476
477	.p2align `4`
478	L(more_8x_vec):
479	subq %rdi, %rsi
480	leaq (VEC_SIZE * -`6` + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
481	andq $(VEC_SIZE * -`1`), %rdi
482	addq %rdi, %rsi
483	.p2align `4`
484	L(loop_4x):
485	movups (VEC_SIZE * `2`)(%rsi), %xmm0
486	movups (VEC_SIZE * `3`)(%rsi), %xmm1
487
488	PCMPEQ (VEC_SIZE * `2`)(%rdi), %xmm0
489	PCMPEQ (VEC_SIZE * `3`)(%rdi), %xmm1
490
491	movups (VEC_SIZE * `4`)(%rsi), %xmm2
492	movups (VEC_SIZE * `5`)(%rsi), %xmm3
493
494	PCMPEQ (VEC_SIZE * `4`)(%rdi), %xmm2
495	PCMPEQ (VEC_SIZE * `5`)(%rdi), %xmm3
496
497	pand %xmm0, %xmm1
498	pand %xmm2, %xmm3
499	pand %xmm1, %xmm3
500
501	pmovmskb %xmm3, %eax
502	subl %ecx, %eax
503	jnz L(ret_nonzero_loop)
504
505	addq $(VEC_SIZE * `4`), %rdi
506	addq $(VEC_SIZE * `4`), %rsi
507	cmpq %rdi, %rdx
508	ja L(loop_4x)
509	/ Get remaining length in edx. /
510	subl %edi, %edx
511	/ Restore offset so we can reuse L(last_2x_vec). /
512	addl $(VEC_SIZE * `6` - SIZE_OFFSET), %edx
513	# ifdef USE_AS_WMEMCMP
514	shrl $`2`, %edx
515	# endif
516	cmpl $(CHAR_PER_VEC * `4` - SIZE_OFFSET), %edx
517	jbe L(last_2x_vec)
518
519
520	movups (VEC_SIZE * `2`)(%rsi), %xmm0
521	movups (VEC_SIZE * `2`)(%rdi), %xmm1
522	PCMPEQ %xmm0, %xmm1
523	movups (VEC_SIZE * `3`)(%rsi), %xmm2
524	movups (VEC_SIZE * `3`)(%rdi), %xmm3
525	PCMPEQ %xmm2, %xmm3
526	pand %xmm1, %xmm3
527
528	pmovmskb %xmm3, %eax
529	CHECK_CMP (%ecx, %eax)
530	jz L(last_2x_vec)
531	# ifdef USE_AS_MEMCMPEQ
532	L(ret_nonzero_loop):
533	ret
534	# else
535
536	.p2align `4`
537	L(ret_nonzero_vec_start_2_3):
538	pmovmskb %xmm1, %edx
539	sall $`16`, %eax
540	leal `1`(%rax, %rdx), %eax
541
542	bsfl %eax, %eax
543	# ifdef USE_AS_WMEMCMP
544	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
545	xorl %edx, %edx
546	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
547	/ NB: no partial register stall here because xorl zero idiom*
548	above. /*
549	setg %dl
550	leal -`1`(%rdx, %rdx), %eax
551	# else
552	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
553	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
554	subl %ecx, %eax
555	# endif
556	ret
557
558	.p2align `4`
559	L(ret_nonzero_loop):
560	pmovmskb %xmm0, %ecx
561	pmovmskb %xmm1, %edx
562	sall $(VEC_SIZE * `1`), %edx
563	leal `1`(%rcx, %rdx), %edx
564	pmovmskb %xmm2, %ecx
565	/ High 16 bits of eax guaranteed to be all ones. Rotate them in*
566	to we can do `or + not` with just `xor`. /*
567	rorl $`16`, %eax
568	xorl %ecx, %eax
569
570	salq $`32`, %rax
571	orq %rdx, %rax
572
573	bsfq %rax, %rax
574	# ifdef USE_AS_WMEMCMP
575	movl (VEC_SIZE * `2`)(%rdi, %rax), %ecx
576	xorl %edx, %edx
577	cmpl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
578	/ NB: no partial register stall here because xorl zero idiom*
579	above. /*
580	setg %dl
581	leal -`1`(%rdx, %rdx), %eax
582	# else
583	movzbl (VEC_SIZE * `2`)(%rsi, %rax), %ecx
584	movzbl (VEC_SIZE * `2`)(%rdi, %rax), %eax
585	subl %ecx, %eax
586	# endif
587	ret
588	# endif
589	END(MEMCMP)
590	#endif
591

source code of glibc/sysdeps/x86_64/multiarch/memcmp-sse2.S