strcmp-avx2.S source code [glibc/sysdeps/x86_64/multiarch/strcmp-avx2.S]

1	/ strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.*
2	Copyright (C) 2018-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <isa-level.h>
20
21	#if ISA_SHOULD_BUILD (3)
22
23	# ifndef STRCMP_ISA
24	# define STRCMP_ISA _avx2
25	# endif
26
27	# include "strcmp-naming.h"
28
29	# include <sysdep.h>
30
31	# if defined USE_AS_STRCASECMP_L
32	# include "locale-defines.h"
33	# endif
34
35	# ifndef STRCMP
36	# define STRCMP __strcmp_avx2
37	# endif
38
39	# define PAGE_SIZE 4096
40
41	/ VEC_SIZE = Number of bytes in a ymm register. /
42	# define VEC_SIZE 32
43
44	# define VMOVU vmovdqu
45	# define VMOVA vmovdqa
46
47	# ifdef USE_AS_WCSCMP
48	/ Compare packed dwords. /
49	# define VPCMPEQ vpcmpeqd
50	/ Compare packed dwords and store minimum. /
51	# define VPMINU vpminud
52	/ 1 dword char == 4 bytes. /
53	# define SIZE_OF_CHAR 4
54	# else
55	/ Compare packed bytes. /
56	# define VPCMPEQ vpcmpeqb
57	/ Compare packed bytes and store minimum. /
58	# define VPMINU vpminub
59	/ 1 byte char == 1 byte. /
60	# define SIZE_OF_CHAR 1
61	# endif
62
63	# ifdef USE_AS_STRNCMP
64	# define LOOP_REG r9d
65	# define LOOP_REG64 r9
66
67	# define OFFSET_REG8 r9b
68	# define OFFSET_REG r9d
69	# define OFFSET_REG64 r9
70	# else
71	# define LOOP_REG edx
72	# define LOOP_REG64 rdx
73
74	# define OFFSET_REG8 dl
75	# define OFFSET_REG edx
76	# define OFFSET_REG64 rdx
77	# endif
78
79	# ifndef VZEROUPPER
80	# define VZEROUPPER vzeroupper
81	# endif
82
83	# if defined USE_AS_STRNCMP
84	# define VEC_OFFSET 0
85	# else
86	# define VEC_OFFSET (-VEC_SIZE)
87	# endif
88
89	# ifdef USE_AS_STRCASECMP_L
90	# define BYTE_LOOP_REG OFFSET_REG
91	# else
92	# define BYTE_LOOP_REG ecx
93	# endif
94
95	# ifdef USE_AS_STRCASECMP_L
96	# ifdef USE_AS_STRNCMP
97	# define LOCALE_REG rcx
98	# define LOCALE_REG_LP RCX_LP
99	# else
100	# define LOCALE_REG rdx
101	# define LOCALE_REG_LP RDX_LP
102	# endif
103	# endif
104
105	# define xmmZERO xmm15
106	# define ymmZERO ymm15
107
108	# define LCASE_MIN_ymm %ymm10
109	# define LCASE_MAX_ymm %ymm11
110	# define CASE_ADD_ymm %ymm12
111
112	# define LCASE_MIN_xmm %xmm10
113	# define LCASE_MAX_xmm %xmm11
114	# define CASE_ADD_xmm %xmm12
115
116	/ r11 is never use elsewhere so this is safe to maintain. /
117	# define TOLOWER_BASE %r11
118
119	# ifndef SECTION
120	# define SECTION(p) p##.avx
121	# endif
122
123	# ifdef USE_AS_STRCASECMP_L
124	# define REG(x, y) x ## y
125	# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
126	vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
127	vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
128	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
129	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
130	vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
131	vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
132	vpaddb REG(%ext, 8), reg1_in, reg1_out; \
133	vpaddb REG(%ext, 9), reg2_in, reg2_out
134
135	# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
136	# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
137	# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
138
139	# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
140	TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
141	VPCMPEQ scratch_reg, s2_reg, reg_out
142
143	# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
144	VMOVU s2_mem, reg_out; \
145	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
146
147	# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
148	# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
149
150	# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
151	# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
152
153	# else
154	# define TOLOWER_gpr(...)
155	# define TOLOWER_ymm(...)
156	# define TOLOWER_xmm(...)
157
158	# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
159	VPCMPEQ s2_reg, s1_reg, reg_out
160
161	# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
162
163	# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
164	# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
165	# endif
166
167	/ Warning!*
168	wcscmp/wcsncmp have to use SIGNED comparison for elements.
169	strcmp/strncmp have to use UNSIGNED comparison for elements.
170	*/
171
172	/ The main idea of the string comparison (byte or dword) using AVX2*
173	consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
174	either packed bytes or dwords depending on USE_AS_WCSCMP. In order
175	to check the null char, algorithm keeps the matched bytes/dwords,
176	requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
177	the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
178	one VPMINU instructions, together with movdqu and testl instructions.
179	Main loop (away from from page boundary) compares 4 vectors are a time,
180	effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
181
182	The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
183	is the same as strcmp, except that an a maximum offset is tracked. If
184	the maximum offset is reached before a difference is found, zero is
185	returned. /*
186
187	.section SECTION(.text), "ax", @progbits
188	.align `16`
189	.type STRCMP, @function
190	.globl STRCMP
191
192	# ifdef USE_AS_STRCASECMP_L
193	ENTRY (STRCASECMP)
194	movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
195	mov %fs:(%rax), %LOCALE_REG_LP
196
197	/ Either 1 or 5 bytes (depending if CET is enabled). /
198	.p2align `4`
199	END (STRCASECMP)
200	/ FALLTHROUGH to strcasecmp/strncasecmp_l. /
201	# endif
202
203	.p2align `4`
204	STRCMP:
205	cfi_startproc
206	_CET_ENDBR
207	CALL_MCOUNT
208
209	# if defined USE_AS_STRCASECMP_L
210	/ We have to fall back on the C implementation for locales with*
211	encodings not matching ASCII for single bytes. /*
212	# if LOCALE_T___LOCALES != 0 \|\| LC_CTYPE != 0
213	mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
214	# else
215	mov (%LOCALE_REG), %RAX_LP
216	# endif
217	testb $`1`, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
218	jne STRCASECMP_L_NONASCII
219	leaq _nl_C_LC_CTYPE_tolower + `128` * `4`(%rip), TOLOWER_BASE
220	# endif
221
222	# ifdef USE_AS_STRNCMP
223	/ Don't overwrite LOCALE_REG (rcx) until we have pass*
224	L(one_or_less). Otherwise we might use the wrong locale in
225	the OVERFLOW_STRCMP (strcasecmp_l). /*
226	# ifdef __ILP32__
227	/ Clear the upper 32 bits. /
228	movl %edx, %edx
229	# endif
230	cmp $`1`, %RDX_LP
231	/ Signed comparison intentional. We use this branch to also*
232	test cases where length >= 2^63. These very large sizes can be
233	handled with strcmp as there is no way for that length to
234	actually bound the buffer. /*
235	jle L(one_or_less)
236	# ifdef USE_AS_WCSCMP
237	movq %rdx, %rcx
238
239	/ Multiplying length by sizeof(wchar_t) can result in overflow.*
240	Check if that is possible. All cases where overflow are possible
241	are cases where length is large enough that it can never be a
242	bound on valid memory so just use wcscmp. /*
243	shrq $`56`, %rcx
244	jnz OVERFLOW_STRCMP
245
246	leaq (, %rdx, `4`), %rdx
247	# endif
248	# endif
249	vpxor %xmmZERO, %xmmZERO, %xmmZERO
250	# if defined USE_AS_STRCASECMP_L
251	.section .rodata.cst32, "aM", @progbits, `32`
252	.align `32`
253	L(lcase_min):
254	.quad `0x3f3f3f3f3f3f3f3f`
255	.quad `0x3f3f3f3f3f3f3f3f`
256	.quad `0x3f3f3f3f3f3f3f3f`
257	.quad `0x3f3f3f3f3f3f3f3f`
258	L(lcase_max):
259	.quad `0x9999999999999999`
260	.quad `0x9999999999999999`
261	.quad `0x9999999999999999`
262	.quad `0x9999999999999999`
263	L(case_add):
264	.quad `0x2020202020202020`
265	.quad `0x2020202020202020`
266	.quad `0x2020202020202020`
267	.quad `0x2020202020202020`
268	.previous
269
270	vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
271	vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
272	vmovdqa L(case_add)(%rip), CASE_ADD_ymm
273	# endif
274	movl %edi, %eax
275	orl %esi, %eax
276	sall $`20`, %eax
277	/ Check if s1 or s2 may cross a page in next 4x VEC loads. /
278	cmpl $((PAGE_SIZE -(VEC_SIZE * `4`)) << `20`), %eax
279	ja L(page_cross)
280
281	L(no_page_cross):
282	/ Safe to compare 4x vectors. /
283	VMOVU (%rdi), %ymm0
284	/ 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.*
285	Otherwise converts ymm0 and load from rsi to lower. ymm2 is
286	scratch and ymm1 is the return. /*
287	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
288	/ 1s at null CHAR. /
289	VPCMPEQ %ymm0, %ymmZERO, %ymm2
290	/ 1s where s1 and s2 equal AND not null CHAR. /
291	vpandn %ymm1, %ymm2, %ymm1
292
293	/ All 1s -> keep going, any 0s -> return. /
294	vpmovmskb %ymm1, %ecx
295	# ifdef USE_AS_STRNCMP
296	cmpq $VEC_SIZE, %rdx
297	jbe L(vec_0_test_len)
298	# endif
299
300	/ All 1s represents all equals. incl will overflow to zero in*
301	all equals case. Otherwise 1s will carry until position of first
302	mismatch. /*
303	incl %ecx
304	jz L(more_3x_vec)
305
306	.p2align `4`,, `4`
307	L(return_vec_0):
308	tzcntl %ecx, %ecx
309	# ifdef USE_AS_WCSCMP
310	movl (%rdi, %rcx), %edx
311	xorl %eax, %eax
312	cmpl (%rsi, %rcx), %edx
313	je L(ret0)
314	setl %al
315	negl %eax
316	orl $`1`, %eax
317	# else
318	movzbl (%rdi, %rcx), %eax
319	movzbl (%rsi, %rcx), %ecx
320	TOLOWER_gpr (%rax, %eax)
321	TOLOWER_gpr (%rcx, %ecx)
322	subl %ecx, %eax
323	# endif
324	L(ret0):
325	L(return_vzeroupper):
326	ZERO_UPPER_VEC_REGISTERS_RETURN
327
328	# ifdef USE_AS_STRNCMP
329	.p2align `4`,, `8`
330	L(vec_0_test_len):
331	notl %ecx
332	bzhil %edx, %ecx, %eax
333	jnz L(return_vec_0)
334	/ Align if will cross fetch block. /
335	.p2align `4`,, `2`
336	L(ret_zero):
337	xorl %eax, %eax
338	VZEROUPPER_RETURN
339
340	.p2align `4`,, `5`
341	L(one_or_less):
342	# ifdef USE_AS_STRCASECMP_L
343	/ Set locale argument for strcasecmp. /
344	movq %LOCALE_REG, %rdx
345	# endif
346	jb L(ret_zero)
347	/ 'nbe' covers the case where length is negative (large*
348	unsigned). /*
349	jnbe OVERFLOW_STRCMP
350	# ifdef USE_AS_WCSCMP
351	movl (%rdi), %edx
352	xorl %eax, %eax
353	cmpl (%rsi), %edx
354	je L(ret1)
355	setl %al
356	negl %eax
357	orl $`1`, %eax
358	# else
359	movzbl (%rdi), %eax
360	movzbl (%rsi), %ecx
361	TOLOWER_gpr (%rax, %eax)
362	TOLOWER_gpr (%rcx, %ecx)
363	subl %ecx, %eax
364	# endif
365	L(ret1):
366	ret
367	# endif
368
369	.p2align `4`,, `10`
370	L(return_vec_1):
371	tzcntl %ecx, %ecx
372	# ifdef USE_AS_STRNCMP
373	/ rdx must be > CHAR_PER_VEC so save to subtract w.o fear of*
374	overflow. /*
375	addq $-VEC_SIZE, %rdx
376	cmpq %rcx, %rdx
377	jbe L(ret_zero)
378	# endif
379	# ifdef USE_AS_WCSCMP
380	movl VEC_SIZE(%rdi, %rcx), %edx
381	xorl %eax, %eax
382	cmpl VEC_SIZE(%rsi, %rcx), %edx
383	je L(ret2)
384	setl %al
385	negl %eax
386	orl $`1`, %eax
387	# else
388	movzbl VEC_SIZE(%rdi, %rcx), %eax
389	movzbl VEC_SIZE(%rsi, %rcx), %ecx
390	TOLOWER_gpr (%rax, %eax)
391	TOLOWER_gpr (%rcx, %ecx)
392	subl %ecx, %eax
393	# endif
394	L(ret2):
395	VZEROUPPER_RETURN
396
397	.p2align `4`,, `10`
398	# ifdef USE_AS_STRNCMP
399	L(return_vec_3):
400	salq $`32`, %rcx
401	# endif
402
403	L(return_vec_2):
404	# ifndef USE_AS_STRNCMP
405	tzcntl %ecx, %ecx
406	# else
407	tzcntq %rcx, %rcx
408	cmpq %rcx, %rdx
409	jbe L(ret_zero)
410	# endif
411
412	# ifdef USE_AS_WCSCMP
413	movl (VEC_SIZE * `2`)(%rdi, %rcx), %edx
414	xorl %eax, %eax
415	cmpl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
416	je L(ret3)
417	setl %al
418	negl %eax
419	orl $`1`, %eax
420	# else
421	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
422	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %ecx
423	TOLOWER_gpr (%rax, %eax)
424	TOLOWER_gpr (%rcx, %ecx)
425	subl %ecx, %eax
426	# endif
427	L(ret3):
428	VZEROUPPER_RETURN
429
430	# ifndef USE_AS_STRNCMP
431	.p2align `4`,, `10`
432	L(return_vec_3):
433	tzcntl %ecx, %ecx
434	# ifdef USE_AS_WCSCMP
435	movl (VEC_SIZE * `3`)(%rdi, %rcx), %edx
436	xorl %eax, %eax
437	cmpl (VEC_SIZE * `3`)(%rsi, %rcx), %edx
438	je L(ret4)
439	setl %al
440	negl %eax
441	orl $`1`, %eax
442	# else
443	movzbl (VEC_SIZE * `3`)(%rdi, %rcx), %eax
444	movzbl (VEC_SIZE * `3`)(%rsi, %rcx), %ecx
445	TOLOWER_gpr (%rax, %eax)
446	TOLOWER_gpr (%rcx, %ecx)
447	subl %ecx, %eax
448	# endif
449	L(ret4):
450	VZEROUPPER_RETURN
451	# endif
452
453	.p2align `4`,, `10`
454	L(more_3x_vec):
455	/ Safe to compare 4x vectors. /
456	VMOVU VEC_SIZE(%rdi), %ymm0
457	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
458	VPCMPEQ %ymm0, %ymmZERO, %ymm2
459	vpandn %ymm1, %ymm2, %ymm1
460	vpmovmskb %ymm1, %ecx
461	incl %ecx
462	jnz L(return_vec_1)
463
464	# ifdef USE_AS_STRNCMP
465	subq $(VEC_SIZE * `2`), %rdx
466	jbe L(ret_zero)
467	# endif
468
469	VMOVU (VEC_SIZE * `2`)(%rdi), %ymm0
470	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * `2`)(%rsi), %ymm2, %ymm1)
471	VPCMPEQ %ymm0, %ymmZERO, %ymm2
472	vpandn %ymm1, %ymm2, %ymm1
473	vpmovmskb %ymm1, %ecx
474	incl %ecx
475	jnz L(return_vec_2)
476
477	VMOVU (VEC_SIZE * `3`)(%rdi), %ymm0
478	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * `3`)(%rsi), %ymm2, %ymm1)
479	VPCMPEQ %ymm0, %ymmZERO, %ymm2
480	vpandn %ymm1, %ymm2, %ymm1
481	vpmovmskb %ymm1, %ecx
482	incl %ecx
483	jnz L(return_vec_3)
484
485	# ifdef USE_AS_STRNCMP
486	cmpq $(VEC_SIZE * `2`), %rdx
487	jbe L(ret_zero)
488	# endif
489
490	# ifdef USE_AS_WCSCMP
491	/ any non-zero positive value that doesn't inference with 0x1.*
492	*/
493	movl $`2`, %r8d
494
495	# else
496	xorl %r8d, %r8d
497	# endif
498
499	/ The prepare labels are various entry points from the page*
500	cross logic. /*
501	L(prepare_loop):
502
503	# ifdef USE_AS_STRNCMP
504	/ Store N + (VEC_SIZE * 4) and place check at the beginning of*
505	the loop. /*
506	leaq (VEC_SIZE * `2`)(%rdi, %rdx), %rdx
507	# endif
508	L(prepare_loop_no_len):
509
510	/ Align s1 and adjust s2 accordingly. /
511	subq %rdi, %rsi
512	andq $-(VEC_SIZE * `4`), %rdi
513	addq %rdi, %rsi
514
515	# ifdef USE_AS_STRNCMP
516	subq %rdi, %rdx
517	# endif
518
519	L(prepare_loop_aligned):
520	/ eax stores distance from rsi to next page cross. These cases*
521	need to be handled specially as the 4x loop could potentially
522	read memory past the length of s1 or s2 and across a page
523	boundary. /*
524	movl $-(VEC_SIZE * `4`), %eax
525	subl %esi, %eax
526	andl $(PAGE_SIZE - `1`), %eax
527
528	/ Loop 4x comparisons at a time. /
529	.p2align `4`
530	L(loop):
531
532	/ End condition for strncmp. /
533	# ifdef USE_AS_STRNCMP
534	subq $(VEC_SIZE * `4`), %rdx
535	jbe L(ret_zero)
536	# endif
537
538	subq $-(VEC_SIZE * `4`), %rdi
539	subq $-(VEC_SIZE * `4`), %rsi
540
541	/ Check if rsi loads will cross a page boundary. /
542	addl $-(VEC_SIZE * `4`), %eax
543	jnb L(page_cross_during_loop)
544
545	/ Loop entry after handling page cross during loop. /
546	L(loop_skip_page_cross_check):
547	VMOVA (VEC_SIZE * `0`)(%rdi), %ymm0
548	VMOVA (VEC_SIZE * `1`)(%rdi), %ymm2
549	VMOVA (VEC_SIZE * `2`)(%rdi), %ymm4
550	VMOVA (VEC_SIZE * `3`)(%rdi), %ymm6
551
552	/ ymm1 all 1s where s1 and s2 equal. All 0s otherwise. /
553	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * `0`)(%rsi), %ymm3, %ymm1)
554	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * `1`)(%rsi), %ymm5, %ymm3)
555	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * `2`)(%rsi), %ymm7, %ymm5)
556	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * `3`)(%rsi), %ymm13, %ymm7)
557
558	/ If any mismatches or null CHAR then 0 CHAR, otherwise non-*
559	zero. /*
560	vpand %ymm0, %ymm1, %ymm1
561
562
563	vpand %ymm2, %ymm3, %ymm3
564	vpand %ymm4, %ymm5, %ymm5
565	vpand %ymm6, %ymm7, %ymm7
566
567	VPMINU %ymm1, %ymm3, %ymm3
568	VPMINU %ymm5, %ymm7, %ymm7
569
570	/ Reduce all 0 CHARs for the 4x VEC into ymm7. /
571	VPMINU %ymm3, %ymm7, %ymm7
572
573	/ If any 0 CHAR then done. /
574	VPCMPEQ %ymm7, %ymmZERO, %ymm7
575	vpmovmskb %ymm7, %LOOP_REG
576	testl %LOOP_REG, %LOOP_REG
577	jz L(loop)
578
579	/ Find which VEC has the mismatch of end of string. /
580	VPCMPEQ %ymm1, %ymmZERO, %ymm1
581	vpmovmskb %ymm1, %ecx
582	testl %ecx, %ecx
583	jnz L(return_vec_0_end)
584
585
586	VPCMPEQ %ymm3, %ymmZERO, %ymm3
587	vpmovmskb %ymm3, %ecx
588	testl %ecx, %ecx
589	jnz L(return_vec_1_end)
590
591	L(return_vec_2_3_end):
592	# ifdef USE_AS_STRNCMP
593	subq $(VEC_SIZE * `2`), %rdx
594	jbe L(ret_zero_end)
595	# endif
596
597	VPCMPEQ %ymm5, %ymmZERO, %ymm5
598	vpmovmskb %ymm5, %ecx
599	testl %ecx, %ecx
600	jnz L(return_vec_2_end)
601
602	/ LOOP_REG contains matches for null/mismatch from the loop. If*
603	VEC 0,1,and 2 all have no null and no mismatches then mismatch
604	must entirely be from VEC 3 which is fully represented by
605	LOOP_REG. /*
606	tzcntl %LOOP_REG, %LOOP_REG
607
608	# ifdef USE_AS_STRNCMP
609	subl $-(VEC_SIZE), %LOOP_REG
610	cmpq %LOOP_REG64, %rdx
611	jbe L(ret_zero_end)
612	# endif
613
614	# ifdef USE_AS_WCSCMP
615	movl (VEC_SIZE * `2` - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
616	xorl %eax, %eax
617	cmpl (VEC_SIZE * `2` - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
618	je L(ret5)
619	setl %al
620	negl %eax
621	xorl %r8d, %eax
622	# else
623	movzbl (VEC_SIZE * `2` - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
624	movzbl (VEC_SIZE * `2` - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
625	TOLOWER_gpr (%rax, %eax)
626	TOLOWER_gpr (%rcx, %ecx)
627	subl %ecx, %eax
628	xorl %r8d, %eax
629	subl %r8d, %eax
630	# endif
631	L(ret5):
632	VZEROUPPER_RETURN
633
634	# ifdef USE_AS_STRNCMP
635	.p2align `4`,, `2`
636	L(ret_zero_end):
637	xorl %eax, %eax
638	VZEROUPPER_RETURN
639	# endif
640
641
642	/ The L(return_vec_N_end) differ from L(return_vec_N) in that*
643	they use the value of `r8` to negate the return value. This is
644	because the page cross logic can swap `rdi` and `rsi`. /*
645	.p2align `4`,, `10`
646	# ifdef USE_AS_STRNCMP
647	L(return_vec_1_end):
648	salq $`32`, %rcx
649	# endif
650	L(return_vec_0_end):
651	# ifndef USE_AS_STRNCMP
652	tzcntl %ecx, %ecx
653	# else
654	tzcntq %rcx, %rcx
655	cmpq %rcx, %rdx
656	jbe L(ret_zero_end)
657	# endif
658
659	# ifdef USE_AS_WCSCMP
660	movl (%rdi, %rcx), %edx
661	xorl %eax, %eax
662	cmpl (%rsi, %rcx), %edx
663	je L(ret6)
664	setl %al
665	negl %eax
666	xorl %r8d, %eax
667	# else
668	movzbl (%rdi, %rcx), %eax
669	movzbl (%rsi, %rcx), %ecx
670	TOLOWER_gpr (%rax, %eax)
671	TOLOWER_gpr (%rcx, %ecx)
672	subl %ecx, %eax
673	xorl %r8d, %eax
674	subl %r8d, %eax
675	# endif
676	L(ret6):
677	VZEROUPPER_RETURN
678
679	# ifndef USE_AS_STRNCMP
680	.p2align `4`,, `10`
681	L(return_vec_1_end):
682	tzcntl %ecx, %ecx
683	# ifdef USE_AS_WCSCMP
684	movl VEC_SIZE(%rdi, %rcx), %edx
685	xorl %eax, %eax
686	cmpl VEC_SIZE(%rsi, %rcx), %edx
687	je L(ret7)
688	setl %al
689	negl %eax
690	xorl %r8d, %eax
691	# else
692	movzbl VEC_SIZE(%rdi, %rcx), %eax
693	movzbl VEC_SIZE(%rsi, %rcx), %ecx
694	TOLOWER_gpr (%rax, %eax)
695	TOLOWER_gpr (%rcx, %ecx)
696	subl %ecx, %eax
697	xorl %r8d, %eax
698	subl %r8d, %eax
699	# endif
700	L(ret7):
701	VZEROUPPER_RETURN
702	# endif
703
704	.p2align `4`,, `10`
705	L(return_vec_2_end):
706	tzcntl %ecx, %ecx
707	# ifdef USE_AS_STRNCMP
708	cmpq %rcx, %rdx
709	jbe L(ret_zero_page_cross)
710	# endif
711	# ifdef USE_AS_WCSCMP
712	movl (VEC_SIZE * `2`)(%rdi, %rcx), %edx
713	xorl %eax, %eax
714	cmpl (VEC_SIZE * `2`)(%rsi, %rcx), %edx
715	je L(ret11)
716	setl %al
717	negl %eax
718	xorl %r8d, %eax
719	# else
720	movzbl (VEC_SIZE * `2`)(%rdi, %rcx), %eax
721	movzbl (VEC_SIZE * `2`)(%rsi, %rcx), %ecx
722	TOLOWER_gpr (%rax, %eax)
723	TOLOWER_gpr (%rcx, %ecx)
724	subl %ecx, %eax
725	xorl %r8d, %eax
726	subl %r8d, %eax
727	# endif
728	L(ret11):
729	VZEROUPPER_RETURN
730
731
732	/ Page cross in rsi in next 4x VEC. /
733
734	/ TODO: Improve logic here. /
735	.p2align `4`,, `10`
736	L(page_cross_during_loop):
737	/ eax contains [distance_from_page - (VEC_SIZE * 4)]. /
738
739	/ Optimistically rsi and rdi and both aligned inwhich case we*
740	don't need any logic here. /*
741	cmpl $-(VEC_SIZE * `4`), %eax
742	/ Don't adjust eax before jumping back to loop and we will*
743	never hit page cross case again. /*
744	je L(loop_skip_page_cross_check)
745
746	/ Check if we can safely load a VEC. /
747	cmpl $-(VEC_SIZE * `3`), %eax
748	jle L(less_1x_vec_till_page_cross)
749
750	VMOVA (%rdi), %ymm0
751	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
752	VPCMPEQ %ymm0, %ymmZERO, %ymm2
753	vpandn %ymm1, %ymm2, %ymm1
754	vpmovmskb %ymm1, %ecx
755	incl %ecx
756	jnz L(return_vec_0_end)
757
758	/ if distance >= 2x VEC then eax > -(VEC_SIZE * 2). /
759	cmpl $-(VEC_SIZE * `2`), %eax
760	jg L(more_2x_vec_till_page_cross)
761
762	.p2align `4`,, `4`
763	L(less_1x_vec_till_page_cross):
764	subl $-(VEC_SIZE * `4`), %eax
765	/ Guaranteed safe to read from rdi - VEC_SIZE here. The only*
766	concerning case is first iteration if incoming s1 was near start
767	of a page and s2 near end. If s1 was near the start of the page
768	we already aligned up to nearest VEC_SIZE 4 so gurnateed safe*
769	to read back -VEC_SIZE. If rdi is truly at the start of a page
770	here, it means the previous page (rdi - VEC_SIZE) has already
771	been loaded earlier so must be valid. /*
772	VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
773	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
774	VPCMPEQ %ymm0, %ymmZERO, %ymm2
775	vpandn %ymm1, %ymm2, %ymm1
776	vpmovmskb %ymm1, %ecx
777
778	/ Mask of potentially valid bits. The lower bits can be out of*
779	range comparisons (but safe regarding page crosses). /*
780	movl $-`1`, %r10d
781	shlxl %esi, %r10d, %r10d
782	notl %ecx
783
784	# ifdef USE_AS_STRNCMP
785	cmpq %rax, %rdx
786	jbe L(return_page_cross_end_check)
787	# endif
788	movl %eax, %OFFSET_REG
789	addl $(PAGE_SIZE - VEC_SIZE * `4`), %eax
790
791	andl %r10d, %ecx
792	jz L(loop_skip_page_cross_check)
793
794	.p2align `4`,, `3`
795	L(return_page_cross_end):
796	tzcntl %ecx, %ecx
797
798	# ifdef USE_AS_STRNCMP
799	leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
800	L(return_page_cross_cmp_mem):
801	# else
802	addl %OFFSET_REG, %ecx
803	# endif
804	# ifdef USE_AS_WCSCMP
805	movl VEC_OFFSET(%rdi, %rcx), %edx
806	xorl %eax, %eax
807	cmpl VEC_OFFSET(%rsi, %rcx), %edx
808	je L(ret8)
809	setl %al
810	negl %eax
811	xorl %r8d, %eax
812	# else
813	movzbl VEC_OFFSET(%rdi, %rcx), %eax
814	movzbl VEC_OFFSET(%rsi, %rcx), %ecx
815	TOLOWER_gpr (%rax, %eax)
816	TOLOWER_gpr (%rcx, %ecx)
817	subl %ecx, %eax
818	xorl %r8d, %eax
819	subl %r8d, %eax
820	# endif
821	L(ret8):
822	VZEROUPPER_RETURN
823
824	# ifdef USE_AS_STRNCMP
825	.p2align `4`,, `10`
826	L(return_page_cross_end_check):
827	andl %r10d, %ecx
828	tzcntl %ecx, %ecx
829	leal -VEC_SIZE(%rax, %rcx), %ecx
830	cmpl %ecx, %edx
831	ja L(return_page_cross_cmp_mem)
832	xorl %eax, %eax
833	VZEROUPPER_RETURN
834	# endif
835
836
837	.p2align `4`,, `10`
838	L(more_2x_vec_till_page_cross):
839	/ If more 2x vec till cross we will complete a full loop*
840	iteration here. /*
841
842	VMOVU VEC_SIZE(%rdi), %ymm0
843	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
844	VPCMPEQ %ymm0, %ymmZERO, %ymm2
845	vpandn %ymm1, %ymm2, %ymm1
846	vpmovmskb %ymm1, %ecx
847	incl %ecx
848	jnz L(return_vec_1_end)
849
850	# ifdef USE_AS_STRNCMP
851	cmpq $(VEC_SIZE * `2`), %rdx
852	jbe L(ret_zero_in_loop_page_cross)
853	# endif
854
855	subl $-(VEC_SIZE * `4`), %eax
856
857	/ Safe to include comparisons from lower bytes. /
858	VMOVU -(VEC_SIZE * `2`)(%rdi, %rax), %ymm0
859	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * `2`)(%rsi, %rax), %ymm2, %ymm1)
860	VPCMPEQ %ymm0, %ymmZERO, %ymm2
861	vpandn %ymm1, %ymm2, %ymm1
862	vpmovmskb %ymm1, %ecx
863	incl %ecx
864	jnz L(return_vec_page_cross_0)
865
866	VMOVU -(VEC_SIZE * `1`)(%rdi, %rax), %ymm0
867	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * `1`)(%rsi, %rax), %ymm2, %ymm1)
868	VPCMPEQ %ymm0, %ymmZERO, %ymm2
869	vpandn %ymm1, %ymm2, %ymm1
870	vpmovmskb %ymm1, %ecx
871	incl %ecx
872	jnz L(return_vec_page_cross_1)
873
874	# ifdef USE_AS_STRNCMP
875	/ Must check length here as length might proclude reading next*
876	page. /*
877	cmpq %rax, %rdx
878	jbe L(ret_zero_in_loop_page_cross)
879	# endif
880
881	/ Finish the loop. /
882	VMOVA (VEC_SIZE * `2`)(%rdi), %ymm4
883	VMOVA (VEC_SIZE * `3`)(%rdi), %ymm6
884
885	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * `2`)(%rsi), %ymm7, %ymm5)
886	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * `3`)(%rsi), %ymm13, %ymm7)
887	vpand %ymm4, %ymm5, %ymm5
888	vpand %ymm6, %ymm7, %ymm7
889	VPMINU %ymm5, %ymm7, %ymm7
890	VPCMPEQ %ymm7, %ymmZERO, %ymm7
891	vpmovmskb %ymm7, %LOOP_REG
892	testl %LOOP_REG, %LOOP_REG
893	jnz L(return_vec_2_3_end)
894
895	/ Best for code size to include ucond-jmp here. Would be faster*
896	if this case is hot to duplicate the L(return_vec_2_3_end) code
897	as fall-through and have jump back to loop on mismatch
898	comparison. /*
899	subq $-(VEC_SIZE * `4`), %rdi
900	subq $-(VEC_SIZE * `4`), %rsi
901	addl $(PAGE_SIZE - VEC_SIZE * `8`), %eax
902	# ifdef USE_AS_STRNCMP
903	subq $(VEC_SIZE * `4`), %rdx
904	ja L(loop_skip_page_cross_check)
905	L(ret_zero_in_loop_page_cross):
906	xorl %eax, %eax
907	VZEROUPPER_RETURN
908	# else
909	jmp L(loop_skip_page_cross_check)
910	# endif
911
912
913	.p2align `4`,, `10`
914	L(return_vec_page_cross_0):
915	addl $-VEC_SIZE, %eax
916	L(return_vec_page_cross_1):
917	tzcntl %ecx, %ecx
918	# ifdef USE_AS_STRNCMP
919	leal -VEC_SIZE(%rax, %rcx), %ecx
920	cmpq %rcx, %rdx
921	jbe L(ret_zero_in_loop_page_cross)
922	# else
923	addl %eax, %ecx
924	# endif
925
926	# ifdef USE_AS_WCSCMP
927	movl VEC_OFFSET(%rdi, %rcx), %edx
928	xorl %eax, %eax
929	cmpl VEC_OFFSET(%rsi, %rcx), %edx
930	je L(ret9)
931	setl %al
932	negl %eax
933	xorl %r8d, %eax
934	# else
935	movzbl VEC_OFFSET(%rdi, %rcx), %eax
936	movzbl VEC_OFFSET(%rsi, %rcx), %ecx
937	TOLOWER_gpr (%rax, %eax)
938	TOLOWER_gpr (%rcx, %ecx)
939	subl %ecx, %eax
940	xorl %r8d, %eax
941	subl %r8d, %eax
942	# endif
943	L(ret9):
944	VZEROUPPER_RETURN
945
946
947	.p2align `4`,, `10`
948	L(page_cross):
949	# ifndef USE_AS_STRNCMP
950	/ If both are VEC aligned we don't need any special logic here.*
951	Only valid for strcmp where stop condition is guaranteed to be
952	reachable by just reading memory. /*
953	testl $((VEC_SIZE - `1`) << `20`), %eax
954	jz L(no_page_cross)
955	# endif
956
957	movl %edi, %eax
958	movl %esi, %ecx
959	andl $(PAGE_SIZE - `1`), %eax
960	andl $(PAGE_SIZE - `1`), %ecx
961
962	xorl %OFFSET_REG, %OFFSET_REG
963
964	/ Check which is closer to page cross, s1 or s2. /
965	cmpl %eax, %ecx
966	jg L(page_cross_s2)
967
968	/ The previous page cross check has false positives. Check for*
969	true positive as page cross logic is very expensive. /*
970	subl $(PAGE_SIZE - VEC_SIZE * `4`), %eax
971	jbe L(no_page_cross)
972
973	/ Set r8 to not interfere with normal return value (rdi and rsi*
974	did not swap). /*
975	# ifdef USE_AS_WCSCMP
976	/ any non-zero positive value that doesn't inference with 0x1.*
977	*/
978	movl $`2`, %r8d
979	# else
980	xorl %r8d, %r8d
981	# endif
982
983	/ Check if less than 1x VEC till page cross. /
984	subl $(VEC_SIZE * `3`), %eax
985	jg L(less_1x_vec_till_page)
986
987	/ If more than 1x VEC till page cross, loop through safely*
988	loadable memory until within 1x VEC of page cross. /*
989
990	.p2align `4`,, `10`
991	L(page_cross_loop):
992
993	VMOVU (%rdi, %OFFSET_REG64), %ymm0
994	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
995	VPCMPEQ %ymm0, %ymmZERO, %ymm2
996	vpandn %ymm1, %ymm2, %ymm1
997	vpmovmskb %ymm1, %ecx
998	incl %ecx
999
1000	jnz L(check_ret_vec_page_cross)
1001	addl $VEC_SIZE, %OFFSET_REG
1002	# ifdef USE_AS_STRNCMP
1003	cmpq %OFFSET_REG64, %rdx
1004	jbe L(ret_zero_page_cross)
1005	# endif
1006	addl $VEC_SIZE, %eax
1007	jl L(page_cross_loop)
1008
1009	subl %eax, %OFFSET_REG
1010	/ OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed*
1011	to not cross page so is safe to load. Since we have already
1012	loaded at least 1 VEC from rsi it is also guaranteed to be
1013	safe. /*
1014
1015	VMOVU (%rdi, %OFFSET_REG64), %ymm0
1016	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
1017	VPCMPEQ %ymm0, %ymmZERO, %ymm2
1018	vpandn %ymm1, %ymm2, %ymm1
1019	vpmovmskb %ymm1, %ecx
1020
1021	# ifdef USE_AS_STRNCMP
1022	leal VEC_SIZE(%OFFSET_REG64), %eax
1023	cmpq %rax, %rdx
1024	jbe L(check_ret_vec_page_cross2)
1025	addq %rdi, %rdx
1026	# endif
1027	incl %ecx
1028	jz L(prepare_loop_no_len)
1029
1030	.p2align `4`,, `4`
1031	L(ret_vec_page_cross):
1032	# ifndef USE_AS_STRNCMP
1033	L(check_ret_vec_page_cross):
1034	# endif
1035	tzcntl %ecx, %ecx
1036	addl %OFFSET_REG, %ecx
1037	L(ret_vec_page_cross_cont):
1038	# ifdef USE_AS_WCSCMP
1039	movl (%rdi, %rcx), %edx
1040	xorl %eax, %eax
1041	cmpl (%rsi, %rcx), %edx
1042	je L(ret12)
1043	setl %al
1044	negl %eax
1045	xorl %r8d, %eax
1046	# else
1047	movzbl (%rdi, %rcx), %eax
1048	movzbl (%rsi, %rcx), %ecx
1049	TOLOWER_gpr (%rax, %eax)
1050	TOLOWER_gpr (%rcx, %ecx)
1051	subl %ecx, %eax
1052	xorl %r8d, %eax
1053	subl %r8d, %eax
1054	# endif
1055	L(ret12):
1056	VZEROUPPER_RETURN
1057
1058	# ifdef USE_AS_STRNCMP
1059	.p2align `4`,, `10`
1060	L(check_ret_vec_page_cross2):
1061	incl %ecx
1062	L(check_ret_vec_page_cross):
1063	tzcntl %ecx, %ecx
1064	addl %OFFSET_REG, %ecx
1065	cmpq %rcx, %rdx
1066	ja L(ret_vec_page_cross_cont)
1067	.p2align `4`,, `2`
1068	L(ret_zero_page_cross):
1069	xorl %eax, %eax
1070	VZEROUPPER_RETURN
1071	# endif
1072
1073	.p2align `4`,, `4`
1074	L(page_cross_s2):
1075	/ Ensure this is a true page cross. /
1076	subl $(PAGE_SIZE - VEC_SIZE * `4`), %ecx
1077	jbe L(no_page_cross)
1078
1079
1080	movl %ecx, %eax
1081	movq %rdi, %rcx
1082	movq %rsi, %rdi
1083	movq %rcx, %rsi
1084
1085	/ set r8 to negate return value as rdi and rsi swapped. /
1086	# ifdef USE_AS_WCSCMP
1087	movl $-`4`, %r8d
1088	# else
1089	movl $-`1`, %r8d
1090	# endif
1091	xorl %OFFSET_REG, %OFFSET_REG
1092
1093	/ Check if more than 1x VEC till page cross. /
1094	subl $(VEC_SIZE * `3`), %eax
1095	jle L(page_cross_loop)
1096
1097	.p2align `4`,, `6`
1098	L(less_1x_vec_till_page):
1099	/ Find largest load size we can use. /
1100	cmpl $`16`, %eax
1101	ja L(less_16_till_page)
1102
1103	VMOVU (%rdi), %xmm0
1104	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
1105	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1106	vpandn %xmm1, %xmm2, %xmm1
1107	vpmovmskb %ymm1, %ecx
1108	incw %cx
1109	jnz L(check_ret_vec_page_cross)
1110	movl $`16`, %OFFSET_REG
1111	# ifdef USE_AS_STRNCMP
1112	cmpq %OFFSET_REG64, %rdx
1113	jbe L(ret_zero_page_cross_slow_case0)
1114	subl %eax, %OFFSET_REG
1115	# else
1116	/ Explicit check for 16 byte alignment. /
1117	subl %eax, %OFFSET_REG
1118	jz L(prepare_loop)
1119	# endif
1120
1121	VMOVU (%rdi, %OFFSET_REG64), %xmm0
1122	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
1123	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1124	vpandn %xmm1, %xmm2, %xmm1
1125	vpmovmskb %ymm1, %ecx
1126	incw %cx
1127	jnz L(check_ret_vec_page_cross)
1128
1129	# ifdef USE_AS_STRNCMP
1130	addl $`16`, %OFFSET_REG
1131	subq %OFFSET_REG64, %rdx
1132	jbe L(ret_zero_page_cross_slow_case0)
1133	subq $-(VEC_SIZE * `4`), %rdx
1134
1135	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1136	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1137	# else
1138	leaq (`16` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1139	leaq (`16` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1140	# endif
1141	jmp L(prepare_loop_aligned)
1142
1143	# ifdef USE_AS_STRNCMP
1144	.p2align `4`,, `2`
1145	L(ret_zero_page_cross_slow_case0):
1146	xorl %eax, %eax
1147	ret
1148	# endif
1149
1150
1151	.p2align `4`,, `10`
1152	L(less_16_till_page):
1153	/ Find largest load size we can use. /
1154	cmpl $`24`, %eax
1155	ja L(less_8_till_page)
1156
1157	vmovq (%rdi), %xmm0
1158	vmovq (%rsi), %xmm1
1159	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1160	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1161	vpandn %xmm1, %xmm2, %xmm1
1162	vpmovmskb %ymm1, %ecx
1163	incb %cl
1164	jnz L(check_ret_vec_page_cross)
1165
1166
1167	# ifdef USE_AS_STRNCMP
1168	cmpq $`8`, %rdx
1169	jbe L(ret_zero_page_cross_slow_case0)
1170	# endif
1171	movl $`24`, %OFFSET_REG
1172	/ Explicit check for 16 byte alignment. /
1173	subl %eax, %OFFSET_REG
1174
1175
1176
1177	vmovq (%rdi, %OFFSET_REG64), %xmm0
1178	vmovq (%rsi, %OFFSET_REG64), %xmm1
1179	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1180	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1181	vpandn %xmm1, %xmm2, %xmm1
1182	vpmovmskb %ymm1, %ecx
1183	incb %cl
1184	jnz L(check_ret_vec_page_cross)
1185
1186	# ifdef USE_AS_STRNCMP
1187	addl $`8`, %OFFSET_REG
1188	subq %OFFSET_REG64, %rdx
1189	jbe L(ret_zero_page_cross_slow_case0)
1190	subq $-(VEC_SIZE * `4`), %rdx
1191
1192	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1193	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1194	# else
1195	leaq (`8` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1196	leaq (`8` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1197	# endif
1198	jmp L(prepare_loop_aligned)
1199
1200
1201	.p2align `4`,, `10`
1202	L(less_8_till_page):
1203	# ifdef USE_AS_WCSCMP
1204	/ If using wchar then this is the only check before we reach*
1205	the page boundary. /*
1206	movl (%rdi), %eax
1207	movl (%rsi), %ecx
1208	cmpl %ecx, %eax
1209	jnz L(ret_less_8_wcs)
1210	# ifdef USE_AS_STRNCMP
1211	addq %rdi, %rdx
1212	/ We already checked for len <= 1 so cannot hit that case here.*
1213	*/
1214	# endif
1215	testl %eax, %eax
1216	jnz L(prepare_loop_no_len)
1217	ret
1218
1219	.p2align `4`,, `8`
1220	L(ret_less_8_wcs):
1221	setl %OFFSET_REG8
1222	negl %OFFSET_REG
1223	movl %OFFSET_REG, %eax
1224	xorl %r8d, %eax
1225	ret
1226
1227	# else
1228
1229	/ Find largest load size we can use. /
1230	cmpl $`28`, %eax
1231	ja L(less_4_till_page)
1232
1233	vmovd (%rdi), %xmm0
1234	vmovd (%rsi), %xmm1
1235	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1236	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1237	vpandn %xmm1, %xmm2, %xmm1
1238	vpmovmskb %ymm1, %ecx
1239	subl $`0xf`, %ecx
1240	jnz L(check_ret_vec_page_cross)
1241
1242	# ifdef USE_AS_STRNCMP
1243	cmpq $`4`, %rdx
1244	jbe L(ret_zero_page_cross_slow_case1)
1245	# endif
1246	movl $`28`, %OFFSET_REG
1247	/ Explicit check for 16 byte alignment. /
1248	subl %eax, %OFFSET_REG
1249
1250
1251
1252	vmovd (%rdi, %OFFSET_REG64), %xmm0
1253	vmovd (%rsi, %OFFSET_REG64), %xmm1
1254	VPCMPEQ %xmm0, %xmmZERO, %xmm2
1255	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1256	vpandn %xmm1, %xmm2, %xmm1
1257	vpmovmskb %ymm1, %ecx
1258	subl $`0xf`, %ecx
1259	jnz L(check_ret_vec_page_cross)
1260
1261	# ifdef USE_AS_STRNCMP
1262	addl $`4`, %OFFSET_REG
1263	subq %OFFSET_REG64, %rdx
1264	jbe L(ret_zero_page_cross_slow_case1)
1265	subq $-(VEC_SIZE * `4`), %rdx
1266
1267	leaq -(VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1268	leaq -(VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1269	# else
1270	leaq (`4` - VEC_SIZE * `4`)(%rdi, %OFFSET_REG64), %rdi
1271	leaq (`4` - VEC_SIZE * `4`)(%rsi, %OFFSET_REG64), %rsi
1272	# endif
1273	jmp L(prepare_loop_aligned)
1274
1275	# ifdef USE_AS_STRNCMP
1276	.p2align `4`,, `2`
1277	L(ret_zero_page_cross_slow_case1):
1278	xorl %eax, %eax
1279	ret
1280	# endif
1281
1282	.p2align `4`,, `10`
1283	L(less_4_till_page):
1284	subq %rdi, %rsi
1285	/ Extremely slow byte comparison loop. /
1286	L(less_4_loop):
1287	movzbl (%rdi), %eax
1288	movzbl (%rsi, %rdi), %ecx
1289	TOLOWER_gpr (%rax, %eax)
1290	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1291	subl %BYTE_LOOP_REG, %eax
1292	jnz L(ret_less_4_loop)
1293	testl %ecx, %ecx
1294	jz L(ret_zero_4_loop)
1295	# ifdef USE_AS_STRNCMP
1296	decq %rdx
1297	jz L(ret_zero_4_loop)
1298	# endif
1299	incq %rdi
1300	/ end condition is reach page boundary (rdi is aligned). /
1301	testl $`31`, %edi
1302	jnz L(less_4_loop)
1303	leaq -(VEC_SIZE * `4`)(%rdi, %rsi), %rsi
1304	addq $-(VEC_SIZE * `4`), %rdi
1305	# ifdef USE_AS_STRNCMP
1306	subq $-(VEC_SIZE * `4`), %rdx
1307	# endif
1308	jmp L(prepare_loop_aligned)
1309
1310	L(ret_zero_4_loop):
1311	xorl %eax, %eax
1312	ret
1313	L(ret_less_4_loop):
1314	xorl %r8d, %eax
1315	subl %r8d, %eax
1316	ret
1317	# endif
1318	cfi_endproc
1319	.size STRCMP, .-STRCMP
1320	#endif
1321

source code of glibc/sysdeps/x86_64/multiarch/strcmp-avx2.S