1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2 Copyright (C) 2018-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# ifndef STRCMP_ISA
24# define STRCMP_ISA _avx2
25# endif
26
27# include "strcmp-naming.h"
28
29# include <sysdep.h>
30
31# if defined USE_AS_STRCASECMP_L
32# include "locale-defines.h"
33# endif
34
35# ifndef STRCMP
36# define STRCMP __strcmp_avx2
37# endif
38
39# define PAGE_SIZE 4096
40
41 /* VEC_SIZE = Number of bytes in a ymm register. */
42# define VEC_SIZE 32
43
44# define VMOVU vmovdqu
45# define VMOVA vmovdqa
46
47# ifdef USE_AS_WCSCMP
48 /* Compare packed dwords. */
49# define VPCMPEQ vpcmpeqd
50 /* Compare packed dwords and store minimum. */
51# define VPMINU vpminud
52 /* 1 dword char == 4 bytes. */
53# define SIZE_OF_CHAR 4
54# else
55 /* Compare packed bytes. */
56# define VPCMPEQ vpcmpeqb
57 /* Compare packed bytes and store minimum. */
58# define VPMINU vpminub
59 /* 1 byte char == 1 byte. */
60# define SIZE_OF_CHAR 1
61# endif
62
63# ifdef USE_AS_STRNCMP
64# define LOOP_REG r9d
65# define LOOP_REG64 r9
66
67# define OFFSET_REG8 r9b
68# define OFFSET_REG r9d
69# define OFFSET_REG64 r9
70# else
71# define LOOP_REG edx
72# define LOOP_REG64 rdx
73
74# define OFFSET_REG8 dl
75# define OFFSET_REG edx
76# define OFFSET_REG64 rdx
77# endif
78
79# ifndef VZEROUPPER
80# define VZEROUPPER vzeroupper
81# endif
82
83# if defined USE_AS_STRNCMP
84# define VEC_OFFSET 0
85# else
86# define VEC_OFFSET (-VEC_SIZE)
87# endif
88
89# ifdef USE_AS_STRCASECMP_L
90# define BYTE_LOOP_REG OFFSET_REG
91# else
92# define BYTE_LOOP_REG ecx
93# endif
94
95# ifdef USE_AS_STRCASECMP_L
96# ifdef USE_AS_STRNCMP
97# define LOCALE_REG rcx
98# define LOCALE_REG_LP RCX_LP
99# else
100# define LOCALE_REG rdx
101# define LOCALE_REG_LP RDX_LP
102# endif
103# endif
104
105# define xmmZERO xmm15
106# define ymmZERO ymm15
107
108# define LCASE_MIN_ymm %ymm10
109# define LCASE_MAX_ymm %ymm11
110# define CASE_ADD_ymm %ymm12
111
112# define LCASE_MIN_xmm %xmm10
113# define LCASE_MAX_xmm %xmm11
114# define CASE_ADD_xmm %xmm12
115
116 /* r11 is never use elsewhere so this is safe to maintain. */
117# define TOLOWER_BASE %r11
118
119# ifndef SECTION
120# define SECTION(p) p##.avx
121# endif
122
123# ifdef USE_AS_STRCASECMP_L
124# define REG(x, y) x ## y
125# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
126 vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
127 vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
128 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
129 vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
130 vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
131 vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
132 vpaddb REG(%ext, 8), reg1_in, reg1_out; \
133 vpaddb REG(%ext, 9), reg2_in, reg2_out
134
135# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
136# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
137# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
138
139# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
140 TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
141 VPCMPEQ scratch_reg, s2_reg, reg_out
142
143# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
144 VMOVU s2_mem, reg_out; \
145 CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
146
147# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
148# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
149
150# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
151# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
152
153# else
154# define TOLOWER_gpr(...)
155# define TOLOWER_ymm(...)
156# define TOLOWER_xmm(...)
157
158# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
159 VPCMPEQ s2_reg, s1_reg, reg_out
160
161# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
162
163# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
164# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
165# endif
166
167/* Warning!
168 wcscmp/wcsncmp have to use SIGNED comparison for elements.
169 strcmp/strncmp have to use UNSIGNED comparison for elements.
170*/
171
172/* The main idea of the string comparison (byte or dword) using AVX2
173 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
174 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
175 to check the null char, algorithm keeps the matched bytes/dwords,
176 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
177 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
178 one VPMINU instructions, together with movdqu and testl instructions.
179 Main loop (away from from page boundary) compares 4 vectors are a time,
180 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
181
182 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
183 is the same as strcmp, except that an a maximum offset is tracked. If
184 the maximum offset is reached before a difference is found, zero is
185 returned. */
186
187 .section SECTION(.text), "ax", @progbits
188 .align 16
189 .type STRCMP, @function
190 .globl STRCMP
191
192# ifdef USE_AS_STRCASECMP_L
193ENTRY (STRCASECMP)
194 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
195 mov %fs:(%rax), %LOCALE_REG_LP
196
197 /* Either 1 or 5 bytes (depending if CET is enabled). */
198 .p2align 4
199END (STRCASECMP)
200 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
201# endif
202
203 .p2align 4
204STRCMP:
205 cfi_startproc
206 _CET_ENDBR
207 CALL_MCOUNT
208
209# if defined USE_AS_STRCASECMP_L
210 /* We have to fall back on the C implementation for locales with
211 encodings not matching ASCII for single bytes. */
212# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
213 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
214# else
215 mov (%LOCALE_REG), %RAX_LP
216# endif
217 testb $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
218 jne STRCASECMP_L_NONASCII
219 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
220# endif
221
222# ifdef USE_AS_STRNCMP
223 /* Don't overwrite LOCALE_REG (rcx) until we have pass
224 L(one_or_less). Otherwise we might use the wrong locale in
225 the OVERFLOW_STRCMP (strcasecmp_l). */
226# ifdef __ILP32__
227 /* Clear the upper 32 bits. */
228 movl %edx, %edx
229# endif
230 cmp $1, %RDX_LP
231 /* Signed comparison intentional. We use this branch to also
232 test cases where length >= 2^63. These very large sizes can be
233 handled with strcmp as there is no way for that length to
234 actually bound the buffer. */
235 jle L(one_or_less)
236# ifdef USE_AS_WCSCMP
237 movq %rdx, %rcx
238
239 /* Multiplying length by sizeof(wchar_t) can result in overflow.
240 Check if that is possible. All cases where overflow are possible
241 are cases where length is large enough that it can never be a
242 bound on valid memory so just use wcscmp. */
243 shrq $56, %rcx
244 jnz OVERFLOW_STRCMP
245
246 leaq (, %rdx, 4), %rdx
247# endif
248# endif
249 vpxor %xmmZERO, %xmmZERO, %xmmZERO
250# if defined USE_AS_STRCASECMP_L
251 .section .rodata.cst32, "aM", @progbits, 32
252 .align 32
253L(lcase_min):
254 .quad 0x3f3f3f3f3f3f3f3f
255 .quad 0x3f3f3f3f3f3f3f3f
256 .quad 0x3f3f3f3f3f3f3f3f
257 .quad 0x3f3f3f3f3f3f3f3f
258L(lcase_max):
259 .quad 0x9999999999999999
260 .quad 0x9999999999999999
261 .quad 0x9999999999999999
262 .quad 0x9999999999999999
263L(case_add):
264 .quad 0x2020202020202020
265 .quad 0x2020202020202020
266 .quad 0x2020202020202020
267 .quad 0x2020202020202020
268 .previous
269
270 vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
271 vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
272 vmovdqa L(case_add)(%rip), CASE_ADD_ymm
273# endif
274 movl %edi, %eax
275 orl %esi, %eax
276 sall $20, %eax
277 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
278 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
279 ja L(page_cross)
280
281L(no_page_cross):
282 /* Safe to compare 4x vectors. */
283 VMOVU (%rdi), %ymm0
284 /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
285 Otherwise converts ymm0 and load from rsi to lower. ymm2 is
286 scratch and ymm1 is the return. */
287 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
288 /* 1s at null CHAR. */
289 VPCMPEQ %ymm0, %ymmZERO, %ymm2
290 /* 1s where s1 and s2 equal AND not null CHAR. */
291 vpandn %ymm1, %ymm2, %ymm1
292
293 /* All 1s -> keep going, any 0s -> return. */
294 vpmovmskb %ymm1, %ecx
295# ifdef USE_AS_STRNCMP
296 cmpq $VEC_SIZE, %rdx
297 jbe L(vec_0_test_len)
298# endif
299
300 /* All 1s represents all equals. incl will overflow to zero in
301 all equals case. Otherwise 1s will carry until position of first
302 mismatch. */
303 incl %ecx
304 jz L(more_3x_vec)
305
306 .p2align 4,, 4
307L(return_vec_0):
308 tzcntl %ecx, %ecx
309# ifdef USE_AS_WCSCMP
310 movl (%rdi, %rcx), %edx
311 xorl %eax, %eax
312 cmpl (%rsi, %rcx), %edx
313 je L(ret0)
314 setl %al
315 negl %eax
316 orl $1, %eax
317# else
318 movzbl (%rdi, %rcx), %eax
319 movzbl (%rsi, %rcx), %ecx
320 TOLOWER_gpr (%rax, %eax)
321 TOLOWER_gpr (%rcx, %ecx)
322 subl %ecx, %eax
323# endif
324L(ret0):
325L(return_vzeroupper):
326 ZERO_UPPER_VEC_REGISTERS_RETURN
327
328# ifdef USE_AS_STRNCMP
329 .p2align 4,, 8
330L(vec_0_test_len):
331 notl %ecx
332 bzhil %edx, %ecx, %eax
333 jnz L(return_vec_0)
334 /* Align if will cross fetch block. */
335 .p2align 4,, 2
336L(ret_zero):
337 xorl %eax, %eax
338 VZEROUPPER_RETURN
339
340 .p2align 4,, 5
341L(one_or_less):
342# ifdef USE_AS_STRCASECMP_L
343 /* Set locale argument for strcasecmp. */
344 movq %LOCALE_REG, %rdx
345# endif
346 jb L(ret_zero)
347 /* 'nbe' covers the case where length is negative (large
348 unsigned). */
349 jnbe OVERFLOW_STRCMP
350# ifdef USE_AS_WCSCMP
351 movl (%rdi), %edx
352 xorl %eax, %eax
353 cmpl (%rsi), %edx
354 je L(ret1)
355 setl %al
356 negl %eax
357 orl $1, %eax
358# else
359 movzbl (%rdi), %eax
360 movzbl (%rsi), %ecx
361 TOLOWER_gpr (%rax, %eax)
362 TOLOWER_gpr (%rcx, %ecx)
363 subl %ecx, %eax
364# endif
365L(ret1):
366 ret
367# endif
368
369 .p2align 4,, 10
370L(return_vec_1):
371 tzcntl %ecx, %ecx
372# ifdef USE_AS_STRNCMP
373 /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
374 overflow. */
375 addq $-VEC_SIZE, %rdx
376 cmpq %rcx, %rdx
377 jbe L(ret_zero)
378# endif
379# ifdef USE_AS_WCSCMP
380 movl VEC_SIZE(%rdi, %rcx), %edx
381 xorl %eax, %eax
382 cmpl VEC_SIZE(%rsi, %rcx), %edx
383 je L(ret2)
384 setl %al
385 negl %eax
386 orl $1, %eax
387# else
388 movzbl VEC_SIZE(%rdi, %rcx), %eax
389 movzbl VEC_SIZE(%rsi, %rcx), %ecx
390 TOLOWER_gpr (%rax, %eax)
391 TOLOWER_gpr (%rcx, %ecx)
392 subl %ecx, %eax
393# endif
394L(ret2):
395 VZEROUPPER_RETURN
396
397 .p2align 4,, 10
398# ifdef USE_AS_STRNCMP
399L(return_vec_3):
400 salq $32, %rcx
401# endif
402
403L(return_vec_2):
404# ifndef USE_AS_STRNCMP
405 tzcntl %ecx, %ecx
406# else
407 tzcntq %rcx, %rcx
408 cmpq %rcx, %rdx
409 jbe L(ret_zero)
410# endif
411
412# ifdef USE_AS_WCSCMP
413 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
414 xorl %eax, %eax
415 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
416 je L(ret3)
417 setl %al
418 negl %eax
419 orl $1, %eax
420# else
421 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
422 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
423 TOLOWER_gpr (%rax, %eax)
424 TOLOWER_gpr (%rcx, %ecx)
425 subl %ecx, %eax
426# endif
427L(ret3):
428 VZEROUPPER_RETURN
429
430# ifndef USE_AS_STRNCMP
431 .p2align 4,, 10
432L(return_vec_3):
433 tzcntl %ecx, %ecx
434# ifdef USE_AS_WCSCMP
435 movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
436 xorl %eax, %eax
437 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
438 je L(ret4)
439 setl %al
440 negl %eax
441 orl $1, %eax
442# else
443 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
444 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
445 TOLOWER_gpr (%rax, %eax)
446 TOLOWER_gpr (%rcx, %ecx)
447 subl %ecx, %eax
448# endif
449L(ret4):
450 VZEROUPPER_RETURN
451# endif
452
453 .p2align 4,, 10
454L(more_3x_vec):
455 /* Safe to compare 4x vectors. */
456 VMOVU VEC_SIZE(%rdi), %ymm0
457 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
458 VPCMPEQ %ymm0, %ymmZERO, %ymm2
459 vpandn %ymm1, %ymm2, %ymm1
460 vpmovmskb %ymm1, %ecx
461 incl %ecx
462 jnz L(return_vec_1)
463
464# ifdef USE_AS_STRNCMP
465 subq $(VEC_SIZE * 2), %rdx
466 jbe L(ret_zero)
467# endif
468
469 VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
470 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
471 VPCMPEQ %ymm0, %ymmZERO, %ymm2
472 vpandn %ymm1, %ymm2, %ymm1
473 vpmovmskb %ymm1, %ecx
474 incl %ecx
475 jnz L(return_vec_2)
476
477 VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
478 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
479 VPCMPEQ %ymm0, %ymmZERO, %ymm2
480 vpandn %ymm1, %ymm2, %ymm1
481 vpmovmskb %ymm1, %ecx
482 incl %ecx
483 jnz L(return_vec_3)
484
485# ifdef USE_AS_STRNCMP
486 cmpq $(VEC_SIZE * 2), %rdx
487 jbe L(ret_zero)
488# endif
489
490# ifdef USE_AS_WCSCMP
491 /* any non-zero positive value that doesn't inference with 0x1.
492 */
493 movl $2, %r8d
494
495# else
496 xorl %r8d, %r8d
497# endif
498
499 /* The prepare labels are various entry points from the page
500 cross logic. */
501L(prepare_loop):
502
503# ifdef USE_AS_STRNCMP
504 /* Store N + (VEC_SIZE * 4) and place check at the beginning of
505 the loop. */
506 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
507# endif
508L(prepare_loop_no_len):
509
510 /* Align s1 and adjust s2 accordingly. */
511 subq %rdi, %rsi
512 andq $-(VEC_SIZE * 4), %rdi
513 addq %rdi, %rsi
514
515# ifdef USE_AS_STRNCMP
516 subq %rdi, %rdx
517# endif
518
519L(prepare_loop_aligned):
520 /* eax stores distance from rsi to next page cross. These cases
521 need to be handled specially as the 4x loop could potentially
522 read memory past the length of s1 or s2 and across a page
523 boundary. */
524 movl $-(VEC_SIZE * 4), %eax
525 subl %esi, %eax
526 andl $(PAGE_SIZE - 1), %eax
527
528 /* Loop 4x comparisons at a time. */
529 .p2align 4
530L(loop):
531
532 /* End condition for strncmp. */
533# ifdef USE_AS_STRNCMP
534 subq $(VEC_SIZE * 4), %rdx
535 jbe L(ret_zero)
536# endif
537
538 subq $-(VEC_SIZE * 4), %rdi
539 subq $-(VEC_SIZE * 4), %rsi
540
541 /* Check if rsi loads will cross a page boundary. */
542 addl $-(VEC_SIZE * 4), %eax
543 jnb L(page_cross_during_loop)
544
545 /* Loop entry after handling page cross during loop. */
546L(loop_skip_page_cross_check):
547 VMOVA (VEC_SIZE * 0)(%rdi), %ymm0
548 VMOVA (VEC_SIZE * 1)(%rdi), %ymm2
549 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
550 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
551
552 /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
553 CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
554 CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
555 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
556 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
557
558 /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
559 zero. */
560 vpand %ymm0, %ymm1, %ymm1
561
562
563 vpand %ymm2, %ymm3, %ymm3
564 vpand %ymm4, %ymm5, %ymm5
565 vpand %ymm6, %ymm7, %ymm7
566
567 VPMINU %ymm1, %ymm3, %ymm3
568 VPMINU %ymm5, %ymm7, %ymm7
569
570 /* Reduce all 0 CHARs for the 4x VEC into ymm7. */
571 VPMINU %ymm3, %ymm7, %ymm7
572
573 /* If any 0 CHAR then done. */
574 VPCMPEQ %ymm7, %ymmZERO, %ymm7
575 vpmovmskb %ymm7, %LOOP_REG
576 testl %LOOP_REG, %LOOP_REG
577 jz L(loop)
578
579 /* Find which VEC has the mismatch of end of string. */
580 VPCMPEQ %ymm1, %ymmZERO, %ymm1
581 vpmovmskb %ymm1, %ecx
582 testl %ecx, %ecx
583 jnz L(return_vec_0_end)
584
585
586 VPCMPEQ %ymm3, %ymmZERO, %ymm3
587 vpmovmskb %ymm3, %ecx
588 testl %ecx, %ecx
589 jnz L(return_vec_1_end)
590
591L(return_vec_2_3_end):
592# ifdef USE_AS_STRNCMP
593 subq $(VEC_SIZE * 2), %rdx
594 jbe L(ret_zero_end)
595# endif
596
597 VPCMPEQ %ymm5, %ymmZERO, %ymm5
598 vpmovmskb %ymm5, %ecx
599 testl %ecx, %ecx
600 jnz L(return_vec_2_end)
601
602 /* LOOP_REG contains matches for null/mismatch from the loop. If
603 VEC 0,1,and 2 all have no null and no mismatches then mismatch
604 must entirely be from VEC 3 which is fully represented by
605 LOOP_REG. */
606 tzcntl %LOOP_REG, %LOOP_REG
607
608# ifdef USE_AS_STRNCMP
609 subl $-(VEC_SIZE), %LOOP_REG
610 cmpq %LOOP_REG64, %rdx
611 jbe L(ret_zero_end)
612# endif
613
614# ifdef USE_AS_WCSCMP
615 movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
616 xorl %eax, %eax
617 cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
618 je L(ret5)
619 setl %al
620 negl %eax
621 xorl %r8d, %eax
622# else
623 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
624 movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
625 TOLOWER_gpr (%rax, %eax)
626 TOLOWER_gpr (%rcx, %ecx)
627 subl %ecx, %eax
628 xorl %r8d, %eax
629 subl %r8d, %eax
630# endif
631L(ret5):
632 VZEROUPPER_RETURN
633
634# ifdef USE_AS_STRNCMP
635 .p2align 4,, 2
636L(ret_zero_end):
637 xorl %eax, %eax
638 VZEROUPPER_RETURN
639# endif
640
641
642 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
643 they use the value of `r8` to negate the return value. This is
644 because the page cross logic can swap `rdi` and `rsi`. */
645 .p2align 4,, 10
646# ifdef USE_AS_STRNCMP
647L(return_vec_1_end):
648 salq $32, %rcx
649# endif
650L(return_vec_0_end):
651# ifndef USE_AS_STRNCMP
652 tzcntl %ecx, %ecx
653# else
654 tzcntq %rcx, %rcx
655 cmpq %rcx, %rdx
656 jbe L(ret_zero_end)
657# endif
658
659# ifdef USE_AS_WCSCMP
660 movl (%rdi, %rcx), %edx
661 xorl %eax, %eax
662 cmpl (%rsi, %rcx), %edx
663 je L(ret6)
664 setl %al
665 negl %eax
666 xorl %r8d, %eax
667# else
668 movzbl (%rdi, %rcx), %eax
669 movzbl (%rsi, %rcx), %ecx
670 TOLOWER_gpr (%rax, %eax)
671 TOLOWER_gpr (%rcx, %ecx)
672 subl %ecx, %eax
673 xorl %r8d, %eax
674 subl %r8d, %eax
675# endif
676L(ret6):
677 VZEROUPPER_RETURN
678
679# ifndef USE_AS_STRNCMP
680 .p2align 4,, 10
681L(return_vec_1_end):
682 tzcntl %ecx, %ecx
683# ifdef USE_AS_WCSCMP
684 movl VEC_SIZE(%rdi, %rcx), %edx
685 xorl %eax, %eax
686 cmpl VEC_SIZE(%rsi, %rcx), %edx
687 je L(ret7)
688 setl %al
689 negl %eax
690 xorl %r8d, %eax
691# else
692 movzbl VEC_SIZE(%rdi, %rcx), %eax
693 movzbl VEC_SIZE(%rsi, %rcx), %ecx
694 TOLOWER_gpr (%rax, %eax)
695 TOLOWER_gpr (%rcx, %ecx)
696 subl %ecx, %eax
697 xorl %r8d, %eax
698 subl %r8d, %eax
699# endif
700L(ret7):
701 VZEROUPPER_RETURN
702# endif
703
704 .p2align 4,, 10
705L(return_vec_2_end):
706 tzcntl %ecx, %ecx
707# ifdef USE_AS_STRNCMP
708 cmpq %rcx, %rdx
709 jbe L(ret_zero_page_cross)
710# endif
711# ifdef USE_AS_WCSCMP
712 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
713 xorl %eax, %eax
714 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
715 je L(ret11)
716 setl %al
717 negl %eax
718 xorl %r8d, %eax
719# else
720 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
721 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
722 TOLOWER_gpr (%rax, %eax)
723 TOLOWER_gpr (%rcx, %ecx)
724 subl %ecx, %eax
725 xorl %r8d, %eax
726 subl %r8d, %eax
727# endif
728L(ret11):
729 VZEROUPPER_RETURN
730
731
732 /* Page cross in rsi in next 4x VEC. */
733
734 /* TODO: Improve logic here. */
735 .p2align 4,, 10
736L(page_cross_during_loop):
737 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
738
739 /* Optimistically rsi and rdi and both aligned inwhich case we
740 don't need any logic here. */
741 cmpl $-(VEC_SIZE * 4), %eax
742 /* Don't adjust eax before jumping back to loop and we will
743 never hit page cross case again. */
744 je L(loop_skip_page_cross_check)
745
746 /* Check if we can safely load a VEC. */
747 cmpl $-(VEC_SIZE * 3), %eax
748 jle L(less_1x_vec_till_page_cross)
749
750 VMOVA (%rdi), %ymm0
751 CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
752 VPCMPEQ %ymm0, %ymmZERO, %ymm2
753 vpandn %ymm1, %ymm2, %ymm1
754 vpmovmskb %ymm1, %ecx
755 incl %ecx
756 jnz L(return_vec_0_end)
757
758 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
759 cmpl $-(VEC_SIZE * 2), %eax
760 jg L(more_2x_vec_till_page_cross)
761
762 .p2align 4,, 4
763L(less_1x_vec_till_page_cross):
764 subl $-(VEC_SIZE * 4), %eax
765 /* Guaranteed safe to read from rdi - VEC_SIZE here. The only
766 concerning case is first iteration if incoming s1 was near start
767 of a page and s2 near end. If s1 was near the start of the page
768 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
769 to read back -VEC_SIZE. If rdi is truly at the start of a page
770 here, it means the previous page (rdi - VEC_SIZE) has already
771 been loaded earlier so must be valid. */
772 VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
773 CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
774 VPCMPEQ %ymm0, %ymmZERO, %ymm2
775 vpandn %ymm1, %ymm2, %ymm1
776 vpmovmskb %ymm1, %ecx
777
778 /* Mask of potentially valid bits. The lower bits can be out of
779 range comparisons (but safe regarding page crosses). */
780 movl $-1, %r10d
781 shlxl %esi, %r10d, %r10d
782 notl %ecx
783
784# ifdef USE_AS_STRNCMP
785 cmpq %rax, %rdx
786 jbe L(return_page_cross_end_check)
787# endif
788 movl %eax, %OFFSET_REG
789 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
790
791 andl %r10d, %ecx
792 jz L(loop_skip_page_cross_check)
793
794 .p2align 4,, 3
795L(return_page_cross_end):
796 tzcntl %ecx, %ecx
797
798# ifdef USE_AS_STRNCMP
799 leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
800L(return_page_cross_cmp_mem):
801# else
802 addl %OFFSET_REG, %ecx
803# endif
804# ifdef USE_AS_WCSCMP
805 movl VEC_OFFSET(%rdi, %rcx), %edx
806 xorl %eax, %eax
807 cmpl VEC_OFFSET(%rsi, %rcx), %edx
808 je L(ret8)
809 setl %al
810 negl %eax
811 xorl %r8d, %eax
812# else
813 movzbl VEC_OFFSET(%rdi, %rcx), %eax
814 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
815 TOLOWER_gpr (%rax, %eax)
816 TOLOWER_gpr (%rcx, %ecx)
817 subl %ecx, %eax
818 xorl %r8d, %eax
819 subl %r8d, %eax
820# endif
821L(ret8):
822 VZEROUPPER_RETURN
823
824# ifdef USE_AS_STRNCMP
825 .p2align 4,, 10
826L(return_page_cross_end_check):
827 andl %r10d, %ecx
828 tzcntl %ecx, %ecx
829 leal -VEC_SIZE(%rax, %rcx), %ecx
830 cmpl %ecx, %edx
831 ja L(return_page_cross_cmp_mem)
832 xorl %eax, %eax
833 VZEROUPPER_RETURN
834# endif
835
836
837 .p2align 4,, 10
838L(more_2x_vec_till_page_cross):
839 /* If more 2x vec till cross we will complete a full loop
840 iteration here. */
841
842 VMOVU VEC_SIZE(%rdi), %ymm0
843 CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
844 VPCMPEQ %ymm0, %ymmZERO, %ymm2
845 vpandn %ymm1, %ymm2, %ymm1
846 vpmovmskb %ymm1, %ecx
847 incl %ecx
848 jnz L(return_vec_1_end)
849
850# ifdef USE_AS_STRNCMP
851 cmpq $(VEC_SIZE * 2), %rdx
852 jbe L(ret_zero_in_loop_page_cross)
853# endif
854
855 subl $-(VEC_SIZE * 4), %eax
856
857 /* Safe to include comparisons from lower bytes. */
858 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
859 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
860 VPCMPEQ %ymm0, %ymmZERO, %ymm2
861 vpandn %ymm1, %ymm2, %ymm1
862 vpmovmskb %ymm1, %ecx
863 incl %ecx
864 jnz L(return_vec_page_cross_0)
865
866 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
867 CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
868 VPCMPEQ %ymm0, %ymmZERO, %ymm2
869 vpandn %ymm1, %ymm2, %ymm1
870 vpmovmskb %ymm1, %ecx
871 incl %ecx
872 jnz L(return_vec_page_cross_1)
873
874# ifdef USE_AS_STRNCMP
875 /* Must check length here as length might proclude reading next
876 page. */
877 cmpq %rax, %rdx
878 jbe L(ret_zero_in_loop_page_cross)
879# endif
880
881 /* Finish the loop. */
882 VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
883 VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
884
885 CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
886 CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
887 vpand %ymm4, %ymm5, %ymm5
888 vpand %ymm6, %ymm7, %ymm7
889 VPMINU %ymm5, %ymm7, %ymm7
890 VPCMPEQ %ymm7, %ymmZERO, %ymm7
891 vpmovmskb %ymm7, %LOOP_REG
892 testl %LOOP_REG, %LOOP_REG
893 jnz L(return_vec_2_3_end)
894
895 /* Best for code size to include ucond-jmp here. Would be faster
896 if this case is hot to duplicate the L(return_vec_2_3_end) code
897 as fall-through and have jump back to loop on mismatch
898 comparison. */
899 subq $-(VEC_SIZE * 4), %rdi
900 subq $-(VEC_SIZE * 4), %rsi
901 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
902# ifdef USE_AS_STRNCMP
903 subq $(VEC_SIZE * 4), %rdx
904 ja L(loop_skip_page_cross_check)
905L(ret_zero_in_loop_page_cross):
906 xorl %eax, %eax
907 VZEROUPPER_RETURN
908# else
909 jmp L(loop_skip_page_cross_check)
910# endif
911
912
913 .p2align 4,, 10
914L(return_vec_page_cross_0):
915 addl $-VEC_SIZE, %eax
916L(return_vec_page_cross_1):
917 tzcntl %ecx, %ecx
918# ifdef USE_AS_STRNCMP
919 leal -VEC_SIZE(%rax, %rcx), %ecx
920 cmpq %rcx, %rdx
921 jbe L(ret_zero_in_loop_page_cross)
922# else
923 addl %eax, %ecx
924# endif
925
926# ifdef USE_AS_WCSCMP
927 movl VEC_OFFSET(%rdi, %rcx), %edx
928 xorl %eax, %eax
929 cmpl VEC_OFFSET(%rsi, %rcx), %edx
930 je L(ret9)
931 setl %al
932 negl %eax
933 xorl %r8d, %eax
934# else
935 movzbl VEC_OFFSET(%rdi, %rcx), %eax
936 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
937 TOLOWER_gpr (%rax, %eax)
938 TOLOWER_gpr (%rcx, %ecx)
939 subl %ecx, %eax
940 xorl %r8d, %eax
941 subl %r8d, %eax
942# endif
943L(ret9):
944 VZEROUPPER_RETURN
945
946
947 .p2align 4,, 10
948L(page_cross):
949# ifndef USE_AS_STRNCMP
950 /* If both are VEC aligned we don't need any special logic here.
951 Only valid for strcmp where stop condition is guaranteed to be
952 reachable by just reading memory. */
953 testl $((VEC_SIZE - 1) << 20), %eax
954 jz L(no_page_cross)
955# endif
956
957 movl %edi, %eax
958 movl %esi, %ecx
959 andl $(PAGE_SIZE - 1), %eax
960 andl $(PAGE_SIZE - 1), %ecx
961
962 xorl %OFFSET_REG, %OFFSET_REG
963
964 /* Check which is closer to page cross, s1 or s2. */
965 cmpl %eax, %ecx
966 jg L(page_cross_s2)
967
968 /* The previous page cross check has false positives. Check for
969 true positive as page cross logic is very expensive. */
970 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
971 jbe L(no_page_cross)
972
973 /* Set r8 to not interfere with normal return value (rdi and rsi
974 did not swap). */
975# ifdef USE_AS_WCSCMP
976 /* any non-zero positive value that doesn't inference with 0x1.
977 */
978 movl $2, %r8d
979# else
980 xorl %r8d, %r8d
981# endif
982
983 /* Check if less than 1x VEC till page cross. */
984 subl $(VEC_SIZE * 3), %eax
985 jg L(less_1x_vec_till_page)
986
987 /* If more than 1x VEC till page cross, loop through safely
988 loadable memory until within 1x VEC of page cross. */
989
990 .p2align 4,, 10
991L(page_cross_loop):
992
993 VMOVU (%rdi, %OFFSET_REG64), %ymm0
994 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
995 VPCMPEQ %ymm0, %ymmZERO, %ymm2
996 vpandn %ymm1, %ymm2, %ymm1
997 vpmovmskb %ymm1, %ecx
998 incl %ecx
999
1000 jnz L(check_ret_vec_page_cross)
1001 addl $VEC_SIZE, %OFFSET_REG
1002# ifdef USE_AS_STRNCMP
1003 cmpq %OFFSET_REG64, %rdx
1004 jbe L(ret_zero_page_cross)
1005# endif
1006 addl $VEC_SIZE, %eax
1007 jl L(page_cross_loop)
1008
1009 subl %eax, %OFFSET_REG
1010 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guaranteed
1011 to not cross page so is safe to load. Since we have already
1012 loaded at least 1 VEC from rsi it is also guaranteed to be
1013 safe. */
1014
1015 VMOVU (%rdi, %OFFSET_REG64), %ymm0
1016 CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
1017 VPCMPEQ %ymm0, %ymmZERO, %ymm2
1018 vpandn %ymm1, %ymm2, %ymm1
1019 vpmovmskb %ymm1, %ecx
1020
1021# ifdef USE_AS_STRNCMP
1022 leal VEC_SIZE(%OFFSET_REG64), %eax
1023 cmpq %rax, %rdx
1024 jbe L(check_ret_vec_page_cross2)
1025 addq %rdi, %rdx
1026# endif
1027 incl %ecx
1028 jz L(prepare_loop_no_len)
1029
1030 .p2align 4,, 4
1031L(ret_vec_page_cross):
1032# ifndef USE_AS_STRNCMP
1033L(check_ret_vec_page_cross):
1034# endif
1035 tzcntl %ecx, %ecx
1036 addl %OFFSET_REG, %ecx
1037L(ret_vec_page_cross_cont):
1038# ifdef USE_AS_WCSCMP
1039 movl (%rdi, %rcx), %edx
1040 xorl %eax, %eax
1041 cmpl (%rsi, %rcx), %edx
1042 je L(ret12)
1043 setl %al
1044 negl %eax
1045 xorl %r8d, %eax
1046# else
1047 movzbl (%rdi, %rcx), %eax
1048 movzbl (%rsi, %rcx), %ecx
1049 TOLOWER_gpr (%rax, %eax)
1050 TOLOWER_gpr (%rcx, %ecx)
1051 subl %ecx, %eax
1052 xorl %r8d, %eax
1053 subl %r8d, %eax
1054# endif
1055L(ret12):
1056 VZEROUPPER_RETURN
1057
1058# ifdef USE_AS_STRNCMP
1059 .p2align 4,, 10
1060L(check_ret_vec_page_cross2):
1061 incl %ecx
1062L(check_ret_vec_page_cross):
1063 tzcntl %ecx, %ecx
1064 addl %OFFSET_REG, %ecx
1065 cmpq %rcx, %rdx
1066 ja L(ret_vec_page_cross_cont)
1067 .p2align 4,, 2
1068L(ret_zero_page_cross):
1069 xorl %eax, %eax
1070 VZEROUPPER_RETURN
1071# endif
1072
1073 .p2align 4,, 4
1074L(page_cross_s2):
1075 /* Ensure this is a true page cross. */
1076 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1077 jbe L(no_page_cross)
1078
1079
1080 movl %ecx, %eax
1081 movq %rdi, %rcx
1082 movq %rsi, %rdi
1083 movq %rcx, %rsi
1084
1085 /* set r8 to negate return value as rdi and rsi swapped. */
1086# ifdef USE_AS_WCSCMP
1087 movl $-4, %r8d
1088# else
1089 movl $-1, %r8d
1090# endif
1091 xorl %OFFSET_REG, %OFFSET_REG
1092
1093 /* Check if more than 1x VEC till page cross. */
1094 subl $(VEC_SIZE * 3), %eax
1095 jle L(page_cross_loop)
1096
1097 .p2align 4,, 6
1098L(less_1x_vec_till_page):
1099 /* Find largest load size we can use. */
1100 cmpl $16, %eax
1101 ja L(less_16_till_page)
1102
1103 VMOVU (%rdi), %xmm0
1104 CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
1105 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1106 vpandn %xmm1, %xmm2, %xmm1
1107 vpmovmskb %ymm1, %ecx
1108 incw %cx
1109 jnz L(check_ret_vec_page_cross)
1110 movl $16, %OFFSET_REG
1111# ifdef USE_AS_STRNCMP
1112 cmpq %OFFSET_REG64, %rdx
1113 jbe L(ret_zero_page_cross_slow_case0)
1114 subl %eax, %OFFSET_REG
1115# else
1116 /* Explicit check for 16 byte alignment. */
1117 subl %eax, %OFFSET_REG
1118 jz L(prepare_loop)
1119# endif
1120
1121 VMOVU (%rdi, %OFFSET_REG64), %xmm0
1122 CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
1123 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1124 vpandn %xmm1, %xmm2, %xmm1
1125 vpmovmskb %ymm1, %ecx
1126 incw %cx
1127 jnz L(check_ret_vec_page_cross)
1128
1129# ifdef USE_AS_STRNCMP
1130 addl $16, %OFFSET_REG
1131 subq %OFFSET_REG64, %rdx
1132 jbe L(ret_zero_page_cross_slow_case0)
1133 subq $-(VEC_SIZE * 4), %rdx
1134
1135 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1136 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1137# else
1138 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1139 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1140# endif
1141 jmp L(prepare_loop_aligned)
1142
1143# ifdef USE_AS_STRNCMP
1144 .p2align 4,, 2
1145L(ret_zero_page_cross_slow_case0):
1146 xorl %eax, %eax
1147 ret
1148# endif
1149
1150
1151 .p2align 4,, 10
1152L(less_16_till_page):
1153 /* Find largest load size we can use. */
1154 cmpl $24, %eax
1155 ja L(less_8_till_page)
1156
1157 vmovq (%rdi), %xmm0
1158 vmovq (%rsi), %xmm1
1159 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1160 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1161 vpandn %xmm1, %xmm2, %xmm1
1162 vpmovmskb %ymm1, %ecx
1163 incb %cl
1164 jnz L(check_ret_vec_page_cross)
1165
1166
1167# ifdef USE_AS_STRNCMP
1168 cmpq $8, %rdx
1169 jbe L(ret_zero_page_cross_slow_case0)
1170# endif
1171 movl $24, %OFFSET_REG
1172 /* Explicit check for 16 byte alignment. */
1173 subl %eax, %OFFSET_REG
1174
1175
1176
1177 vmovq (%rdi, %OFFSET_REG64), %xmm0
1178 vmovq (%rsi, %OFFSET_REG64), %xmm1
1179 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1180 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1181 vpandn %xmm1, %xmm2, %xmm1
1182 vpmovmskb %ymm1, %ecx
1183 incb %cl
1184 jnz L(check_ret_vec_page_cross)
1185
1186# ifdef USE_AS_STRNCMP
1187 addl $8, %OFFSET_REG
1188 subq %OFFSET_REG64, %rdx
1189 jbe L(ret_zero_page_cross_slow_case0)
1190 subq $-(VEC_SIZE * 4), %rdx
1191
1192 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1193 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1194# else
1195 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1196 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1197# endif
1198 jmp L(prepare_loop_aligned)
1199
1200
1201 .p2align 4,, 10
1202L(less_8_till_page):
1203# ifdef USE_AS_WCSCMP
1204 /* If using wchar then this is the only check before we reach
1205 the page boundary. */
1206 movl (%rdi), %eax
1207 movl (%rsi), %ecx
1208 cmpl %ecx, %eax
1209 jnz L(ret_less_8_wcs)
1210# ifdef USE_AS_STRNCMP
1211 addq %rdi, %rdx
1212 /* We already checked for len <= 1 so cannot hit that case here.
1213 */
1214# endif
1215 testl %eax, %eax
1216 jnz L(prepare_loop_no_len)
1217 ret
1218
1219 .p2align 4,, 8
1220L(ret_less_8_wcs):
1221 setl %OFFSET_REG8
1222 negl %OFFSET_REG
1223 movl %OFFSET_REG, %eax
1224 xorl %r8d, %eax
1225 ret
1226
1227# else
1228
1229 /* Find largest load size we can use. */
1230 cmpl $28, %eax
1231 ja L(less_4_till_page)
1232
1233 vmovd (%rdi), %xmm0
1234 vmovd (%rsi), %xmm1
1235 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1236 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1237 vpandn %xmm1, %xmm2, %xmm1
1238 vpmovmskb %ymm1, %ecx
1239 subl $0xf, %ecx
1240 jnz L(check_ret_vec_page_cross)
1241
1242# ifdef USE_AS_STRNCMP
1243 cmpq $4, %rdx
1244 jbe L(ret_zero_page_cross_slow_case1)
1245# endif
1246 movl $28, %OFFSET_REG
1247 /* Explicit check for 16 byte alignment. */
1248 subl %eax, %OFFSET_REG
1249
1250
1251
1252 vmovd (%rdi, %OFFSET_REG64), %xmm0
1253 vmovd (%rsi, %OFFSET_REG64), %xmm1
1254 VPCMPEQ %xmm0, %xmmZERO, %xmm2
1255 CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
1256 vpandn %xmm1, %xmm2, %xmm1
1257 vpmovmskb %ymm1, %ecx
1258 subl $0xf, %ecx
1259 jnz L(check_ret_vec_page_cross)
1260
1261# ifdef USE_AS_STRNCMP
1262 addl $4, %OFFSET_REG
1263 subq %OFFSET_REG64, %rdx
1264 jbe L(ret_zero_page_cross_slow_case1)
1265 subq $-(VEC_SIZE * 4), %rdx
1266
1267 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1268 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1269# else
1270 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
1271 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
1272# endif
1273 jmp L(prepare_loop_aligned)
1274
1275# ifdef USE_AS_STRNCMP
1276 .p2align 4,, 2
1277L(ret_zero_page_cross_slow_case1):
1278 xorl %eax, %eax
1279 ret
1280# endif
1281
1282 .p2align 4,, 10
1283L(less_4_till_page):
1284 subq %rdi, %rsi
1285 /* Extremely slow byte comparison loop. */
1286L(less_4_loop):
1287 movzbl (%rdi), %eax
1288 movzbl (%rsi, %rdi), %ecx
1289 TOLOWER_gpr (%rax, %eax)
1290 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1291 subl %BYTE_LOOP_REG, %eax
1292 jnz L(ret_less_4_loop)
1293 testl %ecx, %ecx
1294 jz L(ret_zero_4_loop)
1295# ifdef USE_AS_STRNCMP
1296 decq %rdx
1297 jz L(ret_zero_4_loop)
1298# endif
1299 incq %rdi
1300 /* end condition is reach page boundary (rdi is aligned). */
1301 testl $31, %edi
1302 jnz L(less_4_loop)
1303 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1304 addq $-(VEC_SIZE * 4), %rdi
1305# ifdef USE_AS_STRNCMP
1306 subq $-(VEC_SIZE * 4), %rdx
1307# endif
1308 jmp L(prepare_loop_aligned)
1309
1310L(ret_zero_4_loop):
1311 xorl %eax, %eax
1312 ret
1313L(ret_less_4_loop):
1314 xorl %r8d, %eax
1315 subl %r8d, %eax
1316 ret
1317# endif
1318 cfi_endproc
1319 .size STRCMP, .-STRCMP
1320#endif
1321

source code of glibc/sysdeps/x86_64/multiarch/strcmp-avx2.S