1/* strcmp optimized with SSE4.2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (2)
22
23# include <sysdep.h>
24
25# define STRCMP_ISA _sse42
26# include "strcmp-naming.h"
27
28# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
29# include "locale-defines.h"
30# endif
31
32# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
33/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
34 if the new counter > the old one or is 0. */
35# define UPDATE_STRNCMP_COUNTER \
36 /* calculate left number to compare */ \
37 lea -16(%rcx, %r11), %r9; \
38 cmp %r9, %r11; \
39 jb LABEL(strcmp_exitz); \
40 test %r9, %r9; \
41 je LABEL(strcmp_exitz); \
42 mov %r9, %r11
43# else
44# define UPDATE_STRNCMP_COUNTER
45# endif
46
47# define SECTION sse4.2
48
49# define LABEL(l) .L##l
50
51/* We use 0x1a:
52 _SIDD_SBYTE_OPS
53 | _SIDD_CMP_EQUAL_EACH
54 | _SIDD_NEGATIVE_POLARITY
55 | _SIDD_LEAST_SIGNIFICANT
56 on pcmpistri to find out if two 16byte data elements are the same
57 and the offset of the first different byte. There are 4 cases:
58
59 1. Both 16byte data elements are valid and identical.
60 2. Both 16byte data elements have EOS and identical.
61 3. Both 16byte data elements are valid and they differ at offset X.
62 4. At least one 16byte data element has EOS at offset X. Two 16byte
63 data elements must differ at or before offset X.
64
65 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
66
67 case ECX CFlag ZFlag SFlag
68 1 16 0 0 0
69 2 16 0 1 1
70 3 X 1 0 0
71 4 0 <= X 1 0/1 0/1
72
73 We exit from the loop for cases 2, 3 and 4 with jbe which branches
74 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
75 case 2. */
76
77 /* Put all SSE 4.2 functions together. */
78 .section .text.SECTION,"ax",@progbits
79 .align 16
80 .type STRCMP, @function
81 .globl STRCMP
82# ifdef USE_AS_STRCASECMP_L
83ENTRY (STRCASECMP)
84 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
85 mov %fs:(%rax),%RDX_LP
86
87 /* Either 1 or 5 bytes (depending if CET is enabled). */
88 .p2align 4
89END (STRCASECMP)
90 /* FALLTHROUGH to strcasecmp_l. */
91# endif
92# ifdef USE_AS_STRNCASECMP_L
93ENTRY (STRCASECMP)
94 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
95 mov %fs:(%rax),%RCX_LP
96
97 /* Either 1 or 5 bytes (depending if CET is enabled). */
98 .p2align 4
99END (STRCASECMP)
100 /* FALLTHROUGH to strncasecmp_l. */
101# endif
102
103
104# define arg arg
105
106STRCMP:
107 cfi_startproc
108 _CET_ENDBR
109 CALL_MCOUNT
110
111/*
112 * This implementation uses SSE to compare up to 16 bytes at a time.
113 */
114# ifdef USE_AS_STRCASECMP_L
115 /* We have to fall back on the C implementation for locales
116 with encodings not matching ASCII for single bytes. */
117# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
118 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
119# else
120 mov (%rdx), %RAX_LP
121# endif
122 testb $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
123 jne __strcasecmp_l_nonascii
124# endif
125# ifdef USE_AS_STRNCASECMP_L
126 /* We have to fall back on the C implementation for locales
127 with encodings not matching ASCII for single bytes. */
128# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
129 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
130# else
131 mov (%rcx), %RAX_LP
132# endif
133 testb $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
134 jne __strncasecmp_l_nonascii
135# endif
136
137# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
138 test %RDX_LP, %RDX_LP
139 je LABEL(strcmp_exitz)
140 cmp $1, %RDX_LP
141 je LABEL(Byte0)
142 mov %RDX_LP, %R11_LP
143# endif
144 mov %esi, %ecx
145 mov %edi, %eax
146/* Use 64bit AND here to avoid long NOP padding. */
147 and $0x3f, %rcx /* rsi alignment in cache line */
148 and $0x3f, %rax /* rdi alignment in cache line */
149# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
150 .section .rodata.cst16,"aM",@progbits,16
151 .align 16
152LABEL(lcase_min):
153 .quad 0x3f3f3f3f3f3f3f3f
154 .quad 0x3f3f3f3f3f3f3f3f
155LABEL(lcase_max):
156 .quad 0x9999999999999999
157 .quad 0x9999999999999999
158LABEL(case_add):
159 .quad 0x2020202020202020
160 .quad 0x2020202020202020
161 .previous
162 movdqa LABEL(lcase_min)(%rip), %xmm4
163# define LCASE_MIN_reg %xmm4
164 movdqa LABEL(lcase_max)(%rip), %xmm5
165# define LCASE_MAX_reg %xmm5
166 movdqa LABEL(case_add)(%rip), %xmm6
167# define CASE_ADD_reg %xmm6
168# endif
169 cmp $0x30, %ecx
170 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
171 cmp $0x30, %eax
172 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
173 movdqu (%rdi), %xmm1
174 movdqu (%rsi), %xmm2
175# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
176# define TOLOWER(reg1, reg2) \
177 movdqa LCASE_MIN_reg, %xmm7; \
178 movdqa LCASE_MIN_reg, %xmm8; \
179 paddb reg1, %xmm7; \
180 paddb reg2, %xmm8; \
181 pcmpgtb LCASE_MAX_reg, %xmm7; \
182 pcmpgtb LCASE_MAX_reg, %xmm8; \
183 pandn CASE_ADD_reg, %xmm7; \
184 pandn CASE_ADD_reg, %xmm8; \
185 paddb %xmm7, reg1; \
186 paddb %xmm8, reg2
187
188 TOLOWER (%xmm1, %xmm2)
189# else
190# define TOLOWER(reg1, reg2)
191# endif
192 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
193 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
194 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
195 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
196 pmovmskb %xmm1, %edx
197 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
198 jnz LABEL(less16bytes)/* If not, find different value or null char */
199# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
200 sub $16, %r11
201 jbe LABEL(strcmp_exitz)/* finish comparison */
202# endif
203 add $16, %rsi /* prepare to search next 16 bytes */
204 add $16, %rdi /* prepare to search next 16 bytes */
205
206 /*
207 * Determine source and destination string offsets from 16-byte
208 * alignment. Use relative offset difference between the two to
209 * determine which case below to use.
210 */
211 .p2align 4
212LABEL(crosscache):
213 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
214 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
215 mov $0xffff, %edx /* for equivalent offset */
216 xor %r8d, %r8d
217 and $0xf, %ecx /* offset of rsi */
218 and $0xf, %eax /* offset of rdi */
219 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
220 cmp %eax, %ecx
221 je LABEL(ashr_0) /* rsi and rdi relative offset same */
222 ja LABEL(bigger)
223 mov %edx, %r8d /* r8d is offset flag for exit tail */
224 xchg %ecx, %eax
225 xchg %rsi, %rdi
226LABEL(bigger):
227 movdqa (%rdi), %xmm2
228 movdqa (%rsi), %xmm1
229 lea 15(%rax), %r9
230 sub %rcx, %r9
231 lea LABEL(unaligned_table)(%rip), %r10
232 movslq (%r10, %r9,4), %r9
233 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
234 lea (%r10, %r9), %r10
235 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
236
237/*
238 * The following cases will be handled by ashr_0
239 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
240 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
241 */
242 .p2align 4
243LABEL(ashr_0):
244
245 movdqa (%rsi), %xmm1
246 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
247# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
248 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
249# else
250 movdqa (%rdi), %xmm2
251 TOLOWER (%xmm1, %xmm2)
252 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
253# endif
254 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
255 pmovmskb %xmm1, %r9d
256 shr %cl, %edx /* adjust 0xffff for offset */
257 shr %cl, %r9d /* adjust for 16-byte offset */
258 sub %r9d, %edx
259 /*
260 * edx must be the same with r9d if in left byte (16-rcx) is equal to
261 * the start from (16-rax) and no null char was seen.
262 */
263 jne LABEL(less32bytes) /* mismatch or null char */
264 UPDATE_STRNCMP_COUNTER
265 mov $16, %rcx
266 mov $16, %r9
267
268 /*
269 * Now both strings are aligned at 16-byte boundary. Loop over strings
270 * checking 32-bytes per iteration.
271 */
272 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
273 .p2align 4
274LABEL(ashr_0_use):
275 movdqa (%rdi,%rdx), %xmm0
276# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
277 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
278# else
279 movdqa (%rsi,%rdx), %xmm1
280 TOLOWER (%xmm0, %xmm1)
281 pcmpistri $0x1a, %xmm1, %xmm0
282# endif
283 lea 16(%rdx), %rdx
284 jbe LABEL(ashr_0_exit_use)
285# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
286 sub $16, %r11
287 jbe LABEL(strcmp_exitz)
288# endif
289
290 movdqa (%rdi,%rdx), %xmm0
291# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
292 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
293# else
294 movdqa (%rsi,%rdx), %xmm1
295 TOLOWER (%xmm0, %xmm1)
296 pcmpistri $0x1a, %xmm1, %xmm0
297# endif
298 lea 16(%rdx), %rdx
299 jbe LABEL(ashr_0_exit_use)
300# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
301 sub $16, %r11
302 jbe LABEL(strcmp_exitz)
303# endif
304 jmp LABEL(ashr_0_use)
305
306
307 .p2align 4
308LABEL(ashr_0_exit_use):
309 jnc LABEL(strcmp_exitz)
310# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
311 sub %rcx, %r11
312 jbe LABEL(strcmp_exitz)
313# endif
314 lea -16(%rdx, %rcx), %rcx
315 movzbl (%rdi, %rcx), %eax
316 movzbl (%rsi, %rcx), %edx
317# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
318 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
319 movl (%rcx,%rax,4), %eax
320 movl (%rcx,%rdx,4), %edx
321# endif
322 sub %edx, %eax
323 ret
324
325
326
327/*
328 * The following cases will be handled by ashr_1
329 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
330 * n(15) n -15 0(15 +(n-15) - n) ashr_1
331 */
332 .p2align 4
333LABEL(ashr_1):
334 pslldq $15, %xmm2 /* shift first string to align with second */
335 TOLOWER (%xmm1, %xmm2)
336 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
337 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
338 pmovmskb %xmm2, %r9d
339 shr %cl, %edx /* adjust 0xffff for offset */
340 shr %cl, %r9d /* adjust for 16-byte offset */
341 sub %r9d, %edx
342 jnz LABEL(less32bytes) /* mismatch or null char seen */
343 movdqa (%rdi), %xmm3
344 UPDATE_STRNCMP_COUNTER
345
346 mov $16, %rcx /* index for loads*/
347 mov $1, %r9d /* byte position left over from less32bytes case */
348 /*
349 * Setup %r10 value allows us to detect crossing a page boundary.
350 * When %r10 goes positive we have crossed a page boundary and
351 * need to do a nibble.
352 */
353 lea 1(%rdi), %r10
354 and $0xfff, %r10 /* offset into 4K page */
355 sub $0x1000, %r10 /* subtract 4K pagesize */
356 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
357
358 .p2align 4
359LABEL(loop_ashr_1_use):
360 add $16, %r10
361 jg LABEL(nibble_ashr_1_use)
362
363LABEL(nibble_ashr_1_restart_use):
364 movdqa (%rdi, %rdx), %xmm0
365 palignr $1, -16(%rdi, %rdx), %xmm0
366# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
367 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
368# else
369 movdqa (%rsi,%rdx), %xmm1
370 TOLOWER (%xmm0, %xmm1)
371 pcmpistri $0x1a, %xmm1, %xmm0
372# endif
373 jbe LABEL(exit_use)
374# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
375 sub $16, %r11
376 jbe LABEL(strcmp_exitz)
377# endif
378
379 add $16, %rdx
380 add $16, %r10
381 jg LABEL(nibble_ashr_1_use)
382
383 movdqa (%rdi, %rdx), %xmm0
384 palignr $1, -16(%rdi, %rdx), %xmm0
385# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
386 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
387# else
388 movdqa (%rsi,%rdx), %xmm1
389 TOLOWER (%xmm0, %xmm1)
390 pcmpistri $0x1a, %xmm1, %xmm0
391# endif
392 jbe LABEL(exit_use)
393# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
394 sub $16, %r11
395 jbe LABEL(strcmp_exitz)
396# endif
397 add $16, %rdx
398 jmp LABEL(loop_ashr_1_use)
399
400 .p2align 4
401LABEL(nibble_ashr_1_use):
402 sub $0x1000, %r10
403 movdqa -16(%rdi, %rdx), %xmm0
404 psrldq $1, %xmm0
405 pcmpistri $0x3a,%xmm0, %xmm0
406# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
407 cmp %r11, %rcx
408 jae LABEL(nibble_ashr_exit_use)
409# endif
410 cmp $14, %ecx
411 ja LABEL(nibble_ashr_1_restart_use)
412
413 jmp LABEL(nibble_ashr_exit_use)
414
415/*
416 * The following cases will be handled by ashr_2
417 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
418 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
419 */
420 .p2align 4
421LABEL(ashr_2):
422 pslldq $14, %xmm2
423 TOLOWER (%xmm1, %xmm2)
424 pcmpeqb %xmm1, %xmm2
425 psubb %xmm0, %xmm2
426 pmovmskb %xmm2, %r9d
427 shr %cl, %edx
428 shr %cl, %r9d
429 sub %r9d, %edx
430 jnz LABEL(less32bytes)
431 movdqa (%rdi), %xmm3
432 UPDATE_STRNCMP_COUNTER
433
434 mov $16, %rcx /* index for loads */
435 mov $2, %r9d /* byte position left over from less32bytes case */
436 /*
437 * Setup %r10 value allows us to detect crossing a page boundary.
438 * When %r10 goes positive we have crossed a page boundary and
439 * need to do a nibble.
440 */
441 lea 2(%rdi), %r10
442 and $0xfff, %r10 /* offset into 4K page */
443 sub $0x1000, %r10 /* subtract 4K pagesize */
444 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
445
446 .p2align 4
447LABEL(loop_ashr_2_use):
448 add $16, %r10
449 jg LABEL(nibble_ashr_2_use)
450
451LABEL(nibble_ashr_2_restart_use):
452 movdqa (%rdi, %rdx), %xmm0
453 palignr $2, -16(%rdi, %rdx), %xmm0
454# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
455 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
456# else
457 movdqa (%rsi,%rdx), %xmm1
458 TOLOWER (%xmm0, %xmm1)
459 pcmpistri $0x1a, %xmm1, %xmm0
460# endif
461 jbe LABEL(exit_use)
462# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
463 sub $16, %r11
464 jbe LABEL(strcmp_exitz)
465# endif
466
467 add $16, %rdx
468 add $16, %r10
469 jg LABEL(nibble_ashr_2_use)
470
471 movdqa (%rdi, %rdx), %xmm0
472 palignr $2, -16(%rdi, %rdx), %xmm0
473# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
474 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
475# else
476 movdqa (%rsi,%rdx), %xmm1
477 TOLOWER (%xmm0, %xmm1)
478 pcmpistri $0x1a, %xmm1, %xmm0
479# endif
480 jbe LABEL(exit_use)
481# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
482 sub $16, %r11
483 jbe LABEL(strcmp_exitz)
484# endif
485 add $16, %rdx
486 jmp LABEL(loop_ashr_2_use)
487
488 .p2align 4
489LABEL(nibble_ashr_2_use):
490 sub $0x1000, %r10
491 movdqa -16(%rdi, %rdx), %xmm0
492 psrldq $2, %xmm0
493 pcmpistri $0x3a,%xmm0, %xmm0
494# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
495 cmp %r11, %rcx
496 jae LABEL(nibble_ashr_exit_use)
497# endif
498 cmp $13, %ecx
499 ja LABEL(nibble_ashr_2_restart_use)
500
501 jmp LABEL(nibble_ashr_exit_use)
502
503/*
504 * The following cases will be handled by ashr_3
505 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
506 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
507 */
508 .p2align 4
509LABEL(ashr_3):
510 pslldq $13, %xmm2
511 TOLOWER (%xmm1, %xmm2)
512 pcmpeqb %xmm1, %xmm2
513 psubb %xmm0, %xmm2
514 pmovmskb %xmm2, %r9d
515 shr %cl, %edx
516 shr %cl, %r9d
517 sub %r9d, %edx
518 jnz LABEL(less32bytes)
519 movdqa (%rdi), %xmm3
520
521 UPDATE_STRNCMP_COUNTER
522
523 mov $16, %rcx /* index for loads */
524 mov $3, %r9d /* byte position left over from less32bytes case */
525 /*
526 * Setup %r10 value allows us to detect crossing a page boundary.
527 * When %r10 goes positive we have crossed a page boundary and
528 * need to do a nibble.
529 */
530 lea 3(%rdi), %r10
531 and $0xfff, %r10 /* offset into 4K page */
532 sub $0x1000, %r10 /* subtract 4K pagesize */
533 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
534
535LABEL(loop_ashr_3_use):
536 add $16, %r10
537 jg LABEL(nibble_ashr_3_use)
538
539LABEL(nibble_ashr_3_restart_use):
540 movdqa (%rdi, %rdx), %xmm0
541 palignr $3, -16(%rdi, %rdx), %xmm0
542# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
543 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
544# else
545 movdqa (%rsi,%rdx), %xmm1
546 TOLOWER (%xmm0, %xmm1)
547 pcmpistri $0x1a, %xmm1, %xmm0
548# endif
549 jbe LABEL(exit_use)
550# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
551 sub $16, %r11
552 jbe LABEL(strcmp_exitz)
553# endif
554
555 add $16, %rdx
556 add $16, %r10
557 jg LABEL(nibble_ashr_3_use)
558
559 movdqa (%rdi, %rdx), %xmm0
560 palignr $3, -16(%rdi, %rdx), %xmm0
561# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
562 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
563# else
564 movdqa (%rsi,%rdx), %xmm1
565 TOLOWER (%xmm0, %xmm1)
566 pcmpistri $0x1a, %xmm1, %xmm0
567# endif
568 jbe LABEL(exit_use)
569# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
570 sub $16, %r11
571 jbe LABEL(strcmp_exitz)
572# endif
573 add $16, %rdx
574 jmp LABEL(loop_ashr_3_use)
575
576 .p2align 4
577LABEL(nibble_ashr_3_use):
578 sub $0x1000, %r10
579 movdqa -16(%rdi, %rdx), %xmm0
580 psrldq $3, %xmm0
581 pcmpistri $0x3a,%xmm0, %xmm0
582# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
583 cmp %r11, %rcx
584 jae LABEL(nibble_ashr_exit_use)
585# endif
586 cmp $12, %ecx
587 ja LABEL(nibble_ashr_3_restart_use)
588
589 jmp LABEL(nibble_ashr_exit_use)
590
591/*
592 * The following cases will be handled by ashr_4
593 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
594 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
595 */
596 .p2align 4
597LABEL(ashr_4):
598 pslldq $12, %xmm2
599 TOLOWER (%xmm1, %xmm2)
600 pcmpeqb %xmm1, %xmm2
601 psubb %xmm0, %xmm2
602 pmovmskb %xmm2, %r9d
603 shr %cl, %edx
604 shr %cl, %r9d
605 sub %r9d, %edx
606 jnz LABEL(less32bytes)
607 movdqa (%rdi), %xmm3
608
609 UPDATE_STRNCMP_COUNTER
610
611 mov $16, %rcx /* index for loads */
612 mov $4, %r9d /* byte position left over from less32bytes case */
613 /*
614 * Setup %r10 value allows us to detect crossing a page boundary.
615 * When %r10 goes positive we have crossed a page boundary and
616 * need to do a nibble.
617 */
618 lea 4(%rdi), %r10
619 and $0xfff, %r10 /* offset into 4K page */
620 sub $0x1000, %r10 /* subtract 4K pagesize */
621 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
622
623 .p2align 4
624LABEL(loop_ashr_4_use):
625 add $16, %r10
626 jg LABEL(nibble_ashr_4_use)
627
628LABEL(nibble_ashr_4_restart_use):
629 movdqa (%rdi, %rdx), %xmm0
630 palignr $4, -16(%rdi, %rdx), %xmm0
631# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
632 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
633# else
634 movdqa (%rsi,%rdx), %xmm1
635 TOLOWER (%xmm0, %xmm1)
636 pcmpistri $0x1a, %xmm1, %xmm0
637# endif
638 jbe LABEL(exit_use)
639# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
640 sub $16, %r11
641 jbe LABEL(strcmp_exitz)
642# endif
643
644 add $16, %rdx
645 add $16, %r10
646 jg LABEL(nibble_ashr_4_use)
647
648 movdqa (%rdi, %rdx), %xmm0
649 palignr $4, -16(%rdi, %rdx), %xmm0
650# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
651 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
652# else
653 movdqa (%rsi,%rdx), %xmm1
654 TOLOWER (%xmm0, %xmm1)
655 pcmpistri $0x1a, %xmm1, %xmm0
656# endif
657 jbe LABEL(exit_use)
658# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
659 sub $16, %r11
660 jbe LABEL(strcmp_exitz)
661# endif
662 add $16, %rdx
663 jmp LABEL(loop_ashr_4_use)
664
665 .p2align 4
666LABEL(nibble_ashr_4_use):
667 sub $0x1000, %r10
668 movdqa -16(%rdi, %rdx), %xmm0
669 psrldq $4, %xmm0
670 pcmpistri $0x3a,%xmm0, %xmm0
671# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
672 cmp %r11, %rcx
673 jae LABEL(nibble_ashr_exit_use)
674# endif
675 cmp $11, %ecx
676 ja LABEL(nibble_ashr_4_restart_use)
677
678 jmp LABEL(nibble_ashr_exit_use)
679
680/*
681 * The following cases will be handled by ashr_5
682 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
683 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
684 */
685 .p2align 4
686LABEL(ashr_5):
687 pslldq $11, %xmm2
688 TOLOWER (%xmm1, %xmm2)
689 pcmpeqb %xmm1, %xmm2
690 psubb %xmm0, %xmm2
691 pmovmskb %xmm2, %r9d
692 shr %cl, %edx
693 shr %cl, %r9d
694 sub %r9d, %edx
695 jnz LABEL(less32bytes)
696 movdqa (%rdi), %xmm3
697
698 UPDATE_STRNCMP_COUNTER
699
700 mov $16, %rcx /* index for loads */
701 mov $5, %r9d /* byte position left over from less32bytes case */
702 /*
703 * Setup %r10 value allows us to detect crossing a page boundary.
704 * When %r10 goes positive we have crossed a page boundary and
705 * need to do a nibble.
706 */
707 lea 5(%rdi), %r10
708 and $0xfff, %r10 /* offset into 4K page */
709 sub $0x1000, %r10 /* subtract 4K pagesize */
710 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
711
712 .p2align 4
713LABEL(loop_ashr_5_use):
714 add $16, %r10
715 jg LABEL(nibble_ashr_5_use)
716
717LABEL(nibble_ashr_5_restart_use):
718 movdqa (%rdi, %rdx), %xmm0
719 palignr $5, -16(%rdi, %rdx), %xmm0
720# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
721 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
722# else
723 movdqa (%rsi,%rdx), %xmm1
724 TOLOWER (%xmm0, %xmm1)
725 pcmpistri $0x1a, %xmm1, %xmm0
726# endif
727 jbe LABEL(exit_use)
728# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
729 sub $16, %r11
730 jbe LABEL(strcmp_exitz)
731# endif
732
733 add $16, %rdx
734 add $16, %r10
735 jg LABEL(nibble_ashr_5_use)
736
737 movdqa (%rdi, %rdx), %xmm0
738
739 palignr $5, -16(%rdi, %rdx), %xmm0
740# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
741 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
742# else
743 movdqa (%rsi,%rdx), %xmm1
744 TOLOWER (%xmm0, %xmm1)
745 pcmpistri $0x1a, %xmm1, %xmm0
746# endif
747 jbe LABEL(exit_use)
748# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
749 sub $16, %r11
750 jbe LABEL(strcmp_exitz)
751# endif
752 add $16, %rdx
753 jmp LABEL(loop_ashr_5_use)
754
755 .p2align 4
756LABEL(nibble_ashr_5_use):
757 sub $0x1000, %r10
758 movdqa -16(%rdi, %rdx), %xmm0
759 psrldq $5, %xmm0
760 pcmpistri $0x3a,%xmm0, %xmm0
761# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
762 cmp %r11, %rcx
763 jae LABEL(nibble_ashr_exit_use)
764# endif
765 cmp $10, %ecx
766 ja LABEL(nibble_ashr_5_restart_use)
767
768 jmp LABEL(nibble_ashr_exit_use)
769
770/*
771 * The following cases will be handled by ashr_6
772 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
773 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
774 */
775 .p2align 4
776LABEL(ashr_6):
777 pslldq $10, %xmm2
778 TOLOWER (%xmm1, %xmm2)
779 pcmpeqb %xmm1, %xmm2
780 psubb %xmm0, %xmm2
781 pmovmskb %xmm2, %r9d
782 shr %cl, %edx
783 shr %cl, %r9d
784 sub %r9d, %edx
785 jnz LABEL(less32bytes)
786 movdqa (%rdi), %xmm3
787
788 UPDATE_STRNCMP_COUNTER
789
790 mov $16, %rcx /* index for loads */
791 mov $6, %r9d /* byte position left over from less32bytes case */
792 /*
793 * Setup %r10 value allows us to detect crossing a page boundary.
794 * When %r10 goes positive we have crossed a page boundary and
795 * need to do a nibble.
796 */
797 lea 6(%rdi), %r10
798 and $0xfff, %r10 /* offset into 4K page */
799 sub $0x1000, %r10 /* subtract 4K pagesize */
800 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
801
802 .p2align 4
803LABEL(loop_ashr_6_use):
804 add $16, %r10
805 jg LABEL(nibble_ashr_6_use)
806
807LABEL(nibble_ashr_6_restart_use):
808 movdqa (%rdi, %rdx), %xmm0
809 palignr $6, -16(%rdi, %rdx), %xmm0
810# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
811 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
812# else
813 movdqa (%rsi,%rdx), %xmm1
814 TOLOWER (%xmm0, %xmm1)
815 pcmpistri $0x1a, %xmm1, %xmm0
816# endif
817 jbe LABEL(exit_use)
818# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
819 sub $16, %r11
820 jbe LABEL(strcmp_exitz)
821# endif
822
823 add $16, %rdx
824 add $16, %r10
825 jg LABEL(nibble_ashr_6_use)
826
827 movdqa (%rdi, %rdx), %xmm0
828 palignr $6, -16(%rdi, %rdx), %xmm0
829# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
830 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
831# else
832 movdqa (%rsi,%rdx), %xmm1
833 TOLOWER (%xmm0, %xmm1)
834 pcmpistri $0x1a, %xmm1, %xmm0
835# endif
836 jbe LABEL(exit_use)
837# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
838 sub $16, %r11
839 jbe LABEL(strcmp_exitz)
840# endif
841 add $16, %rdx
842 jmp LABEL(loop_ashr_6_use)
843
844 .p2align 4
845LABEL(nibble_ashr_6_use):
846 sub $0x1000, %r10
847 movdqa -16(%rdi, %rdx), %xmm0
848 psrldq $6, %xmm0
849 pcmpistri $0x3a,%xmm0, %xmm0
850# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
851 cmp %r11, %rcx
852 jae LABEL(nibble_ashr_exit_use)
853# endif
854 cmp $9, %ecx
855 ja LABEL(nibble_ashr_6_restart_use)
856
857 jmp LABEL(nibble_ashr_exit_use)
858
859/*
860 * The following cases will be handled by ashr_7
861 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
862 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
863 */
864 .p2align 4
865LABEL(ashr_7):
866 pslldq $9, %xmm2
867 TOLOWER (%xmm1, %xmm2)
868 pcmpeqb %xmm1, %xmm2
869 psubb %xmm0, %xmm2
870 pmovmskb %xmm2, %r9d
871 shr %cl, %edx
872 shr %cl, %r9d
873 sub %r9d, %edx
874 jnz LABEL(less32bytes)
875 movdqa (%rdi), %xmm3
876
877 UPDATE_STRNCMP_COUNTER
878
879 mov $16, %rcx /* index for loads */
880 mov $7, %r9d /* byte position left over from less32bytes case */
881 /*
882 * Setup %r10 value allows us to detect crossing a page boundary.
883 * When %r10 goes positive we have crossed a page boundary and
884 * need to do a nibble.
885 */
886 lea 7(%rdi), %r10
887 and $0xfff, %r10 /* offset into 4K page */
888 sub $0x1000, %r10 /* subtract 4K pagesize */
889 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
890
891 .p2align 4
892LABEL(loop_ashr_7_use):
893 add $16, %r10
894 jg LABEL(nibble_ashr_7_use)
895
896LABEL(nibble_ashr_7_restart_use):
897 movdqa (%rdi, %rdx), %xmm0
898 palignr $7, -16(%rdi, %rdx), %xmm0
899# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
900 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
901# else
902 movdqa (%rsi,%rdx), %xmm1
903 TOLOWER (%xmm0, %xmm1)
904 pcmpistri $0x1a, %xmm1, %xmm0
905# endif
906 jbe LABEL(exit_use)
907# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
908 sub $16, %r11
909 jbe LABEL(strcmp_exitz)
910# endif
911
912 add $16, %rdx
913 add $16, %r10
914 jg LABEL(nibble_ashr_7_use)
915
916 movdqa (%rdi, %rdx), %xmm0
917 palignr $7, -16(%rdi, %rdx), %xmm0
918# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
919 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
920# else
921 movdqa (%rsi,%rdx), %xmm1
922 TOLOWER (%xmm0, %xmm1)
923 pcmpistri $0x1a, %xmm1, %xmm0
924# endif
925 jbe LABEL(exit_use)
926# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
927 sub $16, %r11
928 jbe LABEL(strcmp_exitz)
929# endif
930 add $16, %rdx
931 jmp LABEL(loop_ashr_7_use)
932
933 .p2align 4
934LABEL(nibble_ashr_7_use):
935 sub $0x1000, %r10
936 movdqa -16(%rdi, %rdx), %xmm0
937 psrldq $7, %xmm0
938 pcmpistri $0x3a,%xmm0, %xmm0
939# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
940 cmp %r11, %rcx
941 jae LABEL(nibble_ashr_exit_use)
942# endif
943 cmp $8, %ecx
944 ja LABEL(nibble_ashr_7_restart_use)
945
946 jmp LABEL(nibble_ashr_exit_use)
947
948/*
949 * The following cases will be handled by ashr_8
950 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
951 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
952 */
953 .p2align 4
954LABEL(ashr_8):
955 pslldq $8, %xmm2
956 TOLOWER (%xmm1, %xmm2)
957 pcmpeqb %xmm1, %xmm2
958 psubb %xmm0, %xmm2
959 pmovmskb %xmm2, %r9d
960 shr %cl, %edx
961 shr %cl, %r9d
962 sub %r9d, %edx
963 jnz LABEL(less32bytes)
964 movdqa (%rdi), %xmm3
965
966 UPDATE_STRNCMP_COUNTER
967
968 mov $16, %rcx /* index for loads */
969 mov $8, %r9d /* byte position left over from less32bytes case */
970 /*
971 * Setup %r10 value allows us to detect crossing a page boundary.
972 * When %r10 goes positive we have crossed a page boundary and
973 * need to do a nibble.
974 */
975 lea 8(%rdi), %r10
976 and $0xfff, %r10 /* offset into 4K page */
977 sub $0x1000, %r10 /* subtract 4K pagesize */
978 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
979
980 .p2align 4
981LABEL(loop_ashr_8_use):
982 add $16, %r10
983 jg LABEL(nibble_ashr_8_use)
984
985LABEL(nibble_ashr_8_restart_use):
986 movdqa (%rdi, %rdx), %xmm0
987 palignr $8, -16(%rdi, %rdx), %xmm0
988# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
989 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
990# else
991 movdqa (%rsi,%rdx), %xmm1
992 TOLOWER (%xmm0, %xmm1)
993 pcmpistri $0x1a, %xmm1, %xmm0
994# endif
995 jbe LABEL(exit_use)
996# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
997 sub $16, %r11
998 jbe LABEL(strcmp_exitz)
999# endif
1000
1001 add $16, %rdx
1002 add $16, %r10
1003 jg LABEL(nibble_ashr_8_use)
1004
1005 movdqa (%rdi, %rdx), %xmm0
1006 palignr $8, -16(%rdi, %rdx), %xmm0
1007# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1008 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1009# else
1010 movdqa (%rsi,%rdx), %xmm1
1011 TOLOWER (%xmm0, %xmm1)
1012 pcmpistri $0x1a, %xmm1, %xmm0
1013# endif
1014 jbe LABEL(exit_use)
1015# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1016 sub $16, %r11
1017 jbe LABEL(strcmp_exitz)
1018# endif
1019 add $16, %rdx
1020 jmp LABEL(loop_ashr_8_use)
1021
1022 .p2align 4
1023LABEL(nibble_ashr_8_use):
1024 sub $0x1000, %r10
1025 movdqa -16(%rdi, %rdx), %xmm0
1026 psrldq $8, %xmm0
1027 pcmpistri $0x3a,%xmm0, %xmm0
1028# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1029 cmp %r11, %rcx
1030 jae LABEL(nibble_ashr_exit_use)
1031# endif
1032 cmp $7, %ecx
1033 ja LABEL(nibble_ashr_8_restart_use)
1034
1035 jmp LABEL(nibble_ashr_exit_use)
1036
1037/*
1038 * The following cases will be handled by ashr_9
1039 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1040 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1041 */
1042 .p2align 4
1043LABEL(ashr_9):
1044 pslldq $7, %xmm2
1045 TOLOWER (%xmm1, %xmm2)
1046 pcmpeqb %xmm1, %xmm2
1047 psubb %xmm0, %xmm2
1048 pmovmskb %xmm2, %r9d
1049 shr %cl, %edx
1050 shr %cl, %r9d
1051 sub %r9d, %edx
1052 jnz LABEL(less32bytes)
1053 movdqa (%rdi), %xmm3
1054
1055 UPDATE_STRNCMP_COUNTER
1056
1057 mov $16, %rcx /* index for loads */
1058 mov $9, %r9d /* byte position left over from less32bytes case */
1059 /*
1060 * Setup %r10 value allows us to detect crossing a page boundary.
1061 * When %r10 goes positive we have crossed a page boundary and
1062 * need to do a nibble.
1063 */
1064 lea 9(%rdi), %r10
1065 and $0xfff, %r10 /* offset into 4K page */
1066 sub $0x1000, %r10 /* subtract 4K pagesize */
1067 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1068
1069 .p2align 4
1070LABEL(loop_ashr_9_use):
1071 add $16, %r10
1072 jg LABEL(nibble_ashr_9_use)
1073
1074LABEL(nibble_ashr_9_restart_use):
1075 movdqa (%rdi, %rdx), %xmm0
1076
1077 palignr $9, -16(%rdi, %rdx), %xmm0
1078# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1079 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1080# else
1081 movdqa (%rsi,%rdx), %xmm1
1082 TOLOWER (%xmm0, %xmm1)
1083 pcmpistri $0x1a, %xmm1, %xmm0
1084# endif
1085 jbe LABEL(exit_use)
1086# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1087 sub $16, %r11
1088 jbe LABEL(strcmp_exitz)
1089# endif
1090
1091 add $16, %rdx
1092 add $16, %r10
1093 jg LABEL(nibble_ashr_9_use)
1094
1095 movdqa (%rdi, %rdx), %xmm0
1096 palignr $9, -16(%rdi, %rdx), %xmm0
1097# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1098 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1099# else
1100 movdqa (%rsi,%rdx), %xmm1
1101 TOLOWER (%xmm0, %xmm1)
1102 pcmpistri $0x1a, %xmm1, %xmm0
1103# endif
1104 jbe LABEL(exit_use)
1105# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1106 sub $16, %r11
1107 jbe LABEL(strcmp_exitz)
1108# endif
1109 add $16, %rdx
1110 jmp LABEL(loop_ashr_9_use)
1111
1112 .p2align 4
1113LABEL(nibble_ashr_9_use):
1114 sub $0x1000, %r10
1115 movdqa -16(%rdi, %rdx), %xmm0
1116 psrldq $9, %xmm0
1117 pcmpistri $0x3a,%xmm0, %xmm0
1118# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1119 cmp %r11, %rcx
1120 jae LABEL(nibble_ashr_exit_use)
1121# endif
1122 cmp $6, %ecx
1123 ja LABEL(nibble_ashr_9_restart_use)
1124
1125 jmp LABEL(nibble_ashr_exit_use)
1126
1127/*
1128 * The following cases will be handled by ashr_10
1129 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1130 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1131 */
1132 .p2align 4
1133LABEL(ashr_10):
1134 pslldq $6, %xmm2
1135 TOLOWER (%xmm1, %xmm2)
1136 pcmpeqb %xmm1, %xmm2
1137 psubb %xmm0, %xmm2
1138 pmovmskb %xmm2, %r9d
1139 shr %cl, %edx
1140 shr %cl, %r9d
1141 sub %r9d, %edx
1142 jnz LABEL(less32bytes)
1143 movdqa (%rdi), %xmm3
1144
1145 UPDATE_STRNCMP_COUNTER
1146
1147 mov $16, %rcx /* index for loads */
1148 mov $10, %r9d /* byte position left over from less32bytes case */
1149 /*
1150 * Setup %r10 value allows us to detect crossing a page boundary.
1151 * When %r10 goes positive we have crossed a page boundary and
1152 * need to do a nibble.
1153 */
1154 lea 10(%rdi), %r10
1155 and $0xfff, %r10 /* offset into 4K page */
1156 sub $0x1000, %r10 /* subtract 4K pagesize */
1157 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1158
1159 .p2align 4
1160LABEL(loop_ashr_10_use):
1161 add $16, %r10
1162 jg LABEL(nibble_ashr_10_use)
1163
1164LABEL(nibble_ashr_10_restart_use):
1165 movdqa (%rdi, %rdx), %xmm0
1166 palignr $10, -16(%rdi, %rdx), %xmm0
1167# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1168 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1169# else
1170 movdqa (%rsi,%rdx), %xmm1
1171 TOLOWER (%xmm0, %xmm1)
1172 pcmpistri $0x1a, %xmm1, %xmm0
1173# endif
1174 jbe LABEL(exit_use)
1175# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1176 sub $16, %r11
1177 jbe LABEL(strcmp_exitz)
1178# endif
1179
1180 add $16, %rdx
1181 add $16, %r10
1182 jg LABEL(nibble_ashr_10_use)
1183
1184 movdqa (%rdi, %rdx), %xmm0
1185 palignr $10, -16(%rdi, %rdx), %xmm0
1186# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1187 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1188# else
1189 movdqa (%rsi,%rdx), %xmm1
1190 TOLOWER (%xmm0, %xmm1)
1191 pcmpistri $0x1a, %xmm1, %xmm0
1192# endif
1193 jbe LABEL(exit_use)
1194# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1195 sub $16, %r11
1196 jbe LABEL(strcmp_exitz)
1197# endif
1198 add $16, %rdx
1199 jmp LABEL(loop_ashr_10_use)
1200
1201 .p2align 4
1202LABEL(nibble_ashr_10_use):
1203 sub $0x1000, %r10
1204 movdqa -16(%rdi, %rdx), %xmm0
1205 psrldq $10, %xmm0
1206 pcmpistri $0x3a,%xmm0, %xmm0
1207# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1208 cmp %r11, %rcx
1209 jae LABEL(nibble_ashr_exit_use)
1210# endif
1211 cmp $5, %ecx
1212 ja LABEL(nibble_ashr_10_restart_use)
1213
1214 jmp LABEL(nibble_ashr_exit_use)
1215
1216/*
1217 * The following cases will be handled by ashr_11
1218 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1219 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1220 */
1221 .p2align 4
1222LABEL(ashr_11):
1223 pslldq $5, %xmm2
1224 TOLOWER (%xmm1, %xmm2)
1225 pcmpeqb %xmm1, %xmm2
1226 psubb %xmm0, %xmm2
1227 pmovmskb %xmm2, %r9d
1228 shr %cl, %edx
1229 shr %cl, %r9d
1230 sub %r9d, %edx
1231 jnz LABEL(less32bytes)
1232 movdqa (%rdi), %xmm3
1233
1234 UPDATE_STRNCMP_COUNTER
1235
1236 mov $16, %rcx /* index for loads */
1237 mov $11, %r9d /* byte position left over from less32bytes case */
1238 /*
1239 * Setup %r10 value allows us to detect crossing a page boundary.
1240 * When %r10 goes positive we have crossed a page boundary and
1241 * need to do a nibble.
1242 */
1243 lea 11(%rdi), %r10
1244 and $0xfff, %r10 /* offset into 4K page */
1245 sub $0x1000, %r10 /* subtract 4K pagesize */
1246 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1247
1248 .p2align 4
1249LABEL(loop_ashr_11_use):
1250 add $16, %r10
1251 jg LABEL(nibble_ashr_11_use)
1252
1253LABEL(nibble_ashr_11_restart_use):
1254 movdqa (%rdi, %rdx), %xmm0
1255 palignr $11, -16(%rdi, %rdx), %xmm0
1256# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1257 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1258# else
1259 movdqa (%rsi,%rdx), %xmm1
1260 TOLOWER (%xmm0, %xmm1)
1261 pcmpistri $0x1a, %xmm1, %xmm0
1262# endif
1263 jbe LABEL(exit_use)
1264# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1265 sub $16, %r11
1266 jbe LABEL(strcmp_exitz)
1267# endif
1268
1269 add $16, %rdx
1270 add $16, %r10
1271 jg LABEL(nibble_ashr_11_use)
1272
1273 movdqa (%rdi, %rdx), %xmm0
1274 palignr $11, -16(%rdi, %rdx), %xmm0
1275# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1276 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1277# else
1278 movdqa (%rsi,%rdx), %xmm1
1279 TOLOWER (%xmm0, %xmm1)
1280 pcmpistri $0x1a, %xmm1, %xmm0
1281# endif
1282 jbe LABEL(exit_use)
1283# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1284 sub $16, %r11
1285 jbe LABEL(strcmp_exitz)
1286# endif
1287 add $16, %rdx
1288 jmp LABEL(loop_ashr_11_use)
1289
1290 .p2align 4
1291LABEL(nibble_ashr_11_use):
1292 sub $0x1000, %r10
1293 movdqa -16(%rdi, %rdx), %xmm0
1294 psrldq $11, %xmm0
1295 pcmpistri $0x3a,%xmm0, %xmm0
1296# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1297 cmp %r11, %rcx
1298 jae LABEL(nibble_ashr_exit_use)
1299# endif
1300 cmp $4, %ecx
1301 ja LABEL(nibble_ashr_11_restart_use)
1302
1303 jmp LABEL(nibble_ashr_exit_use)
1304
1305/*
1306 * The following cases will be handled by ashr_12
1307 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1308 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1309 */
1310 .p2align 4
1311LABEL(ashr_12):
1312 pslldq $4, %xmm2
1313 TOLOWER (%xmm1, %xmm2)
1314 pcmpeqb %xmm1, %xmm2
1315 psubb %xmm0, %xmm2
1316 pmovmskb %xmm2, %r9d
1317 shr %cl, %edx
1318 shr %cl, %r9d
1319 sub %r9d, %edx
1320 jnz LABEL(less32bytes)
1321 movdqa (%rdi), %xmm3
1322
1323 UPDATE_STRNCMP_COUNTER
1324
1325 mov $16, %rcx /* index for loads */
1326 mov $12, %r9d /* byte position left over from less32bytes case */
1327 /*
1328 * Setup %r10 value allows us to detect crossing a page boundary.
1329 * When %r10 goes positive we have crossed a page boundary and
1330 * need to do a nibble.
1331 */
1332 lea 12(%rdi), %r10
1333 and $0xfff, %r10 /* offset into 4K page */
1334 sub $0x1000, %r10 /* subtract 4K pagesize */
1335 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1336
1337 .p2align 4
1338LABEL(loop_ashr_12_use):
1339 add $16, %r10
1340 jg LABEL(nibble_ashr_12_use)
1341
1342LABEL(nibble_ashr_12_restart_use):
1343 movdqa (%rdi, %rdx), %xmm0
1344 palignr $12, -16(%rdi, %rdx), %xmm0
1345# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1346 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1347# else
1348 movdqa (%rsi,%rdx), %xmm1
1349 TOLOWER (%xmm0, %xmm1)
1350 pcmpistri $0x1a, %xmm1, %xmm0
1351# endif
1352 jbe LABEL(exit_use)
1353# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1354 sub $16, %r11
1355 jbe LABEL(strcmp_exitz)
1356# endif
1357
1358 add $16, %rdx
1359 add $16, %r10
1360 jg LABEL(nibble_ashr_12_use)
1361
1362 movdqa (%rdi, %rdx), %xmm0
1363 palignr $12, -16(%rdi, %rdx), %xmm0
1364# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1365 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1366# else
1367 movdqa (%rsi,%rdx), %xmm1
1368 TOLOWER (%xmm0, %xmm1)
1369 pcmpistri $0x1a, %xmm1, %xmm0
1370# endif
1371 jbe LABEL(exit_use)
1372# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1373 sub $16, %r11
1374 jbe LABEL(strcmp_exitz)
1375# endif
1376 add $16, %rdx
1377 jmp LABEL(loop_ashr_12_use)
1378
1379 .p2align 4
1380LABEL(nibble_ashr_12_use):
1381 sub $0x1000, %r10
1382 movdqa -16(%rdi, %rdx), %xmm0
1383 psrldq $12, %xmm0
1384 pcmpistri $0x3a,%xmm0, %xmm0
1385# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1386 cmp %r11, %rcx
1387 jae LABEL(nibble_ashr_exit_use)
1388# endif
1389 cmp $3, %ecx
1390 ja LABEL(nibble_ashr_12_restart_use)
1391
1392 jmp LABEL(nibble_ashr_exit_use)
1393
1394/*
1395 * The following cases will be handled by ashr_13
1396 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1397 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1398 */
1399 .p2align 4
1400LABEL(ashr_13):
1401 pslldq $3, %xmm2
1402 TOLOWER (%xmm1, %xmm2)
1403 pcmpeqb %xmm1, %xmm2
1404 psubb %xmm0, %xmm2
1405 pmovmskb %xmm2, %r9d
1406 shr %cl, %edx
1407 shr %cl, %r9d
1408 sub %r9d, %edx
1409 jnz LABEL(less32bytes)
1410 movdqa (%rdi), %xmm3
1411
1412 UPDATE_STRNCMP_COUNTER
1413
1414 mov $16, %rcx /* index for loads */
1415 mov $13, %r9d /* byte position left over from less32bytes case */
1416 /*
1417 * Setup %r10 value allows us to detect crossing a page boundary.
1418 * When %r10 goes positive we have crossed a page boundary and
1419 * need to do a nibble.
1420 */
1421 lea 13(%rdi), %r10
1422 and $0xfff, %r10 /* offset into 4K page */
1423 sub $0x1000, %r10 /* subtract 4K pagesize */
1424
1425 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1426
1427 .p2align 4
1428LABEL(loop_ashr_13_use):
1429 add $16, %r10
1430 jg LABEL(nibble_ashr_13_use)
1431
1432LABEL(nibble_ashr_13_restart_use):
1433 movdqa (%rdi, %rdx), %xmm0
1434 palignr $13, -16(%rdi, %rdx), %xmm0
1435# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1436 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1437# else
1438 movdqa (%rsi,%rdx), %xmm1
1439 TOLOWER (%xmm0, %xmm1)
1440 pcmpistri $0x1a, %xmm1, %xmm0
1441# endif
1442 jbe LABEL(exit_use)
1443# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1444 sub $16, %r11
1445 jbe LABEL(strcmp_exitz)
1446# endif
1447
1448 add $16, %rdx
1449 add $16, %r10
1450 jg LABEL(nibble_ashr_13_use)
1451
1452 movdqa (%rdi, %rdx), %xmm0
1453 palignr $13, -16(%rdi, %rdx), %xmm0
1454# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1455 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1456# else
1457 movdqa (%rsi,%rdx), %xmm1
1458 TOLOWER (%xmm0, %xmm1)
1459 pcmpistri $0x1a, %xmm1, %xmm0
1460# endif
1461 jbe LABEL(exit_use)
1462# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1463 sub $16, %r11
1464 jbe LABEL(strcmp_exitz)
1465# endif
1466 add $16, %rdx
1467 jmp LABEL(loop_ashr_13_use)
1468
1469 .p2align 4
1470LABEL(nibble_ashr_13_use):
1471 sub $0x1000, %r10
1472 movdqa -16(%rdi, %rdx), %xmm0
1473 psrldq $13, %xmm0
1474 pcmpistri $0x3a,%xmm0, %xmm0
1475# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1476 cmp %r11, %rcx
1477 jae LABEL(nibble_ashr_exit_use)
1478# endif
1479 cmp $2, %ecx
1480 ja LABEL(nibble_ashr_13_restart_use)
1481
1482 jmp LABEL(nibble_ashr_exit_use)
1483
1484/*
1485 * The following cases will be handled by ashr_14
1486 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1487 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1488 */
1489 .p2align 4
1490LABEL(ashr_14):
1491 pslldq $2, %xmm2
1492 TOLOWER (%xmm1, %xmm2)
1493 pcmpeqb %xmm1, %xmm2
1494 psubb %xmm0, %xmm2
1495 pmovmskb %xmm2, %r9d
1496 shr %cl, %edx
1497 shr %cl, %r9d
1498 sub %r9d, %edx
1499 jnz LABEL(less32bytes)
1500 movdqa (%rdi), %xmm3
1501
1502 UPDATE_STRNCMP_COUNTER
1503
1504 mov $16, %rcx /* index for loads */
1505 mov $14, %r9d /* byte position left over from less32bytes case */
1506 /*
1507 * Setup %r10 value allows us to detect crossing a page boundary.
1508 * When %r10 goes positive we have crossed a page boundary and
1509 * need to do a nibble.
1510 */
1511 lea 14(%rdi), %r10
1512 and $0xfff, %r10 /* offset into 4K page */
1513 sub $0x1000, %r10 /* subtract 4K pagesize */
1514
1515 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1516
1517 .p2align 4
1518LABEL(loop_ashr_14_use):
1519 add $16, %r10
1520 jg LABEL(nibble_ashr_14_use)
1521
1522LABEL(nibble_ashr_14_restart_use):
1523 movdqa (%rdi, %rdx), %xmm0
1524 palignr $14, -16(%rdi, %rdx), %xmm0
1525# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1526 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1527# else
1528 movdqa (%rsi,%rdx), %xmm1
1529 TOLOWER (%xmm0, %xmm1)
1530 pcmpistri $0x1a, %xmm1, %xmm0
1531# endif
1532 jbe LABEL(exit_use)
1533# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1534 sub $16, %r11
1535 jbe LABEL(strcmp_exitz)
1536# endif
1537
1538 add $16, %rdx
1539 add $16, %r10
1540 jg LABEL(nibble_ashr_14_use)
1541
1542 movdqa (%rdi, %rdx), %xmm0
1543 palignr $14, -16(%rdi, %rdx), %xmm0
1544# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1545 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1546# else
1547 movdqa (%rsi,%rdx), %xmm1
1548 TOLOWER (%xmm0, %xmm1)
1549 pcmpistri $0x1a, %xmm1, %xmm0
1550# endif
1551 jbe LABEL(exit_use)
1552# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1553 sub $16, %r11
1554 jbe LABEL(strcmp_exitz)
1555# endif
1556 add $16, %rdx
1557 jmp LABEL(loop_ashr_14_use)
1558
1559 .p2align 4
1560LABEL(nibble_ashr_14_use):
1561 sub $0x1000, %r10
1562 movdqa -16(%rdi, %rdx), %xmm0
1563 psrldq $14, %xmm0
1564 pcmpistri $0x3a,%xmm0, %xmm0
1565# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1566 cmp %r11, %rcx
1567 jae LABEL(nibble_ashr_exit_use)
1568# endif
1569 cmp $1, %ecx
1570 ja LABEL(nibble_ashr_14_restart_use)
1571
1572 jmp LABEL(nibble_ashr_exit_use)
1573
1574/*
1575 * The following cases will be handled by ashr_15
1576 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1577 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1578 */
1579 .p2align 4
1580LABEL(ashr_15):
1581 pslldq $1, %xmm2
1582 TOLOWER (%xmm1, %xmm2)
1583 pcmpeqb %xmm1, %xmm2
1584 psubb %xmm0, %xmm2
1585 pmovmskb %xmm2, %r9d
1586 shr %cl, %edx
1587 shr %cl, %r9d
1588 sub %r9d, %edx
1589 jnz LABEL(less32bytes)
1590
1591 movdqa (%rdi), %xmm3
1592
1593 UPDATE_STRNCMP_COUNTER
1594
1595 mov $16, %rcx /* index for loads */
1596 mov $15, %r9d /* byte position left over from less32bytes case */
1597 /*
1598 * Setup %r10 value allows us to detect crossing a page boundary.
1599 * When %r10 goes positive we have crossed a page boundary and
1600 * need to do a nibble.
1601 */
1602 lea 15(%rdi), %r10
1603 and $0xfff, %r10 /* offset into 4K page */
1604
1605 sub $0x1000, %r10 /* subtract 4K pagesize */
1606
1607 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1608
1609 .p2align 4
1610LABEL(loop_ashr_15_use):
1611 add $16, %r10
1612 jg LABEL(nibble_ashr_15_use)
1613
1614LABEL(nibble_ashr_15_restart_use):
1615 movdqa (%rdi, %rdx), %xmm0
1616 palignr $15, -16(%rdi, %rdx), %xmm0
1617# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1618 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1619# else
1620 movdqa (%rsi,%rdx), %xmm1
1621 TOLOWER (%xmm0, %xmm1)
1622 pcmpistri $0x1a, %xmm1, %xmm0
1623# endif
1624 jbe LABEL(exit_use)
1625# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1626 sub $16, %r11
1627 jbe LABEL(strcmp_exitz)
1628# endif
1629
1630 add $16, %rdx
1631 add $16, %r10
1632 jg LABEL(nibble_ashr_15_use)
1633
1634 movdqa (%rdi, %rdx), %xmm0
1635 palignr $15, -16(%rdi, %rdx), %xmm0
1636# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1637 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1638# else
1639 movdqa (%rsi,%rdx), %xmm1
1640 TOLOWER (%xmm0, %xmm1)
1641 pcmpistri $0x1a, %xmm1, %xmm0
1642# endif
1643 jbe LABEL(exit_use)
1644# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1645 sub $16, %r11
1646 jbe LABEL(strcmp_exitz)
1647# endif
1648 add $16, %rdx
1649 jmp LABEL(loop_ashr_15_use)
1650
1651 .p2align 4
1652LABEL(nibble_ashr_15_use):
1653 sub $0x1000, %r10
1654 movdqa -16(%rdi, %rdx), %xmm0
1655 psrldq $15, %xmm0
1656 pcmpistri $0x3a,%xmm0, %xmm0
1657# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1658 cmp %r11, %rcx
1659 jae LABEL(nibble_ashr_exit_use)
1660# endif
1661 cmp $0, %ecx
1662 ja LABEL(nibble_ashr_15_restart_use)
1663
1664LABEL(nibble_ashr_exit_use):
1665# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1666 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1667# else
1668 movdqa (%rsi,%rdx), %xmm1
1669 TOLOWER (%xmm0, %xmm1)
1670 pcmpistri $0x1a, %xmm1, %xmm0
1671# endif
1672 .p2align 4
1673LABEL(exit_use):
1674 jnc LABEL(strcmp_exitz)
1675# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1676 sub %rcx, %r11
1677 jbe LABEL(strcmp_exitz)
1678# endif
1679 add %rcx, %rdx
1680 lea -16(%rdi, %r9), %rdi
1681 movzbl (%rdi, %rdx), %eax
1682 movzbl (%rsi, %rdx), %edx
1683 test %r8d, %r8d
1684 jz LABEL(ret_use)
1685 xchg %eax, %edx
1686LABEL(ret_use):
1687# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1688 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1689 movl (%rcx,%rdx,4), %edx
1690 movl (%rcx,%rax,4), %eax
1691# endif
1692
1693 sub %edx, %eax
1694 ret
1695
1696LABEL(less32bytes):
1697 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1698 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1699 test %r8d, %r8d
1700 jz LABEL(ret)
1701 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1702
1703 .p2align 4
1704LABEL(ret):
1705LABEL(less16bytes):
1706 bsf %rdx, %rdx /* find and store bit index in %rdx */
1707
1708# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1709 sub %rdx, %r11
1710 jbe LABEL(strcmp_exitz)
1711# endif
1712 movzbl (%rsi, %rdx), %ecx
1713 movzbl (%rdi, %rdx), %eax
1714
1715# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1716 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1717 movl (%rdx,%rcx,4), %ecx
1718 movl (%rdx,%rax,4), %eax
1719# endif
1720
1721 sub %ecx, %eax
1722 ret
1723
1724LABEL(strcmp_exitz):
1725 xor %eax, %eax
1726 ret
1727
1728 .p2align 4
1729 // XXX Same as code above
1730LABEL(Byte0):
1731 movzbl (%rsi), %ecx
1732 movzbl (%rdi), %eax
1733
1734# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1735 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1736 movl (%rdx,%rcx,4), %ecx
1737 movl (%rdx,%rax,4), %eax
1738# endif
1739
1740 sub %ecx, %eax
1741 ret
1742 cfi_endproc
1743 .size STRCMP, .-STRCMP
1744
1745# undef UCLOW_reg
1746# undef UCHIGH_reg
1747# undef LCQWORD_reg
1748# undef TOLOWER
1749
1750 /* Put all SSE 4.2 functions together. */
1751 .section .rodata.SECTION,"a",@progbits
1752 .p2align 3
1753LABEL(unaligned_table):
1754 .int LABEL(ashr_1) - LABEL(unaligned_table)
1755 .int LABEL(ashr_2) - LABEL(unaligned_table)
1756 .int LABEL(ashr_3) - LABEL(unaligned_table)
1757 .int LABEL(ashr_4) - LABEL(unaligned_table)
1758 .int LABEL(ashr_5) - LABEL(unaligned_table)
1759 .int LABEL(ashr_6) - LABEL(unaligned_table)
1760 .int LABEL(ashr_7) - LABEL(unaligned_table)
1761 .int LABEL(ashr_8) - LABEL(unaligned_table)
1762 .int LABEL(ashr_9) - LABEL(unaligned_table)
1763 .int LABEL(ashr_10) - LABEL(unaligned_table)
1764 .int LABEL(ashr_11) - LABEL(unaligned_table)
1765 .int LABEL(ashr_12) - LABEL(unaligned_table)
1766 .int LABEL(ashr_13) - LABEL(unaligned_table)
1767 .int LABEL(ashr_14) - LABEL(unaligned_table)
1768 .int LABEL(ashr_15) - LABEL(unaligned_table)
1769 .int LABEL(ashr_0) - LABEL(unaligned_table)
1770
1771# undef LABEL
1772# undef SECTION
1773# undef movdqa
1774# undef movdqu
1775# undef pmovmskb
1776# undef pcmpistri
1777# undef psubb
1778# undef pcmpeqb
1779# undef psrldq
1780# undef pslldq
1781# undef palignr
1782# undef pxor
1783# undef D
1784#endif
1785

source code of glibc/sysdeps/x86_64/multiarch/strcmp-sse4_2.S