1/* strcmp optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21/* Continue building at ISA level 2 as the strcmp-sse42 is not always
22 preferable for ISA level == 2 CPUs. */
23#if ISA_SHOULD_BUILD (2)
24
25# define STRCMP_ISA _sse2
26# include "strcmp-naming.h"
27
28# include <sysdep.h>
29
30# undef UPDATE_STRNCMP_COUNTER
31
32# ifndef LABEL
33# define LABEL(l) L(l)
34# endif
35
36# ifdef USE_AS_STRNCMP
37/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
38 if the new counter > the old one or is 0. */
39# define UPDATE_STRNCMP_COUNTER \
40 /* calculate left number to compare */ \
41 lea -16(%rcx, %r11), %r9; \
42 cmp %r9, %r11; \
43 jb LABEL(strcmp_exitz); \
44 test %r9, %r9; \
45 je LABEL(strcmp_exitz); \
46 mov %r9, %r11
47
48# elif defined USE_AS_STRCASECMP_L
49# include "locale-defines.h"
50
51# define UPDATE_STRNCMP_COUNTER
52# elif defined USE_AS_STRNCASECMP_L
53# include "locale-defines.h"
54
55# define UPDATE_STRNCMP_COUNTER \
56 /* calculate left number to compare */ \
57 lea -16(%rcx, %r11), %r9; \
58 cmp %r9, %r11; \
59 jb LABEL(strcmp_exitz); \
60 test %r9, %r9; \
61 je LABEL(strcmp_exitz); \
62 mov %r9, %r11
63# else
64# define UPDATE_STRNCMP_COUNTER
65# endif
66
67 .text
68# ifdef USE_AS_STRCASECMP_L
69# ifndef ENTRY2
70# define ENTRY2(name) ENTRY (name)
71# define END2(name) END (name)
72# endif
73
74ENTRY2 (STRCASECMP)
75 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
76 mov %fs:(%rax),%RDX_LP
77
78 /* Either 1 or 5 bytes (depending if CET is enabled). */
79 .p2align 4
80END2 (STRCASECMP)
81 /* FALLTHROUGH to strcasecmp_l. */
82# elif defined USE_AS_STRNCASECMP_L
83# ifndef ENTRY2
84# define ENTRY2(name) ENTRY (name)
85# define END2(name) END (name)
86# endif
87
88ENTRY2 (STRCASECMP)
89 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
90 mov %fs:(%rax),%RCX_LP
91
92 /* Either 1 or 5 bytes (depending if CET is enabled). */
93 .p2align 4
94END2 (STRCASECMP)
95 /* FALLTHROUGH to strncasecmp_l. */
96# endif
97
98ENTRY (STRCMP)
99# ifdef USE_AS_STRCASECMP_L
100 /* We have to fall back on the C implementation for locales
101 with encodings not matching ASCII for single bytes. */
102# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
103 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
104# else
105 mov (%rdx), %RAX_LP
106# endif
107 testb $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
108 jne __strcasecmp_l_nonascii
109# elif defined USE_AS_STRNCASECMP_L
110 /* We have to fall back on the C implementation for locales
111 with encodings not matching ASCII for single bytes. */
112# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
113 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
114# else
115 mov (%rcx), %RAX_LP
116# endif
117 testb $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
118 jne __strncasecmp_l_nonascii
119# endif
120
121/*
122 * This implementation uses SSE to compare up to 16 bytes at a time.
123 */
124# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
125 test %RDX_LP, %RDX_LP
126 je LABEL(strcmp_exitz)
127 cmp $1, %RDX_LP
128 je LABEL(Byte0)
129 mov %RDX_LP, %R11_LP
130# endif
131 mov %esi, %ecx
132 mov %edi, %eax
133/* Use 64bit AND here to avoid long NOP padding. */
134 and $0x3f, %rcx /* rsi alignment in cache line */
135 and $0x3f, %rax /* rdi alignment in cache line */
136# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
137 .section .rodata.cst16,"aM",@progbits,16
138 .align 16
139.Llcase_min:
140 .quad 0x3f3f3f3f3f3f3f3f
141 .quad 0x3f3f3f3f3f3f3f3f
142.Llcase_max:
143 .quad 0x9999999999999999
144 .quad 0x9999999999999999
145.Lcase_add:
146 .quad 0x2020202020202020
147 .quad 0x2020202020202020
148 .previous
149 movdqa .Llcase_min(%rip), %xmm5
150# define LCASE_MIN_reg %xmm5
151 movdqa .Llcase_max(%rip), %xmm6
152# define LCASE_MAX_reg %xmm6
153 movdqa .Lcase_add(%rip), %xmm7
154# define CASE_ADD_reg %xmm7
155# endif
156 cmp $0x30, %ecx
157 ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
158 cmp $0x30, %eax
159 ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
160 movlpd (%rdi), %xmm1
161 movlpd (%rsi), %xmm2
162 movhpd 8(%rdi), %xmm1
163 movhpd 8(%rsi), %xmm2
164# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
165# define TOLOWER(reg1, reg2) \
166 movdqa LCASE_MIN_reg, %xmm8; \
167 movdqa LCASE_MIN_reg, %xmm9; \
168 paddb reg1, %xmm8; \
169 paddb reg2, %xmm9; \
170 pcmpgtb LCASE_MAX_reg, %xmm8; \
171 pcmpgtb LCASE_MAX_reg, %xmm9; \
172 pandn CASE_ADD_reg, %xmm8; \
173 pandn CASE_ADD_reg, %xmm9; \
174 paddb %xmm8, reg1; \
175 paddb %xmm9, reg2
176 TOLOWER (%xmm1, %xmm2)
177# else
178# define TOLOWER(reg1, reg2)
179# endif
180 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
181 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
182 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
183 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
184 pmovmskb %xmm1, %edx
185 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
186 jnz LABEL(less16bytes) /* If not, find different value or null char */
187# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
188 sub $16, %r11
189 jbe LABEL(strcmp_exitz) /* finish comparison */
190# endif
191 add $16, %rsi /* prepare to search next 16 bytes */
192 add $16, %rdi /* prepare to search next 16 bytes */
193
194 /*
195 * Determine source and destination string offsets from 16-byte alignment.
196 * Use relative offset difference between the two to determine which case
197 * below to use.
198 */
199 .p2align 4
200LABEL(crosscache):
201 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
202 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
203 mov $0xffff, %edx /* for equivalent offset */
204 xor %r8d, %r8d
205 and $0xf, %ecx /* offset of rsi */
206 and $0xf, %eax /* offset of rdi */
207 cmp %eax, %ecx
208 je LABEL(ashr_0) /* rsi and rdi relative offset same */
209 ja LABEL(bigger)
210 mov %edx, %r8d /* r8d is offset flag for exit tail */
211 xchg %ecx, %eax
212 xchg %rsi, %rdi
213LABEL(bigger):
214 lea 15(%rax), %r9
215 sub %rcx, %r9
216 lea LABEL(unaligned_table)(%rip), %r10
217 movslq (%r10, %r9,4), %r9
218 lea (%r10, %r9), %r10
219 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
220
221/*
222 * The following cases will be handled by ashr_0
223 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
224 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
225 */
226 .p2align 4
227LABEL(ashr_0):
228
229 movdqa (%rsi), %xmm1
230 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
231 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
232# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
233 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
234# else
235 movdqa (%rdi), %xmm2
236 TOLOWER (%xmm1, %xmm2)
237 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
238# endif
239 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
240 pmovmskb %xmm1, %r9d
241 shr %cl, %edx /* adjust 0xffff for offset */
242 shr %cl, %r9d /* adjust for 16-byte offset */
243 sub %r9d, %edx
244 /*
245 * edx must be the same with r9d if in left byte (16-rcx) is equal to
246 * the start from (16-rax) and no null char was seen.
247 */
248 jne LABEL(less32bytes) /* mismatch or null char */
249 UPDATE_STRNCMP_COUNTER
250 mov $16, %rcx
251 mov $16, %r9
252 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
253
254 /*
255 * Now both strings are aligned at 16-byte boundary. Loop over strings
256 * checking 32-bytes per iteration.
257 */
258 .p2align 4
259LABEL(loop_ashr_0):
260 movdqa (%rsi, %rcx), %xmm1
261 movdqa (%rdi, %rcx), %xmm2
262 TOLOWER (%xmm1, %xmm2)
263
264 pcmpeqb %xmm1, %xmm0
265 pcmpeqb %xmm2, %xmm1
266 psubb %xmm0, %xmm1
267 pmovmskb %xmm1, %edx
268 sub $0xffff, %edx
269 jnz LABEL(exit) /* mismatch or null char seen */
270
271# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
272 sub $16, %r11
273 jbe LABEL(strcmp_exitz)
274# endif
275 add $16, %rcx
276 movdqa (%rsi, %rcx), %xmm1
277 movdqa (%rdi, %rcx), %xmm2
278 TOLOWER (%xmm1, %xmm2)
279
280 pcmpeqb %xmm1, %xmm0
281 pcmpeqb %xmm2, %xmm1
282 psubb %xmm0, %xmm1
283 pmovmskb %xmm1, %edx
284 sub $0xffff, %edx
285 jnz LABEL(exit)
286# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
287 sub $16, %r11
288 jbe LABEL(strcmp_exitz)
289# endif
290 add $16, %rcx
291 jmp LABEL(loop_ashr_0)
292
293/*
294 * The following cases will be handled by ashr_1
295 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
296 * n(15) n -15 0(15 +(n-15) - n) ashr_1
297 */
298 .p2align 4
299LABEL(ashr_1):
300 pxor %xmm0, %xmm0
301 movdqa (%rdi), %xmm2
302 movdqa (%rsi), %xmm1
303 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
304 pslldq $15, %xmm2 /* shift first string to align with second */
305 TOLOWER (%xmm1, %xmm2)
306 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
307 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
308 pmovmskb %xmm2, %r9d
309 shr %cl, %edx /* adjust 0xffff for offset */
310 shr %cl, %r9d /* adjust for 16-byte offset */
311 sub %r9d, %edx
312 jnz LABEL(less32bytes) /* mismatch or null char seen */
313 movdqa (%rdi), %xmm3
314 UPDATE_STRNCMP_COUNTER
315
316 pxor %xmm0, %xmm0
317 mov $16, %rcx /* index for loads*/
318 mov $1, %r9d /* byte position left over from less32bytes case */
319 /*
320 * Setup %r10 value allows us to detect crossing a page boundary.
321 * When %r10 goes positive we have crossed a page boundary and
322 * need to do a nibble.
323 */
324 lea 1(%rdi), %r10
325 and $0xfff, %r10 /* offset into 4K page */
326 sub $0x1000, %r10 /* subtract 4K pagesize */
327
328 .p2align 4
329LABEL(loop_ashr_1):
330 add $16, %r10
331 jg LABEL(nibble_ashr_1) /* cross page boundary */
332
333LABEL(gobble_ashr_1):
334 movdqa (%rsi, %rcx), %xmm1
335 movdqa (%rdi, %rcx), %xmm2
336 movdqa %xmm2, %xmm4 /* store for next cycle */
337
338 psrldq $1, %xmm3
339 pslldq $15, %xmm2
340 por %xmm3, %xmm2 /* merge into one 16byte value */
341
342 TOLOWER (%xmm1, %xmm2)
343
344 pcmpeqb %xmm1, %xmm0
345 pcmpeqb %xmm2, %xmm1
346 psubb %xmm0, %xmm1
347 pmovmskb %xmm1, %edx
348 sub $0xffff, %edx
349 jnz LABEL(exit)
350
351# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
352 sub $16, %r11
353 jbe LABEL(strcmp_exitz)
354# endif
355 add $16, %rcx
356 movdqa %xmm4, %xmm3
357
358 add $16, %r10
359 jg LABEL(nibble_ashr_1) /* cross page boundary */
360
361 movdqa (%rsi, %rcx), %xmm1
362 movdqa (%rdi, %rcx), %xmm2
363 movdqa %xmm2, %xmm4 /* store for next cycle */
364
365 psrldq $1, %xmm3
366 pslldq $15, %xmm2
367 por %xmm3, %xmm2 /* merge into one 16byte value */
368
369 TOLOWER (%xmm1, %xmm2)
370
371 pcmpeqb %xmm1, %xmm0
372 pcmpeqb %xmm2, %xmm1
373 psubb %xmm0, %xmm1
374 pmovmskb %xmm1, %edx
375 sub $0xffff, %edx
376 jnz LABEL(exit)
377
378# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
379 sub $16, %r11
380 jbe LABEL(strcmp_exitz)
381# endif
382 add $16, %rcx
383 movdqa %xmm4, %xmm3
384 jmp LABEL(loop_ashr_1)
385
386 /*
387 * Nibble avoids loads across page boundary. This is to avoid a potential
388 * access into unmapped memory.
389 */
390 .p2align 4
391LABEL(nibble_ashr_1):
392 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
393 pmovmskb %xmm0, %edx
394 test $0xfffe, %edx
395 jnz LABEL(ashr_1_exittail) /* find null char*/
396
397# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
398 cmp $15, %r11
399 jbe LABEL(ashr_1_exittail)
400# endif
401
402 pxor %xmm0, %xmm0
403 sub $0x1000, %r10 /* subtract 4K from %r10 */
404 jmp LABEL(gobble_ashr_1)
405
406 /*
407 * Once find null char, determine if there is a string mismatch
408 * before the null char.
409 */
410 .p2align 4
411LABEL(ashr_1_exittail):
412 movdqa (%rsi, %rcx), %xmm1
413 psrldq $1, %xmm0
414 psrldq $1, %xmm3
415 jmp LABEL(aftertail)
416
417/*
418 * The following cases will be handled by ashr_2
419 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
420 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
421 */
422 .p2align 4
423LABEL(ashr_2):
424 pxor %xmm0, %xmm0
425 movdqa (%rdi), %xmm2
426 movdqa (%rsi), %xmm1
427 pcmpeqb %xmm1, %xmm0
428 pslldq $14, %xmm2
429 TOLOWER (%xmm1, %xmm2)
430 pcmpeqb %xmm1, %xmm2
431 psubb %xmm0, %xmm2
432 pmovmskb %xmm2, %r9d
433 shr %cl, %edx
434 shr %cl, %r9d
435 sub %r9d, %edx
436 jnz LABEL(less32bytes)
437 movdqa (%rdi), %xmm3
438 UPDATE_STRNCMP_COUNTER
439
440 pxor %xmm0, %xmm0
441 mov $16, %rcx /* index for loads */
442 mov $2, %r9d /* byte position left over from less32bytes case */
443 /*
444 * Setup %r10 value allows us to detect crossing a page boundary.
445 * When %r10 goes positive we have crossed a page boundary and
446 * need to do a nibble.
447 */
448 lea 2(%rdi), %r10
449 and $0xfff, %r10 /* offset into 4K page */
450 sub $0x1000, %r10 /* subtract 4K pagesize */
451
452 .p2align 4
453LABEL(loop_ashr_2):
454 add $16, %r10
455 jg LABEL(nibble_ashr_2)
456
457LABEL(gobble_ashr_2):
458 movdqa (%rsi, %rcx), %xmm1
459 movdqa (%rdi, %rcx), %xmm2
460 movdqa %xmm2, %xmm4
461
462 psrldq $2, %xmm3
463 pslldq $14, %xmm2
464 por %xmm3, %xmm2 /* merge into one 16byte value */
465
466 TOLOWER (%xmm1, %xmm2)
467
468 pcmpeqb %xmm1, %xmm0
469 pcmpeqb %xmm2, %xmm1
470 psubb %xmm0, %xmm1
471 pmovmskb %xmm1, %edx
472 sub $0xffff, %edx
473 jnz LABEL(exit)
474
475# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
476 sub $16, %r11
477 jbe LABEL(strcmp_exitz)
478# endif
479
480 add $16, %rcx
481 movdqa %xmm4, %xmm3
482
483 add $16, %r10
484 jg LABEL(nibble_ashr_2) /* cross page boundary */
485
486 movdqa (%rsi, %rcx), %xmm1
487 movdqa (%rdi, %rcx), %xmm2
488 movdqa %xmm2, %xmm4
489
490 psrldq $2, %xmm3
491 pslldq $14, %xmm2
492 por %xmm3, %xmm2 /* merge into one 16byte value */
493
494 TOLOWER (%xmm1, %xmm2)
495
496 pcmpeqb %xmm1, %xmm0
497 pcmpeqb %xmm2, %xmm1
498 psubb %xmm0, %xmm1
499 pmovmskb %xmm1, %edx
500 sub $0xffff, %edx
501 jnz LABEL(exit)
502
503# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
504 sub $16, %r11
505 jbe LABEL(strcmp_exitz)
506# endif
507
508 add $16, %rcx
509 movdqa %xmm4, %xmm3
510 jmp LABEL(loop_ashr_2)
511
512 .p2align 4
513LABEL(nibble_ashr_2):
514 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
515 pmovmskb %xmm0, %edx
516 test $0xfffc, %edx
517 jnz LABEL(ashr_2_exittail)
518
519# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
520 cmp $14, %r11
521 jbe LABEL(ashr_2_exittail)
522# endif
523
524 pxor %xmm0, %xmm0
525 sub $0x1000, %r10
526 jmp LABEL(gobble_ashr_2)
527
528 .p2align 4
529LABEL(ashr_2_exittail):
530 movdqa (%rsi, %rcx), %xmm1
531 psrldq $2, %xmm0
532 psrldq $2, %xmm3
533 jmp LABEL(aftertail)
534
535/*
536 * The following cases will be handled by ashr_3
537 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
538 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
539 */
540 .p2align 4
541LABEL(ashr_3):
542 pxor %xmm0, %xmm0
543 movdqa (%rdi), %xmm2
544 movdqa (%rsi), %xmm1
545 pcmpeqb %xmm1, %xmm0
546 pslldq $13, %xmm2
547 TOLOWER (%xmm1, %xmm2)
548 pcmpeqb %xmm1, %xmm2
549 psubb %xmm0, %xmm2
550 pmovmskb %xmm2, %r9d
551 shr %cl, %edx
552 shr %cl, %r9d
553 sub %r9d, %edx
554 jnz LABEL(less32bytes)
555 movdqa (%rdi), %xmm3
556
557 UPDATE_STRNCMP_COUNTER
558
559 pxor %xmm0, %xmm0
560 mov $16, %rcx /* index for loads */
561 mov $3, %r9d /* byte position left over from less32bytes case */
562 /*
563 * Setup %r10 value allows us to detect crossing a page boundary.
564 * When %r10 goes positive we have crossed a page boundary and
565 * need to do a nibble.
566 */
567 lea 3(%rdi), %r10
568 and $0xfff, %r10 /* offset into 4K page */
569 sub $0x1000, %r10 /* subtract 4K pagesize */
570
571 .p2align 4
572LABEL(loop_ashr_3):
573 add $16, %r10
574 jg LABEL(nibble_ashr_3)
575
576LABEL(gobble_ashr_3):
577 movdqa (%rsi, %rcx), %xmm1
578 movdqa (%rdi, %rcx), %xmm2
579 movdqa %xmm2, %xmm4
580
581 psrldq $3, %xmm3
582 pslldq $13, %xmm2
583 por %xmm3, %xmm2 /* merge into one 16byte value */
584
585 TOLOWER (%xmm1, %xmm2)
586
587 pcmpeqb %xmm1, %xmm0
588 pcmpeqb %xmm2, %xmm1
589 psubb %xmm0, %xmm1
590 pmovmskb %xmm1, %edx
591 sub $0xffff, %edx
592 jnz LABEL(exit)
593
594# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
595 sub $16, %r11
596 jbe LABEL(strcmp_exitz)
597# endif
598
599 add $16, %rcx
600 movdqa %xmm4, %xmm3
601
602 add $16, %r10
603 jg LABEL(nibble_ashr_3) /* cross page boundary */
604
605 movdqa (%rsi, %rcx), %xmm1
606 movdqa (%rdi, %rcx), %xmm2
607 movdqa %xmm2, %xmm4
608
609 psrldq $3, %xmm3
610 pslldq $13, %xmm2
611 por %xmm3, %xmm2 /* merge into one 16byte value */
612
613 TOLOWER (%xmm1, %xmm2)
614
615 pcmpeqb %xmm1, %xmm0
616 pcmpeqb %xmm2, %xmm1
617 psubb %xmm0, %xmm1
618 pmovmskb %xmm1, %edx
619 sub $0xffff, %edx
620 jnz LABEL(exit)
621
622# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
623 sub $16, %r11
624 jbe LABEL(strcmp_exitz)
625# endif
626
627 add $16, %rcx
628 movdqa %xmm4, %xmm3
629 jmp LABEL(loop_ashr_3)
630
631 .p2align 4
632LABEL(nibble_ashr_3):
633 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
634 pmovmskb %xmm0, %edx
635 test $0xfff8, %edx
636 jnz LABEL(ashr_3_exittail)
637
638# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
639 cmp $13, %r11
640 jbe LABEL(ashr_3_exittail)
641# endif
642
643 pxor %xmm0, %xmm0
644 sub $0x1000, %r10
645 jmp LABEL(gobble_ashr_3)
646
647 .p2align 4
648LABEL(ashr_3_exittail):
649 movdqa (%rsi, %rcx), %xmm1
650 psrldq $3, %xmm0
651 psrldq $3, %xmm3
652 jmp LABEL(aftertail)
653
654/*
655 * The following cases will be handled by ashr_4
656 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
657 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
658 */
659 .p2align 4
660LABEL(ashr_4):
661 pxor %xmm0, %xmm0
662 movdqa (%rdi), %xmm2
663 movdqa (%rsi), %xmm1
664 pcmpeqb %xmm1, %xmm0
665 pslldq $12, %xmm2
666 TOLOWER (%xmm1, %xmm2)
667 pcmpeqb %xmm1, %xmm2
668 psubb %xmm0, %xmm2
669 pmovmskb %xmm2, %r9d
670 shr %cl, %edx
671 shr %cl, %r9d
672 sub %r9d, %edx
673 jnz LABEL(less32bytes)
674 movdqa (%rdi), %xmm3
675
676 UPDATE_STRNCMP_COUNTER
677
678 pxor %xmm0, %xmm0
679 mov $16, %rcx /* index for loads */
680 mov $4, %r9d /* byte position left over from less32bytes case */
681 /*
682 * Setup %r10 value allows us to detect crossing a page boundary.
683 * When %r10 goes positive we have crossed a page boundary and
684 * need to do a nibble.
685 */
686 lea 4(%rdi), %r10
687 and $0xfff, %r10 /* offset into 4K page */
688 sub $0x1000, %r10 /* subtract 4K pagesize */
689
690 .p2align 4
691LABEL(loop_ashr_4):
692 add $16, %r10
693 jg LABEL(nibble_ashr_4)
694
695LABEL(gobble_ashr_4):
696 movdqa (%rsi, %rcx), %xmm1
697 movdqa (%rdi, %rcx), %xmm2
698 movdqa %xmm2, %xmm4
699
700 psrldq $4, %xmm3
701 pslldq $12, %xmm2
702 por %xmm3, %xmm2 /* merge into one 16byte value */
703
704 TOLOWER (%xmm1, %xmm2)
705
706 pcmpeqb %xmm1, %xmm0
707 pcmpeqb %xmm2, %xmm1
708 psubb %xmm0, %xmm1
709 pmovmskb %xmm1, %edx
710 sub $0xffff, %edx
711 jnz LABEL(exit)
712
713# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
714 sub $16, %r11
715 jbe LABEL(strcmp_exitz)
716# endif
717
718 add $16, %rcx
719 movdqa %xmm4, %xmm3
720
721 add $16, %r10
722 jg LABEL(nibble_ashr_4) /* cross page boundary */
723
724 movdqa (%rsi, %rcx), %xmm1
725 movdqa (%rdi, %rcx), %xmm2
726 movdqa %xmm2, %xmm4
727
728 psrldq $4, %xmm3
729 pslldq $12, %xmm2
730 por %xmm3, %xmm2 /* merge into one 16byte value */
731
732 TOLOWER (%xmm1, %xmm2)
733
734 pcmpeqb %xmm1, %xmm0
735 pcmpeqb %xmm2, %xmm1
736 psubb %xmm0, %xmm1
737 pmovmskb %xmm1, %edx
738 sub $0xffff, %edx
739 jnz LABEL(exit)
740
741# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
742 sub $16, %r11
743 jbe LABEL(strcmp_exitz)
744# endif
745
746 add $16, %rcx
747 movdqa %xmm4, %xmm3
748 jmp LABEL(loop_ashr_4)
749
750 .p2align 4
751LABEL(nibble_ashr_4):
752 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
753 pmovmskb %xmm0, %edx
754 test $0xfff0, %edx
755 jnz LABEL(ashr_4_exittail)
756
757# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
758 cmp $12, %r11
759 jbe LABEL(ashr_4_exittail)
760# endif
761
762 pxor %xmm0, %xmm0
763 sub $0x1000, %r10
764 jmp LABEL(gobble_ashr_4)
765
766 .p2align 4
767LABEL(ashr_4_exittail):
768 movdqa (%rsi, %rcx), %xmm1
769 psrldq $4, %xmm0
770 psrldq $4, %xmm3
771 jmp LABEL(aftertail)
772
773/*
774 * The following cases will be handled by ashr_5
775 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
776 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
777 */
778 .p2align 4
779LABEL(ashr_5):
780 pxor %xmm0, %xmm0
781 movdqa (%rdi), %xmm2
782 movdqa (%rsi), %xmm1
783 pcmpeqb %xmm1, %xmm0
784 pslldq $11, %xmm2
785 TOLOWER (%xmm1, %xmm2)
786 pcmpeqb %xmm1, %xmm2
787 psubb %xmm0, %xmm2
788 pmovmskb %xmm2, %r9d
789 shr %cl, %edx
790 shr %cl, %r9d
791 sub %r9d, %edx
792 jnz LABEL(less32bytes)
793 movdqa (%rdi), %xmm3
794
795 UPDATE_STRNCMP_COUNTER
796
797 pxor %xmm0, %xmm0
798 mov $16, %rcx /* index for loads */
799 mov $5, %r9d /* byte position left over from less32bytes case */
800 /*
801 * Setup %r10 value allows us to detect crossing a page boundary.
802 * When %r10 goes positive we have crossed a page boundary and
803 * need to do a nibble.
804 */
805 lea 5(%rdi), %r10
806 and $0xfff, %r10 /* offset into 4K page */
807 sub $0x1000, %r10 /* subtract 4K pagesize */
808
809 .p2align 4
810LABEL(loop_ashr_5):
811 add $16, %r10
812 jg LABEL(nibble_ashr_5)
813
814LABEL(gobble_ashr_5):
815 movdqa (%rsi, %rcx), %xmm1
816 movdqa (%rdi, %rcx), %xmm2
817 movdqa %xmm2, %xmm4
818
819 psrldq $5, %xmm3
820 pslldq $11, %xmm2
821 por %xmm3, %xmm2 /* merge into one 16byte value */
822
823 TOLOWER (%xmm1, %xmm2)
824
825 pcmpeqb %xmm1, %xmm0
826 pcmpeqb %xmm2, %xmm1
827 psubb %xmm0, %xmm1
828 pmovmskb %xmm1, %edx
829 sub $0xffff, %edx
830 jnz LABEL(exit)
831
832# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
833 sub $16, %r11
834 jbe LABEL(strcmp_exitz)
835# endif
836
837 add $16, %rcx
838 movdqa %xmm4, %xmm3
839
840 add $16, %r10
841 jg LABEL(nibble_ashr_5) /* cross page boundary */
842
843 movdqa (%rsi, %rcx), %xmm1
844 movdqa (%rdi, %rcx), %xmm2
845 movdqa %xmm2, %xmm4
846
847 psrldq $5, %xmm3
848 pslldq $11, %xmm2
849 por %xmm3, %xmm2 /* merge into one 16byte value */
850
851 TOLOWER (%xmm1, %xmm2)
852
853 pcmpeqb %xmm1, %xmm0
854 pcmpeqb %xmm2, %xmm1
855 psubb %xmm0, %xmm1
856 pmovmskb %xmm1, %edx
857 sub $0xffff, %edx
858 jnz LABEL(exit)
859
860# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
861 sub $16, %r11
862 jbe LABEL(strcmp_exitz)
863# endif
864
865 add $16, %rcx
866 movdqa %xmm4, %xmm3
867 jmp LABEL(loop_ashr_5)
868
869 .p2align 4
870LABEL(nibble_ashr_5):
871 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
872 pmovmskb %xmm0, %edx
873 test $0xffe0, %edx
874 jnz LABEL(ashr_5_exittail)
875
876# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
877 cmp $11, %r11
878 jbe LABEL(ashr_5_exittail)
879# endif
880
881 pxor %xmm0, %xmm0
882 sub $0x1000, %r10
883 jmp LABEL(gobble_ashr_5)
884
885 .p2align 4
886LABEL(ashr_5_exittail):
887 movdqa (%rsi, %rcx), %xmm1
888 psrldq $5, %xmm0
889 psrldq $5, %xmm3
890 jmp LABEL(aftertail)
891
892/*
893 * The following cases will be handled by ashr_6
894 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
895 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
896 */
897 .p2align 4
898LABEL(ashr_6):
899 pxor %xmm0, %xmm0
900 movdqa (%rdi), %xmm2
901 movdqa (%rsi), %xmm1
902 pcmpeqb %xmm1, %xmm0
903 pslldq $10, %xmm2
904 TOLOWER (%xmm1, %xmm2)
905 pcmpeqb %xmm1, %xmm2
906 psubb %xmm0, %xmm2
907 pmovmskb %xmm2, %r9d
908 shr %cl, %edx
909 shr %cl, %r9d
910 sub %r9d, %edx
911 jnz LABEL(less32bytes)
912 movdqa (%rdi), %xmm3
913
914 UPDATE_STRNCMP_COUNTER
915
916 pxor %xmm0, %xmm0
917 mov $16, %rcx /* index for loads */
918 mov $6, %r9d /* byte position left over from less32bytes case */
919 /*
920 * Setup %r10 value allows us to detect crossing a page boundary.
921 * When %r10 goes positive we have crossed a page boundary and
922 * need to do a nibble.
923 */
924 lea 6(%rdi), %r10
925 and $0xfff, %r10 /* offset into 4K page */
926 sub $0x1000, %r10 /* subtract 4K pagesize */
927
928 .p2align 4
929LABEL(loop_ashr_6):
930 add $16, %r10
931 jg LABEL(nibble_ashr_6)
932
933LABEL(gobble_ashr_6):
934 movdqa (%rsi, %rcx), %xmm1
935 movdqa (%rdi, %rcx), %xmm2
936 movdqa %xmm2, %xmm4
937
938 psrldq $6, %xmm3
939 pslldq $10, %xmm2
940 por %xmm3, %xmm2 /* merge into one 16byte value */
941
942 TOLOWER (%xmm1, %xmm2)
943
944 pcmpeqb %xmm1, %xmm0
945 pcmpeqb %xmm2, %xmm1
946 psubb %xmm0, %xmm1
947 pmovmskb %xmm1, %edx
948 sub $0xffff, %edx
949 jnz LABEL(exit)
950
951# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
952 sub $16, %r11
953 jbe LABEL(strcmp_exitz)
954# endif
955
956 add $16, %rcx
957 movdqa %xmm4, %xmm3
958
959 add $16, %r10
960 jg LABEL(nibble_ashr_6) /* cross page boundary */
961
962 movdqa (%rsi, %rcx), %xmm1
963 movdqa (%rdi, %rcx), %xmm2
964 movdqa %xmm2, %xmm4
965
966 psrldq $6, %xmm3
967 pslldq $10, %xmm2
968 por %xmm3, %xmm2 /* merge into one 16byte value */
969
970 TOLOWER (%xmm1, %xmm2)
971
972 pcmpeqb %xmm1, %xmm0
973 pcmpeqb %xmm2, %xmm1
974 psubb %xmm0, %xmm1
975 pmovmskb %xmm1, %edx
976 sub $0xffff, %edx
977 jnz LABEL(exit)
978
979# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
980 sub $16, %r11
981 jbe LABEL(strcmp_exitz)
982# endif
983
984 add $16, %rcx
985 movdqa %xmm4, %xmm3
986 jmp LABEL(loop_ashr_6)
987
988 .p2align 4
989LABEL(nibble_ashr_6):
990 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
991 pmovmskb %xmm0, %edx
992 test $0xffc0, %edx
993 jnz LABEL(ashr_6_exittail)
994
995# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
996 cmp $10, %r11
997 jbe LABEL(ashr_6_exittail)
998# endif
999
1000 pxor %xmm0, %xmm0
1001 sub $0x1000, %r10
1002 jmp LABEL(gobble_ashr_6)
1003
1004 .p2align 4
1005LABEL(ashr_6_exittail):
1006 movdqa (%rsi, %rcx), %xmm1
1007 psrldq $6, %xmm0
1008 psrldq $6, %xmm3
1009 jmp LABEL(aftertail)
1010
1011/*
1012 * The following cases will be handled by ashr_7
1013 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1014 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
1015 */
1016 .p2align 4
1017LABEL(ashr_7):
1018 pxor %xmm0, %xmm0
1019 movdqa (%rdi), %xmm2
1020 movdqa (%rsi), %xmm1
1021 pcmpeqb %xmm1, %xmm0
1022 pslldq $9, %xmm2
1023 TOLOWER (%xmm1, %xmm2)
1024 pcmpeqb %xmm1, %xmm2
1025 psubb %xmm0, %xmm2
1026 pmovmskb %xmm2, %r9d
1027 shr %cl, %edx
1028 shr %cl, %r9d
1029 sub %r9d, %edx
1030 jnz LABEL(less32bytes)
1031 movdqa (%rdi), %xmm3
1032
1033 UPDATE_STRNCMP_COUNTER
1034
1035 pxor %xmm0, %xmm0
1036 mov $16, %rcx /* index for loads */
1037 mov $7, %r9d /* byte position left over from less32bytes case */
1038 /*
1039 * Setup %r10 value allows us to detect crossing a page boundary.
1040 * When %r10 goes positive we have crossed a page boundary and
1041 * need to do a nibble.
1042 */
1043 lea 7(%rdi), %r10
1044 and $0xfff, %r10 /* offset into 4K page */
1045 sub $0x1000, %r10 /* subtract 4K pagesize */
1046
1047 .p2align 4
1048LABEL(loop_ashr_7):
1049 add $16, %r10
1050 jg LABEL(nibble_ashr_7)
1051
1052LABEL(gobble_ashr_7):
1053 movdqa (%rsi, %rcx), %xmm1
1054 movdqa (%rdi, %rcx), %xmm2
1055 movdqa %xmm2, %xmm4
1056
1057 psrldq $7, %xmm3
1058 pslldq $9, %xmm2
1059 por %xmm3, %xmm2 /* merge into one 16byte value */
1060
1061 TOLOWER (%xmm1, %xmm2)
1062
1063 pcmpeqb %xmm1, %xmm0
1064 pcmpeqb %xmm2, %xmm1
1065 psubb %xmm0, %xmm1
1066 pmovmskb %xmm1, %edx
1067 sub $0xffff, %edx
1068 jnz LABEL(exit)
1069
1070# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1071 sub $16, %r11
1072 jbe LABEL(strcmp_exitz)
1073# endif
1074
1075 add $16, %rcx
1076 movdqa %xmm4, %xmm3
1077
1078 add $16, %r10
1079 jg LABEL(nibble_ashr_7) /* cross page boundary */
1080
1081 movdqa (%rsi, %rcx), %xmm1
1082 movdqa (%rdi, %rcx), %xmm2
1083 movdqa %xmm2, %xmm4
1084
1085 psrldq $7, %xmm3
1086 pslldq $9, %xmm2
1087 por %xmm3, %xmm2 /* merge into one 16byte value */
1088
1089 TOLOWER (%xmm1, %xmm2)
1090
1091 pcmpeqb %xmm1, %xmm0
1092 pcmpeqb %xmm2, %xmm1
1093 psubb %xmm0, %xmm1
1094 pmovmskb %xmm1, %edx
1095 sub $0xffff, %edx
1096 jnz LABEL(exit)
1097
1098# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1099 sub $16, %r11
1100 jbe LABEL(strcmp_exitz)
1101# endif
1102
1103 add $16, %rcx
1104 movdqa %xmm4, %xmm3
1105 jmp LABEL(loop_ashr_7)
1106
1107 .p2align 4
1108LABEL(nibble_ashr_7):
1109 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1110 pmovmskb %xmm0, %edx
1111 test $0xff80, %edx
1112 jnz LABEL(ashr_7_exittail)
1113
1114# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1115 cmp $9, %r11
1116 jbe LABEL(ashr_7_exittail)
1117# endif
1118
1119 pxor %xmm0, %xmm0
1120 sub $0x1000, %r10
1121 jmp LABEL(gobble_ashr_7)
1122
1123 .p2align 4
1124LABEL(ashr_7_exittail):
1125 movdqa (%rsi, %rcx), %xmm1
1126 psrldq $7, %xmm0
1127 psrldq $7, %xmm3
1128 jmp LABEL(aftertail)
1129
1130/*
1131 * The following cases will be handled by ashr_8
1132 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1133 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
1134 */
1135 .p2align 4
1136LABEL(ashr_8):
1137 pxor %xmm0, %xmm0
1138 movdqa (%rdi), %xmm2
1139 movdqa (%rsi), %xmm1
1140 pcmpeqb %xmm1, %xmm0
1141 pslldq $8, %xmm2
1142 TOLOWER (%xmm1, %xmm2)
1143 pcmpeqb %xmm1, %xmm2
1144 psubb %xmm0, %xmm2
1145 pmovmskb %xmm2, %r9d
1146 shr %cl, %edx
1147 shr %cl, %r9d
1148 sub %r9d, %edx
1149 jnz LABEL(less32bytes)
1150 movdqa (%rdi), %xmm3
1151
1152 UPDATE_STRNCMP_COUNTER
1153
1154 pxor %xmm0, %xmm0
1155 mov $16, %rcx /* index for loads */
1156 mov $8, %r9d /* byte position left over from less32bytes case */
1157 /*
1158 * Setup %r10 value allows us to detect crossing a page boundary.
1159 * When %r10 goes positive we have crossed a page boundary and
1160 * need to do a nibble.
1161 */
1162 lea 8(%rdi), %r10
1163 and $0xfff, %r10 /* offset into 4K page */
1164 sub $0x1000, %r10 /* subtract 4K pagesize */
1165
1166 .p2align 4
1167LABEL(loop_ashr_8):
1168 add $16, %r10
1169 jg LABEL(nibble_ashr_8)
1170
1171LABEL(gobble_ashr_8):
1172 movdqa (%rsi, %rcx), %xmm1
1173 movdqa (%rdi, %rcx), %xmm2
1174 movdqa %xmm2, %xmm4
1175
1176 psrldq $8, %xmm3
1177 pslldq $8, %xmm2
1178 por %xmm3, %xmm2 /* merge into one 16byte value */
1179
1180 TOLOWER (%xmm1, %xmm2)
1181
1182 pcmpeqb %xmm1, %xmm0
1183 pcmpeqb %xmm2, %xmm1
1184 psubb %xmm0, %xmm1
1185 pmovmskb %xmm1, %edx
1186 sub $0xffff, %edx
1187 jnz LABEL(exit)
1188
1189# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1190 sub $16, %r11
1191 jbe LABEL(strcmp_exitz)
1192# endif
1193
1194 add $16, %rcx
1195 movdqa %xmm4, %xmm3
1196
1197 add $16, %r10
1198 jg LABEL(nibble_ashr_8) /* cross page boundary */
1199
1200 movdqa (%rsi, %rcx), %xmm1
1201 movdqa (%rdi, %rcx), %xmm2
1202 movdqa %xmm2, %xmm4
1203
1204 psrldq $8, %xmm3
1205 pslldq $8, %xmm2
1206 por %xmm3, %xmm2 /* merge into one 16byte value */
1207
1208 TOLOWER (%xmm1, %xmm2)
1209
1210 pcmpeqb %xmm1, %xmm0
1211 pcmpeqb %xmm2, %xmm1
1212 psubb %xmm0, %xmm1
1213 pmovmskb %xmm1, %edx
1214 sub $0xffff, %edx
1215 jnz LABEL(exit)
1216
1217# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1218 sub $16, %r11
1219 jbe LABEL(strcmp_exitz)
1220# endif
1221
1222 add $16, %rcx
1223 movdqa %xmm4, %xmm3
1224 jmp LABEL(loop_ashr_8)
1225
1226 .p2align 4
1227LABEL(nibble_ashr_8):
1228 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1229 pmovmskb %xmm0, %edx
1230 test $0xff00, %edx
1231 jnz LABEL(ashr_8_exittail)
1232
1233# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1234 cmp $8, %r11
1235 jbe LABEL(ashr_8_exittail)
1236# endif
1237
1238 pxor %xmm0, %xmm0
1239 sub $0x1000, %r10
1240 jmp LABEL(gobble_ashr_8)
1241
1242 .p2align 4
1243LABEL(ashr_8_exittail):
1244 movdqa (%rsi, %rcx), %xmm1
1245 psrldq $8, %xmm0
1246 psrldq $8, %xmm3
1247 jmp LABEL(aftertail)
1248
1249/*
1250 * The following cases will be handled by ashr_9
1251 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1252 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1253 */
1254 .p2align 4
1255LABEL(ashr_9):
1256 pxor %xmm0, %xmm0
1257 movdqa (%rdi), %xmm2
1258 movdqa (%rsi), %xmm1
1259 pcmpeqb %xmm1, %xmm0
1260 pslldq $7, %xmm2
1261 TOLOWER (%xmm1, %xmm2)
1262 pcmpeqb %xmm1, %xmm2
1263 psubb %xmm0, %xmm2
1264 pmovmskb %xmm2, %r9d
1265 shr %cl, %edx
1266 shr %cl, %r9d
1267 sub %r9d, %edx
1268 jnz LABEL(less32bytes)
1269 movdqa (%rdi), %xmm3
1270
1271 UPDATE_STRNCMP_COUNTER
1272
1273 pxor %xmm0, %xmm0
1274 mov $16, %rcx /* index for loads */
1275 mov $9, %r9d /* byte position left over from less32bytes case */
1276 /*
1277 * Setup %r10 value allows us to detect crossing a page boundary.
1278 * When %r10 goes positive we have crossed a page boundary and
1279 * need to do a nibble.
1280 */
1281 lea 9(%rdi), %r10
1282 and $0xfff, %r10 /* offset into 4K page */
1283 sub $0x1000, %r10 /* subtract 4K pagesize */
1284
1285 .p2align 4
1286LABEL(loop_ashr_9):
1287 add $16, %r10
1288 jg LABEL(nibble_ashr_9)
1289
1290LABEL(gobble_ashr_9):
1291 movdqa (%rsi, %rcx), %xmm1
1292 movdqa (%rdi, %rcx), %xmm2
1293 movdqa %xmm2, %xmm4
1294
1295 psrldq $9, %xmm3
1296 pslldq $7, %xmm2
1297 por %xmm3, %xmm2 /* merge into one 16byte value */
1298
1299 TOLOWER (%xmm1, %xmm2)
1300
1301 pcmpeqb %xmm1, %xmm0
1302 pcmpeqb %xmm2, %xmm1
1303 psubb %xmm0, %xmm1
1304 pmovmskb %xmm1, %edx
1305 sub $0xffff, %edx
1306 jnz LABEL(exit)
1307
1308# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1309 sub $16, %r11
1310 jbe LABEL(strcmp_exitz)
1311# endif
1312
1313 add $16, %rcx
1314 movdqa %xmm4, %xmm3
1315
1316 add $16, %r10
1317 jg LABEL(nibble_ashr_9) /* cross page boundary */
1318
1319 movdqa (%rsi, %rcx), %xmm1
1320 movdqa (%rdi, %rcx), %xmm2
1321 movdqa %xmm2, %xmm4
1322
1323 psrldq $9, %xmm3
1324 pslldq $7, %xmm2
1325 por %xmm3, %xmm2 /* merge into one 16byte value */
1326
1327 TOLOWER (%xmm1, %xmm2)
1328
1329 pcmpeqb %xmm1, %xmm0
1330 pcmpeqb %xmm2, %xmm1
1331 psubb %xmm0, %xmm1
1332 pmovmskb %xmm1, %edx
1333 sub $0xffff, %edx
1334 jnz LABEL(exit)
1335
1336# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1337 sub $16, %r11
1338 jbe LABEL(strcmp_exitz)
1339# endif
1340
1341 add $16, %rcx
1342 movdqa %xmm4, %xmm3 /* store for next cycle */
1343 jmp LABEL(loop_ashr_9)
1344
1345 .p2align 4
1346LABEL(nibble_ashr_9):
1347 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1348 pmovmskb %xmm0, %edx
1349 test $0xfe00, %edx
1350 jnz LABEL(ashr_9_exittail)
1351
1352# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1353 cmp $7, %r11
1354 jbe LABEL(ashr_9_exittail)
1355# endif
1356
1357 pxor %xmm0, %xmm0
1358 sub $0x1000, %r10
1359 jmp LABEL(gobble_ashr_9)
1360
1361 .p2align 4
1362LABEL(ashr_9_exittail):
1363 movdqa (%rsi, %rcx), %xmm1
1364 psrldq $9, %xmm0
1365 psrldq $9, %xmm3
1366 jmp LABEL(aftertail)
1367
1368/*
1369 * The following cases will be handled by ashr_10
1370 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1371 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1372 */
1373 .p2align 4
1374LABEL(ashr_10):
1375 pxor %xmm0, %xmm0
1376 movdqa (%rdi), %xmm2
1377 movdqa (%rsi), %xmm1
1378 pcmpeqb %xmm1, %xmm0
1379 pslldq $6, %xmm2
1380 TOLOWER (%xmm1, %xmm2)
1381 pcmpeqb %xmm1, %xmm2
1382 psubb %xmm0, %xmm2
1383 pmovmskb %xmm2, %r9d
1384 shr %cl, %edx
1385 shr %cl, %r9d
1386 sub %r9d, %edx
1387 jnz LABEL(less32bytes)
1388 movdqa (%rdi), %xmm3
1389
1390 UPDATE_STRNCMP_COUNTER
1391
1392 pxor %xmm0, %xmm0
1393 mov $16, %rcx /* index for loads */
1394 mov $10, %r9d /* byte position left over from less32bytes case */
1395 /*
1396 * Setup %r10 value allows us to detect crossing a page boundary.
1397 * When %r10 goes positive we have crossed a page boundary and
1398 * need to do a nibble.
1399 */
1400 lea 10(%rdi), %r10
1401 and $0xfff, %r10 /* offset into 4K page */
1402 sub $0x1000, %r10 /* subtract 4K pagesize */
1403
1404 .p2align 4
1405LABEL(loop_ashr_10):
1406 add $16, %r10
1407 jg LABEL(nibble_ashr_10)
1408
1409LABEL(gobble_ashr_10):
1410 movdqa (%rsi, %rcx), %xmm1
1411 movdqa (%rdi, %rcx), %xmm2
1412 movdqa %xmm2, %xmm4
1413
1414 psrldq $10, %xmm3
1415 pslldq $6, %xmm2
1416 por %xmm3, %xmm2 /* merge into one 16byte value */
1417
1418 TOLOWER (%xmm1, %xmm2)
1419
1420 pcmpeqb %xmm1, %xmm0
1421 pcmpeqb %xmm2, %xmm1
1422 psubb %xmm0, %xmm1
1423 pmovmskb %xmm1, %edx
1424 sub $0xffff, %edx
1425 jnz LABEL(exit)
1426
1427# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1428 sub $16, %r11
1429 jbe LABEL(strcmp_exitz)
1430# endif
1431
1432 add $16, %rcx
1433 movdqa %xmm4, %xmm3
1434
1435 add $16, %r10
1436 jg LABEL(nibble_ashr_10) /* cross page boundary */
1437
1438 movdqa (%rsi, %rcx), %xmm1
1439 movdqa (%rdi, %rcx), %xmm2
1440 movdqa %xmm2, %xmm4
1441
1442 psrldq $10, %xmm3
1443 pslldq $6, %xmm2
1444 por %xmm3, %xmm2 /* merge into one 16byte value */
1445
1446 TOLOWER (%xmm1, %xmm2)
1447
1448 pcmpeqb %xmm1, %xmm0
1449 pcmpeqb %xmm2, %xmm1
1450 psubb %xmm0, %xmm1
1451 pmovmskb %xmm1, %edx
1452 sub $0xffff, %edx
1453 jnz LABEL(exit)
1454
1455# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1456 sub $16, %r11
1457 jbe LABEL(strcmp_exitz)
1458# endif
1459
1460 add $16, %rcx
1461 movdqa %xmm4, %xmm3
1462 jmp LABEL(loop_ashr_10)
1463
1464 .p2align 4
1465LABEL(nibble_ashr_10):
1466 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1467 pmovmskb %xmm0, %edx
1468 test $0xfc00, %edx
1469 jnz LABEL(ashr_10_exittail)
1470
1471# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1472 cmp $6, %r11
1473 jbe LABEL(ashr_10_exittail)
1474# endif
1475
1476 pxor %xmm0, %xmm0
1477 sub $0x1000, %r10
1478 jmp LABEL(gobble_ashr_10)
1479
1480 .p2align 4
1481LABEL(ashr_10_exittail):
1482 movdqa (%rsi, %rcx), %xmm1
1483 psrldq $10, %xmm0
1484 psrldq $10, %xmm3
1485 jmp LABEL(aftertail)
1486
1487/*
1488 * The following cases will be handled by ashr_11
1489 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1490 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1491 */
1492 .p2align 4
1493LABEL(ashr_11):
1494 pxor %xmm0, %xmm0
1495 movdqa (%rdi), %xmm2
1496 movdqa (%rsi), %xmm1
1497 pcmpeqb %xmm1, %xmm0
1498 pslldq $5, %xmm2
1499 TOLOWER (%xmm1, %xmm2)
1500 pcmpeqb %xmm1, %xmm2
1501 psubb %xmm0, %xmm2
1502 pmovmskb %xmm2, %r9d
1503 shr %cl, %edx
1504 shr %cl, %r9d
1505 sub %r9d, %edx
1506 jnz LABEL(less32bytes)
1507 movdqa (%rdi), %xmm3
1508
1509 UPDATE_STRNCMP_COUNTER
1510
1511 pxor %xmm0, %xmm0
1512 mov $16, %rcx /* index for loads */
1513 mov $11, %r9d /* byte position left over from less32bytes case */
1514 /*
1515 * Setup %r10 value allows us to detect crossing a page boundary.
1516 * When %r10 goes positive we have crossed a page boundary and
1517 * need to do a nibble.
1518 */
1519 lea 11(%rdi), %r10
1520 and $0xfff, %r10 /* offset into 4K page */
1521 sub $0x1000, %r10 /* subtract 4K pagesize */
1522
1523 .p2align 4
1524LABEL(loop_ashr_11):
1525 add $16, %r10
1526 jg LABEL(nibble_ashr_11)
1527
1528LABEL(gobble_ashr_11):
1529 movdqa (%rsi, %rcx), %xmm1
1530 movdqa (%rdi, %rcx), %xmm2
1531 movdqa %xmm2, %xmm4
1532
1533 psrldq $11, %xmm3
1534 pslldq $5, %xmm2
1535 por %xmm3, %xmm2 /* merge into one 16byte value */
1536
1537 TOLOWER (%xmm1, %xmm2)
1538
1539 pcmpeqb %xmm1, %xmm0
1540 pcmpeqb %xmm2, %xmm1
1541 psubb %xmm0, %xmm1
1542 pmovmskb %xmm1, %edx
1543 sub $0xffff, %edx
1544 jnz LABEL(exit)
1545
1546# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1547 sub $16, %r11
1548 jbe LABEL(strcmp_exitz)
1549# endif
1550
1551 add $16, %rcx
1552 movdqa %xmm4, %xmm3
1553
1554 add $16, %r10
1555 jg LABEL(nibble_ashr_11) /* cross page boundary */
1556
1557 movdqa (%rsi, %rcx), %xmm1
1558 movdqa (%rdi, %rcx), %xmm2
1559 movdqa %xmm2, %xmm4
1560
1561 psrldq $11, %xmm3
1562 pslldq $5, %xmm2
1563 por %xmm3, %xmm2 /* merge into one 16byte value */
1564
1565 TOLOWER (%xmm1, %xmm2)
1566
1567 pcmpeqb %xmm1, %xmm0
1568 pcmpeqb %xmm2, %xmm1
1569 psubb %xmm0, %xmm1
1570 pmovmskb %xmm1, %edx
1571 sub $0xffff, %edx
1572 jnz LABEL(exit)
1573
1574# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1575 sub $16, %r11
1576 jbe LABEL(strcmp_exitz)
1577# endif
1578
1579 add $16, %rcx
1580 movdqa %xmm4, %xmm3
1581 jmp LABEL(loop_ashr_11)
1582
1583 .p2align 4
1584LABEL(nibble_ashr_11):
1585 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1586 pmovmskb %xmm0, %edx
1587 test $0xf800, %edx
1588 jnz LABEL(ashr_11_exittail)
1589
1590# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1591 cmp $5, %r11
1592 jbe LABEL(ashr_11_exittail)
1593# endif
1594
1595 pxor %xmm0, %xmm0
1596 sub $0x1000, %r10
1597 jmp LABEL(gobble_ashr_11)
1598
1599 .p2align 4
1600LABEL(ashr_11_exittail):
1601 movdqa (%rsi, %rcx), %xmm1
1602 psrldq $11, %xmm0
1603 psrldq $11, %xmm3
1604 jmp LABEL(aftertail)
1605
1606/*
1607 * The following cases will be handled by ashr_12
1608 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1609 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1610 */
1611 .p2align 4
1612LABEL(ashr_12):
1613 pxor %xmm0, %xmm0
1614 movdqa (%rdi), %xmm2
1615 movdqa (%rsi), %xmm1
1616 pcmpeqb %xmm1, %xmm0
1617 pslldq $4, %xmm2
1618 TOLOWER (%xmm1, %xmm2)
1619 pcmpeqb %xmm1, %xmm2
1620 psubb %xmm0, %xmm2
1621 pmovmskb %xmm2, %r9d
1622 shr %cl, %edx
1623 shr %cl, %r9d
1624 sub %r9d, %edx
1625 jnz LABEL(less32bytes)
1626 movdqa (%rdi), %xmm3
1627
1628 UPDATE_STRNCMP_COUNTER
1629
1630 pxor %xmm0, %xmm0
1631 mov $16, %rcx /* index for loads */
1632 mov $12, %r9d /* byte position left over from less32bytes case */
1633 /*
1634 * Setup %r10 value allows us to detect crossing a page boundary.
1635 * When %r10 goes positive we have crossed a page boundary and
1636 * need to do a nibble.
1637 */
1638 lea 12(%rdi), %r10
1639 and $0xfff, %r10 /* offset into 4K page */
1640 sub $0x1000, %r10 /* subtract 4K pagesize */
1641
1642 .p2align 4
1643LABEL(loop_ashr_12):
1644 add $16, %r10
1645 jg LABEL(nibble_ashr_12)
1646
1647LABEL(gobble_ashr_12):
1648 movdqa (%rsi, %rcx), %xmm1
1649 movdqa (%rdi, %rcx), %xmm2
1650 movdqa %xmm2, %xmm4
1651
1652 psrldq $12, %xmm3
1653 pslldq $4, %xmm2
1654 por %xmm3, %xmm2 /* merge into one 16byte value */
1655
1656 TOLOWER (%xmm1, %xmm2)
1657
1658 pcmpeqb %xmm1, %xmm0
1659 pcmpeqb %xmm2, %xmm1
1660 psubb %xmm0, %xmm1
1661 pmovmskb %xmm1, %edx
1662 sub $0xffff, %edx
1663 jnz LABEL(exit)
1664
1665# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1666 sub $16, %r11
1667 jbe LABEL(strcmp_exitz)
1668# endif
1669
1670 add $16, %rcx
1671 movdqa %xmm4, %xmm3
1672
1673 add $16, %r10
1674 jg LABEL(nibble_ashr_12) /* cross page boundary */
1675
1676 movdqa (%rsi, %rcx), %xmm1
1677 movdqa (%rdi, %rcx), %xmm2
1678 movdqa %xmm2, %xmm4
1679
1680 psrldq $12, %xmm3
1681 pslldq $4, %xmm2
1682 por %xmm3, %xmm2 /* merge into one 16byte value */
1683
1684 TOLOWER (%xmm1, %xmm2)
1685
1686 pcmpeqb %xmm1, %xmm0
1687 pcmpeqb %xmm2, %xmm1
1688 psubb %xmm0, %xmm1
1689 pmovmskb %xmm1, %edx
1690 sub $0xffff, %edx
1691 jnz LABEL(exit)
1692
1693# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1694 sub $16, %r11
1695 jbe LABEL(strcmp_exitz)
1696# endif
1697
1698 add $16, %rcx
1699 movdqa %xmm4, %xmm3
1700 jmp LABEL(loop_ashr_12)
1701
1702 .p2align 4
1703LABEL(nibble_ashr_12):
1704 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1705 pmovmskb %xmm0, %edx
1706 test $0xf000, %edx
1707 jnz LABEL(ashr_12_exittail)
1708
1709# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1710 cmp $4, %r11
1711 jbe LABEL(ashr_12_exittail)
1712# endif
1713
1714 pxor %xmm0, %xmm0
1715 sub $0x1000, %r10
1716 jmp LABEL(gobble_ashr_12)
1717
1718 .p2align 4
1719LABEL(ashr_12_exittail):
1720 movdqa (%rsi, %rcx), %xmm1
1721 psrldq $12, %xmm0
1722 psrldq $12, %xmm3
1723 jmp LABEL(aftertail)
1724
1725/*
1726 * The following cases will be handled by ashr_13
1727 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1728 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1729 */
1730 .p2align 4
1731LABEL(ashr_13):
1732 pxor %xmm0, %xmm0
1733 movdqa (%rdi), %xmm2
1734 movdqa (%rsi), %xmm1
1735 pcmpeqb %xmm1, %xmm0
1736 pslldq $3, %xmm2
1737 TOLOWER (%xmm1, %xmm2)
1738 pcmpeqb %xmm1, %xmm2
1739 psubb %xmm0, %xmm2
1740 pmovmskb %xmm2, %r9d
1741 shr %cl, %edx
1742 shr %cl, %r9d
1743 sub %r9d, %edx
1744 jnz LABEL(less32bytes)
1745 movdqa (%rdi), %xmm3
1746
1747 UPDATE_STRNCMP_COUNTER
1748
1749 pxor %xmm0, %xmm0
1750 mov $16, %rcx /* index for loads */
1751 mov $13, %r9d /* byte position left over from less32bytes case */
1752 /*
1753 * Setup %r10 value allows us to detect crossing a page boundary.
1754 * When %r10 goes positive we have crossed a page boundary and
1755 * need to do a nibble.
1756 */
1757 lea 13(%rdi), %r10
1758 and $0xfff, %r10 /* offset into 4K page */
1759 sub $0x1000, %r10 /* subtract 4K pagesize */
1760
1761 .p2align 4
1762LABEL(loop_ashr_13):
1763 add $16, %r10
1764 jg LABEL(nibble_ashr_13)
1765
1766LABEL(gobble_ashr_13):
1767 movdqa (%rsi, %rcx), %xmm1
1768 movdqa (%rdi, %rcx), %xmm2
1769 movdqa %xmm2, %xmm4
1770
1771 psrldq $13, %xmm3
1772 pslldq $3, %xmm2
1773 por %xmm3, %xmm2 /* merge into one 16byte value */
1774
1775 TOLOWER (%xmm1, %xmm2)
1776
1777 pcmpeqb %xmm1, %xmm0
1778 pcmpeqb %xmm2, %xmm1
1779 psubb %xmm0, %xmm1
1780 pmovmskb %xmm1, %edx
1781 sub $0xffff, %edx
1782 jnz LABEL(exit)
1783
1784# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1785 sub $16, %r11
1786 jbe LABEL(strcmp_exitz)
1787# endif
1788
1789 add $16, %rcx
1790 movdqa %xmm4, %xmm3
1791
1792 add $16, %r10
1793 jg LABEL(nibble_ashr_13) /* cross page boundary */
1794
1795 movdqa (%rsi, %rcx), %xmm1
1796 movdqa (%rdi, %rcx), %xmm2
1797 movdqa %xmm2, %xmm4
1798
1799 psrldq $13, %xmm3
1800 pslldq $3, %xmm2
1801 por %xmm3, %xmm2 /* merge into one 16byte value */
1802
1803 TOLOWER (%xmm1, %xmm2)
1804
1805 pcmpeqb %xmm1, %xmm0
1806 pcmpeqb %xmm2, %xmm1
1807 psubb %xmm0, %xmm1
1808 pmovmskb %xmm1, %edx
1809 sub $0xffff, %edx
1810 jnz LABEL(exit)
1811
1812# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1813 sub $16, %r11
1814 jbe LABEL(strcmp_exitz)
1815# endif
1816
1817 add $16, %rcx
1818 movdqa %xmm4, %xmm3
1819 jmp LABEL(loop_ashr_13)
1820
1821 .p2align 4
1822LABEL(nibble_ashr_13):
1823 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1824 pmovmskb %xmm0, %edx
1825 test $0xe000, %edx
1826 jnz LABEL(ashr_13_exittail)
1827
1828# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1829 cmp $3, %r11
1830 jbe LABEL(ashr_13_exittail)
1831# endif
1832
1833 pxor %xmm0, %xmm0
1834 sub $0x1000, %r10
1835 jmp LABEL(gobble_ashr_13)
1836
1837 .p2align 4
1838LABEL(ashr_13_exittail):
1839 movdqa (%rsi, %rcx), %xmm1
1840 psrldq $13, %xmm0
1841 psrldq $13, %xmm3
1842 jmp LABEL(aftertail)
1843
1844/*
1845 * The following cases will be handled by ashr_14
1846 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1847 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1848 */
1849 .p2align 4
1850LABEL(ashr_14):
1851 pxor %xmm0, %xmm0
1852 movdqa (%rdi), %xmm2
1853 movdqa (%rsi), %xmm1
1854 pcmpeqb %xmm1, %xmm0
1855 pslldq $2, %xmm2
1856 TOLOWER (%xmm1, %xmm2)
1857 pcmpeqb %xmm1, %xmm2
1858 psubb %xmm0, %xmm2
1859 pmovmskb %xmm2, %r9d
1860 shr %cl, %edx
1861 shr %cl, %r9d
1862 sub %r9d, %edx
1863 jnz LABEL(less32bytes)
1864 movdqa (%rdi), %xmm3
1865
1866 UPDATE_STRNCMP_COUNTER
1867
1868 pxor %xmm0, %xmm0
1869 mov $16, %rcx /* index for loads */
1870 mov $14, %r9d /* byte position left over from less32bytes case */
1871 /*
1872 * Setup %r10 value allows us to detect crossing a page boundary.
1873 * When %r10 goes positive we have crossed a page boundary and
1874 * need to do a nibble.
1875 */
1876 lea 14(%rdi), %r10
1877 and $0xfff, %r10 /* offset into 4K page */
1878 sub $0x1000, %r10 /* subtract 4K pagesize */
1879
1880 .p2align 4
1881LABEL(loop_ashr_14):
1882 add $16, %r10
1883 jg LABEL(nibble_ashr_14)
1884
1885LABEL(gobble_ashr_14):
1886 movdqa (%rsi, %rcx), %xmm1
1887 movdqa (%rdi, %rcx), %xmm2
1888 movdqa %xmm2, %xmm4
1889
1890 psrldq $14, %xmm3
1891 pslldq $2, %xmm2
1892 por %xmm3, %xmm2 /* merge into one 16byte value */
1893
1894 TOLOWER (%xmm1, %xmm2)
1895
1896 pcmpeqb %xmm1, %xmm0
1897 pcmpeqb %xmm2, %xmm1
1898 psubb %xmm0, %xmm1
1899 pmovmskb %xmm1, %edx
1900 sub $0xffff, %edx
1901 jnz LABEL(exit)
1902
1903# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1904 sub $16, %r11
1905 jbe LABEL(strcmp_exitz)
1906# endif
1907
1908 add $16, %rcx
1909 movdqa %xmm4, %xmm3
1910
1911 add $16, %r10
1912 jg LABEL(nibble_ashr_14) /* cross page boundary */
1913
1914 movdqa (%rsi, %rcx), %xmm1
1915 movdqa (%rdi, %rcx), %xmm2
1916 movdqa %xmm2, %xmm4
1917
1918 psrldq $14, %xmm3
1919 pslldq $2, %xmm2
1920 por %xmm3, %xmm2 /* merge into one 16byte value */
1921
1922 TOLOWER (%xmm1, %xmm2)
1923
1924 pcmpeqb %xmm1, %xmm0
1925 pcmpeqb %xmm2, %xmm1
1926 psubb %xmm0, %xmm1
1927 pmovmskb %xmm1, %edx
1928 sub $0xffff, %edx
1929 jnz LABEL(exit)
1930
1931# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
1932 sub $16, %r11
1933 jbe LABEL(strcmp_exitz)
1934# endif
1935
1936 add $16, %rcx
1937 movdqa %xmm4, %xmm3
1938 jmp LABEL(loop_ashr_14)
1939
1940 .p2align 4
1941LABEL(nibble_ashr_14):
1942 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1943 pmovmskb %xmm0, %edx
1944 test $0xc000, %edx
1945 jnz LABEL(ashr_14_exittail)
1946
1947# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1948 cmp $2, %r11
1949 jbe LABEL(ashr_14_exittail)
1950# endif
1951
1952 pxor %xmm0, %xmm0
1953 sub $0x1000, %r10
1954 jmp LABEL(gobble_ashr_14)
1955
1956 .p2align 4
1957LABEL(ashr_14_exittail):
1958 movdqa (%rsi, %rcx), %xmm1
1959 psrldq $14, %xmm0
1960 psrldq $14, %xmm3
1961 jmp LABEL(aftertail)
1962
1963/*
1964 * The following cases will be handled by ashr_15
1965 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1966 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1967 */
1968 .p2align 4
1969LABEL(ashr_15):
1970 pxor %xmm0, %xmm0
1971 movdqa (%rdi), %xmm2
1972 movdqa (%rsi), %xmm1
1973 pcmpeqb %xmm1, %xmm0
1974 pslldq $1, %xmm2
1975 TOLOWER (%xmm1, %xmm2)
1976 pcmpeqb %xmm1, %xmm2
1977 psubb %xmm0, %xmm2
1978 pmovmskb %xmm2, %r9d
1979 shr %cl, %edx
1980 shr %cl, %r9d
1981 sub %r9d, %edx
1982 jnz LABEL(less32bytes)
1983
1984 movdqa (%rdi), %xmm3
1985
1986 UPDATE_STRNCMP_COUNTER
1987
1988 pxor %xmm0, %xmm0
1989 mov $16, %rcx /* index for loads */
1990 mov $15, %r9d /* byte position left over from less32bytes case */
1991 /*
1992 * Setup %r10 value allows us to detect crossing a page boundary.
1993 * When %r10 goes positive we have crossed a page boundary and
1994 * need to do a nibble.
1995 */
1996 lea 15(%rdi), %r10
1997 and $0xfff, %r10 /* offset into 4K page */
1998
1999 sub $0x1000, %r10 /* subtract 4K pagesize */
2000
2001 .p2align 4
2002LABEL(loop_ashr_15):
2003 add $16, %r10
2004 jg LABEL(nibble_ashr_15)
2005
2006LABEL(gobble_ashr_15):
2007 movdqa (%rsi, %rcx), %xmm1
2008 movdqa (%rdi, %rcx), %xmm2
2009 movdqa %xmm2, %xmm4
2010
2011 psrldq $15, %xmm3
2012 pslldq $1, %xmm2
2013 por %xmm3, %xmm2 /* merge into one 16byte value */
2014
2015 TOLOWER (%xmm1, %xmm2)
2016
2017 pcmpeqb %xmm1, %xmm0
2018 pcmpeqb %xmm2, %xmm1
2019 psubb %xmm0, %xmm1
2020 pmovmskb %xmm1, %edx
2021 sub $0xffff, %edx
2022 jnz LABEL(exit)
2023
2024# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2025 sub $16, %r11
2026 jbe LABEL(strcmp_exitz)
2027# endif
2028
2029 add $16, %rcx
2030 movdqa %xmm4, %xmm3
2031
2032 add $16, %r10
2033 jg LABEL(nibble_ashr_15) /* cross page boundary */
2034
2035 movdqa (%rsi, %rcx), %xmm1
2036 movdqa (%rdi, %rcx), %xmm2
2037 movdqa %xmm2, %xmm4
2038
2039 psrldq $15, %xmm3
2040 pslldq $1, %xmm2
2041 por %xmm3, %xmm2 /* merge into one 16byte value */
2042
2043 TOLOWER (%xmm1, %xmm2)
2044
2045 pcmpeqb %xmm1, %xmm0
2046 pcmpeqb %xmm2, %xmm1
2047 psubb %xmm0, %xmm1
2048 pmovmskb %xmm1, %edx
2049 sub $0xffff, %edx
2050 jnz LABEL(exit)
2051
2052# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2053 sub $16, %r11
2054 jbe LABEL(strcmp_exitz)
2055# endif
2056
2057 add $16, %rcx
2058 movdqa %xmm4, %xmm3
2059 jmp LABEL(loop_ashr_15)
2060
2061 .p2align 4
2062LABEL(nibble_ashr_15):
2063 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
2064 pmovmskb %xmm0, %edx
2065 test $0x8000, %edx
2066 jnz LABEL(ashr_15_exittail)
2067
2068# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2069 cmpq $1, %r11
2070 jbe LABEL(ashr_15_exittail)
2071# endif
2072
2073 pxor %xmm0, %xmm0
2074 sub $0x1000, %r10
2075 jmp LABEL(gobble_ashr_15)
2076
2077 .p2align 4
2078LABEL(ashr_15_exittail):
2079 movdqa (%rsi, %rcx), %xmm1
2080 psrldq $15, %xmm3
2081 psrldq $15, %xmm0
2082
2083 .p2align 4
2084LABEL(aftertail):
2085 TOLOWER (%xmm1, %xmm3)
2086 pcmpeqb %xmm3, %xmm1
2087 psubb %xmm0, %xmm1
2088 pmovmskb %xmm1, %edx
2089 not %edx
2090
2091 .p2align 4
2092LABEL(exit):
2093 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
2094LABEL(less32bytes):
2095 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
2096 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
2097 test %r8d, %r8d
2098 jz LABEL(ret)
2099 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
2100
2101 .p2align 4
2102LABEL(ret):
2103LABEL(less16bytes):
2104 bsf %rdx, %rdx /* find and store bit index in %rdx */
2105
2106# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2107 sub %rdx, %r11
2108 jbe LABEL(strcmp_exitz)
2109# endif
2110 movzbl (%rsi, %rdx), %ecx
2111 movzbl (%rdi, %rdx), %eax
2112
2113# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2114 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2115 movl (%rdx,%rcx,4), %ecx
2116 movl (%rdx,%rax,4), %eax
2117# endif
2118
2119 sub %ecx, %eax
2120 ret
2121
2122LABEL(strcmp_exitz):
2123 xor %eax, %eax
2124 ret
2125
2126 .p2align 4
2127LABEL(Byte0):
2128 movzbl (%rsi), %ecx
2129 movzbl (%rdi), %eax
2130
2131# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2132 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2133 movl (%rdx,%rcx,4), %ecx
2134 movl (%rdx,%rax,4), %eax
2135# endif
2136
2137 sub %ecx, %eax
2138 ret
2139END (STRCMP)
2140
2141 .section .rodata,"a",@progbits
2142 .p2align 3
2143LABEL(unaligned_table):
2144 .int LABEL(ashr_1) - LABEL(unaligned_table)
2145 .int LABEL(ashr_2) - LABEL(unaligned_table)
2146 .int LABEL(ashr_3) - LABEL(unaligned_table)
2147 .int LABEL(ashr_4) - LABEL(unaligned_table)
2148 .int LABEL(ashr_5) - LABEL(unaligned_table)
2149 .int LABEL(ashr_6) - LABEL(unaligned_table)
2150 .int LABEL(ashr_7) - LABEL(unaligned_table)
2151 .int LABEL(ashr_8) - LABEL(unaligned_table)
2152 .int LABEL(ashr_9) - LABEL(unaligned_table)
2153 .int LABEL(ashr_10) - LABEL(unaligned_table)
2154 .int LABEL(ashr_11) - LABEL(unaligned_table)
2155 .int LABEL(ashr_12) - LABEL(unaligned_table)
2156 .int LABEL(ashr_13) - LABEL(unaligned_table)
2157 .int LABEL(ashr_14) - LABEL(unaligned_table)
2158 .int LABEL(ashr_15) - LABEL(unaligned_table)
2159 .int LABEL(ashr_0) - LABEL(unaligned_table)
2160#endif
2161

source code of glibc/sysdeps/x86_64/multiarch/strcmp-sse2.S