1/* memcmp with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19
20#include <isa-level.h>
21
22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
23 so we need this to build for ISA V2 builds. */
24#if ISA_SHOULD_BUILD (2)
25
26#include <sysdep.h>
27
28# ifndef MEMCMP
29# define MEMCMP __memcmp_sse2
30# endif
31
32# ifdef USE_AS_WMEMCMP
33# define PCMPEQ pcmpeqd
34# define CHAR_SIZE 4
35# define SIZE_OFFSET (0)
36# else
37# define PCMPEQ pcmpeqb
38# define CHAR_SIZE 1
39# endif
40
41# ifdef USE_AS_MEMCMPEQ
42# define SIZE_OFFSET (0)
43# define CHECK_CMP(x, y) subl x, y
44# else
45# ifndef SIZE_OFFSET
46# define SIZE_OFFSET (CHAR_PER_VEC * 2)
47# endif
48# define CHECK_CMP(x, y) cmpl x, y
49# endif
50
51# define VEC_SIZE 16
52# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54# ifndef MEMCMP
55# define MEMCMP memcmp
56# endif
57
58 .text
59ENTRY(MEMCMP)
60# ifdef __ILP32__
61 /* Clear the upper 32 bits. */
62 movl %edx, %edx
63# endif
64# ifdef USE_AS_WMEMCMP
65 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
66 in ecx for code size. This is preferable to using `incw` as
67 it avoids partial register stalls on older hardware (pre
68 SnB). */
69 movl $0xffff, %ecx
70# endif
71 cmpq $CHAR_PER_VEC, %rdx
72 ja L(more_1x_vec)
73
74# ifdef USE_AS_WMEMCMP
75 /* saves a byte of code keeping the fall through path n = [2, 4]
76 in the initial cache line. */
77 decl %edx
78 jle L(cmp_0_1)
79
80 movq (%rsi), %xmm0
81 movq (%rdi), %xmm1
82 PCMPEQ %xmm0, %xmm1
83 pmovmskb %xmm1, %eax
84 subl %ecx, %eax
85 jnz L(ret_nonzero_vec_start_0)
86
87 movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
88 movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
89 PCMPEQ %xmm0, %xmm1
90 pmovmskb %xmm1, %eax
91 subl %ecx, %eax
92 jnz L(ret_nonzero_vec_end_0_adj)
93# else
94 cmpl $8, %edx
95 ja L(cmp_9_16)
96
97 cmpl $4, %edx
98 jb L(cmp_0_3)
99
100# ifdef USE_AS_MEMCMPEQ
101 movl (%rsi), %eax
102 subl (%rdi), %eax
103
104 movl -4(%rsi, %rdx), %esi
105 subl -4(%rdi, %rdx), %esi
106
107 orl %esi, %eax
108 ret
109# else
110 /* Combine comparisons for lo and hi 4-byte comparisons. */
111 movl -4(%rsi, %rdx), %ecx
112 movl -4(%rdi, %rdx), %eax
113 shlq $32, %rcx
114 shlq $32, %rax
115 movl (%rsi), %esi
116 movl (%rdi), %edi
117 orq %rsi, %rcx
118 orq %rdi, %rax
119 /* Only compute proper return if not-equal. */
120 cmpq %rcx, %rax
121 jnz L(ret_nonzero)
122 xorl %eax, %eax
123 ret
124# endif
125
126 .p2align 4,, 10
127L(cmp_9_16):
128# ifdef USE_AS_MEMCMPEQ
129 movq (%rsi), %rax
130 subq (%rdi), %rax
131
132 movq -8(%rsi, %rdx), %rcx
133 subq -8(%rdi, %rdx), %rcx
134 orq %rcx, %rax
135 /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
136 return long). */
137 setnz %cl
138 movzbl %cl, %eax
139# else
140 movq (%rsi), %rcx
141 movq (%rdi), %rax
142 /* Only compute proper return if not-equal. */
143 cmpq %rcx, %rax
144 jnz L(ret_nonzero)
145
146 movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
147 movq -8(%rdi, %rdx, CHAR_SIZE), %rax
148 /* Only compute proper return if not-equal. */
149 cmpq %rcx, %rax
150 jnz L(ret_nonzero)
151 xorl %eax, %eax
152# endif
153# endif
154 ret
155
156 .p2align 4,, 8
157L(cmp_0_1):
158 /* Flag set by earlier comparison against 1. */
159 jne L(cmp_0_0)
160# ifdef USE_AS_WMEMCMP
161 movl (%rdi), %ecx
162 xorl %edx, %edx
163 cmpl (%rsi), %ecx
164 je L(cmp_0_0)
165 setg %dl
166 leal -1(%rdx, %rdx), %eax
167# else
168 movzbl (%rdi), %eax
169 movzbl (%rsi), %ecx
170 subl %ecx, %eax
171# endif
172 ret
173
174 /* Fits in aligning bytes. */
175L(cmp_0_0):
176 xorl %eax, %eax
177 ret
178
179# ifdef USE_AS_WMEMCMP
180 .p2align 4
181L(ret_nonzero_vec_start_0):
182 bsfl %eax, %eax
183 movl (%rdi, %rax), %ecx
184 xorl %edx, %edx
185 cmpl (%rsi, %rax), %ecx
186 /* NB: no partial register stall here because xorl zero idiom
187 above. */
188 setg %dl
189 leal -1(%rdx, %rdx), %eax
190 ret
191# else
192
193# ifndef USE_AS_MEMCMPEQ
194 .p2align 4,, 14
195L(ret_nonzero):
196 /* Need to bswap to get proper return without branch. */
197 bswapq %rcx
198 bswapq %rax
199 subq %rcx, %rax
200 sbbl %eax, %eax
201 orl $1, %eax
202 ret
203# endif
204
205 .p2align 4
206L(cmp_0_3):
207# ifdef USE_AS_MEMCMPEQ
208 /* No reason to add to dependency chain on rdx. Saving a the
209 bytes here doesn't change number of fetch blocks. */
210 cmpl $1, %edx
211 jbe L(cmp_0_1)
212# else
213 /* We need the code size to prevent taking an extra fetch block.
214 */
215 decl %edx
216 jle L(cmp_0_1)
217# endif
218 movzwl (%rsi), %ecx
219 movzwl (%rdi), %eax
220
221# ifdef USE_AS_MEMCMPEQ
222 subl %ecx, %eax
223
224 movzbl -1(%rsi, %rdx), %esi
225 movzbl -1(%rdi, %rdx), %edi
226 subl %edi, %esi
227 orl %esi, %eax
228# else
229 bswapl %ecx
230 bswapl %eax
231
232 /* Implicit right shift by one. We just need to displace the
233 sign bits. */
234 shrl %ecx
235 shrl %eax
236
237 /* Eat a partial register stall here. Saves code stopping
238 L(cmp_0_3) from bleeding into the next fetch block and saves
239 an ALU. */
240 movb (%rsi, %rdx), %cl
241 movzbl (%rdi, %rdx), %edi
242 orl %edi, %eax
243 subl %ecx, %eax
244# endif
245 ret
246# endif
247
248 .p2align 5
249L(more_1x_vec):
250# ifndef USE_AS_WMEMCMP
251 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
252 in ecx for code size. This is preferable to using `incw` as
253 it avoids partial register stalls on older hardware (pre
254 SnB). */
255 movl $0xffff, %ecx
256# endif
257 movups (%rsi), %xmm0
258 movups (%rdi), %xmm1
259 PCMPEQ %xmm0, %xmm1
260 pmovmskb %xmm1, %eax
261 subl %ecx, %eax
262 jnz L(ret_nonzero_vec_start_0)
263# if SIZE_OFFSET == 0
264 cmpq $(CHAR_PER_VEC * 2), %rdx
265# else
266 /* Offset rdx. Saves just enough code size to keep the
267 L(last_2x_vec) case and the non-zero return in a single
268 cache line. */
269 subq $(CHAR_PER_VEC * 2), %rdx
270# endif
271 ja L(more_2x_vec)
272
273 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
274 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
275 PCMPEQ %xmm0, %xmm1
276 pmovmskb %xmm1, %eax
277 subl %ecx, %eax
278# ifndef USE_AS_MEMCMPEQ
279 /* Don't use `incw ax` as machines this code runs on are liable
280 to have partial register stall. */
281 jnz L(ret_nonzero_vec_end_0)
282# else
283 /* Various return targets for memcmpeq. Will always be hot in
284 Icache and get short encoding. */
285L(ret_nonzero_vec_start_1):
286L(ret_nonzero_vec_start_0):
287L(ret_nonzero_vec_end_0):
288# endif
289 ret
290
291# ifndef USE_AS_MEMCMPEQ
292# ifdef USE_AS_WMEMCMP
293 .p2align 4
294L(ret_nonzero_vec_end_0_adj):
295 addl $3, %edx
296# else
297 .p2align 4,, 8
298# endif
299L(ret_nonzero_vec_end_0):
300 bsfl %eax, %eax
301# ifdef USE_AS_WMEMCMP
302 leal (%rax, %rdx, CHAR_SIZE), %eax
303 movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
304 xorl %edx, %edx
305 cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
306 /* NB: no partial register stall here because xorl zero idiom
307 above. */
308 setg %dl
309 leal -1(%rdx, %rdx), %eax
310# else
311 /* Use `addq` instead of `addl` here so that even if `rax` + `rdx`
312 is negative value of the sum will be usable as a 64-bit offset
313 (negative 32-bit numbers zero-extend to a large and often
314 out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is
315 an invariant when `memcmp` is used correctly, but if the input
316 strings `rsi`/`rdi` are concurrently modified as the function
317 runs (there is a Data-Race) it is possible for `rax` + `rdx` to
318 be negative. Given that there is virtually no extra to cost
319 using `addq` instead of `addl` we may as well protect the
320 data-race case. */
321 addq %rdx, %rax
322 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
323 movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
324 subl %ecx, %eax
325# endif
326 ret
327# ifndef USE_AS_WMEMCMP
328 .p2align 4,, 10
329L(ret_nonzero_vec_start_0):
330 bsfl %eax, %eax
331 movzbl (%rsi, %rax), %ecx
332 movzbl (%rdi, %rax), %eax
333 subl %ecx, %eax
334 ret
335# endif
336# else
337# endif
338
339 .p2align 5
340L(more_2x_vec):
341 movups (VEC_SIZE * 1)(%rsi), %xmm0
342 movups (VEC_SIZE * 1)(%rdi), %xmm1
343 PCMPEQ %xmm0, %xmm1
344 pmovmskb %xmm1, %eax
345 subl %ecx, %eax
346 jnz L(ret_nonzero_vec_start_1)
347
348 cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
349 jbe L(last_2x_vec)
350
351 cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
352 ja L(more_8x_vec)
353
354 /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
355 This can harm performance if non-zero return in [65, 80] or
356 [97, 112] but helps performance otherwise. Generally zero-
357 return is hotter. */
358 movups (VEC_SIZE * 2)(%rsi), %xmm0
359 movups (VEC_SIZE * 2)(%rdi), %xmm1
360 PCMPEQ %xmm0, %xmm1
361 movups (VEC_SIZE * 3)(%rsi), %xmm2
362 movups (VEC_SIZE * 3)(%rdi), %xmm3
363 PCMPEQ %xmm2, %xmm3
364 pand %xmm1, %xmm3
365
366 pmovmskb %xmm3, %eax
367 CHECK_CMP (%ecx, %eax)
368 jnz L(ret_nonzero_vec_start_2_3)
369
370 cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
371 jbe L(last_2x_vec)
372
373 movups (VEC_SIZE * 4)(%rsi), %xmm0
374 movups (VEC_SIZE * 4)(%rdi), %xmm1
375 PCMPEQ %xmm0, %xmm1
376 movups (VEC_SIZE * 5)(%rsi), %xmm2
377 movups (VEC_SIZE * 5)(%rdi), %xmm3
378 PCMPEQ %xmm2, %xmm3
379 pand %xmm1, %xmm3
380
381 pmovmskb %xmm3, %eax
382 CHECK_CMP (%ecx, %eax)
383# ifdef USE_AS_MEMCMPEQ
384 jz L(last_2x_vec)
385 ret
386# else
387 jnz L(ret_nonzero_vec_start_4_5)
388# endif
389 .p2align 4
390L(last_2x_vec):
391 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
392 movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
393 PCMPEQ %xmm0, %xmm1
394 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
395 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
396 PCMPEQ %xmm2, %xmm3
397 pand %xmm1, %xmm3
398 pmovmskb %xmm3, %eax
399 subl %ecx, %eax
400# ifdef USE_AS_MEMCMPEQ
401 /* Various return targets for memcmpeq. Will always be hot in
402 Icache and get short encoding. */
403L(ret_nonzero_vec_start_2_3):
404L(ret_nonzero_vec_start_4_5):
405 ret
406# else
407 jnz L(ret_nonzero_vec_end_1)
408 ret
409
410 .p2align 4,, 8
411L(ret_nonzero_vec_end_1):
412 pmovmskb %xmm1, %ecx
413 /* High 16 bits of eax guaranteed to be all ones. Rotate them in
414 to we can do `or + not` with just `xor`. */
415 rorl $16, %eax
416 xorl %ecx, %eax
417 /* Partial register stall. */
418
419 bsfl %eax, %eax
420# ifdef USE_AS_WMEMCMP
421 leal (%rax, %rdx, CHAR_SIZE), %eax
422 movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
423 xorl %edx, %edx
424 cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
425 /* NB: no partial register stall here because xorl zero idiom
426 above. */
427 setg %dl
428 leal -1(%rdx, %rdx), %eax
429# else
430 addl %edx, %eax
431 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
432 movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
433 subl %ecx, %eax
434# endif
435 ret
436
437 .p2align 4
438L(ret_nonzero_vec_start_4_5):
439 pmovmskb %xmm1, %edx
440 sall $16, %eax
441 leal 1(%rax, %rdx), %eax
442 bsfl %eax, %eax
443# ifdef USE_AS_WMEMCMP
444 movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
445 xorl %edx, %edx
446 cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
447 /* NB: no partial register stall here because xorl zero idiom
448 above. */
449 setg %dl
450 leal -1(%rdx, %rdx), %eax
451# else
452 movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
453 movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
454 subl %ecx, %eax
455# endif
456 ret
457
458 .p2align 4,, 8
459L(ret_nonzero_vec_start_1):
460 bsfl %eax, %eax
461# ifdef USE_AS_WMEMCMP
462 movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
463 xorl %edx, %edx
464 cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
465 /* NB: no partial register stall here because xorl zero idiom
466 above. */
467 setg %dl
468 leal -1(%rdx, %rdx), %eax
469# else
470 movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
471 movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
472 subl %ecx, %eax
473# endif
474 ret
475# endif
476
477 .p2align 4
478L(more_8x_vec):
479 subq %rdi, %rsi
480 leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
481 andq $(VEC_SIZE * -1), %rdi
482 addq %rdi, %rsi
483 .p2align 4
484L(loop_4x):
485 movups (VEC_SIZE * 2)(%rsi), %xmm0
486 movups (VEC_SIZE * 3)(%rsi), %xmm1
487
488 PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
489 PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
490
491 movups (VEC_SIZE * 4)(%rsi), %xmm2
492 movups (VEC_SIZE * 5)(%rsi), %xmm3
493
494 PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
495 PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
496
497 pand %xmm0, %xmm1
498 pand %xmm2, %xmm3
499 pand %xmm1, %xmm3
500
501 pmovmskb %xmm3, %eax
502 subl %ecx, %eax
503 jnz L(ret_nonzero_loop)
504
505 addq $(VEC_SIZE * 4), %rdi
506 addq $(VEC_SIZE * 4), %rsi
507 cmpq %rdi, %rdx
508 ja L(loop_4x)
509 /* Get remaining length in edx. */
510 subl %edi, %edx
511 /* Restore offset so we can reuse L(last_2x_vec). */
512 addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
513# ifdef USE_AS_WMEMCMP
514 shrl $2, %edx
515# endif
516 cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
517 jbe L(last_2x_vec)
518
519
520 movups (VEC_SIZE * 2)(%rsi), %xmm0
521 movups (VEC_SIZE * 2)(%rdi), %xmm1
522 PCMPEQ %xmm0, %xmm1
523 movups (VEC_SIZE * 3)(%rsi), %xmm2
524 movups (VEC_SIZE * 3)(%rdi), %xmm3
525 PCMPEQ %xmm2, %xmm3
526 pand %xmm1, %xmm3
527
528 pmovmskb %xmm3, %eax
529 CHECK_CMP (%ecx, %eax)
530 jz L(last_2x_vec)
531# ifdef USE_AS_MEMCMPEQ
532L(ret_nonzero_loop):
533 ret
534# else
535
536 .p2align 4
537L(ret_nonzero_vec_start_2_3):
538 pmovmskb %xmm1, %edx
539 sall $16, %eax
540 leal 1(%rax, %rdx), %eax
541
542 bsfl %eax, %eax
543# ifdef USE_AS_WMEMCMP
544 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
545 xorl %edx, %edx
546 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
547 /* NB: no partial register stall here because xorl zero idiom
548 above. */
549 setg %dl
550 leal -1(%rdx, %rdx), %eax
551# else
552 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
553 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
554 subl %ecx, %eax
555# endif
556 ret
557
558 .p2align 4
559L(ret_nonzero_loop):
560 pmovmskb %xmm0, %ecx
561 pmovmskb %xmm1, %edx
562 sall $(VEC_SIZE * 1), %edx
563 leal 1(%rcx, %rdx), %edx
564 pmovmskb %xmm2, %ecx
565 /* High 16 bits of eax guaranteed to be all ones. Rotate them in
566 to we can do `or + not` with just `xor`. */
567 rorl $16, %eax
568 xorl %ecx, %eax
569
570 salq $32, %rax
571 orq %rdx, %rax
572
573 bsfq %rax, %rax
574# ifdef USE_AS_WMEMCMP
575 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
576 xorl %edx, %edx
577 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
578 /* NB: no partial register stall here because xorl zero idiom
579 above. */
580 setg %dl
581 leal -1(%rdx, %rdx), %eax
582# else
583 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
584 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
585 subl %ecx, %eax
586# endif
587 ret
588# endif
589END(MEMCMP)
590#endif
591

source code of glibc/sysdeps/x86_64/multiarch/memcmp-sse2.S