1/* memcmp/wmemcmp optimized with AVX2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23/* memcmp/wmemcmp is implemented as:
24 1. Use ymm vector compares when possible. The only case where
25 vector compares is not possible for when size < VEC_SIZE
26 and loading from either s1 or s2 would cause a page cross.
27 2. For size from 2 to 7 bytes on page cross, load as big endian
28 with movbe and bswap to avoid branches.
29 3. Use xmm vector compare when size >= 4 bytes for memcmp or
30 size >= 8 bytes for wmemcmp.
31 4. Optimistically compare up to first 4 * VEC_SIZE one at a
32 to check for early mismatches. Only do this if its guaranteed the
33 work is not wasted.
34 5. If size is 8 * VEC_SIZE or less, unroll the loop.
35 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
36 area.
37 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
38 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
39 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
40
41
42# include <sysdep.h>
43
44# ifndef MEMCMP
45# define MEMCMP __memcmp_avx2_movbe
46# endif
47
48# ifdef USE_AS_WMEMCMP
49# define CHAR_SIZE 4
50# define VPCMPEQ vpcmpeqd
51# else
52# define CHAR_SIZE 1
53# define VPCMPEQ vpcmpeqb
54# endif
55
56# ifndef VZEROUPPER
57# define VZEROUPPER vzeroupper
58# endif
59
60# ifndef SECTION
61# define SECTION(p) p##.avx
62# endif
63
64# define VEC_SIZE 32
65# define PAGE_SIZE 4096
66
67/* Warning!
68 wmemcmp has to use SIGNED comparison for elements.
69 memcmp has to use UNSIGNED comparison for elements.
70*/
71
72 .section SECTION(.text),"ax",@progbits
73ENTRY (MEMCMP)
74# ifdef USE_AS_WMEMCMP
75 shl $2, %RDX_LP
76# elif defined __ILP32__
77 /* Clear the upper 32 bits. */
78 movl %edx, %edx
79# endif
80 cmp $VEC_SIZE, %RDX_LP
81 jb L(less_vec)
82
83 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
84 vmovdqu (%rsi), %ymm1
85 VPCMPEQ (%rdi), %ymm1, %ymm1
86 vpmovmskb %ymm1, %eax
87 /* NB: eax must be destination register if going to
88 L(return_vec_[0,2]). For L(return_vec_3 destination register
89 must be ecx. */
90 incl %eax
91 jnz L(return_vec_0)
92
93 cmpq $(VEC_SIZE * 2), %rdx
94 jbe L(last_1x_vec)
95
96 /* Check second VEC no matter what. */
97 vmovdqu VEC_SIZE(%rsi), %ymm2
98 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
99 vpmovmskb %ymm2, %eax
100 /* If all 4 VEC where equal eax will be all 1s so incl will
101 overflow and set zero flag. */
102 incl %eax
103 jnz L(return_vec_1)
104
105 /* Less than 4 * VEC. */
106 cmpq $(VEC_SIZE * 4), %rdx
107 jbe L(last_2x_vec)
108
109 /* Check third and fourth VEC no matter what. */
110 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
111 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
112 vpmovmskb %ymm3, %eax
113 incl %eax
114 jnz L(return_vec_2)
115 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
116 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
117 vpmovmskb %ymm4, %ecx
118 incl %ecx
119 jnz L(return_vec_3)
120
121 /* Go to 4x VEC loop. */
122 cmpq $(VEC_SIZE * 8), %rdx
123 ja L(more_8x_vec)
124
125 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
126 branches. */
127
128 /* Load first two VEC from s2 before adjusting addresses. */
129 vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
130 vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
131 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
132 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
133
134 /* Wait to load from s1 until addressed adjust due to
135 unlamination of microfusion with complex address mode. */
136 VPCMPEQ (%rdi), %ymm1, %ymm1
137 VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
138
139 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
140 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
141 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
142 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
143
144 /* Reduce VEC0 - VEC4. */
145 vpand %ymm1, %ymm2, %ymm5
146 vpand %ymm3, %ymm4, %ymm6
147 vpand %ymm5, %ymm6, %ymm7
148 vpmovmskb %ymm7, %ecx
149 incl %ecx
150 jnz L(return_vec_0_1_2_3)
151 /* NB: eax must be zero to reach here. */
152 VZEROUPPER_RETURN
153
154 .p2align 4
155L(return_vec_0):
156 tzcntl %eax, %eax
157# ifdef USE_AS_WMEMCMP
158 movl (%rdi, %rax), %ecx
159 xorl %edx, %edx
160 cmpl (%rsi, %rax), %ecx
161 /* NB: no partial register stall here because xorl zero idiom
162 above. */
163 setg %dl
164 leal -1(%rdx, %rdx), %eax
165# else
166 movzbl (%rsi, %rax), %ecx
167 movzbl (%rdi, %rax), %eax
168 subl %ecx, %eax
169# endif
170L(return_vzeroupper):
171 ZERO_UPPER_VEC_REGISTERS_RETURN
172
173 .p2align 4
174L(return_vec_1):
175 tzcntl %eax, %eax
176# ifdef USE_AS_WMEMCMP
177 movl VEC_SIZE(%rdi, %rax), %ecx
178 xorl %edx, %edx
179 cmpl VEC_SIZE(%rsi, %rax), %ecx
180 setg %dl
181 leal -1(%rdx, %rdx), %eax
182# else
183 movzbl VEC_SIZE(%rsi, %rax), %ecx
184 movzbl VEC_SIZE(%rdi, %rax), %eax
185 subl %ecx, %eax
186# endif
187 VZEROUPPER_RETURN
188
189 .p2align 4
190L(return_vec_2):
191 tzcntl %eax, %eax
192# ifdef USE_AS_WMEMCMP
193 movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
194 xorl %edx, %edx
195 cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
196 setg %dl
197 leal -1(%rdx, %rdx), %eax
198# else
199 movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
200 movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
201 subl %ecx, %eax
202# endif
203 VZEROUPPER_RETURN
204
205 /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */
206 .p2align 5
207L(8x_return_vec_0_1_2_3):
208 /* Returning from L(more_8x_vec) requires restoring rsi. */
209 addq %rdi, %rsi
210L(return_vec_0_1_2_3):
211 vpmovmskb %ymm1, %eax
212 incl %eax
213 jnz L(return_vec_0)
214
215 vpmovmskb %ymm2, %eax
216 incl %eax
217 jnz L(return_vec_1)
218
219 vpmovmskb %ymm3, %eax
220 incl %eax
221 jnz L(return_vec_2)
222L(return_vec_3):
223 tzcntl %ecx, %ecx
224# ifdef USE_AS_WMEMCMP
225 movl (VEC_SIZE * 3)(%rdi, %rcx), %eax
226 xorl %edx, %edx
227 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax
228 setg %dl
229 leal -1(%rdx, %rdx), %eax
230# else
231 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
232 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
233 subl %ecx, %eax
234# endif
235 VZEROUPPER_RETURN
236
237 .p2align 4
238L(more_8x_vec):
239 /* Set end of s1 in rdx. */
240 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
241 /* rsi stores s2 - s1. This allows loop to only update one
242 pointer. */
243 subq %rdi, %rsi
244 /* Align s1 pointer. */
245 andq $-VEC_SIZE, %rdi
246 /* Adjust because first 4x vec where check already. */
247 subq $-(VEC_SIZE * 4), %rdi
248 .p2align 4
249L(loop_4x_vec):
250 /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
251 */
252 vmovdqu (%rsi, %rdi), %ymm1
253 VPCMPEQ (%rdi), %ymm1, %ymm1
254
255 vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
256 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
257
258 vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
259 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
260
261 vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
262 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
263
264 vpand %ymm1, %ymm2, %ymm5
265 vpand %ymm3, %ymm4, %ymm6
266 vpand %ymm5, %ymm6, %ymm7
267 vpmovmskb %ymm7, %ecx
268 incl %ecx
269 jnz L(8x_return_vec_0_1_2_3)
270 subq $-(VEC_SIZE * 4), %rdi
271 /* Check if s1 pointer at end. */
272 cmpq %rdx, %rdi
273 jb L(loop_4x_vec)
274
275 subq %rdx, %rdi
276 /* rdi has 4 * VEC_SIZE - remaining length. */
277 cmpl $(VEC_SIZE * 3), %edi
278 jae L(8x_last_1x_vec)
279 /* Load regardless of branch. */
280 vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
281 cmpl $(VEC_SIZE * 2), %edi
282 jae L(8x_last_2x_vec)
283
284 /* Check last 4 VEC. */
285 vmovdqu (%rsi, %rdx), %ymm1
286 VPCMPEQ (%rdx), %ymm1, %ymm1
287
288 vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
289 VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
290
291 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
292
293 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
294 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
295
296 vpand %ymm1, %ymm2, %ymm5
297 vpand %ymm3, %ymm4, %ymm6
298 vpand %ymm5, %ymm6, %ymm7
299 vpmovmskb %ymm7, %ecx
300 /* Restore s1 pointer to rdi. */
301 movq %rdx, %rdi
302 incl %ecx
303 jnz L(8x_return_vec_0_1_2_3)
304 /* NB: eax must be zero to reach here. */
305 VZEROUPPER_RETURN
306
307 /* Only entry is from L(more_8x_vec). */
308 .p2align 4
309L(8x_last_2x_vec):
310 /* Check second to last VEC. rdx store end pointer of s1 and
311 ymm3 has already been loaded with second to last VEC from s2.
312 */
313 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
314 vpmovmskb %ymm3, %eax
315 incl %eax
316 jnz L(8x_return_vec_2)
317 /* Check last VEC. */
318 .p2align 4
319L(8x_last_1x_vec):
320 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
321 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
322 vpmovmskb %ymm4, %eax
323 incl %eax
324 jnz L(8x_return_vec_3)
325 VZEROUPPER_RETURN
326
327 .p2align 4
328L(last_2x_vec):
329 /* Check second to last VEC. */
330 vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
331 VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
332 vpmovmskb %ymm1, %eax
333 incl %eax
334 jnz L(return_vec_1_end)
335 /* Check last VEC. */
336L(last_1x_vec):
337 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
338 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
339 vpmovmskb %ymm1, %eax
340 incl %eax
341 jnz L(return_vec_0_end)
342 VZEROUPPER_RETURN
343
344 .p2align 4
345L(8x_return_vec_2):
346 subq $VEC_SIZE, %rdx
347L(8x_return_vec_3):
348 tzcntl %eax, %eax
349 addq %rdx, %rax
350# ifdef USE_AS_WMEMCMP
351 movl (VEC_SIZE * 3)(%rax), %ecx
352 xorl %edx, %edx
353 cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
354 setg %dl
355 leal -1(%rdx, %rdx), %eax
356# else
357 movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
358 movzbl (VEC_SIZE * 3)(%rax), %eax
359 subl %ecx, %eax
360# endif
361 VZEROUPPER_RETURN
362
363 .p2align 4
364L(return_vec_1_end):
365 tzcntl %eax, %eax
366 addl %edx, %eax
367# ifdef USE_AS_WMEMCMP
368 movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx
369 xorl %edx, %edx
370 cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
371 setg %dl
372 leal -1(%rdx, %rdx), %eax
373# else
374 movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
375 movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
376 subl %ecx, %eax
377# endif
378 VZEROUPPER_RETURN
379
380 .p2align 4
381L(return_vec_0_end):
382 tzcntl %eax, %eax
383 addl %edx, %eax
384# ifdef USE_AS_WMEMCMP
385 movl -VEC_SIZE(%rdi, %rax), %ecx
386 xorl %edx, %edx
387 cmpl -VEC_SIZE(%rsi, %rax), %ecx
388 setg %dl
389 leal -1(%rdx, %rdx), %eax
390# else
391 movzbl -VEC_SIZE(%rsi, %rax), %ecx
392 movzbl -VEC_SIZE(%rdi, %rax), %eax
393 subl %ecx, %eax
394# endif
395 VZEROUPPER_RETURN
396
397 .p2align 4
398L(less_vec):
399 /* Check if one or less CHAR. This is necessary for size = 0 but
400 is also faster for size = CHAR_SIZE. */
401 cmpl $CHAR_SIZE, %edx
402 jbe L(one_or_less)
403
404 /* Check if loading one VEC from either s1 or s2 could cause a
405 page cross. This can have false positives but is by far the
406 fastest method. */
407 movl %edi, %eax
408 orl %esi, %eax
409 andl $(PAGE_SIZE - 1), %eax
410 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
411 jg L(page_cross_less_vec)
412
413 /* No page cross possible. */
414 vmovdqu (%rsi), %ymm2
415 VPCMPEQ (%rdi), %ymm2, %ymm2
416 vpmovmskb %ymm2, %eax
417 incl %eax
418 /* Result will be zero if s1 and s2 match. Otherwise first set
419 bit will be first mismatch. */
420 bzhil %edx, %eax, %edx
421 jnz L(return_vec_0)
422 xorl %eax, %eax
423 VZEROUPPER_RETURN
424
425 .p2align 4
426L(page_cross_less_vec):
427 /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
428 bytes. */
429 cmpl $16, %edx
430 jae L(between_16_31)
431# ifndef USE_AS_WMEMCMP
432 cmpl $8, %edx
433 jae L(between_8_15)
434 /* Fall through for [4, 7]. */
435 cmpl $4, %edx
436 jb L(between_2_3)
437
438 movbe (%rdi), %eax
439 movbe (%rsi), %ecx
440 shlq $32, %rax
441 shlq $32, %rcx
442 movbe -4(%rdi, %rdx), %edi
443 movbe -4(%rsi, %rdx), %esi
444 orq %rdi, %rax
445 orq %rsi, %rcx
446 subq %rcx, %rax
447 /* Fast path for return zero. */
448 jnz L(ret_nonzero)
449 /* No ymm register was touched. */
450 ret
451
452 .p2align 4
453L(one_or_less):
454 jb L(zero)
455 movzbl (%rsi), %ecx
456 movzbl (%rdi), %eax
457 subl %ecx, %eax
458 /* No ymm register was touched. */
459 ret
460
461 .p2align 4,, 5
462L(ret_nonzero):
463 sbbl %eax, %eax
464 orl $1, %eax
465 /* No ymm register was touched. */
466 ret
467
468 .p2align 4,, 2
469L(zero):
470 xorl %eax, %eax
471 /* No ymm register was touched. */
472 ret
473
474 .p2align 4
475L(between_8_15):
476 movbe (%rdi), %rax
477 movbe (%rsi), %rcx
478 subq %rcx, %rax
479 jnz L(ret_nonzero)
480 movbe -8(%rdi, %rdx), %rax
481 movbe -8(%rsi, %rdx), %rcx
482 subq %rcx, %rax
483 /* Fast path for return zero. */
484 jnz L(ret_nonzero)
485 /* No ymm register was touched. */
486 ret
487# else
488 /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
489 vmovq (%rdi), %xmm1
490 vmovq (%rsi), %xmm2
491 VPCMPEQ %xmm1, %xmm2, %xmm2
492 vpmovmskb %xmm2, %eax
493 subl $0xffff, %eax
494 jnz L(return_vec_0)
495 /* Use overlapping loads to avoid branches. */
496 leaq -8(%rdi, %rdx), %rdi
497 leaq -8(%rsi, %rdx), %rsi
498 vmovq (%rdi), %xmm1
499 vmovq (%rsi), %xmm2
500 VPCMPEQ %xmm1, %xmm2, %xmm2
501 vpmovmskb %xmm2, %eax
502 subl $0xffff, %eax
503 /* Fast path for return zero. */
504 jnz L(return_vec_0)
505 /* No ymm register was touched. */
506 ret
507# endif
508
509 .p2align 4,, 10
510L(between_16_31):
511 /* From 16 to 31 bytes. No branch when size == 16. */
512 vmovdqu (%rsi), %xmm2
513 VPCMPEQ (%rdi), %xmm2, %xmm2
514 vpmovmskb %xmm2, %eax
515 subl $0xffff, %eax
516 jnz L(return_vec_0)
517
518 /* Use overlapping loads to avoid branches. */
519
520 vmovdqu -16(%rsi, %rdx), %xmm2
521 leaq -16(%rdi, %rdx), %rdi
522 leaq -16(%rsi, %rdx), %rsi
523 VPCMPEQ (%rdi), %xmm2, %xmm2
524 vpmovmskb %xmm2, %eax
525 subl $0xffff, %eax
526 /* Fast path for return zero. */
527 jnz L(return_vec_0)
528 /* No ymm register was touched. */
529 ret
530
531# ifdef USE_AS_WMEMCMP
532 .p2align 4,, 2
533L(zero):
534 xorl %eax, %eax
535 ret
536
537 .p2align 4
538L(one_or_less):
539 jb L(zero)
540 movl (%rdi), %ecx
541 xorl %edx, %edx
542 cmpl (%rsi), %ecx
543 je L(zero)
544 setg %dl
545 leal -1(%rdx, %rdx), %eax
546 /* No ymm register was touched. */
547 ret
548# else
549
550 .p2align 4
551L(between_2_3):
552 /* Load as big endian to avoid branches. */
553 movzwl (%rdi), %eax
554 movzwl (%rsi), %ecx
555 bswap %eax
556 bswap %ecx
557 shrl %eax
558 shrl %ecx
559 movzbl -1(%rdi, %rdx), %edi
560 movzbl -1(%rsi, %rdx), %esi
561 orl %edi, %eax
562 orl %esi, %ecx
563 /* Subtraction is okay because the upper bit is zero. */
564 subl %ecx, %eax
565 /* No ymm register was touched. */
566 ret
567# endif
568
569END (MEMCMP)
570#endif
571

source code of glibc/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S