1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. On machines with ERMS feature, if size greater than equal or to
34 __x86_rep_movsb_threshold and less than
35 __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36 7. If size >= __x86_shared_non_temporal_threshold and there is no
37 overlap between destination and source, use non-temporal store
38 instead of aligned store copying from either 2 or 4 pages at
39 once.
40 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
41 and source and destination do not page alias, copy from 2 pages
42 at once using non-temporal stores. Page aliasing in this case is
43 considered true if destination's page alignment - sources' page
44 alignment is less than 8 * VEC_SIZE.
45 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
46 and destination do page alias copy from 4 pages at once using
47 non-temporal stores. */
48
49#include <sysdep.h>
50
51#ifndef MEMCPY_SYMBOL
52# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53#endif
54
55#ifndef MEMPCPY_SYMBOL
56# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
57#endif
58
59#ifndef MEMMOVE_CHK_SYMBOL
60# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
61#endif
62
63#ifndef VZEROUPPER
64# if VEC_SIZE > 16
65# define VZEROUPPER vzeroupper
66# else
67# define VZEROUPPER
68# endif
69#endif
70
71/* Whether to align before movsb. Ultimately we want 64 byte
72 align and not worth it to load 4x VEC for VEC_SIZE == 16. */
73#define ALIGN_MOVSB (VEC_SIZE > 16)
74/* Number of bytes to align movsb to. */
75#define MOVSB_ALIGN_TO 64
76
77#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
78#define LARGE_MOV_SIZE (MOV_SIZE > 4)
79
80#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
81# error MOV_SIZE Unknown
82#endif
83
84#if LARGE_MOV_SIZE
85# define SMALL_SIZE_OFFSET (4)
86#else
87# define SMALL_SIZE_OFFSET (0)
88#endif
89
90#ifndef PAGE_SIZE
91# define PAGE_SIZE 4096
92#endif
93
94#if PAGE_SIZE != 4096
95# error Unsupported PAGE_SIZE
96#endif
97
98#ifndef LOG_PAGE_SIZE
99# define LOG_PAGE_SIZE 12
100#endif
101
102#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
103# error Invalid LOG_PAGE_SIZE
104#endif
105
106/* Byte per page for large_memcpy inner loop. */
107#if VEC_SIZE == 64
108# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
109#else
110# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
111#endif
112
113/* Amount to shift __x86_shared_non_temporal_threshold by for
114 bound for memcpy_large_4x. This is essentially use to to
115 indicate that the copy is far beyond the scope of L3
116 (assuming no user config x86_non_temporal_threshold) and to
117 use a more aggressively unrolled loop. NB: before
118 increasing the value also update initialization of
119 x86_non_temporal_threshold. */
120#ifndef LOG_4X_MEMCPY_THRESH
121# define LOG_4X_MEMCPY_THRESH 4
122#endif
123
124/* Avoid short distance rep movsb only with non-SSE vector. */
125#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
126# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
127#else
128# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
129#endif
130
131#ifndef PREFETCH
132# define PREFETCH(addr) prefetcht0 addr
133#endif
134
135/* Assume 64-byte prefetch size. */
136#ifndef PREFETCH_SIZE
137# define PREFETCH_SIZE 64
138#endif
139
140#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
141
142#if PREFETCH_SIZE == 64
143# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
144# define PREFETCH_ONE_SET(dir, base, offset) \
145 PREFETCH ((offset)base)
146# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
147# define PREFETCH_ONE_SET(dir, base, offset) \
148 PREFETCH ((offset)base); \
149 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
150# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
151# define PREFETCH_ONE_SET(dir, base, offset) \
152 PREFETCH ((offset)base); \
153 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
154 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
155 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
156# else
157# error Unsupported PREFETCHED_LOAD_SIZE!
158# endif
159#else
160# error Unsupported PREFETCH_SIZE!
161#endif
162
163#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
164# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
165 VMOVU (offset)base, vec0; \
166 VMOVU ((offset) + VEC_SIZE)base, vec1;
167# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
168 VMOVNT vec0, (offset)base; \
169 VMOVNT vec1, ((offset) + VEC_SIZE)base;
170#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
171# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
172 VMOVU (offset)base, vec0; \
173 VMOVU ((offset) + VEC_SIZE)base, vec1; \
174 VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
175 VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
176# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
177 VMOVNT vec0, (offset)base; \
178 VMOVNT vec1, ((offset) + VEC_SIZE)base; \
179 VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
180 VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
181#else
182# error Invalid LARGE_LOAD_SIZE
183#endif
184
185#ifndef SECTION
186# error SECTION is not defined!
187#endif
188
189 .section SECTION(.text),"ax",@progbits
190#if defined SHARED && IS_IN (libc)
191ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
192 cmp %RDX_LP, %RCX_LP
193 jb HIDDEN_JUMPTARGET (__chk_fail)
194END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
195#endif
196
197ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
198 mov %RDI_LP, %RAX_LP
199 add %RDX_LP, %RAX_LP
200 jmp L(start)
201END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
202
203#if defined SHARED && IS_IN (libc)
204ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
205 cmp %RDX_LP, %RCX_LP
206 jb HIDDEN_JUMPTARGET (__chk_fail)
207END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
208#endif
209
210ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
211 movq %rdi, %rax
212L(start):
213# ifdef __ILP32__
214 /* Clear the upper 32 bits. */
215 movl %edx, %edx
216# endif
217 cmp $VEC_SIZE, %RDX_LP
218 jb L(less_vec)
219 /* Load regardless. */
220 VMOVU (%rsi), %VMM(0)
221 cmp $(VEC_SIZE * 2), %RDX_LP
222 ja L(more_2x_vec)
223 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
224 VMOVU -VEC_SIZE(%rsi,%rdx), %VMM(1)
225 VMOVU %VMM(0), (%rdi)
226 VMOVU %VMM(1), -VEC_SIZE(%rdi,%rdx)
227#if !(defined USE_MULTIARCH && IS_IN (libc))
228 ZERO_UPPER_VEC_REGISTERS_RETURN
229#else
230 VZEROUPPER_RETURN
231#endif
232#if defined USE_MULTIARCH && IS_IN (libc)
233END (MEMMOVE_SYMBOL (__memmove, unaligned))
234
235# ifdef SHARED
236ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
237 cmp %RDX_LP, %RCX_LP
238 jb HIDDEN_JUMPTARGET (__chk_fail)
239END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
240# endif
241
242ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
243 mov %RDI_LP, %RAX_LP
244 add %RDX_LP, %RAX_LP
245 jmp L(start_erms)
246END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
247
248# ifdef SHARED
249ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
250 cmp %RDX_LP, %RCX_LP
251 jb HIDDEN_JUMPTARGET (__chk_fail)
252END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
253# endif
254
255ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
256 movq %rdi, %rax
257L(start_erms):
258# ifdef __ILP32__
259 /* Clear the upper 32 bits. */
260 movl %edx, %edx
261# endif
262 cmp $VEC_SIZE, %RDX_LP
263 jb L(less_vec)
264 /* Load regardless. */
265 VMOVU (%rsi), %VMM(0)
266 cmp $(VEC_SIZE * 2), %RDX_LP
267 ja L(movsb_more_2x_vec)
268 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
269 */
270 VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(1)
271 VMOVU %VMM(0), (%rdi)
272 VMOVU %VMM(1), -VEC_SIZE(%rdi, %rdx)
273L(return_vzeroupper):
274# if VEC_SIZE > 16
275 ZERO_UPPER_VEC_REGISTERS_RETURN
276# else
277 ret
278# endif
279#endif
280
281#if LARGE_MOV_SIZE
282 /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
283 ENTRY block and L(less_vec). */
284 .p2align 4,, 8
285L(between_4_7):
286 /* From 4 to 7. No branch when size == 4. */
287 movl (%rsi), %ecx
288 movl (%rsi, %rdx), %esi
289 movl %ecx, (%rdi)
290 movl %esi, (%rdi, %rdx)
291 ret
292#endif
293
294 .p2align 4
295L(less_vec):
296 /* Less than 1 VEC. */
297#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
298# error Unsupported VEC_SIZE!
299#endif
300#if VEC_SIZE > 32
301 cmpl $32, %edx
302 jae L(between_32_63)
303#endif
304#if VEC_SIZE > 16
305 cmpl $16, %edx
306 jae L(between_16_31)
307#endif
308 cmpl $8, %edx
309 jae L(between_8_15)
310#if SMALL_MOV_SIZE
311 cmpl $4, %edx
312#else
313 subq $4, %rdx
314#endif
315 jae L(between_4_7)
316 cmpl $(1 - SMALL_SIZE_OFFSET), %edx
317 jl L(copy_0)
318 movb (%rsi), %cl
319 je L(copy_1)
320 movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
321 movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
322L(copy_1):
323 movb %cl, (%rdi)
324L(copy_0):
325 ret
326
327#if SMALL_MOV_SIZE
328 .p2align 4,, 8
329L(between_4_7):
330 /* From 4 to 7. No branch when size == 4. */
331 movl -4(%rsi, %rdx), %ecx
332 movl (%rsi), %esi
333 movl %ecx, -4(%rdi, %rdx)
334 movl %esi, (%rdi)
335 ret
336#endif
337
338#if VEC_SIZE > 16
339 /* From 16 to 31. No branch when size == 16. */
340 .p2align 4,, 8
341L(between_16_31):
342 vmovdqu (%rsi), %xmm0
343 vmovdqu -16(%rsi, %rdx), %xmm1
344 vmovdqu %xmm0, (%rdi)
345 vmovdqu %xmm1, -16(%rdi, %rdx)
346 /* No ymm registers have been touched. */
347 ret
348#endif
349
350#if VEC_SIZE > 32
351 .p2align 4,, 10
352L(between_32_63):
353 /* From 32 to 63. No branch when size == 32. */
354 VMOVU (%rsi), %VMM_256(0)
355 VMOVU -32(%rsi, %rdx), %VMM_256(1)
356 VMOVU %VMM_256(0), (%rdi)
357 VMOVU %VMM_256(1), -32(%rdi, %rdx)
358 VZEROUPPER_RETURN
359#endif
360
361 .p2align 4,, 10
362L(between_8_15):
363 /* From 8 to 15. No branch when size == 8. */
364 movq -8(%rsi, %rdx), %rcx
365 movq (%rsi), %rsi
366 movq %rsi, (%rdi)
367 movq %rcx, -8(%rdi, %rdx)
368 ret
369
370 .p2align 4,, 10
371L(last_4x_vec):
372 /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
373
374 /* VEC(0) and VEC(1) have already been loaded. */
375 VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(2)
376 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
377 VMOVU %VMM(0), (%rdi)
378 VMOVU %VMM(1), VEC_SIZE(%rdi)
379 VMOVU %VMM(2), -VEC_SIZE(%rdi, %rdx)
380 VMOVU %VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx)
381 VZEROUPPER_RETURN
382
383 .p2align 4
384#if defined USE_MULTIARCH && IS_IN (libc)
385L(movsb_more_2x_vec):
386 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
387 ja L(movsb)
388#endif
389L(more_2x_vec):
390 /* More than 2 * VEC and there may be overlap between
391 destination and source. */
392 cmpq $(VEC_SIZE * 8), %rdx
393 ja L(more_8x_vec)
394 /* Load VEC(1) regardless. VEC(0) has already been loaded. */
395 VMOVU VEC_SIZE(%rsi), %VMM(1)
396 cmpq $(VEC_SIZE * 4), %rdx
397 jbe L(last_4x_vec)
398 /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
399 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2)
400 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3)
401 VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(4)
402 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5)
403 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6)
404 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7)
405 VMOVU %VMM(0), (%rdi)
406 VMOVU %VMM(1), VEC_SIZE(%rdi)
407 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
408 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
409 VMOVU %VMM(4), -VEC_SIZE(%rdi, %rdx)
410 VMOVU %VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx)
411 VMOVU %VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx)
412 VMOVU %VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx)
413 VZEROUPPER_RETURN
414
415 .p2align 4,, 4
416L(more_8x_vec):
417 movq %rdi, %rcx
418 subq %rsi, %rcx
419 /* Go to backwards temporal copy if overlap no matter what as
420 backward REP MOVSB is slow and we don't want to use NT stores if
421 there is overlap. */
422 cmpq %rdx, %rcx
423 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
424 jb L(more_8x_vec_backward_check_nop)
425 /* Check if non-temporal move candidate. */
426#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
427 /* Check non-temporal store threshold. */
428 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
429 ja L(large_memcpy_2x)
430#endif
431 /* To reach this point there cannot be overlap and dst > src. So
432 check for overlap and src > dst in which case correctness
433 requires forward copy. Otherwise decide between backward/forward
434 copy depending on address aliasing. */
435
436 /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
437 but less than __x86_shared_non_temporal_threshold. */
438L(more_8x_vec_check):
439 /* rcx contains dst - src. Add back length (rdx). */
440 leaq (%rcx, %rdx), %r8
441 /* If r8 has different sign than rcx then there is overlap so we
442 must do forward copy. */
443 xorq %rcx, %r8
444 /* Isolate just sign bit of r8. */
445 shrq $63, %r8
446 /* Get 4k difference dst - src. */
447 andl $(PAGE_SIZE - 256), %ecx
448 /* If r8 is non-zero must do forward for correctness. Otherwise
449 if ecx is non-zero there is 4k False Alaising so do backward
450 copy. */
451 addl %r8d, %ecx
452 jz L(more_8x_vec_backward)
453
454 /* if rdx is greater than __x86_shared_non_temporal_threshold
455 but there is overlap, or from short distance movsb. */
456L(more_8x_vec_forward):
457 /* Load first and last 4 * VEC to support overlapping addresses.
458 */
459
460 /* First vec was already loaded into VEC(0). */
461 VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(5)
462 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6)
463 /* Save beginning of dst. */
464 movq %rdi, %rcx
465 /* Align dst to VEC_SIZE - 1. */
466 orq $(VEC_SIZE - 1), %rdi
467 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7)
468 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8)
469
470 /* Subtract dst from src. Add back after dst aligned. */
471 subq %rcx, %rsi
472 /* Finish aligning dst. */
473 incq %rdi
474 /* Restore src adjusted with new value for aligned dst. */
475 addq %rdi, %rsi
476 /* Store end of buffer minus tail in rdx. */
477 leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
478
479 /* Dont use multi-byte nop to align. */
480 .p2align 4,, 11
481L(loop_4x_vec_forward):
482 /* Copy 4 * VEC a time forward. */
483 VMOVU (%rsi), %VMM(1)
484 VMOVU VEC_SIZE(%rsi), %VMM(2)
485 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
486 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
487 subq $-(VEC_SIZE * 4), %rsi
488 VMOVA %VMM(1), (%rdi)
489 VMOVA %VMM(2), VEC_SIZE(%rdi)
490 VMOVA %VMM(3), (VEC_SIZE * 2)(%rdi)
491 VMOVA %VMM(4), (VEC_SIZE * 3)(%rdi)
492 subq $-(VEC_SIZE * 4), %rdi
493 cmpq %rdi, %rdx
494 ja L(loop_4x_vec_forward)
495 /* Store the last 4 * VEC. */
496 VMOVU %VMM(5), (VEC_SIZE * 3)(%rdx)
497 VMOVU %VMM(6), (VEC_SIZE * 2)(%rdx)
498 VMOVU %VMM(7), VEC_SIZE(%rdx)
499 VMOVU %VMM(8), (%rdx)
500 /* Store the first VEC. */
501 VMOVU %VMM(0), (%rcx)
502 /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
503 */
504L(nop_backward):
505 VZEROUPPER_RETURN
506
507 .p2align 4,, 8
508L(more_8x_vec_backward_check_nop):
509 /* rcx contains dst - src. Test for dst == src to skip all of
510 memmove. */
511 testq %rcx, %rcx
512 jz L(nop_backward)
513L(more_8x_vec_backward):
514 /* Load the first 4 * VEC and last VEC to support overlapping
515 addresses. */
516
517 /* First vec was also loaded into VEC(0). */
518 VMOVU VEC_SIZE(%rsi), %VMM(5)
519 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(6)
520 /* Beginning of region for 4x backward copy stored in rcx. */
521 leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
522 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
523 VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(8)
524 /* Subtract dst from src. Add back after dst aligned. */
525 subq %rdi, %rsi
526 /* Align dst. */
527 andq $-(VEC_SIZE), %rcx
528 /* Restore src. */
529 addq %rcx, %rsi
530
531 /* Don't use multi-byte nop to align. */
532 .p2align 4,, 11
533L(loop_4x_vec_backward):
534 /* Copy 4 * VEC a time backward. */
535 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(1)
536 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2)
537 VMOVU (VEC_SIZE * 1)(%rsi), %VMM(3)
538 VMOVU (VEC_SIZE * 0)(%rsi), %VMM(4)
539 addq $(VEC_SIZE * -4), %rsi
540 VMOVA %VMM(1), (VEC_SIZE * 3)(%rcx)
541 VMOVA %VMM(2), (VEC_SIZE * 2)(%rcx)
542 VMOVA %VMM(3), (VEC_SIZE * 1)(%rcx)
543 VMOVA %VMM(4), (VEC_SIZE * 0)(%rcx)
544 addq $(VEC_SIZE * -4), %rcx
545 cmpq %rcx, %rdi
546 jb L(loop_4x_vec_backward)
547 /* Store the first 4 * VEC. */
548 VMOVU %VMM(0), (%rdi)
549 VMOVU %VMM(5), VEC_SIZE(%rdi)
550 VMOVU %VMM(6), (VEC_SIZE * 2)(%rdi)
551 VMOVU %VMM(7), (VEC_SIZE * 3)(%rdi)
552 /* Store the last VEC. */
553 VMOVU %VMM(8), -VEC_SIZE(%rdx, %rdi)
554 VZEROUPPER_RETURN
555
556#if defined USE_MULTIARCH && IS_IN (libc)
557 /* L(skip_short_movsb_check) is only used with ERMS. Not for
558 FSRM. */
559 .p2align 5,, 16
560# if ALIGN_MOVSB
561L(skip_short_movsb_check):
562# if MOVSB_ALIGN_TO > VEC_SIZE
563 VMOVU VEC_SIZE(%rsi), %VMM(1)
564# endif
565# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
566# error Unsupported MOVSB_ALIGN_TO
567# endif
568 /* If CPU does not have FSRM two options for aligning. Align src
569 if dst and src 4k alias. Otherwise align dst. */
570 testl $(PAGE_SIZE - 512), %ecx
571 jnz L(movsb_align_dst)
572 /* Fall through. dst and src 4k alias. It's better to align src
573 here because the bottleneck will be loads dues to the false
574 dependency on dst. */
575
576 /* rcx already has dst - src. */
577 movq %rcx, %r9
578 /* Add src to len. Subtract back after src aligned. -1 because
579 src is initially aligned to MOVSB_ALIGN_TO - 1. */
580 leaq -1(%rsi, %rdx), %rcx
581 /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
582 orq $(MOVSB_ALIGN_TO - 1), %rsi
583 /* Restore dst and len adjusted with new values for aligned dst.
584 */
585 leaq 1(%rsi, %r9), %rdi
586 subq %rsi, %rcx
587 /* Finish aligning src. */
588 incq %rsi
589
590 rep movsb
591
592 VMOVU %VMM(0), (%r8)
593# if MOVSB_ALIGN_TO > VEC_SIZE
594 VMOVU %VMM(1), VEC_SIZE(%r8)
595# endif
596 VZEROUPPER_RETURN
597# endif
598
599 .p2align 4,, 12
600L(movsb):
601 movq %rdi, %rcx
602 subq %rsi, %rcx
603 /* Go to backwards temporal copy if overlap no matter what as
604 backward REP MOVSB is slow and we don't want to use NT stores if
605 there is overlap. */
606 cmpq %rdx, %rcx
607 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
608 jb L(more_8x_vec_backward_check_nop)
609# if ALIGN_MOVSB
610 /* Save dest for storing aligning VECs later. */
611 movq %rdi, %r8
612# endif
613 /* If above __x86_rep_movsb_stop_threshold most likely is
614 candidate for NT moves as well. */
615 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
616 jae L(large_memcpy_2x_check)
617# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
618 /* Only avoid short movsb if CPU has FSRM. */
619# if X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB < 256
620 testb $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
621# else
622 testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
623# endif
624 jz L(skip_short_movsb_check)
625# if AVOID_SHORT_DISTANCE_REP_MOVSB
626 /* Avoid "rep movsb" if RCX, the distance between source and
627 destination, is N*4GB + [1..63] with N >= 0. */
628
629 /* ecx contains dst - src. Early check for backward copy
630 conditions means only case of slow movsb with src = dst + [0,
631 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
632 for that case. */
633 cmpl $-64, %ecx
634 ja L(more_8x_vec_forward)
635# endif
636# endif
637# if ALIGN_MOVSB
638# if MOVSB_ALIGN_TO > VEC_SIZE
639 VMOVU VEC_SIZE(%rsi), %VMM(1)
640# endif
641# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
642# error Unsupported MOVSB_ALIGN_TO
643# endif
644 /* Fall through means cpu has FSRM. In that case exclusively
645 align destination. */
646L(movsb_align_dst):
647 /* Subtract dst from src. Add back after dst aligned. */
648 subq %rdi, %rsi
649 /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
650 addq $(MOVSB_ALIGN_TO - 1), %rdi
651 /* Add dst to len. Subtract back after dst aligned. */
652 leaq (%r8, %rdx), %rcx
653 /* Finish aligning dst. */
654 andq $-(MOVSB_ALIGN_TO), %rdi
655 /* Restore src and len adjusted with new values for aligned dst.
656 */
657 addq %rdi, %rsi
658 subq %rdi, %rcx
659
660 rep movsb
661
662 /* Store VECs loaded for aligning. */
663 VMOVU %VMM(0), (%r8)
664# if MOVSB_ALIGN_TO > VEC_SIZE
665 VMOVU %VMM(1), VEC_SIZE(%r8)
666# endif
667 VZEROUPPER_RETURN
668# else /* !ALIGN_MOVSB. */
669L(skip_short_movsb_check):
670 mov %RDX_LP, %RCX_LP
671 rep movsb
672 ret
673# endif
674#endif
675
676 .p2align 4,, 10
677#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
678L(large_memcpy_2x_check):
679 /* Entry from L(large_memcpy_2x) has a redundant load of
680 __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
681 is only use for the non-erms memmove which is generally less
682 common. */
683L(large_memcpy_2x):
684 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP
685 cmp %R11_LP, %RDX_LP
686 jb L(more_8x_vec_check)
687 /* To reach this point it is impossible for dst > src and
688 overlap. Remaining to check is src > dst and overlap. rcx
689 already contains dst - src. Negate rcx to get src - dst. If
690 length > rcx then there is overlap and forward copy is best. */
691 negq %rcx
692 cmpq %rcx, %rdx
693 ja L(more_8x_vec_forward)
694
695 /* Cache align destination. First store the first 64 bytes then
696 adjust alignments. */
697
698 /* First vec was also loaded into VEC(0). */
699# if VEC_SIZE < 64
700 VMOVU VEC_SIZE(%rsi), %VMM(1)
701# if VEC_SIZE < 32
702 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2)
703 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3)
704# endif
705# endif
706 VMOVU %VMM(0), (%rdi)
707# if VEC_SIZE < 64
708 VMOVU %VMM(1), VEC_SIZE(%rdi)
709# if VEC_SIZE < 32
710 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
711 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
712# endif
713# endif
714
715 /* Adjust source, destination, and size. */
716 movq %rdi, %r8
717 andq $63, %r8
718 /* Get the negative of offset for alignment. */
719 subq $64, %r8
720 /* Adjust source. */
721 subq %r8, %rsi
722 /* Adjust destination which should be aligned now. */
723 subq %r8, %rdi
724 /* Adjust length. */
725 addq %r8, %rdx
726
727 /* Test if source and destination addresses will alias. If they
728 do the larger pipeline in large_memcpy_4x alleviated the
729 performance drop. */
730
731 /* ecx contains -(dst - src). not ecx will return dst - src - 1
732 which works for testing aliasing. */
733 notl %ecx
734 movq %rdx, %r10
735 testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
736 jz L(large_memcpy_4x)
737
738 /* r11 has __x86_shared_non_temporal_threshold. Shift it left
739 by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
740 */
741 shlq $LOG_4X_MEMCPY_THRESH, %r11
742 cmp %r11, %rdx
743 jae L(large_memcpy_4x)
744
745 /* edx will store remainder size for copying tail. */
746 andl $(PAGE_SIZE * 2 - 1), %edx
747 /* r10 stores outer loop counter. */
748 shrq $(LOG_PAGE_SIZE + 1), %r10
749 /* Copy 4x VEC at a time from 2 pages. */
750 .p2align 4
751L(loop_large_memcpy_2x_outer):
752 /* ecx stores inner loop counter. */
753 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
754L(loop_large_memcpy_2x_inner):
755 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
756 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
757 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
758 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
759 /* Load vectors from rsi. */
760 LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
761 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
762 subq $-LARGE_LOAD_SIZE, %rsi
763 /* Non-temporal store vectors to rdi. */
764 STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
765 STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
766 subq $-LARGE_LOAD_SIZE, %rdi
767 decl %ecx
768 jnz L(loop_large_memcpy_2x_inner)
769 addq $PAGE_SIZE, %rdi
770 addq $PAGE_SIZE, %rsi
771 decq %r10
772 jne L(loop_large_memcpy_2x_outer)
773 sfence
774
775 /* Check if only last 4 loads are needed. */
776 cmpl $(VEC_SIZE * 4), %edx
777 jbe L(large_memcpy_2x_end)
778
779 /* Handle the last 2 * PAGE_SIZE bytes. */
780L(loop_large_memcpy_2x_tail):
781 /* Copy 4 * VEC a time forward with non-temporal stores. */
782 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
783 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
784 VMOVU (%rsi), %VMM(0)
785 VMOVU VEC_SIZE(%rsi), %VMM(1)
786 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2)
787 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3)
788 subq $-(VEC_SIZE * 4), %rsi
789 addl $-(VEC_SIZE * 4), %edx
790 VMOVA %VMM(0), (%rdi)
791 VMOVA %VMM(1), VEC_SIZE(%rdi)
792 VMOVA %VMM(2), (VEC_SIZE * 2)(%rdi)
793 VMOVA %VMM(3), (VEC_SIZE * 3)(%rdi)
794 subq $-(VEC_SIZE * 4), %rdi
795 cmpl $(VEC_SIZE * 4), %edx
796 ja L(loop_large_memcpy_2x_tail)
797
798L(large_memcpy_2x_end):
799 /* Store the last 4 * VEC. */
800 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
801 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
802 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
803 VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(3)
804
805 VMOVU %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
806 VMOVU %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
807 VMOVU %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
808 VMOVU %VMM(3), -VEC_SIZE(%rdi, %rdx)
809 VZEROUPPER_RETURN
810
811 .p2align 4
812L(large_memcpy_4x):
813 /* edx will store remainder size for copying tail. */
814 andl $(PAGE_SIZE * 4 - 1), %edx
815 /* r10 stores outer loop counter. */
816 shrq $(LOG_PAGE_SIZE + 2), %r10
817 /* Copy 4x VEC at a time from 4 pages. */
818 .p2align 4
819L(loop_large_memcpy_4x_outer):
820 /* ecx stores inner loop counter. */
821 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
822L(loop_large_memcpy_4x_inner):
823 /* Only one prefetch set per page as doing 4 pages give more
824 time for prefetcher to keep up. */
825 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
826 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
827 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
828 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
829 /* Load vectors from rsi. */
830 LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
831 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
832 LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
833 LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
834 subq $-LARGE_LOAD_SIZE, %rsi
835 /* Non-temporal store vectors to rdi. */
836 STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3))
837 STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7))
838 STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11))
839 STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15))
840 subq $-LARGE_LOAD_SIZE, %rdi
841 decl %ecx
842 jnz L(loop_large_memcpy_4x_inner)
843 addq $(PAGE_SIZE * 3), %rdi
844 addq $(PAGE_SIZE * 3), %rsi
845 decq %r10
846 jne L(loop_large_memcpy_4x_outer)
847 sfence
848 /* Check if only last 4 loads are needed. */
849 cmpl $(VEC_SIZE * 4), %edx
850 jbe L(large_memcpy_4x_end)
851
852 /* Handle the last 4 * PAGE_SIZE bytes. */
853L(loop_large_memcpy_4x_tail):
854 /* Copy 4 * VEC a time forward with non-temporal stores. */
855 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
856 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
857 VMOVU (%rsi), %VMM(0)
858 VMOVU VEC_SIZE(%rsi), %VMM(1)
859 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2)
860 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3)
861 subq $-(VEC_SIZE * 4), %rsi
862 addl $-(VEC_SIZE * 4), %edx
863 VMOVA %VMM(0), (%rdi)
864 VMOVA %VMM(1), VEC_SIZE(%rdi)
865 VMOVA %VMM(2), (VEC_SIZE * 2)(%rdi)
866 VMOVA %VMM(3), (VEC_SIZE * 3)(%rdi)
867 subq $-(VEC_SIZE * 4), %rdi
868 cmpl $(VEC_SIZE * 4), %edx
869 ja L(loop_large_memcpy_4x_tail)
870
871L(large_memcpy_4x_end):
872 /* Store the last 4 * VEC. */
873 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0)
874 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
875 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
876 VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(3)
877
878 VMOVU %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx)
879 VMOVU %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx)
880 VMOVU %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx)
881 VMOVU %VMM(3), -VEC_SIZE(%rdi, %rdx)
882 VZEROUPPER_RETURN
883#endif
884END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
885
886#if IS_IN (libc)
887# ifdef USE_MULTIARCH
888strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
889 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
890# ifdef SHARED
891strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
892 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
893# endif
894# endif
895# ifdef SHARED
896strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
897 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
898# endif
899#endif
900strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
901 MEMCPY_SYMBOL (__memcpy, unaligned))
902

source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S