1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. On machines with ERMS feature, if size greater than equal or to
34 __x86_rep_movsb_threshold and less than
35 __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36 7. If size >= __x86_shared_non_temporal_threshold and there is no
37 overlap between destination and source, use non-temporal store
38 instead of aligned store copying from either 2 or 4 pages at
39 once.
40 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
41 and source and destination do not page alias, copy from 2 pages
42 at once using non-temporal stores. Page aliasing in this case is
43 considered true if destination's page alignment - sources' page
44 alignment is less than 8 * VEC_SIZE.
45 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
46 and destination do page alias copy from 4 pages at once using
47 non-temporal stores. */
48
49#include <sysdep.h>
50
51#ifndef MEMCPY_SYMBOL
52# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53#endif
54
55#ifndef MEMPCPY_SYMBOL
56# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
57#endif
58
59#ifndef MEMMOVE_CHK_SYMBOL
60# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
61#endif
62
63#ifndef XMM0
64# define XMM0 xmm0
65#endif
66
67#ifndef YMM0
68# define YMM0 ymm0
69#endif
70
71#ifndef VZEROUPPER
72# if VEC_SIZE > 16
73# define VZEROUPPER vzeroupper
74# else
75# define VZEROUPPER
76# endif
77#endif
78
79/* Whether to align before movsb. Ultimately we want 64 byte
80 align and not worth it to load 4x VEC for VEC_SIZE == 16. */
81#define ALIGN_MOVSB (VEC_SIZE > 16)
82/* Number of bytes to align movsb to. */
83#define MOVSB_ALIGN_TO 64
84
85#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
86#define LARGE_MOV_SIZE (MOV_SIZE > 4)
87
88#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
89# error MOV_SIZE Unknown
90#endif
91
92#if LARGE_MOV_SIZE
93# define SMALL_SIZE_OFFSET (4)
94#else
95# define SMALL_SIZE_OFFSET (0)
96#endif
97
98#ifndef PAGE_SIZE
99# define PAGE_SIZE 4096
100#endif
101
102#if PAGE_SIZE != 4096
103# error Unsupported PAGE_SIZE
104#endif
105
106#ifndef LOG_PAGE_SIZE
107# define LOG_PAGE_SIZE 12
108#endif
109
110#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
111# error Invalid LOG_PAGE_SIZE
112#endif
113
114/* Byte per page for large_memcpy inner loop. */
115#if VEC_SIZE == 64
116# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
117#else
118# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
119#endif
120
121/* Amount to shift __x86_shared_non_temporal_threshold by for
122 bound for memcpy_large_4x. This is essentially use to to
123 indicate that the copy is far beyond the scope of L3
124 (assuming no user config x86_non_temporal_threshold) and to
125 use a more aggressively unrolled loop. NB: before
126 increasing the value also update initialization of
127 x86_non_temporal_threshold. */
128#ifndef LOG_4X_MEMCPY_THRESH
129# define LOG_4X_MEMCPY_THRESH 4
130#endif
131
132/* Avoid short distance rep movsb only with non-SSE vector. */
133#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
134# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
135#else
136# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
137#endif
138
139#ifndef PREFETCH
140# define PREFETCH(addr) prefetcht0 addr
141#endif
142
143/* Assume 64-byte prefetch size. */
144#ifndef PREFETCH_SIZE
145# define PREFETCH_SIZE 64
146#endif
147
148#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
149
150#if PREFETCH_SIZE == 64
151# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
152# define PREFETCH_ONE_SET(dir, base, offset) \
153 PREFETCH ((offset)base)
154# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
155# define PREFETCH_ONE_SET(dir, base, offset) \
156 PREFETCH ((offset)base); \
157 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
158# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
159# define PREFETCH_ONE_SET(dir, base, offset) \
160 PREFETCH ((offset)base); \
161 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
162 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
163 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
164# else
165# error Unsupported PREFETCHED_LOAD_SIZE!
166# endif
167#else
168# error Unsupported PREFETCH_SIZE!
169#endif
170
171#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
172# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
173 VMOVU (offset)base, vec0; \
174 VMOVU ((offset) + VEC_SIZE)base, vec1;
175# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
176 VMOVNT vec0, (offset)base; \
177 VMOVNT vec1, ((offset) + VEC_SIZE)base;
178#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
179# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
180 VMOVU (offset)base, vec0; \
181 VMOVU ((offset) + VEC_SIZE)base, vec1; \
182 VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
183 VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
184# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
185 VMOVNT vec0, (offset)base; \
186 VMOVNT vec1, ((offset) + VEC_SIZE)base; \
187 VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
188 VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
189#else
190# error Invalid LARGE_LOAD_SIZE
191#endif
192
193#ifndef SECTION
194# error SECTION is not defined!
195#endif
196
197 .section SECTION(.text),"ax",@progbits
198#if defined SHARED && IS_IN (libc)
199ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
200 cmp %RDX_LP, %RCX_LP
201 jb HIDDEN_JUMPTARGET (__chk_fail)
202END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
203#endif
204
205ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
206 mov %RDI_LP, %RAX_LP
207 add %RDX_LP, %RAX_LP
208 jmp L(start)
209END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
210
211#if defined SHARED && IS_IN (libc)
212ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
213 cmp %RDX_LP, %RCX_LP
214 jb HIDDEN_JUMPTARGET (__chk_fail)
215END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
216#endif
217
218ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
219 movq %rdi, %rax
220L(start):
221# ifdef __ILP32__
222 /* Clear the upper 32 bits. */
223 movl %edx, %edx
224# endif
225 cmp $VEC_SIZE, %RDX_LP
226 jb L(less_vec)
227 /* Load regardless. */
228 VMOVU (%rsi), %VEC(0)
229 cmp $(VEC_SIZE * 2), %RDX_LP
230 ja L(more_2x_vec)
231 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
232 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
233 VMOVU %VEC(0), (%rdi)
234 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
235#if !(defined USE_MULTIARCH && IS_IN (libc))
236 ZERO_UPPER_VEC_REGISTERS_RETURN
237#else
238 VZEROUPPER_RETURN
239#endif
240#if defined USE_MULTIARCH && IS_IN (libc)
241END (MEMMOVE_SYMBOL (__memmove, unaligned))
242
243# ifdef SHARED
244ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
245 cmp %RDX_LP, %RCX_LP
246 jb HIDDEN_JUMPTARGET (__chk_fail)
247END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
248# endif
249
250ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
251 mov %RDI_LP, %RAX_LP
252 add %RDX_LP, %RAX_LP
253 jmp L(start_erms)
254END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
255
256# ifdef SHARED
257ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
258 cmp %RDX_LP, %RCX_LP
259 jb HIDDEN_JUMPTARGET (__chk_fail)
260END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
261# endif
262
263ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
264 movq %rdi, %rax
265L(start_erms):
266# ifdef __ILP32__
267 /* Clear the upper 32 bits. */
268 movl %edx, %edx
269# endif
270 cmp $VEC_SIZE, %RDX_LP
271 jb L(less_vec)
272 /* Load regardless. */
273 VMOVU (%rsi), %VEC(0)
274 cmp $(VEC_SIZE * 2), %RDX_LP
275 ja L(movsb_more_2x_vec)
276 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
277 */
278 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
279 VMOVU %VEC(0), (%rdi)
280 VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
281L(return):
282# if VEC_SIZE > 16
283 ZERO_UPPER_VEC_REGISTERS_RETURN
284# else
285 ret
286# endif
287#endif
288
289#if LARGE_MOV_SIZE
290 /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
291 ENTRY block and L(less_vec). */
292 .p2align 4,, 8
293L(between_4_7):
294 /* From 4 to 7. No branch when size == 4. */
295 movl (%rsi), %ecx
296 movl (%rsi, %rdx), %esi
297 movl %ecx, (%rdi)
298 movl %esi, (%rdi, %rdx)
299 ret
300#endif
301
302 .p2align 4
303L(less_vec):
304 /* Less than 1 VEC. */
305#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
306# error Unsupported VEC_SIZE!
307#endif
308#if VEC_SIZE > 32
309 cmpl $32, %edx
310 jae L(between_32_63)
311#endif
312#if VEC_SIZE > 16
313 cmpl $16, %edx
314 jae L(between_16_31)
315#endif
316 cmpl $8, %edx
317 jae L(between_8_15)
318#if SMALL_MOV_SIZE
319 cmpl $4, %edx
320#else
321 subq $4, %rdx
322#endif
323 jae L(between_4_7)
324 cmpl $(1 - SMALL_SIZE_OFFSET), %edx
325 jl L(copy_0)
326 movb (%rsi), %cl
327 je L(copy_1)
328 movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
329 movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
330L(copy_1):
331 movb %cl, (%rdi)
332L(copy_0):
333 ret
334
335#if SMALL_MOV_SIZE
336 .p2align 4,, 8
337L(between_4_7):
338 /* From 4 to 7. No branch when size == 4. */
339 movl -4(%rsi, %rdx), %ecx
340 movl (%rsi), %esi
341 movl %ecx, -4(%rdi, %rdx)
342 movl %esi, (%rdi)
343 ret
344#endif
345
346#if VEC_SIZE > 16
347 /* From 16 to 31. No branch when size == 16. */
348 .p2align 4,, 8
349L(between_16_31):
350 vmovdqu (%rsi), %xmm0
351 vmovdqu -16(%rsi, %rdx), %xmm1
352 vmovdqu %xmm0, (%rdi)
353 vmovdqu %xmm1, -16(%rdi, %rdx)
354 /* No ymm registers have been touched. */
355 ret
356#endif
357
358#if VEC_SIZE > 32
359 .p2align 4,, 10
360L(between_32_63):
361 /* From 32 to 63. No branch when size == 32. */
362 VMOVU (%rsi), %YMM0
363 VMOVU -32(%rsi, %rdx), %YMM1
364 VMOVU %YMM0, (%rdi)
365 VMOVU %YMM1, -32(%rdi, %rdx)
366 VZEROUPPER_RETURN
367#endif
368
369 .p2align 4,, 10
370L(between_8_15):
371 /* From 8 to 15. No branch when size == 8. */
372 movq -8(%rsi, %rdx), %rcx
373 movq (%rsi), %rsi
374 movq %rsi, (%rdi)
375 movq %rcx, -8(%rdi, %rdx)
376 ret
377
378 .p2align 4,, 10
379L(last_4x_vec):
380 /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
381
382 /* VEC(0) and VEC(1) have already been loaded. */
383 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
384 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
385 VMOVU %VEC(0), (%rdi)
386 VMOVU %VEC(1), VEC_SIZE(%rdi)
387 VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
388 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
389 VZEROUPPER_RETURN
390
391 .p2align 4
392#if defined USE_MULTIARCH && IS_IN (libc)
393L(movsb_more_2x_vec):
394 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
395 ja L(movsb)
396#endif
397L(more_2x_vec):
398 /* More than 2 * VEC and there may be overlap between
399 destination and source. */
400 cmpq $(VEC_SIZE * 8), %rdx
401 ja L(more_8x_vec)
402 /* Load VEC(1) regardless. VEC(0) has already been loaded. */
403 VMOVU VEC_SIZE(%rsi), %VEC(1)
404 cmpq $(VEC_SIZE * 4), %rdx
405 jbe L(last_4x_vec)
406 /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
407 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
408 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
409 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
410 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
411 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
412 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
413 VMOVU %VEC(0), (%rdi)
414 VMOVU %VEC(1), VEC_SIZE(%rdi)
415 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
416 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
417 VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
418 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
419 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
420 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
421 VZEROUPPER_RETURN
422
423 .p2align 4,, 4
424L(more_8x_vec):
425 movq %rdi, %rcx
426 subq %rsi, %rcx
427 /* Go to backwards temporal copy if overlap no matter what as
428 backward REP MOVSB is slow and we don't want to use NT stores if
429 there is overlap. */
430 cmpq %rdx, %rcx
431 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
432 jb L(more_8x_vec_backward_check_nop)
433 /* Check if non-temporal move candidate. */
434#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
435 /* Check non-temporal store threshold. */
436 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
437 ja L(large_memcpy_2x)
438#endif
439 /* To reach this point there cannot be overlap and dst > src. So
440 check for overlap and src > dst in which case correctness
441 requires forward copy. Otherwise decide between backward/forward
442 copy depending on address aliasing. */
443
444 /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
445 but less than __x86_shared_non_temporal_threshold. */
446L(more_8x_vec_check):
447 /* rcx contains dst - src. Add back length (rdx). */
448 leaq (%rcx, %rdx), %r8
449 /* If r8 has different sign than rcx then there is overlap so we
450 must do forward copy. */
451 xorq %rcx, %r8
452 /* Isolate just sign bit of r8. */
453 shrq $63, %r8
454 /* Get 4k difference dst - src. */
455 andl $(PAGE_SIZE - 256), %ecx
456 /* If r8 is non-zero must do foward for correctness. Otherwise
457 if ecx is non-zero there is 4k False Alaising so do backward
458 copy. */
459 addl %r8d, %ecx
460 jz L(more_8x_vec_backward)
461
462 /* if rdx is greater than __x86_shared_non_temporal_threshold
463 but there is overlap, or from short distance movsb. */
464L(more_8x_vec_forward):
465 /* Load first and last 4 * VEC to support overlapping addresses.
466 */
467
468 /* First vec was already loaded into VEC(0). */
469 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
470 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
471 /* Save begining of dst. */
472 movq %rdi, %rcx
473 /* Align dst to VEC_SIZE - 1. */
474 orq $(VEC_SIZE - 1), %rdi
475 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
476 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
477
478 /* Subtract dst from src. Add back after dst aligned. */
479 subq %rcx, %rsi
480 /* Finish aligning dst. */
481 incq %rdi
482 /* Restore src adjusted with new value for aligned dst. */
483 addq %rdi, %rsi
484 /* Store end of buffer minus tail in rdx. */
485 leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
486
487 /* Dont use multi-byte nop to align. */
488 .p2align 4,, 11
489L(loop_4x_vec_forward):
490 /* Copy 4 * VEC a time forward. */
491 VMOVU (%rsi), %VEC(1)
492 VMOVU VEC_SIZE(%rsi), %VEC(2)
493 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
494 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
495 subq $-(VEC_SIZE * 4), %rsi
496 VMOVA %VEC(1), (%rdi)
497 VMOVA %VEC(2), VEC_SIZE(%rdi)
498 VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
499 VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
500 subq $-(VEC_SIZE * 4), %rdi
501 cmpq %rdi, %rdx
502 ja L(loop_4x_vec_forward)
503 /* Store the last 4 * VEC. */
504 VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
505 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
506 VMOVU %VEC(7), VEC_SIZE(%rdx)
507 VMOVU %VEC(8), (%rdx)
508 /* Store the first VEC. */
509 VMOVU %VEC(0), (%rcx)
510 /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
511 */
512L(nop_backward):
513 VZEROUPPER_RETURN
514
515 .p2align 4,, 8
516L(more_8x_vec_backward_check_nop):
517 /* rcx contains dst - src. Test for dst == src to skip all of
518 memmove. */
519 testq %rcx, %rcx
520 jz L(nop_backward)
521L(more_8x_vec_backward):
522 /* Load the first 4 * VEC and last VEC to support overlapping
523 addresses. */
524
525 /* First vec was also loaded into VEC(0). */
526 VMOVU VEC_SIZE(%rsi), %VEC(5)
527 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
528 /* Begining of region for 4x backward copy stored in rcx. */
529 leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
530 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
531 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
532 /* Subtract dst from src. Add back after dst aligned. */
533 subq %rdi, %rsi
534 /* Align dst. */
535 andq $-(VEC_SIZE), %rcx
536 /* Restore src. */
537 addq %rcx, %rsi
538
539 /* Don't use multi-byte nop to align. */
540 .p2align 4,, 11
541L(loop_4x_vec_backward):
542 /* Copy 4 * VEC a time backward. */
543 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
544 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
545 VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
546 VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
547 addq $(VEC_SIZE * -4), %rsi
548 VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
549 VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
550 VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
551 VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
552 addq $(VEC_SIZE * -4), %rcx
553 cmpq %rcx, %rdi
554 jb L(loop_4x_vec_backward)
555 /* Store the first 4 * VEC. */
556 VMOVU %VEC(0), (%rdi)
557 VMOVU %VEC(5), VEC_SIZE(%rdi)
558 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
559 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
560 /* Store the last VEC. */
561 VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
562 VZEROUPPER_RETURN
563
564#if defined USE_MULTIARCH && IS_IN (libc)
565 /* L(skip_short_movsb_check) is only used with ERMS. Not for
566 FSRM. */
567 .p2align 5,, 16
568# if ALIGN_MOVSB
569L(skip_short_movsb_check):
570# if MOVSB_ALIGN_TO > VEC_SIZE
571 VMOVU VEC_SIZE(%rsi), %VEC(1)
572# endif
573# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
574# error Unsupported MOVSB_ALIGN_TO
575# endif
576 /* If CPU does not have FSRM two options for aligning. Align src
577 if dst and src 4k alias. Otherwise align dst. */
578 testl $(PAGE_SIZE - 512), %ecx
579 jnz L(movsb_align_dst)
580 /* Fall through. dst and src 4k alias. It's better to align src
581 here because the bottleneck will be loads dues to the false
582 dependency on dst. */
583
584 /* rcx already has dst - src. */
585 movq %rcx, %r9
586 /* Add src to len. Subtract back after src aligned. -1 because
587 src is initially aligned to MOVSB_ALIGN_TO - 1. */
588 leaq -1(%rsi, %rdx), %rcx
589 /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
590 orq $(MOVSB_ALIGN_TO - 1), %rsi
591 /* Restore dst and len adjusted with new values for aligned dst.
592 */
593 leaq 1(%rsi, %r9), %rdi
594 subq %rsi, %rcx
595 /* Finish aligning src. */
596 incq %rsi
597
598 rep movsb
599
600 VMOVU %VEC(0), (%r8)
601# if MOVSB_ALIGN_TO > VEC_SIZE
602 VMOVU %VEC(1), VEC_SIZE(%r8)
603# endif
604 VZEROUPPER_RETURN
605# endif
606
607 .p2align 4,, 12
608L(movsb):
609 movq %rdi, %rcx
610 subq %rsi, %rcx
611 /* Go to backwards temporal copy if overlap no matter what as
612 backward REP MOVSB is slow and we don't want to use NT stores if
613 there is overlap. */
614 cmpq %rdx, %rcx
615 /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
616 jb L(more_8x_vec_backward_check_nop)
617# if ALIGN_MOVSB
618 /* Save dest for storing aligning VECs later. */
619 movq %rdi, %r8
620# endif
621 /* If above __x86_rep_movsb_stop_threshold most likely is
622 candidate for NT moves aswell. */
623 cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
624 jae L(large_memcpy_2x_check)
625# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
626 /* Only avoid short movsb if CPU has FSRM. */
627 testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
628 jz L(skip_short_movsb_check)
629# if AVOID_SHORT_DISTANCE_REP_MOVSB
630 /* Avoid "rep movsb" if RCX, the distance between source and
631 destination, is N*4GB + [1..63] with N >= 0. */
632
633 /* ecx contains dst - src. Early check for backward copy
634 conditions means only case of slow movsb with src = dst + [0,
635 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
636 for that case. */
637 cmpl $-64, %ecx
638 ja L(more_8x_vec_forward)
639# endif
640# endif
641# if ALIGN_MOVSB
642# if MOVSB_ALIGN_TO > VEC_SIZE
643 VMOVU VEC_SIZE(%rsi), %VEC(1)
644# endif
645# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
646# error Unsupported MOVSB_ALIGN_TO
647# endif
648 /* Fall through means cpu has FSRM. In that case exclusively
649 align destination. */
650L(movsb_align_dst):
651 /* Subtract dst from src. Add back after dst aligned. */
652 subq %rdi, %rsi
653 /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
654 addq $(MOVSB_ALIGN_TO - 1), %rdi
655 /* Add dst to len. Subtract back after dst aligned. */
656 leaq (%r8, %rdx), %rcx
657 /* Finish aligning dst. */
658 andq $-(MOVSB_ALIGN_TO), %rdi
659 /* Restore src and len adjusted with new values for aligned dst.
660 */
661 addq %rdi, %rsi
662 subq %rdi, %rcx
663
664 rep movsb
665
666 /* Store VECs loaded for aligning. */
667 VMOVU %VEC(0), (%r8)
668# if MOVSB_ALIGN_TO > VEC_SIZE
669 VMOVU %VEC(1), VEC_SIZE(%r8)
670# endif
671 VZEROUPPER_RETURN
672# else /* !ALIGN_MOVSB. */
673L(skip_short_movsb_check):
674 mov %RDX_LP, %RCX_LP
675 rep movsb
676 ret
677# endif
678#endif
679
680 .p2align 4,, 10
681#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
682L(large_memcpy_2x_check):
683 /* Entry from L(large_memcpy_2x) has a redundant load of
684 __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
685 is only use for the non-erms memmove which is generally less
686 common. */
687L(large_memcpy_2x):
688 mov __x86_shared_non_temporal_threshold(%rip), %R11_LP
689 cmp %R11_LP, %RDX_LP
690 jb L(more_8x_vec_check)
691 /* To reach this point it is impossible for dst > src and
692 overlap. Remaining to check is src > dst and overlap. rcx
693 already contains dst - src. Negate rcx to get src - dst. If
694 length > rcx then there is overlap and forward copy is best. */
695 negq %rcx
696 cmpq %rcx, %rdx
697 ja L(more_8x_vec_forward)
698
699 /* Cache align destination. First store the first 64 bytes then
700 adjust alignments. */
701
702 /* First vec was also loaded into VEC(0). */
703# if VEC_SIZE < 64
704 VMOVU VEC_SIZE(%rsi), %VEC(1)
705# if VEC_SIZE < 32
706 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
707 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
708# endif
709# endif
710 VMOVU %VEC(0), (%rdi)
711# if VEC_SIZE < 64
712 VMOVU %VEC(1), VEC_SIZE(%rdi)
713# if VEC_SIZE < 32
714 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
715 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
716# endif
717# endif
718
719 /* Adjust source, destination, and size. */
720 movq %rdi, %r8
721 andq $63, %r8
722 /* Get the negative of offset for alignment. */
723 subq $64, %r8
724 /* Adjust source. */
725 subq %r8, %rsi
726 /* Adjust destination which should be aligned now. */
727 subq %r8, %rdi
728 /* Adjust length. */
729 addq %r8, %rdx
730
731 /* Test if source and destination addresses will alias. If they
732 do the larger pipeline in large_memcpy_4x alleviated the
733 performance drop. */
734
735 /* ecx contains -(dst - src). not ecx will return dst - src - 1
736 which works for testing aliasing. */
737 notl %ecx
738 movq %rdx, %r10
739 testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
740 jz L(large_memcpy_4x)
741
742 /* r11 has __x86_shared_non_temporal_threshold. Shift it left
743 by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
744 */
745 shlq $LOG_4X_MEMCPY_THRESH, %r11
746 cmp %r11, %rdx
747 jae L(large_memcpy_4x)
748
749 /* edx will store remainder size for copying tail. */
750 andl $(PAGE_SIZE * 2 - 1), %edx
751 /* r10 stores outer loop counter. */
752 shrq $(LOG_PAGE_SIZE + 1), %r10
753 /* Copy 4x VEC at a time from 2 pages. */
754 .p2align 4
755L(loop_large_memcpy_2x_outer):
756 /* ecx stores inner loop counter. */
757 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
758L(loop_large_memcpy_2x_inner):
759 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
760 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
761 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
762 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
763 /* Load vectors from rsi. */
764 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
765 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
766 subq $-LARGE_LOAD_SIZE, %rsi
767 /* Non-temporal store vectors to rdi. */
768 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
769 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
770 subq $-LARGE_LOAD_SIZE, %rdi
771 decl %ecx
772 jnz L(loop_large_memcpy_2x_inner)
773 addq $PAGE_SIZE, %rdi
774 addq $PAGE_SIZE, %rsi
775 decq %r10
776 jne L(loop_large_memcpy_2x_outer)
777 sfence
778
779 /* Check if only last 4 loads are needed. */
780 cmpl $(VEC_SIZE * 4), %edx
781 jbe L(large_memcpy_2x_end)
782
783 /* Handle the last 2 * PAGE_SIZE bytes. */
784L(loop_large_memcpy_2x_tail):
785 /* Copy 4 * VEC a time forward with non-temporal stores. */
786 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
787 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
788 VMOVU (%rsi), %VEC(0)
789 VMOVU VEC_SIZE(%rsi), %VEC(1)
790 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
791 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
792 subq $-(VEC_SIZE * 4), %rsi
793 addl $-(VEC_SIZE * 4), %edx
794 VMOVA %VEC(0), (%rdi)
795 VMOVA %VEC(1), VEC_SIZE(%rdi)
796 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
797 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
798 subq $-(VEC_SIZE * 4), %rdi
799 cmpl $(VEC_SIZE * 4), %edx
800 ja L(loop_large_memcpy_2x_tail)
801
802L(large_memcpy_2x_end):
803 /* Store the last 4 * VEC. */
804 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
805 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
806 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
807 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
808
809 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
810 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
811 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
812 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
813 VZEROUPPER_RETURN
814
815 .p2align 4
816L(large_memcpy_4x):
817 /* edx will store remainder size for copying tail. */
818 andl $(PAGE_SIZE * 4 - 1), %edx
819 /* r10 stores outer loop counter. */
820 shrq $(LOG_PAGE_SIZE + 2), %r10
821 /* Copy 4x VEC at a time from 4 pages. */
822 .p2align 4
823L(loop_large_memcpy_4x_outer):
824 /* ecx stores inner loop counter. */
825 movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
826L(loop_large_memcpy_4x_inner):
827 /* Only one prefetch set per page as doing 4 pages give more
828 time for prefetcher to keep up. */
829 PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
830 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
831 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
832 PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
833 /* Load vectors from rsi. */
834 LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
835 LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
836 LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
837 LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
838 subq $-LARGE_LOAD_SIZE, %rsi
839 /* Non-temporal store vectors to rdi. */
840 STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
841 STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
842 STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
843 STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
844 subq $-LARGE_LOAD_SIZE, %rdi
845 decl %ecx
846 jnz L(loop_large_memcpy_4x_inner)
847 addq $(PAGE_SIZE * 3), %rdi
848 addq $(PAGE_SIZE * 3), %rsi
849 decq %r10
850 jne L(loop_large_memcpy_4x_outer)
851 sfence
852 /* Check if only last 4 loads are needed. */
853 cmpl $(VEC_SIZE * 4), %edx
854 jbe L(large_memcpy_4x_end)
855
856 /* Handle the last 4 * PAGE_SIZE bytes. */
857L(loop_large_memcpy_4x_tail):
858 /* Copy 4 * VEC a time forward with non-temporal stores. */
859 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
860 PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
861 VMOVU (%rsi), %VEC(0)
862 VMOVU VEC_SIZE(%rsi), %VEC(1)
863 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
864 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
865 subq $-(VEC_SIZE * 4), %rsi
866 addl $-(VEC_SIZE * 4), %edx
867 VMOVA %VEC(0), (%rdi)
868 VMOVA %VEC(1), VEC_SIZE(%rdi)
869 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
870 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
871 subq $-(VEC_SIZE * 4), %rdi
872 cmpl $(VEC_SIZE * 4), %edx
873 ja L(loop_large_memcpy_4x_tail)
874
875L(large_memcpy_4x_end):
876 /* Store the last 4 * VEC. */
877 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
878 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
879 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
880 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
881
882 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
883 VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
884 VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
885 VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
886 VZEROUPPER_RETURN
887#endif
888END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
889
890#if IS_IN (libc)
891# ifdef USE_MULTIARCH
892strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
893 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
894# ifdef SHARED
895strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
896 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
897# endif
898# endif
899# ifdef SHARED
900strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
901 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
902# endif
903#endif
904strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
905 MEMCPY_SYMBOL (__memcpy, unaligned))
906

source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S