memmove-vec-unaligned-erms.S source code [glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. On machines with ERMS feature, if size greater than equal or to
34	__x86_rep_movsb_threshold and less than
35	__x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36	7. If size >= __x86_shared_non_temporal_threshold and there is no
37	overlap between destination and source, use non-temporal store
38	instead of aligned store copying from either 2 or 4 pages at
39	once.
40	8. For point 7) if size < 16 __x86_shared_non_temporal_threshold*
41	and source and destination do not page alias, copy from 2 pages
42	at once using non-temporal stores. Page aliasing in this case is
43	considered true if destination's page alignment - sources' page
44	alignment is less than 8 VEC_SIZE.*
45	9. If size >= 16 __x86_shared_non_temporal_threshold or source*
46	and destination do page alias copy from 4 pages at once using
47	non-temporal stores. /*
48
49	#include <sysdep.h>
50
51	#ifndef MEMCPY_SYMBOL
52	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53	#endif
54
55	#ifndef MEMPCPY_SYMBOL
56	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
57	#endif
58
59	#ifndef MEMMOVE_CHK_SYMBOL
60	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
61	#endif
62
63	#ifndef VZEROUPPER
64	# if VEC_SIZE > 16
65	# define VZEROUPPER vzeroupper
66	# else
67	# define VZEROUPPER
68	# endif
69	#endif
70
71	/ Whether to align before movsb. Ultimately we want 64 byte*
72	align and not worth it to load 4x VEC for VEC_SIZE == 16. /*
73	#define ALIGN_MOVSB (VEC_SIZE > 16)
74	/ Number of bytes to align movsb to. /
75	#define MOVSB_ALIGN_TO 64
76
77	#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
78	#define LARGE_MOV_SIZE (MOV_SIZE > 4)
79
80	#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
81	# error MOV_SIZE Unknown
82	#endif
83
84	#if LARGE_MOV_SIZE
85	# define SMALL_SIZE_OFFSET (4)
86	#else
87	# define SMALL_SIZE_OFFSET (0)
88	#endif
89
90	#ifndef PAGE_SIZE
91	# define PAGE_SIZE 4096
92	#endif
93
94	#if PAGE_SIZE != 4096
95	# error Unsupported PAGE_SIZE
96	#endif
97
98	#ifndef LOG_PAGE_SIZE
99	# define LOG_PAGE_SIZE 12
100	#endif
101
102	#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
103	# error Invalid LOG_PAGE_SIZE
104	#endif
105
106	/ Byte per page for large_memcpy inner loop. /
107	#if VEC_SIZE == 64
108	# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
109	#else
110	# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
111	#endif
112
113	/ Amount to shift __x86_shared_non_temporal_threshold by for*
114	bound for memcpy_large_4x. This is essentially use to to
115	indicate that the copy is far beyond the scope of L3
116	(assuming no user config x86_non_temporal_threshold) and to
117	use a more aggressively unrolled loop. NB: before
118	increasing the value also update initialization of
119	x86_non_temporal_threshold. /*
120	#ifndef LOG_4X_MEMCPY_THRESH
121	# define LOG_4X_MEMCPY_THRESH 4
122	#endif
123
124	/ Avoid short distance rep movsb only with non-SSE vector. /
125	#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
126	# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
127	#else
128	# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
129	#endif
130
131	#ifndef PREFETCH
132	# define PREFETCH(addr) prefetcht0 addr
133	#endif
134
135	/ Assume 64-byte prefetch size. /
136	#ifndef PREFETCH_SIZE
137	# define PREFETCH_SIZE 64
138	#endif
139
140	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
141
142	#if PREFETCH_SIZE == 64
143	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
144	# define PREFETCH_ONE_SET(dir, base, offset) \
145	PREFETCH ((offset)base)
146	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
147	# define PREFETCH_ONE_SET(dir, base, offset) \
148	PREFETCH ((offset)base); \
149	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
150	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
151	# define PREFETCH_ONE_SET(dir, base, offset) \
152	PREFETCH ((offset)base); \
153	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
154	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
155	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
156	# else
157	# error Unsupported PREFETCHED_LOAD_SIZE!
158	# endif
159	#else
160	# error Unsupported PREFETCH_SIZE!
161	#endif
162
163	#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
164	# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
165	VMOVU (offset)base, vec0; \
166	VMOVU ((offset) + VEC_SIZE)base, vec1;
167	# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
168	VMOVNT vec0, (offset)base; \
169	VMOVNT vec1, ((offset) + VEC_SIZE)base;
170	#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
171	# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
172	VMOVU (offset)base, vec0; \
173	VMOVU ((offset) + VEC_SIZE)base, vec1; \
174	VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
175	VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
176	# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
177	VMOVNT vec0, (offset)base; \
178	VMOVNT vec1, ((offset) + VEC_SIZE)base; \
179	VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
180	VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
181	#else
182	# error Invalid LARGE_LOAD_SIZE
183	#endif
184
185	#ifndef SECTION
186	# error SECTION is not defined!
187	#endif
188
189	.section SECTION(.text),"ax",@progbits
190	#if defined SHARED && IS_IN (libc)
191	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
192	cmp %RDX_LP, %RCX_LP
193	jb HIDDEN_JUMPTARGET (__chk_fail)
194	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
195	#endif
196
197	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
198	mov %RDI_LP, %RAX_LP
199	add %RDX_LP, %RAX_LP
200	jmp L(start)
201	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
202
203	#if defined SHARED && IS_IN (libc)
204	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
205	cmp %RDX_LP, %RCX_LP
206	jb HIDDEN_JUMPTARGET (__chk_fail)
207	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
208	#endif
209
210	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
211	movq %rdi, %rax
212	L(start):
213	# ifdef __ILP32__
214	/ Clear the upper 32 bits. /
215	movl %edx, %edx
216	# endif
217	cmp $VEC_SIZE, %RDX_LP
218	jb L(less_vec)
219	/ Load regardless. /
220	VMOVU (%rsi), %VMM(`0`)
221	cmp $(VEC_SIZE * `2`), %RDX_LP
222	ja L(more_2x_vec)
223	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
224	VMOVU -VEC_SIZE(%rsi,%rdx), %VMM(`1`)
225	VMOVU %VMM(`0`), (%rdi)
226	VMOVU %VMM(`1`), -VEC_SIZE(%rdi,%rdx)
227	#if !(defined USE_MULTIARCH && IS_IN (libc))
228	ZERO_UPPER_VEC_REGISTERS_RETURN
229	#else
230	VZEROUPPER_RETURN
231	#endif
232	#if defined USE_MULTIARCH && IS_IN (libc)
233	END (MEMMOVE_SYMBOL (__memmove, unaligned))
234
235	# ifdef SHARED
236	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
237	cmp %RDX_LP, %RCX_LP
238	jb HIDDEN_JUMPTARGET (__chk_fail)
239	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
240	# endif
241
242	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
243	mov %RDI_LP, %RAX_LP
244	add %RDX_LP, %RAX_LP
245	jmp L(start_erms)
246	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
247
248	# ifdef SHARED
249	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
250	cmp %RDX_LP, %RCX_LP
251	jb HIDDEN_JUMPTARGET (__chk_fail)
252	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
253	# endif
254
255	ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), `6`)
256	movq %rdi, %rax
257	L(start_erms):
258	# ifdef __ILP32__
259	/ Clear the upper 32 bits. /
260	movl %edx, %edx
261	# endif
262	cmp $VEC_SIZE, %RDX_LP
263	jb L(less_vec)
264	/ Load regardless. /
265	VMOVU (%rsi), %VMM(`0`)
266	cmp $(VEC_SIZE * `2`), %RDX_LP
267	ja L(movsb_more_2x_vec)
268	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE.*
269	*/
270	VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(`1`)
271	VMOVU %VMM(`0`), (%rdi)
272	VMOVU %VMM(`1`), -VEC_SIZE(%rdi, %rdx)
273	L(return_vzeroupper):
274	# if VEC_SIZE > 16
275	ZERO_UPPER_VEC_REGISTERS_RETURN
276	# else
277	ret
278	# endif
279	#endif
280
281	#if LARGE_MOV_SIZE
282	/ If LARGE_MOV_SIZE this fits in the aligning bytes between the*
283	ENTRY block and L(less_vec). /*
284	.p2align `4`,, `8`
285	L(between_4_7):
286	/ From 4 to 7. No branch when size == 4. /
287	movl (%rsi), %ecx
288	movl (%rsi, %rdx), %esi
289	movl %ecx, (%rdi)
290	movl %esi, (%rdi, %rdx)
291	ret
292	#endif
293
294	.p2align `4`
295	L(less_vec):
296	/ Less than 1 VEC. /
297	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
298	# error Unsupported VEC_SIZE!
299	#endif
300	#if VEC_SIZE > 32
301	cmpl $`32`, %edx
302	jae L(between_32_63)
303	#endif
304	#if VEC_SIZE > 16
305	cmpl $`16`, %edx
306	jae L(between_16_31)
307	#endif
308	cmpl $`8`, %edx
309	jae L(between_8_15)
310	#if SMALL_MOV_SIZE
311	cmpl $`4`, %edx
312	#else
313	subq $`4`, %rdx
314	#endif
315	jae L(between_4_7)
316	cmpl $(`1` - SMALL_SIZE_OFFSET), %edx
317	jl L(copy_0)
318	movb (%rsi), %cl
319	je L(copy_1)
320	movzwl (-`2` + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
321	movw %si, (-`2` + SMALL_SIZE_OFFSET)(%rdi, %rdx)
322	L(copy_1):
323	movb %cl, (%rdi)
324	L(copy_0):
325	ret
326
327	#if SMALL_MOV_SIZE
328	.p2align `4`,, `8`
329	L(between_4_7):
330	/ From 4 to 7. No branch when size == 4. /
331	movl -`4`(%rsi, %rdx), %ecx
332	movl (%rsi), %esi
333	movl %ecx, -`4`(%rdi, %rdx)
334	movl %esi, (%rdi)
335	ret
336	#endif
337
338	#if VEC_SIZE > 16
339	/ From 16 to 31. No branch when size == 16. /
340	.p2align `4`,, `8`
341	L(between_16_31):
342	vmovdqu (%rsi), %xmm0
343	vmovdqu -`16`(%rsi, %rdx), %xmm1
344	vmovdqu %xmm0, (%rdi)
345	vmovdqu %xmm1, -`16`(%rdi, %rdx)
346	/ No ymm registers have been touched. /
347	ret
348	#endif
349
350	#if VEC_SIZE > 32
351	.p2align `4`,, `10`
352	L(between_32_63):
353	/ From 32 to 63. No branch when size == 32. /
354	VMOVU (%rsi), %VMM_256(`0`)
355	VMOVU -`32`(%rsi, %rdx), %VMM_256(`1`)
356	VMOVU %VMM_256(`0`), (%rdi)
357	VMOVU %VMM_256(`1`), -`32`(%rdi, %rdx)
358	VZEROUPPER_RETURN
359	#endif
360
361	.p2align `4`,, `10`
362	L(between_8_15):
363	/ From 8 to 15. No branch when size == 8. /
364	movq -`8`(%rsi, %rdx), %rcx
365	movq (%rsi), %rsi
366	movq %rsi, (%rdi)
367	movq %rcx, -`8`(%rdi, %rdx)
368	ret
369
370	.p2align `4`,, `10`
371	L(last_4x_vec):
372	/ Copy from 2 * VEC + 1 to 4 * VEC, inclusively. /
373
374	/ VEC(0) and VEC(1) have already been loaded. /
375	VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(`2`)
376	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`3`)
377	VMOVU %VMM(`0`), (%rdi)
378	VMOVU %VMM(`1`), VEC_SIZE(%rdi)
379	VMOVU %VMM(`2`), -VEC_SIZE(%rdi, %rdx)
380	VMOVU %VMM(`3`), -(VEC_SIZE * `2`)(%rdi, %rdx)
381	VZEROUPPER_RETURN
382
383	.p2align `4`
384	#if defined USE_MULTIARCH && IS_IN (libc)
385	L(movsb_more_2x_vec):
386	cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
387	ja L(movsb)
388	#endif
389	L(more_2x_vec):
390	/ More than 2 * VEC and there may be overlap between*
391	destination and source. /*
392	cmpq $(VEC_SIZE * `8`), %rdx
393	ja L(more_8x_vec)
394	/ Load VEC(1) regardless. VEC(0) has already been loaded. /
395	VMOVU VEC_SIZE(%rsi), %VMM(`1`)
396	cmpq $(VEC_SIZE * `4`), %rdx
397	jbe L(last_4x_vec)
398	/ Copy from 4 * VEC + 1 to 8 * VEC, inclusively. /
399	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
400	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`3`)
401	VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(`4`)
402	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`5`)
403	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`6`)
404	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VMM(`7`)
405	VMOVU %VMM(`0`), (%rdi)
406	VMOVU %VMM(`1`), VEC_SIZE(%rdi)
407	VMOVU %VMM(`2`), (VEC_SIZE * `2`)(%rdi)
408	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
409	VMOVU %VMM(`4`), -VEC_SIZE(%rdi, %rdx)
410	VMOVU %VMM(`5`), -(VEC_SIZE * `2`)(%rdi, %rdx)
411	VMOVU %VMM(`6`), -(VEC_SIZE * `3`)(%rdi, %rdx)
412	VMOVU %VMM(`7`), -(VEC_SIZE * `4`)(%rdi, %rdx)
413	VZEROUPPER_RETURN
414
415	.p2align `4`,, `4`
416	L(more_8x_vec):
417	movq %rdi, %rcx
418	subq %rsi, %rcx
419	/ Go to backwards temporal copy if overlap no matter what as*
420	backward REP MOVSB is slow and we don't want to use NT stores if
421	there is overlap. /*
422	cmpq %rdx, %rcx
423	/ L(more_8x_vec_backward_check_nop) checks for src == dst. /
424	jb L(more_8x_vec_backward_check_nop)
425	/ Check if non-temporal move candidate. /
426	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
427	/ Check non-temporal store threshold. /
428	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
429	ja L(large_memcpy_2x)
430	#endif
431	/ To reach this point there cannot be overlap and dst > src. So*
432	check for overlap and src > dst in which case correctness
433	requires forward copy. Otherwise decide between backward/forward
434	copy depending on address aliasing. /*
435
436	/ Entry if rdx is greater than __x86_rep_movsb_stop_threshold*
437	but less than __x86_shared_non_temporal_threshold. /*
438	L(more_8x_vec_check):
439	/ rcx contains dst - src. Add back length (rdx). /
440	leaq (%rcx, %rdx), %r8
441	/ If r8 has different sign than rcx then there is overlap so we*
442	must do forward copy. /*
443	xorq %rcx, %r8
444	/ Isolate just sign bit of r8. /
445	shrq $`63`, %r8
446	/ Get 4k difference dst - src. /
447	andl $(PAGE_SIZE - `256`), %ecx
448	/ If r8 is non-zero must do forward for correctness. Otherwise*
449	if ecx is non-zero there is 4k False Alaising so do backward
450	copy. /*
451	addl %r8d, %ecx
452	jz L(more_8x_vec_backward)
453
454	/ if rdx is greater than __x86_shared_non_temporal_threshold*
455	but there is overlap, or from short distance movsb. /*
456	L(more_8x_vec_forward):
457	/ Load first and last 4 * VEC to support overlapping addresses.*
458	*/
459
460	/ First vec was already loaded into VEC(0). /
461	VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(`5`)
462	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`6`)
463	/ Save beginning of dst. /
464	movq %rdi, %rcx
465	/ Align dst to VEC_SIZE - 1. /
466	orq $(VEC_SIZE - `1`), %rdi
467	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`7`)
468	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VMM(`8`)
469
470	/ Subtract dst from src. Add back after dst aligned. /
471	subq %rcx, %rsi
472	/ Finish aligning dst. /
473	incq %rdi
474	/ Restore src adjusted with new value for aligned dst. /
475	addq %rdi, %rsi
476	/ Store end of buffer minus tail in rdx. /
477	leaq (VEC_SIZE * -`4`)(%rcx, %rdx), %rdx
478
479	/ Dont use multi-byte nop to align. /
480	.p2align `4`,, `11`
481	L(loop_4x_vec_forward):
482	/ Copy 4 * VEC a time forward. /
483	VMOVU (%rsi), %VMM(`1`)
484	VMOVU VEC_SIZE(%rsi), %VMM(`2`)
485	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`3`)
486	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`4`)
487	subq $-(VEC_SIZE * `4`), %rsi
488	VMOVA %VMM(`1`), (%rdi)
489	VMOVA %VMM(`2`), VEC_SIZE(%rdi)
490	VMOVA %VMM(`3`), (VEC_SIZE * `2`)(%rdi)
491	VMOVA %VMM(`4`), (VEC_SIZE * `3`)(%rdi)
492	subq $-(VEC_SIZE * `4`), %rdi
493	cmpq %rdi, %rdx
494	ja L(loop_4x_vec_forward)
495	/ Store the last 4 * VEC. /
496	VMOVU %VMM(`5`), (VEC_SIZE * `3`)(%rdx)
497	VMOVU %VMM(`6`), (VEC_SIZE * `2`)(%rdx)
498	VMOVU %VMM(`7`), VEC_SIZE(%rdx)
499	VMOVU %VMM(`8`), (%rdx)
500	/ Store the first VEC. /
501	VMOVU %VMM(`0`), (%rcx)
502	/ Keep L(nop_backward) target close to jmp for 2-byte encoding.*
503	*/
504	L(nop_backward):
505	VZEROUPPER_RETURN
506
507	.p2align `4`,, `8`
508	L(more_8x_vec_backward_check_nop):
509	/ rcx contains dst - src. Test for dst == src to skip all of*
510	memmove. /*
511	testq %rcx, %rcx
512	jz L(nop_backward)
513	L(more_8x_vec_backward):
514	/ Load the first 4 * VEC and last VEC to support overlapping*
515	addresses. /*
516
517	/ First vec was also loaded into VEC(0). /
518	VMOVU VEC_SIZE(%rsi), %VMM(`5`)
519	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`6`)
520	/ Beginning of region for 4x backward copy stored in rcx. /
521	leaq (VEC_SIZE * -`4` + -`1`)(%rdi, %rdx), %rcx
522	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`7`)
523	VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(`8`)
524	/ Subtract dst from src. Add back after dst aligned. /
525	subq %rdi, %rsi
526	/ Align dst. /
527	andq $-(VEC_SIZE), %rcx
528	/ Restore src. /
529	addq %rcx, %rsi
530
531	/ Don't use multi-byte nop to align. /
532	.p2align `4`,, `11`
533	L(loop_4x_vec_backward):
534	/ Copy 4 * VEC a time backward. /
535	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`1`)
536	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
537	VMOVU (VEC_SIZE * `1`)(%rsi), %VMM(`3`)
538	VMOVU (VEC_SIZE * `0`)(%rsi), %VMM(`4`)
539	addq $(VEC_SIZE * -`4`), %rsi
540	VMOVA %VMM(`1`), (VEC_SIZE * `3`)(%rcx)
541	VMOVA %VMM(`2`), (VEC_SIZE * `2`)(%rcx)
542	VMOVA %VMM(`3`), (VEC_SIZE * `1`)(%rcx)
543	VMOVA %VMM(`4`), (VEC_SIZE * `0`)(%rcx)
544	addq $(VEC_SIZE * -`4`), %rcx
545	cmpq %rcx, %rdi
546	jb L(loop_4x_vec_backward)
547	/ Store the first 4 * VEC. /
548	VMOVU %VMM(`0`), (%rdi)
549	VMOVU %VMM(`5`), VEC_SIZE(%rdi)
550	VMOVU %VMM(`6`), (VEC_SIZE * `2`)(%rdi)
551	VMOVU %VMM(`7`), (VEC_SIZE * `3`)(%rdi)
552	/ Store the last VEC. /
553	VMOVU %VMM(`8`), -VEC_SIZE(%rdx, %rdi)
554	VZEROUPPER_RETURN
555
556	#if defined USE_MULTIARCH && IS_IN (libc)
557	/ L(skip_short_movsb_check) is only used with ERMS. Not for*
558	FSRM. /*
559	.p2align `5`,, `16`
560	# if ALIGN_MOVSB
561	L(skip_short_movsb_check):
562	# if MOVSB_ALIGN_TO > VEC_SIZE
563	VMOVU VEC_SIZE(%rsi), %VMM(`1`)
564	# endif
565	# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
566	# error Unsupported MOVSB_ALIGN_TO
567	# endif
568	/ If CPU does not have FSRM two options for aligning. Align src*
569	if dst and src 4k alias. Otherwise align dst. /*
570	testl $(PAGE_SIZE - `512`), %ecx
571	jnz L(movsb_align_dst)
572	/ Fall through. dst and src 4k alias. It's better to align src*
573	here because the bottleneck will be loads dues to the false
574	dependency on dst. /*
575
576	/ rcx already has dst - src. /
577	movq %rcx, %r9
578	/ Add src to len. Subtract back after src aligned. -1 because*
579	src is initially aligned to MOVSB_ALIGN_TO - 1. /*
580	leaq -`1`(%rsi, %rdx), %rcx
581	/ Inclusively align src to MOVSB_ALIGN_TO - 1. /
582	orq $(MOVSB_ALIGN_TO - `1`), %rsi
583	/ Restore dst and len adjusted with new values for aligned dst.*
584	*/
585	leaq `1`(%rsi, %r9), %rdi
586	subq %rsi, %rcx
587	/ Finish aligning src. /
588	incq %rsi
589
590	rep movsb
591
592	VMOVU %VMM(`0`), (%r8)
593	# if MOVSB_ALIGN_TO > VEC_SIZE
594	VMOVU %VMM(`1`), VEC_SIZE(%r8)
595	# endif
596	VZEROUPPER_RETURN
597	# endif
598
599	.p2align `4`,, `12`
600	L(movsb):
601	movq %rdi, %rcx
602	subq %rsi, %rcx
603	/ Go to backwards temporal copy if overlap no matter what as*
604	backward REP MOVSB is slow and we don't want to use NT stores if
605	there is overlap. /*
606	cmpq %rdx, %rcx
607	/ L(more_8x_vec_backward_check_nop) checks for src == dst. /
608	jb L(more_8x_vec_backward_check_nop)
609	# if ALIGN_MOVSB
610	/ Save dest for storing aligning VECs later. /
611	movq %rdi, %r8
612	# endif
613	/ If above __x86_rep_movsb_stop_threshold most likely is*
614	candidate for NT moves as well. /*
615	cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
616	jae L(large_memcpy_2x_check)
617	# if AVOID_SHORT_DISTANCE_REP_MOVSB \|\| ALIGN_MOVSB
618	/ Only avoid short movsb if CPU has FSRM. /
619	# if X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB < 256
620	testb $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
621	# else
622	testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
623	# endif
624	jz L(skip_short_movsb_check)
625	# if AVOID_SHORT_DISTANCE_REP_MOVSB
626	/ Avoid "rep movsb" if RCX, the distance between source and*
627	destination, is N4GB + [1..63] with N >= 0. /
628
629	/ ecx contains dst - src. Early check for backward copy*
630	conditions means only case of slow movsb with src = dst + [0,
631	63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
632	for that case. /*
633	cmpl $-`64`, %ecx
634	ja L(more_8x_vec_forward)
635	# endif
636	# endif
637	# if ALIGN_MOVSB
638	# if MOVSB_ALIGN_TO > VEC_SIZE
639	VMOVU VEC_SIZE(%rsi), %VMM(`1`)
640	# endif
641	# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
642	# error Unsupported MOVSB_ALIGN_TO
643	# endif
644	/ Fall through means cpu has FSRM. In that case exclusively*
645	align destination. /*
646	L(movsb_align_dst):
647	/ Subtract dst from src. Add back after dst aligned. /
648	subq %rdi, %rsi
649	/ Exclusively align dst to MOVSB_ALIGN_TO (64). /
650	addq $(MOVSB_ALIGN_TO - `1`), %rdi
651	/ Add dst to len. Subtract back after dst aligned. /
652	leaq (%r8, %rdx), %rcx
653	/ Finish aligning dst. /
654	andq $-(MOVSB_ALIGN_TO), %rdi
655	/ Restore src and len adjusted with new values for aligned dst.*
656	*/
657	addq %rdi, %rsi
658	subq %rdi, %rcx
659
660	rep movsb
661
662	/ Store VECs loaded for aligning. /
663	VMOVU %VMM(`0`), (%r8)
664	# if MOVSB_ALIGN_TO > VEC_SIZE
665	VMOVU %VMM(`1`), VEC_SIZE(%r8)
666	# endif
667	VZEROUPPER_RETURN
668	# else /* !ALIGN_MOVSB. */
669	L(skip_short_movsb_check):
670	mov %RDX_LP, %RCX_LP
671	rep movsb
672	ret
673	# endif
674	#endif
675
676	.p2align `4`,, `10`
677	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
678	L(large_memcpy_2x_check):
679	/ Entry from L(large_memcpy_2x) has a redundant load of*
680	__x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
681	is only use for the non-erms memmove which is generally less
682	common. /*
683	L(large_memcpy_2x):
684	mov __x86_shared_non_temporal_threshold(%rip), %R11_LP
685	cmp %R11_LP, %RDX_LP
686	jb L(more_8x_vec_check)
687	/ To reach this point it is impossible for dst > src and*
688	overlap. Remaining to check is src > dst and overlap. rcx
689	already contains dst - src. Negate rcx to get src - dst. If
690	length > rcx then there is overlap and forward copy is best. /*
691	negq %rcx
692	cmpq %rcx, %rdx
693	ja L(more_8x_vec_forward)
694
695	/ Cache align destination. First store the first 64 bytes then*
696	adjust alignments. /*
697
698	/ First vec was also loaded into VEC(0). /
699	# if VEC_SIZE < 64
700	VMOVU VEC_SIZE(%rsi), %VMM(`1`)
701	# if VEC_SIZE < 32
702	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
703	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`3`)
704	# endif
705	# endif
706	VMOVU %VMM(`0`), (%rdi)
707	# if VEC_SIZE < 64
708	VMOVU %VMM(`1`), VEC_SIZE(%rdi)
709	# if VEC_SIZE < 32
710	VMOVU %VMM(`2`), (VEC_SIZE * `2`)(%rdi)
711	VMOVU %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
712	# endif
713	# endif
714
715	/ Adjust source, destination, and size. /
716	movq %rdi, %r8
717	andq $`63`, %r8
718	/ Get the negative of offset for alignment. /
719	subq $`64`, %r8
720	/ Adjust source. /
721	subq %r8, %rsi
722	/ Adjust destination which should be aligned now. /
723	subq %r8, %rdi
724	/ Adjust length. /
725	addq %r8, %rdx
726
727	/ Test if source and destination addresses will alias. If they*
728	do the larger pipeline in large_memcpy_4x alleviated the
729	performance drop. /*
730
731	/ ecx contains -(dst - src). not ecx will return dst - src - 1*
732	which works for testing aliasing. /*
733	notl %ecx
734	movq %rdx, %r10
735	testl $(PAGE_SIZE - VEC_SIZE * `8`), %ecx
736	jz L(large_memcpy_4x)
737
738	/ r11 has __x86_shared_non_temporal_threshold. Shift it left*
739	by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
740	*/
741	shlq $LOG_4X_MEMCPY_THRESH, %r11
742	cmp %r11, %rdx
743	jae L(large_memcpy_4x)
744
745	/ edx will store remainder size for copying tail. /
746	andl $(PAGE_SIZE * `2` - `1`), %edx
747	/ r10 stores outer loop counter. /
748	shrq $(LOG_PAGE_SIZE + `1`), %r10
749	/ Copy 4x VEC at a time from 2 pages. /
750	.p2align `4`
751	L(loop_large_memcpy_2x_outer):
752	/ ecx stores inner loop counter. /
753	movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
754	L(loop_large_memcpy_2x_inner):
755	PREFETCH_ONE_SET(`1`, (%rsi), PREFETCHED_LOAD_SIZE)
756	PREFETCH_ONE_SET(`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
757	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
758	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * `2`)
759	/ Load vectors from rsi. /
760	LOAD_ONE_SET((%rsi), `0`, %VMM(`0`), %VMM(`1`), %VMM(`2`), %VMM(`3`))
761	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(`4`), %VMM(`5`), %VMM(`6`), %VMM(`7`))
762	subq $-LARGE_LOAD_SIZE, %rsi
763	/ Non-temporal store vectors to rdi. /
764	STORE_ONE_SET((%rdi), `0`, %VMM(`0`), %VMM(`1`), %VMM(`2`), %VMM(`3`))
765	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(`4`), %VMM(`5`), %VMM(`6`), %VMM(`7`))
766	subq $-LARGE_LOAD_SIZE, %rdi
767	decl %ecx
768	jnz L(loop_large_memcpy_2x_inner)
769	addq $PAGE_SIZE, %rdi
770	addq $PAGE_SIZE, %rsi
771	decq %r10
772	jne L(loop_large_memcpy_2x_outer)
773	sfence
774
775	/ Check if only last 4 loads are needed. /
776	cmpl $(VEC_SIZE * `4`), %edx
777	jbe L(large_memcpy_2x_end)
778
779	/ Handle the last 2 * PAGE_SIZE bytes. /
780	L(loop_large_memcpy_2x_tail):
781	/ Copy 4 * VEC a time forward with non-temporal stores. /
782	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE)
783	PREFETCH_ONE_SET (`1`, (%rdi), PREFETCHED_LOAD_SIZE)
784	VMOVU (%rsi), %VMM(`0`)
785	VMOVU VEC_SIZE(%rsi), %VMM(`1`)
786	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
787	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`3`)
788	subq $-(VEC_SIZE * `4`), %rsi
789	addl $-(VEC_SIZE * `4`), %edx
790	VMOVA %VMM(`0`), (%rdi)
791	VMOVA %VMM(`1`), VEC_SIZE(%rdi)
792	VMOVA %VMM(`2`), (VEC_SIZE * `2`)(%rdi)
793	VMOVA %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
794	subq $-(VEC_SIZE * `4`), %rdi
795	cmpl $(VEC_SIZE * `4`), %edx
796	ja L(loop_large_memcpy_2x_tail)
797
798	L(large_memcpy_2x_end):
799	/ Store the last 4 * VEC. /
800	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VMM(`0`)
801	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`1`)
802	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`2`)
803	VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(`3`)
804
805	VMOVU %VMM(`0`), -(VEC_SIZE * `4`)(%rdi, %rdx)
806	VMOVU %VMM(`1`), -(VEC_SIZE * `3`)(%rdi, %rdx)
807	VMOVU %VMM(`2`), -(VEC_SIZE * `2`)(%rdi, %rdx)
808	VMOVU %VMM(`3`), -VEC_SIZE(%rdi, %rdx)
809	VZEROUPPER_RETURN
810
811	.p2align `4`
812	L(large_memcpy_4x):
813	/ edx will store remainder size for copying tail. /
814	andl $(PAGE_SIZE * `4` - `1`), %edx
815	/ r10 stores outer loop counter. /
816	shrq $(LOG_PAGE_SIZE + `2`), %r10
817	/ Copy 4x VEC at a time from 4 pages. /
818	.p2align `4`
819	L(loop_large_memcpy_4x_outer):
820	/ ecx stores inner loop counter. /
821	movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
822	L(loop_large_memcpy_4x_inner):
823	/ Only one prefetch set per page as doing 4 pages give more*
824	time for prefetcher to keep up. /*
825	PREFETCH_ONE_SET(`1`, (%rsi), PREFETCHED_LOAD_SIZE)
826	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
827	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE * `2` + PREFETCHED_LOAD_SIZE)
828	PREFETCH_ONE_SET(`1`, (%rsi), PAGE_SIZE * `3` + PREFETCHED_LOAD_SIZE)
829	/ Load vectors from rsi. /
830	LOAD_ONE_SET((%rsi), `0`, %VMM(`0`), %VMM(`1`), %VMM(`2`), %VMM(`3`))
831	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(`4`), %VMM(`5`), %VMM(`6`), %VMM(`7`))
832	LOAD_ONE_SET((%rsi), PAGE_SIZE * `2`, %VMM(`8`), %VMM(`9`), %VMM(`10`), %VMM(`11`))
833	LOAD_ONE_SET((%rsi), PAGE_SIZE * `3`, %VMM(`12`), %VMM(`13`), %VMM(`14`), %VMM(`15`))
834	subq $-LARGE_LOAD_SIZE, %rsi
835	/ Non-temporal store vectors to rdi. /
836	STORE_ONE_SET((%rdi), `0`, %VMM(`0`), %VMM(`1`), %VMM(`2`), %VMM(`3`))
837	STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(`4`), %VMM(`5`), %VMM(`6`), %VMM(`7`))
838	STORE_ONE_SET((%rdi), PAGE_SIZE * `2`, %VMM(`8`), %VMM(`9`), %VMM(`10`), %VMM(`11`))
839	STORE_ONE_SET((%rdi), PAGE_SIZE * `3`, %VMM(`12`), %VMM(`13`), %VMM(`14`), %VMM(`15`))
840	subq $-LARGE_LOAD_SIZE, %rdi
841	decl %ecx
842	jnz L(loop_large_memcpy_4x_inner)
843	addq $(PAGE_SIZE * `3`), %rdi
844	addq $(PAGE_SIZE * `3`), %rsi
845	decq %r10
846	jne L(loop_large_memcpy_4x_outer)
847	sfence
848	/ Check if only last 4 loads are needed. /
849	cmpl $(VEC_SIZE * `4`), %edx
850	jbe L(large_memcpy_4x_end)
851
852	/ Handle the last 4 * PAGE_SIZE bytes. /
853	L(loop_large_memcpy_4x_tail):
854	/ Copy 4 * VEC a time forward with non-temporal stores. /
855	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE)
856	PREFETCH_ONE_SET (`1`, (%rdi), PREFETCHED_LOAD_SIZE)
857	VMOVU (%rsi), %VMM(`0`)
858	VMOVU VEC_SIZE(%rsi), %VMM(`1`)
859	VMOVU (VEC_SIZE * `2`)(%rsi), %VMM(`2`)
860	VMOVU (VEC_SIZE * `3`)(%rsi), %VMM(`3`)
861	subq $-(VEC_SIZE * `4`), %rsi
862	addl $-(VEC_SIZE * `4`), %edx
863	VMOVA %VMM(`0`), (%rdi)
864	VMOVA %VMM(`1`), VEC_SIZE(%rdi)
865	VMOVA %VMM(`2`), (VEC_SIZE * `2`)(%rdi)
866	VMOVA %VMM(`3`), (VEC_SIZE * `3`)(%rdi)
867	subq $-(VEC_SIZE * `4`), %rdi
868	cmpl $(VEC_SIZE * `4`), %edx
869	ja L(loop_large_memcpy_4x_tail)
870
871	L(large_memcpy_4x_end):
872	/ Store the last 4 * VEC. /
873	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VMM(`0`)
874	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VMM(`1`)
875	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VMM(`2`)
876	VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(`3`)
877
878	VMOVU %VMM(`0`), -(VEC_SIZE * `4`)(%rdi, %rdx)
879	VMOVU %VMM(`1`), -(VEC_SIZE * `3`)(%rdi, %rdx)
880	VMOVU %VMM(`2`), -(VEC_SIZE * `2`)(%rdi, %rdx)
881	VMOVU %VMM(`3`), -VEC_SIZE(%rdi, %rdx)
882	VZEROUPPER_RETURN
883	#endif
884	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
885
886	#if IS_IN (libc)
887	# ifdef USE_MULTIARCH
888	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
889	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
890	# ifdef SHARED
891	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
892	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
893	# endif
894	# endif
895	# ifdef SHARED
896	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
897	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
898	# endif
899	#endif
900	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
901	MEMCPY_SYMBOL (__memcpy, unaligned))
902

source code of glibc/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S