memcpy_64.S source code [linux/arch/x86/lib/memcpy_64.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/ Copyright 2002 Andi Kleen /
3
4	#include <linux/export.h>
5	#include <linux/linkage.h>
6	#include <linux/cfi_types.h>
7	#include <asm/errno.h>
8	#include <asm/cpufeatures.h>
9	#include <asm/alternative.h>
10
11	.section .noinstr.text, "ax"
12
13	/*
14	* memcpy - Copy a memory block.
15	*
16	* Input:
17	* rdi destination
18	* rsi source
19	* rdx count
20	*
21	* Output:
22	* rax original destination
23	*
24	* The FSRM alternative should be done inline (avoiding the call and
25	* the disgusting return handling), but that would require some help
26	* from the compiler for better calling conventions.
27	*
28	* The 'rep movsb' itself is small enough to replace the call, but the
29	* two register moves blow up the code. And one of them is "needed"
30	* only for the return value that is the same as the source input,
31	* which the compiler could/should do much better anyway.
32	*/
33	SYM_TYPED_FUNC_START(__memcpy)
34	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
35
36	movq %rdi, %rax
37	movq %rdx, %rcx
38	rep movsb
39	RET
40	SYM_FUNC_END(__memcpy)
41	EXPORT_SYMBOL(__memcpy)
42
43	SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
44	EXPORT_SYMBOL(memcpy)
45
46	SYM_FUNC_START_LOCAL(memcpy_orig)
47	movq %rdi, %rax
48
49	cmpq $`0x20`, %rdx
50	jb .Lhandle_tail
51
52	/*
53	* We check whether memory false dependence could occur,
54	* then jump to corresponding copy mode.
55	*/
56	cmp %dil, %sil
57	jl .Lcopy_backward
58	subq $`0x20`, %rdx
59	.Lcopy_forward_loop:
60	subq $`0x20`, %rdx
61
62	/*
63	* Move in blocks of 4x8 bytes:
64	*/
65	movq `0`*`8`(%rsi), %r8
66	movq `1`*`8`(%rsi), %r9
67	movq `2`*`8`(%rsi), %r10
68	movq `3`*`8`(%rsi), %r11
69	leaq `4`*`8`(%rsi), %rsi
70
71	movq %r8, `0`*`8`(%rdi)
72	movq %r9, `1`*`8`(%rdi)
73	movq %r10, `2`*`8`(%rdi)
74	movq %r11, `3`*`8`(%rdi)
75	leaq `4`*`8`(%rdi), %rdi
76	jae .Lcopy_forward_loop
77	addl $`0x20`, %edx
78	jmp .Lhandle_tail
79
80	.Lcopy_backward:
81	/*
82	* Calculate copy position to tail.
83	*/
84	addq %rdx, %rsi
85	addq %rdx, %rdi
86	subq $`0x20`, %rdx
87	/*
88	* At most 3 ALU operations in one cycle,
89	* so append NOPS in the same 16 bytes trunk.
90	*/
91	.p2align `4`
92	.Lcopy_backward_loop:
93	subq $`0x20`, %rdx
94	movq -`1`*`8`(%rsi), %r8
95	movq -`2`*`8`(%rsi), %r9
96	movq -`3`*`8`(%rsi), %r10
97	movq -`4`*`8`(%rsi), %r11
98	leaq -`4`*`8`(%rsi), %rsi
99	movq %r8, -`1`*`8`(%rdi)
100	movq %r9, -`2`*`8`(%rdi)
101	movq %r10, -`3`*`8`(%rdi)
102	movq %r11, -`4`*`8`(%rdi)
103	leaq -`4`*`8`(%rdi), %rdi
104	jae .Lcopy_backward_loop
105
106	/*
107	* Calculate copy position to head.
108	*/
109	addl $`0x20`, %edx
110	subq %rdx, %rsi
111	subq %rdx, %rdi
112	.Lhandle_tail:
113	cmpl $`16`, %edx
114	jb .Lless_16bytes
115
116	/*
117	* Move data from 16 bytes to 31 bytes.
118	*/
119	movq `0`*`8`(%rsi), %r8
120	movq `1`*`8`(%rsi), %r9
121	movq -`2`*`8`(%rsi, %rdx), %r10
122	movq -`1`*`8`(%rsi, %rdx), %r11
123	movq %r8, `0`*`8`(%rdi)
124	movq %r9, `1`*`8`(%rdi)
125	movq %r10, -`2`*`8`(%rdi, %rdx)
126	movq %r11, -`1`*`8`(%rdi, %rdx)
127	RET
128	.p2align `4`
129	.Lless_16bytes:
130	cmpl $`8`, %edx
131	jb .Lless_8bytes
132	/*
133	* Move data from 8 bytes to 15 bytes.
134	*/
135	movq `0`*`8`(%rsi), %r8
136	movq -`1`*`8`(%rsi, %rdx), %r9
137	movq %r8, `0`*`8`(%rdi)
138	movq %r9, -`1`*`8`(%rdi, %rdx)
139	RET
140	.p2align `4`
141	.Lless_8bytes:
142	cmpl $`4`, %edx
143	jb .Lless_3bytes
144
145	/*
146	* Move data from 4 bytes to 7 bytes.
147	*/
148	movl (%rsi), %ecx
149	movl -`4`(%rsi, %rdx), %r8d
150	movl %ecx, (%rdi)
151	movl %r8d, -`4`(%rdi, %rdx)
152	RET
153	.p2align `4`
154	.Lless_3bytes:
155	subl $`1`, %edx
156	jb .Lend
157	/*
158	* Move data from 1 bytes to 3 bytes.
159	*/
160	movzbl (%rsi), %ecx
161	jz .Lstore_1byte
162	movzbq `1`(%rsi), %r8
163	movzbq (%rsi, %rdx), %r9
164	movb %r8b, `1`(%rdi)
165	movb %r9b, (%rdi, %rdx)
166	.Lstore_1byte:
167	movb %cl, (%rdi)
168
169	.Lend:
170	RET
171	SYM_FUNC_END(memcpy_orig)
172
173

source code of linux/arch/x86/lib/memcpy_64.S