memset_64.S source code [linux/tools/arch/x86/lib/memset_64.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/ Copyright 2002 Andi Kleen, SuSE Labs /
3
4	#include <linux/export.h>
5	#include <linux/linkage.h>
6	#include <linux/cfi_types.h>
7	#include <asm/cpufeatures.h>
8	#include <asm/alternative.h>
9
10	.section .noinstr.text, "ax"
11
12	/*
13	* ISO C memset - set a memory block to a byte value. This function uses fast
14	* string to get better performance than the original function. The code is
15	* simpler and shorter than the original function as well.
16	*
17	* rdi destination
18	* rsi value (char)
19	* rdx count (bytes)
20	*
21	* rax original destination
22	*
23	* The FSRS alternative should be done inline (avoiding the call and
24	* the disgusting return handling), but that would require some help
25	* from the compiler for better calling conventions.
26	*
27	* The 'rep stosb' itself is small enough to replace the call, but all
28	* the register moves blow up the code. And two of them are "needed"
29	* only for the return value that is the same as the source input,
30	* which the compiler could/should do much better anyway.
31	*/
32	SYM_TYPED_FUNC_START(__memset)
33	ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
34
35	movq %rdi,%r9
36	movb %sil,%al
37	movq %rdx,%rcx
38	rep stosb
39	movq %r9,%rax
40	RET
41	SYM_FUNC_END(__memset)
42	EXPORT_SYMBOL(__memset)
43
44	SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
45	SYM_PIC_ALIAS(memset)
46	EXPORT_SYMBOL(memset)
47
48	SYM_FUNC_START_LOCAL(memset_orig)
49	movq %rdi,%r10
50
51	/ expand byte value /
52	movzbl %sil,%ecx
53	movabs $`0x0101010101010101`,%rax
54	imulq %rcx,%rax
55
56	/ align dst /
57	movl %edi,%r9d
58	andl $`7`,%r9d
59	jnz .Lbad_alignment
60	.Lafter_bad_alignment:
61
62	movq %rdx,%rcx
63	shrq $`6`,%rcx
64	jz .Lhandle_tail
65
66	.p2align `4`
67	.Lloop_64:
68	decq %rcx
69	movq %rax,(%rdi)
70	movq %rax,`8`(%rdi)
71	movq %rax,`16`(%rdi)
72	movq %rax,`24`(%rdi)
73	movq %rax,`32`(%rdi)
74	movq %rax,`40`(%rdi)
75	movq %rax,`48`(%rdi)
76	movq %rax,`56`(%rdi)
77	leaq `64`(%rdi),%rdi
78	jnz .Lloop_64
79
80	/ Handle tail in loops. The loops should be faster than hard*
81	to predict jump tables. /*
82	.p2align `4`
83	.Lhandle_tail:
84	movl %edx,%ecx
85	andl $`63`&(~`7`),%ecx
86	jz .Lhandle_7
87	shrl $`3`,%ecx
88	.p2align `4`
89	.Lloop_8:
90	decl %ecx
91	movq %rax,(%rdi)
92	leaq `8`(%rdi),%rdi
93	jnz .Lloop_8
94
95	.Lhandle_7:
96	andl $`7`,%edx
97	jz .Lende
98	.p2align `4`
99	.Lloop_1:
100	decl %edx
101	movb %al,(%rdi)
102	leaq `1`(%rdi),%rdi
103	jnz .Lloop_1
104
105	.Lende:
106	movq %r10,%rax
107	RET
108
109	.Lbad_alignment:
110	cmpq $`7`,%rdx
111	jbe .Lhandle_7
112	movq %rax,(%rdi) / unaligned store /
113	movq $`8`,%r8
114	subq %r9,%r8
115	addq %r8,%rdi
116	subq %r8,%rdx
117	jmp .Lafter_bad_alignment
118	.Lfinal:
119	SYM_FUNC_END(memset_orig)
120

source code of linux/tools/arch/x86/lib/memset_64.S