memset_64.S source code [linux/arch/x86/lib/memset_64.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/ Copyright 2002 Andi Kleen, SuSE Labs /
3
4	#include <linux/export.h>
5	#include <linux/linkage.h>
6	#include <asm/cpufeatures.h>
7	#include <asm/alternative.h>
8
9	.section .noinstr.text, "ax"
10
11	/*
12	* ISO C memset - set a memory block to a byte value. This function uses fast
13	* string to get better performance than the original function. The code is
14	* simpler and shorter than the original function as well.
15	*
16	* rdi destination
17	* rsi value (char)
18	* rdx count (bytes)
19	*
20	* rax original destination
21	*
22	* The FSRS alternative should be done inline (avoiding the call and
23	* the disgusting return handling), but that would require some help
24	* from the compiler for better calling conventions.
25	*
26	* The 'rep stosb' itself is small enough to replace the call, but all
27	* the register moves blow up the code. And two of them are "needed"
28	* only for the return value that is the same as the source input,
29	* which the compiler could/should do much better anyway.
30	*/
31	SYM_FUNC_START(__memset)
32	ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
33
34	movq %rdi,%r9
35	movb %sil,%al
36	movq %rdx,%rcx
37	rep stosb
38	movq %r9,%rax
39	RET
40	SYM_FUNC_END(__memset)
41	EXPORT_SYMBOL(__memset)
42
43	SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
44	EXPORT_SYMBOL(memset)
45
46	SYM_FUNC_START_LOCAL(memset_orig)
47	movq %rdi,%r10
48
49	/ expand byte value /
50	movzbl %sil,%ecx
51	movabs $`0x0101010101010101`,%rax
52	imulq %rcx,%rax
53
54	/ align dst /
55	movl %edi,%r9d
56	andl $`7`,%r9d
57	jnz .Lbad_alignment
58	.Lafter_bad_alignment:
59
60	movq %rdx,%rcx
61	shrq $`6`,%rcx
62	jz .Lhandle_tail
63
64	.p2align `4`
65	.Lloop_64:
66	decq %rcx
67	movq %rax,(%rdi)
68	movq %rax,`8`(%rdi)
69	movq %rax,`16`(%rdi)
70	movq %rax,`24`(%rdi)
71	movq %rax,`32`(%rdi)
72	movq %rax,`40`(%rdi)
73	movq %rax,`48`(%rdi)
74	movq %rax,`56`(%rdi)
75	leaq `64`(%rdi),%rdi
76	jnz .Lloop_64
77
78	/ Handle tail in loops. The loops should be faster than hard*
79	to predict jump tables. /*
80	.p2align `4`
81	.Lhandle_tail:
82	movl %edx,%ecx
83	andl $`63`&(~`7`),%ecx
84	jz .Lhandle_7
85	shrl $`3`,%ecx
86	.p2align `4`
87	.Lloop_8:
88	decl %ecx
89	movq %rax,(%rdi)
90	leaq `8`(%rdi),%rdi
91	jnz .Lloop_8
92
93	.Lhandle_7:
94	andl $`7`,%edx
95	jz .Lende
96	.p2align `4`
97	.Lloop_1:
98	decl %edx
99	movb %al,(%rdi)
100	leaq `1`(%rdi),%rdi
101	jnz .Lloop_1
102
103	.Lende:
104	movq %r10,%rax
105	RET
106
107	.Lbad_alignment:
108	cmpq $`7`,%rdx
109	jbe .Lhandle_7
110	movq %rax,(%rdi) / unaligned store /
111	movq $`8`,%r8
112	subq %r9,%r8
113	addq %r8,%rdi
114	subq %r8,%rdx
115	jmp .Lafter_bad_alignment
116	.Lfinal:
117	SYM_FUNC_END(memset_orig)
118

source code of linux/arch/x86/lib/memset_64.S