1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* Copyright 2002 Andi Kleen, SuSE Labs */ |
3 | |
4 | #include <linux/export.h> |
5 | #include <linux/linkage.h> |
6 | #include <asm/cpufeatures.h> |
7 | #include <asm/alternative.h> |
8 | |
9 | .section .noinstr.text, "ax" |
10 | |
11 | /* |
12 | * ISO C memset - set a memory block to a byte value. This function uses fast |
13 | * string to get better performance than the original function. The code is |
14 | * simpler and shorter than the original function as well. |
15 | * |
16 | * rdi destination |
17 | * rsi value (char) |
18 | * rdx count (bytes) |
19 | * |
20 | * rax original destination |
21 | * |
22 | * The FSRS alternative should be done inline (avoiding the call and |
23 | * the disgusting return handling), but that would require some help |
24 | * from the compiler for better calling conventions. |
25 | * |
26 | * The 'rep stosb' itself is small enough to replace the call, but all |
27 | * the register moves blow up the code. And two of them are "needed" |
28 | * only for the return value that is the same as the source input, |
29 | * which the compiler could/should do much better anyway. |
30 | */ |
31 | SYM_FUNC_START(__memset) |
32 | ALTERNATIVE "jmp memset_orig" , "" , X86_FEATURE_FSRS |
33 | |
34 | movq %rdi,%r9 |
35 | movb %sil,%al |
36 | movq %rdx,%rcx |
37 | rep stosb |
38 | movq %r9,%rax |
39 | RET |
40 | SYM_FUNC_END(__memset) |
41 | EXPORT_SYMBOL(__memset) |
42 | |
43 | SYM_FUNC_ALIAS_MEMFUNC(memset, __memset) |
44 | EXPORT_SYMBOL(memset) |
45 | |
46 | SYM_FUNC_START_LOCAL(memset_orig) |
47 | movq %rdi,%r10 |
48 | |
49 | /* expand byte value */ |
50 | movzbl %sil,%ecx |
51 | movabs $0x0101010101010101,%rax |
52 | imulq %rcx,%rax |
53 | |
54 | /* align dst */ |
55 | movl %edi,%r9d |
56 | andl $7,%r9d |
57 | jnz .Lbad_alignment |
58 | .Lafter_bad_alignment: |
59 | |
60 | movq %rdx,%rcx |
61 | shrq $6,%rcx |
62 | jz .Lhandle_tail |
63 | |
64 | .p2align 4 |
65 | .Lloop_64: |
66 | decq %rcx |
67 | movq %rax,(%rdi) |
68 | movq %rax,8(%rdi) |
69 | movq %rax,16(%rdi) |
70 | movq %rax,24(%rdi) |
71 | movq %rax,32(%rdi) |
72 | movq %rax,40(%rdi) |
73 | movq %rax,48(%rdi) |
74 | movq %rax,56(%rdi) |
75 | leaq 64(%rdi),%rdi |
76 | jnz .Lloop_64 |
77 | |
78 | /* Handle tail in loops. The loops should be faster than hard |
79 | to predict jump tables. */ |
80 | .p2align 4 |
81 | .Lhandle_tail: |
82 | movl %edx,%ecx |
83 | andl $63&(~7),%ecx |
84 | jz .Lhandle_7 |
85 | shrl $3,%ecx |
86 | .p2align 4 |
87 | .Lloop_8: |
88 | decl %ecx |
89 | movq %rax,(%rdi) |
90 | leaq 8(%rdi),%rdi |
91 | jnz .Lloop_8 |
92 | |
93 | .Lhandle_7: |
94 | andl $7,%edx |
95 | jz .Lende |
96 | .p2align 4 |
97 | .Lloop_1: |
98 | decl %edx |
99 | movb %al,(%rdi) |
100 | leaq 1(%rdi),%rdi |
101 | jnz .Lloop_1 |
102 | |
103 | .Lende: |
104 | movq %r10,%rax |
105 | RET |
106 | |
107 | .Lbad_alignment: |
108 | cmpq $7,%rdx |
109 | jbe .Lhandle_7 |
110 | movq %rax,(%rdi) /* unaligned store */ |
111 | movq $8,%r8 |
112 | subq %r9,%r8 |
113 | addq %r8,%rdi |
114 | subq %r8,%rdx |
115 | jmp .Lafter_bad_alignment |
116 | .Lfinal: |
117 | SYM_FUNC_END(memset_orig) |
118 | |