1 | /* |
2 | * memset - fill memory with a constant byte |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | /* Assumptions: |
10 | * |
11 | * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. |
12 | * |
13 | */ |
14 | |
15 | #include "../asmdefs.h" |
16 | |
17 | #define dstin x0 |
18 | #define val x1 |
19 | #define valw w1 |
20 | #define count x2 |
21 | #define dst x3 |
22 | #define dstend x4 |
23 | #define zva_val x5 |
24 | |
25 | ENTRY (__memset_aarch64) |
26 | |
27 | dup v0.16B, valw |
28 | add dstend, dstin, count |
29 | |
30 | cmp count, 96 |
31 | b.hi L(set_long) |
32 | cmp count, 16 |
33 | b.hs L(set_medium) |
34 | mov val, v0.D[0] |
35 | |
36 | /* Set 0..15 bytes. */ |
37 | tbz count, 3, 1f |
38 | str val, [dstin] |
39 | str val, [dstend, -8] |
40 | ret |
41 | nop |
42 | 1: tbz count, 2, 2f |
43 | str valw, [dstin] |
44 | str valw, [dstend, -4] |
45 | ret |
46 | 2: cbz count, 3f |
47 | strb valw, [dstin] |
48 | tbz count, 1, 3f |
49 | strh valw, [dstend, -2] |
50 | 3: ret |
51 | |
52 | /* Set 17..96 bytes. */ |
53 | L(set_medium): |
54 | str q0, [dstin] |
55 | tbnz count, 6, L(set96) |
56 | str q0, [dstend, -16] |
57 | tbz count, 5, 1f |
58 | str q0, [dstin, 16] |
59 | str q0, [dstend, -32] |
60 | 1: ret |
61 | |
62 | .p2align 4 |
63 | /* Set 64..96 bytes. Write 64 bytes from the start and |
64 | 32 bytes from the end. */ |
65 | L(set96): |
66 | str q0, [dstin, 16] |
67 | stp q0, q0, [dstin, 32] |
68 | stp q0, q0, [dstend, -32] |
69 | ret |
70 | |
71 | .p2align 4 |
72 | L(set_long): |
73 | and valw, valw, 255 |
74 | bic dst, dstin, 15 |
75 | str q0, [dstin] |
76 | cmp count, 160 |
77 | ccmp valw, 0, 0, hs |
78 | b.ne L(no_zva) |
79 | |
80 | #ifndef SKIP_ZVA_CHECK |
81 | mrs zva_val, dczid_el0 |
82 | and zva_val, zva_val, 31 |
83 | cmp zva_val, 4 /* ZVA size is 64 bytes. */ |
84 | b.ne L(no_zva) |
85 | #endif |
86 | str q0, [dst, 16] |
87 | stp q0, q0, [dst, 32] |
88 | bic dst, dst, 63 |
89 | sub count, dstend, dst /* Count is now 64 too large. */ |
90 | sub count, count, 128 /* Adjust count and bias for loop. */ |
91 | |
92 | .p2align 4 |
93 | L(zva_loop): |
94 | add dst, dst, 64 |
95 | dc zva, dst |
96 | subs count, count, 64 |
97 | b.hi L(zva_loop) |
98 | stp q0, q0, [dstend, -64] |
99 | stp q0, q0, [dstend, -32] |
100 | ret |
101 | |
102 | L(no_zva): |
103 | sub count, dstend, dst /* Count is 16 too large. */ |
104 | sub dst, dst, 16 /* Dst is biased by -32. */ |
105 | sub count, count, 64 + 16 /* Adjust count and bias for loop. */ |
106 | L(no_zva_loop): |
107 | stp q0, q0, [dst, 32] |
108 | stp q0, q0, [dst, 64]! |
109 | subs count, count, 64 |
110 | b.hi L(no_zva_loop) |
111 | stp q0, q0, [dstend, -64] |
112 | stp q0, q0, [dstend, -32] |
113 | ret |
114 | |
115 | END (__memset_aarch64) |
116 | |