1/* Optimized memset for AmpereComputing emag processor.
2 Copyright (C) 2018-2024 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21#include "memset-reg.h"
22
23/* Assumptions:
24 *
25 * ARMv8-a, AArch64, unaligned accesses
26 *
27 */
28
29ENTRY (__memset_emag)
30
31 PTR_ARG (0)
32 SIZE_ARG (2)
33
34 bfi valw, valw, 8, 8
35 bfi valw, valw, 16, 16
36 bfi val, val, 32, 32
37
38 add dstend, dstin, count
39
40 cmp count, 96
41 b.hi L(set_long)
42 cmp count, 16
43 b.hs L(set_medium)
44
45 /* Set 0..15 bytes. */
46 tbz count, 3, 1f
47 str val, [dstin]
48 str val, [dstend, -8]
49 ret
50
51 .p2align 3
521: tbz count, 2, 2f
53 str valw, [dstin]
54 str valw, [dstend, -4]
55 ret
562: cbz count, 3f
57 strb valw, [dstin]
58 tbz count, 1, 3f
59 strh valw, [dstend, -2]
603: ret
61
62 .p2align 3
63 /* Set 16..96 bytes. */
64L(set_medium):
65 stp val, val, [dstin]
66 tbnz count, 6, L(set96)
67 stp val, val, [dstend, -16]
68 tbz count, 5, 1f
69 stp val, val, [dstin, 16]
70 stp val, val, [dstend, -32]
711: ret
72
73 .p2align 4
74 /* Set 64..96 bytes. Write 64 bytes from the start and
75 32 bytes from the end. */
76L(set96):
77 stp val, val, [dstin, 16]
78 stp val, val, [dstin, 32]
79 stp val, val, [dstin, 48]
80 stp val, val, [dstend, -32]
81 stp val, val, [dstend, -16]
82 ret
83
84 .p2align 4
85L(set_long):
86 stp val, val, [dstin]
87 bic dst, dstin, 15
88 /* Small-size or non-zero memset does not use DC ZVA. */
89 sub count, dstend, dst
90
91 /*
92 * Adjust count and bias for loop. By subtracting extra 1 from count,
93 * it is easy to use tbz instruction to check whether loop tailing
94 * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
95 */
96 sub count, count, 64+16+1
97
981: stp val, val, [dst, 16]
99 stp val, val, [dst, 32]
100 stp val, val, [dst, 48]
101 stp val, val, [dst, 64]!
102 subs count, count, 64
103 b.hs 1b
104
105 tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
106 stp val, val, [dst, 16]
107 stp val, val, [dst, 32]
1081: stp val, val, [dstend, -32]
109 stp val, val, [dstend, -16]
110 ret
111
112END (__memset_emag)
113

source code of glibc/sysdeps/aarch64/multiarch/memset_emag.S