1/*
2 * strlen - calculate the length of a string
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64.
12 */
13
14#include "../asmdefs.h"
15
16/* Arguments and results. */
17#define srcin x0
18#define len x0
19
20/* Locals and temporaries. */
21#define src x1
22#define data1 x2
23#define data2 x3
24#define has_nul1 x4
25#define has_nul2 x5
26#define tmp1 x4
27#define tmp2 x5
28#define tmp3 x6
29#define tmp4 x7
30#define zeroones x8
31#define offset x9
32
33 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
34 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
35 can be done in parallel across the entire word. A faster check
36 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
37 false hits for characters 129..255. */
38
39#define REP8_01 0x0101010101010101
40#define REP8_7f 0x7f7f7f7f7f7f7f7f
41
42 /* This implementation is compatible with Memory Tagging. All loads
43 are 16 bytes in size and 16 bytes aligned. This also avoids the
44 need for page boundary checks. This implementation is correct
45 even without Memory Tagging, but other implementations could be
46 more beneficial if Memory Tagging is not enabled.
47
48 First load is aligned down and can contain bytes that are located
49 before the string. This is handled by modifying the "zeroones"
50 mask. The bytes that need to be ignored are set to zero.
51 If the string is aligned in such a way that 8 or more bytes from
52 the first load should be ignored, there is a special case
53 (skip_first_8_bytes) which only compares the second 8 bytes.
54
55 If there is a NUL byte in the first load, we calculate the length
56 from the 2 8-byte words using conditional select to reduce branch
57 mispredictions.
58
59 If the string is longer than 16 bytes, we check 32 bytes per
60 iteration using the fast NUL check (main_loop). If we encounter
61 non-ASCII characters, we fallback to a second loop
62 (nonascii_loop) using the full NUL check. */
63
64ENTRY(__strlen_aarch64_mte)
65 bic src, srcin, 15 /* Align down to 16 bytes. */
66 mov zeroones, REP8_01
67 /* (offset & 63) holds number of bits to ignore in a register.*/
68 lsl offset, srcin, 3
69 ldp data1, data2, [src], -16
70 lsl tmp1, zeroones, offset /* Shift (offset & 63). */
71#ifdef __AARCH64EB__
72 /* For big-endian, carry propagation (if the final byte in the
73 string is 0x01) means we cannot use has_nul1/2 directly.
74 e.g. 0x0100 - 0x0101 = 0xffff, so 0x01 will be mistaken for NUL.
75 Since we expect strings to be small and early-exit,
76 byte-swap the data now so has_null1/2 will be correct. */
77 rev data1, data1
78 rev data2, data2
79#endif
80 tbnz srcin, 3, L(skip_first_8_bytes)
81 sub tmp1, data1, tmp1
82 orr tmp2, data1, REP8_7f
83 sub tmp3, data2, zeroones
84 orr tmp4, data2, REP8_7f
85 bics has_nul1, tmp1, tmp2
86 bic has_nul2, tmp3, tmp4
87 /* If comparison happens, C flag is always set. */
88 ccmp has_nul2, 0, 0, eq
89 beq L(main_loop)
90
91 /* Enter with C = has_nul1 == 0. */
92 csel has_nul1, has_nul1, has_nul2, cc
93 and tmp2, srcin, 7 /* Bytes to ignore. */
94 rev has_nul1, has_nul1
95 neg tmp2, tmp2
96 clz tmp1, has_nul1 /* Count bits before NUL. */
97 /* Add 8 if NUL byte is not in first register. */
98 add tmp3, tmp2, 8
99 csel len, tmp2, tmp3, cc
100 add len, len, tmp1, lsr 3
101 ret
102
103L(skip_first_8_bytes):
104 sub tmp1, data2, tmp1
105 orr tmp2, data2, REP8_7f
106 bics has_nul1, tmp1, tmp2
107 beq L(main_loop)
108
109 rev has_nul1, has_nul1
110 lsl tmp1, has_nul1, offset /* Ignore bytes before string. */
111 clz tmp1, tmp1 /* Count bits before NUL. */
112 lsr len, tmp1, 3
113 ret
114
115 /* The inner loop processes 32 bytes per iteration and uses the fast
116 NUL check. If we encounter non-ASCII characters, use a second
117 loop with the accurate NUL check. */
118 .p2align 4
119L(main_loop):
120 ldp data1, data2, [src, 32]!
121 sub tmp1, data1, zeroones
122 sub tmp3, data2, zeroones
123 orr tmp2, tmp1, tmp3
124 tst tmp2, zeroones, lsl 7
125 bne 1f
126 ldp data1, data2, [src, 16]
127 sub tmp1, data1, zeroones
128 sub tmp3, data2, zeroones
129 orr tmp2, tmp1, tmp3
130 tst tmp2, zeroones, lsl 7
131 beq L(main_loop)
132 add src, src, 16
1331:
134 /* The fast check failed, so do the slower, accurate NUL check. */
135 orr tmp2, data1, REP8_7f
136 orr tmp4, data2, REP8_7f
137 bics has_nul1, tmp1, tmp2
138 bic has_nul2, tmp3, tmp4
139 ccmp has_nul2, 0, 0, eq
140 beq L(nonascii_loop)
141
142 /* Enter with C = has_nul1 == 0. */
143L(tail):
144#ifdef __AARCH64EB__
145 /* For big-endian, carry propagation (if the final byte in the
146 string is 0x01) means we cannot use has_nul1/2 directly. The
147 easiest way to get the correct byte is to byte-swap the data
148 and calculate the syndrome a second time. */
149 csel data1, data1, data2, cc
150 rev data1, data1
151 sub tmp1, data1, zeroones
152 orr tmp2, data1, REP8_7f
153 bic has_nul1, tmp1, tmp2
154#else
155 csel has_nul1, has_nul1, has_nul2, cc
156#endif
157 sub len, src, srcin
158 rev has_nul1, has_nul1
159 add tmp2, len, 8
160 clz tmp1, has_nul1
161 csel len, len, tmp2, cc
162 add len, len, tmp1, lsr 3
163 ret
164
165L(nonascii_loop):
166 ldp data1, data2, [src, 16]!
167 sub tmp1, data1, zeroones
168 orr tmp2, data1, REP8_7f
169 sub tmp3, data2, zeroones
170 orr tmp4, data2, REP8_7f
171 bics has_nul1, tmp1, tmp2
172 bic has_nul2, tmp3, tmp4
173 ccmp has_nul2, 0, 0, eq
174 bne L(tail)
175 ldp data1, data2, [src, 16]!
176 sub tmp1, data1, zeroones
177 orr tmp2, data1, REP8_7f
178 sub tmp3, data2, zeroones
179 orr tmp4, data2, REP8_7f
180 bics has_nul1, tmp1, tmp2
181 bic has_nul2, tmp3, tmp4
182 ccmp has_nul2, 0, 0, eq
183 beq L(nonascii_loop)
184 b L(tail)
185
186END(__strlen_aarch64_mte)
187

source code of libc/AOR_v20.02/string/aarch64/strlen-mte.S