1/* Copyright (C) 2013-2024 Free Software Foundation, Inc.
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* Assumptions:
22 *
23 * ARMv8-a, AArch64
24 */
25
26#define REP8_01 0x0101010101010101
27#define REP8_7f 0x7f7f7f7f7f7f7f7f
28
29/* Parameters and result. */
30#define src1 x0
31#define src2 x1
32#define limit x2
33#define result x0
34
35/* Internal variables. */
36#define data1 x3
37#define data1w w3
38#define data2 x4
39#define data2w w4
40#define has_nul x5
41#define diff x6
42#define syndrome x7
43#define tmp1 x8
44#define tmp2 x9
45#define tmp3 x10
46#define zeroones x11
47#define pos x12
48#define mask x13
49#define endloop x14
50#define count mask
51#define offset pos
52#define neg_offset x15
53
54/* Define endian dependent shift operations.
55 On big-endian early bytes are at MSB and on little-endian LSB.
56 LS_FW means shifting towards early bytes.
57 LS_BK means shifting towards later bytes.
58 */
59#ifdef __AARCH64EB__
60#define LS_FW lsl
61#define LS_BK lsr
62#else
63#define LS_FW lsr
64#define LS_BK lsl
65#endif
66
67 .text
68 .p2align 6
69 .rep 9
70 nop /* Pad so that the loop below fits a cache line. */
71 .endr
72ENTRY_ALIGN (strncmp, 0)
73 cbz limit, L(ret0)
74 eor tmp1, src1, src2
75 mov zeroones, #REP8_01
76 tst tmp1, #7
77 and count, src1, #7
78 b.ne L(misaligned8)
79 cbnz count, L(mutual_align)
80
81 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
82 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
83 can be done in parallel across the entire word. */
84 /* Start of performance-critical section -- one 64B cache line. */
85L(loop_aligned):
86 ldr data1, [src1], #8
87 ldr data2, [src2], #8
88L(start_realigned):
89 subs limit, limit, #8
90 sub tmp1, data1, zeroones
91 orr tmp2, data1, #REP8_7f
92 eor diff, data1, data2 /* Non-zero if differences found. */
93 csinv endloop, diff, xzr, hi /* Last Dword or differences. */
94 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
95 ccmp endloop, #0, #0, eq
96 b.eq L(loop_aligned)
97 /* End of performance-critical section -- one 64B cache line. */
98
99L(full_check):
100#ifndef __AARCH64EB__
101 orr syndrome, diff, has_nul
102 add limit, limit, 8 /* Rewind limit to before last subs. */
103L(syndrome_check):
104 /* Limit was reached. Check if the NUL byte or the difference
105 is before the limit. */
106 rev syndrome, syndrome
107 rev data1, data1
108 clz pos, syndrome
109 rev data2, data2
110 lsl data1, data1, pos
111 cmp limit, pos, lsr #3
112 lsl data2, data2, pos
113 /* But we need to zero-extend (char is unsigned) the value and then
114 perform a signed 32-bit subtraction. */
115 lsr data1, data1, #56
116 sub result, data1, data2, lsr #56
117 csel result, result, xzr, hi
118 ret
119#else
120 /* Not reached the limit, must have found the end or a diff. */
121 tbz limit, #63, L(not_limit)
122 add tmp1, limit, 8
123 cbz limit, L(not_limit)
124
125 lsl limit, tmp1, #3 /* Bits -> bytes. */
126 mov mask, #~0
127 lsr mask, mask, limit
128 bic data1, data1, mask
129 bic data2, data2, mask
130
131 /* Make sure that the NUL byte is marked in the syndrome. */
132 orr has_nul, has_nul, mask
133
134L(not_limit):
135 /* For big-endian we cannot use the trick with the syndrome value
136 as carry-propagation can corrupt the upper bits if the trailing
137 bytes in the string contain 0x01. */
138 /* However, if there is no NUL byte in the dword, we can generate
139 the result directly. We can't just subtract the bytes as the
140 MSB might be significant. */
141 cbnz has_nul, 1f
142 cmp data1, data2
143 cset result, ne
144 cneg result, result, lo
145 ret
1461:
147 /* Re-compute the NUL-byte detection, using a byte-reversed value. */
148 rev tmp3, data1
149 sub tmp1, tmp3, zeroones
150 orr tmp2, tmp3, #REP8_7f
151 bic has_nul, tmp1, tmp2
152 rev has_nul, has_nul
153 orr syndrome, diff, has_nul
154 clz pos, syndrome
155 /* The most-significant-non-zero bit of the syndrome marks either the
156 first bit that is different, or the top bit of the first zero byte.
157 Shifting left now will bring the critical information into the
158 top bits. */
159L(end_quick):
160 lsl data1, data1, pos
161 lsl data2, data2, pos
162 /* But we need to zero-extend (char is unsigned) the value and then
163 perform a signed 32-bit subtraction. */
164 lsr data1, data1, #56
165 sub result, data1, data2, lsr #56
166 ret
167#endif
168
169L(mutual_align):
170 /* Sources are mutually aligned, but are not currently at an
171 alignment boundary. Round down the addresses and then mask off
172 the bytes that precede the start point.
173 We also need to adjust the limit calculations, but without
174 overflowing if the limit is near ULONG_MAX. */
175 bic src1, src1, #7
176 bic src2, src2, #7
177 ldr data1, [src1], #8
178 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
179 ldr data2, [src2], #8
180 mov tmp2, #~0
181 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
182 /* Adjust the limit and ensure it doesn't overflow. */
183 adds limit, limit, count
184 csinv limit, limit, xzr, lo
185 orr data1, data1, tmp2
186 orr data2, data2, tmp2
187 b L(start_realigned)
188
189 .p2align 6
190 /* Don't bother with dwords for up to 16 bytes. */
191L(misaligned8):
192 cmp limit, #16
193 b.hs L(try_misaligned_words)
194
195L(byte_loop):
196 /* Perhaps we can do better than this. */
197 ldrb data1w, [src1], #1
198 ldrb data2w, [src2], #1
199 subs limit, limit, #1
200 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
201 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
202 b.eq L(byte_loop)
203L(done):
204 sub result, data1, data2
205 ret
206 /* Align the SRC1 to a dword by doing a bytewise compare and then do
207 the dword loop. */
208L(try_misaligned_words):
209 cbz count, L(src1_aligned)
210
211 neg count, count
212 and count, count, #7
213 sub limit, limit, count
214
215L(page_end_loop):
216 ldrb data1w, [src1], #1
217 ldrb data2w, [src2], #1
218 cmp data1w, #1
219 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
220 b.ne L(done)
221 subs count, count, #1
222 b.hi L(page_end_loop)
223
224 /* The following diagram explains the comparison of misaligned strings.
225 The bytes are shown in natural order. For little-endian, it is
226 reversed in the registers. The "x" bytes are before the string.
227 The "|" separates data that is loaded at one time.
228 src1 | a a a a a a a a | b b b c c c c c | . . .
229 src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
230 After shifting in each step, the data looks like this:
231 STEP_A STEP_B STEP_C
232 data1 a a a a a a a a b b b c c c c c b b b c c c c c
233 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
234 The bytes with "0" are eliminated from the syndrome via mask.
235 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
236 time from SRC2. The comparison happens in 3 steps. After each step
237 the loop can exit, or read from SRC1 or SRC2. */
238L(src1_aligned):
239 /* Calculate offset from 8 byte alignment to string start in bits. No
240 need to mask offset since shifts are ignoring upper bits. */
241 lsl offset, src2, #3
242 bic src2, src2, #0xf
243 mov mask, -1
244 neg neg_offset, offset
245 ldr data1, [src1], #8
246 ldp tmp1, tmp2, [src2], #16
247 LS_BK mask, mask, neg_offset
248 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
249 /* Skip the first compare if data in tmp1 is irrelevant. */
250 tbnz offset, 6, L(misaligned_mid_loop)
251
252L(loop_misaligned):
253 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
254 LS_FW data2, tmp1, offset
255 LS_BK tmp1, tmp2, neg_offset
256 subs limit, limit, #8
257 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
258 sub has_nul, data1, zeroones
259 eor diff, data1, data2 /* Non-zero if differences found. */
260 orr tmp3, data1, #REP8_7f
261 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
262 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
263 orr tmp3, endloop, has_nul
264 cbnz tmp3, L(full_check)
265
266 ldr data1, [src1], #8
267L(misaligned_mid_loop):
268 /* STEP_B: Compare first part of data1 to second part of tmp2. */
269 LS_FW data2, tmp2, offset
270#ifdef __AARCH64EB__
271 /* For big-endian we do a byte reverse to avoid carry-propagation
272 problem described above. This way we can reuse the has_nul in the
273 next step and also use syndrome value trick at the end. */
274 rev tmp3, data1
275 #define data1_fixed tmp3
276#else
277 #define data1_fixed data1
278#endif
279 sub has_nul, data1_fixed, zeroones
280 orr tmp3, data1_fixed, #REP8_7f
281 eor diff, data2, data1 /* Non-zero if differences found. */
282 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
283#ifdef __AARCH64EB__
284 rev has_nul, has_nul
285#endif
286 cmp limit, neg_offset, lsr #3
287 orr syndrome, diff, has_nul
288 bic syndrome, syndrome, mask /* Ignore later bytes. */
289 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
290 cbnz tmp3, L(syndrome_check)
291
292 /* STEP_C: Compare second part of data1 to first part of tmp1. */
293 ldp tmp1, tmp2, [src2], #16
294 cmp limit, #8
295 LS_BK data2, tmp1, neg_offset
296 eor diff, data2, data1 /* Non-zero if differences found. */
297 orr syndrome, diff, has_nul
298 and syndrome, syndrome, mask /* Ignore earlier bytes. */
299 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
300 cbnz tmp3, L(syndrome_check)
301
302 ldr data1, [src1], #8
303 sub limit, limit, #8
304 b L(loop_misaligned)
305
306#ifdef __AARCH64EB__
307L(syndrome_check):
308 clz pos, syndrome
309 cmp pos, limit, lsl #3
310 b.lo L(end_quick)
311#endif
312
313L(ret0):
314 mov result, #0
315 ret
316
317END (strncmp)
318libc_hidden_builtin_def (strncmp)
319

source code of glibc/sysdeps/aarch64/strncmp.S