1/*
2 * strncmp - compare two strings
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64
12 */
13
14#include "../asmdefs.h"
15
16#define REP8_01 0x0101010101010101
17#define REP8_7f 0x7f7f7f7f7f7f7f7f
18#define REP8_80 0x8080808080808080
19
20/* Parameters and result. */
21#define src1 x0
22#define src2 x1
23#define limit x2
24#define result x0
25
26/* Internal variables. */
27#define data1 x3
28#define data1w w3
29#define data2 x4
30#define data2w w4
31#define has_nul x5
32#define diff x6
33#define syndrome x7
34#define tmp1 x8
35#define tmp2 x9
36#define tmp3 x10
37#define zeroones x11
38#define pos x12
39#define limit_wd x13
40#define mask x14
41#define endloop x15
42#define count mask
43
44 .text
45 .p2align 6
46 .rep 7
47 nop /* Pad so that the loop below fits a cache line. */
48 .endr
49ENTRY_ALIGN (__strncmp_aarch64, 0)
50 cbz limit, L(ret0)
51 eor tmp1, src1, src2
52 mov zeroones, #REP8_01
53 tst tmp1, #7
54 and count, src1, #7
55 b.ne L(misaligned8)
56 cbnz count, L(mutual_align)
57 /* Calculate the number of full and partial words -1. */
58 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
59 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
60
61 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
62 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
63 can be done in parallel across the entire word. */
64 /* Start of performance-critical section -- one 64B cache line. */
65L(loop_aligned):
66 ldr data1, [src1], #8
67 ldr data2, [src2], #8
68L(start_realigned):
69 subs limit_wd, limit_wd, #1
70 sub tmp1, data1, zeroones
71 orr tmp2, data1, #REP8_7f
72 eor diff, data1, data2 /* Non-zero if differences found. */
73 csinv endloop, diff, xzr, pl /* Last Dword or differences. */
74 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
75 ccmp endloop, #0, #0, eq
76 b.eq L(loop_aligned)
77 /* End of performance-critical section -- one 64B cache line. */
78
79 /* Not reached the limit, must have found the end or a diff. */
80 tbz limit_wd, #63, L(not_limit)
81
82 /* Limit % 8 == 0 => all bytes significant. */
83 ands limit, limit, #7
84 b.eq L(not_limit)
85
86 lsl limit, limit, #3 /* Bits -> bytes. */
87 mov mask, #~0
88#ifdef __AARCH64EB__
89 lsr mask, mask, limit
90#else
91 lsl mask, mask, limit
92#endif
93 bic data1, data1, mask
94 bic data2, data2, mask
95
96 /* Make sure that the NUL byte is marked in the syndrome. */
97 orr has_nul, has_nul, mask
98
99L(not_limit):
100 orr syndrome, diff, has_nul
101
102#ifndef __AARCH64EB__
103 rev syndrome, syndrome
104 rev data1, data1
105 /* The MS-non-zero bit of the syndrome marks either the first bit
106 that is different, or the top bit of the first zero byte.
107 Shifting left now will bring the critical information into the
108 top bits. */
109 clz pos, syndrome
110 rev data2, data2
111 lsl data1, data1, pos
112 lsl data2, data2, pos
113 /* But we need to zero-extend (char is unsigned) the value and then
114 perform a signed 32-bit subtraction. */
115 lsr data1, data1, #56
116 sub result, data1, data2, lsr #56
117 ret
118#else
119 /* For big-endian we cannot use the trick with the syndrome value
120 as carry-propagation can corrupt the upper bits if the trailing
121 bytes in the string contain 0x01. */
122 /* However, if there is no NUL byte in the dword, we can generate
123 the result directly. We can't just subtract the bytes as the
124 MSB might be significant. */
125 cbnz has_nul, 1f
126 cmp data1, data2
127 cset result, ne
128 cneg result, result, lo
129 ret
1301:
131 /* Re-compute the NUL-byte detection, using a byte-reversed value. */
132 rev tmp3, data1
133 sub tmp1, tmp3, zeroones
134 orr tmp2, tmp3, #REP8_7f
135 bic has_nul, tmp1, tmp2
136 rev has_nul, has_nul
137 orr syndrome, diff, has_nul
138 clz pos, syndrome
139 /* The MS-non-zero bit of the syndrome marks either the first bit
140 that is different, or the top bit of the first zero byte.
141 Shifting left now will bring the critical information into the
142 top bits. */
143 lsl data1, data1, pos
144 lsl data2, data2, pos
145 /* But we need to zero-extend (char is unsigned) the value and then
146 perform a signed 32-bit subtraction. */
147 lsr data1, data1, #56
148 sub result, data1, data2, lsr #56
149 ret
150#endif
151
152L(mutual_align):
153 /* Sources are mutually aligned, but are not currently at an
154 alignment boundary. Round down the addresses and then mask off
155 the bytes that precede the start point.
156 We also need to adjust the limit calculations, but without
157 overflowing if the limit is near ULONG_MAX. */
158 bic src1, src1, #7
159 bic src2, src2, #7
160 ldr data1, [src1], #8
161 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
162 ldr data2, [src2], #8
163 mov tmp2, #~0
164 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
165#ifdef __AARCH64EB__
166 /* Big-endian. Early bytes are at MSB. */
167 lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
168#else
169 /* Little-endian. Early bytes are at LSB. */
170 lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
171#endif
172 and tmp3, limit_wd, #7
173 lsr limit_wd, limit_wd, #3
174 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
175 add limit, limit, count
176 add tmp3, tmp3, count
177 orr data1, data1, tmp2
178 orr data2, data2, tmp2
179 add limit_wd, limit_wd, tmp3, lsr #3
180 b L(start_realigned)
181
182 .p2align 6
183 /* Don't bother with dwords for up to 16 bytes. */
184L(misaligned8):
185 cmp limit, #16
186 b.hs L(try_misaligned_words)
187
188L(byte_loop):
189 /* Perhaps we can do better than this. */
190 ldrb data1w, [src1], #1
191 ldrb data2w, [src2], #1
192 subs limit, limit, #1
193 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
194 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
195 b.eq L(byte_loop)
196L(done):
197 sub result, data1, data2
198 ret
199 /* Align the SRC1 to a dword by doing a bytewise compare and then do
200 the dword loop. */
201L(try_misaligned_words):
202 lsr limit_wd, limit, #3
203 cbz count, L(do_misaligned)
204
205 neg count, count
206 and count, count, #7
207 sub limit, limit, count
208 lsr limit_wd, limit, #3
209
210L(page_end_loop):
211 ldrb data1w, [src1], #1
212 ldrb data2w, [src2], #1
213 cmp data1w, #1
214 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
215 b.ne L(done)
216 subs count, count, #1
217 b.hi L(page_end_loop)
218
219L(do_misaligned):
220 /* Prepare ourselves for the next page crossing. Unlike the aligned
221 loop, we fetch 1 less dword because we risk crossing bounds on
222 SRC2. */
223 mov count, #8
224 subs limit_wd, limit_wd, #1
225 b.lo L(done_loop)
226L(loop_misaligned):
227 and tmp2, src2, #0xff8
228 eor tmp2, tmp2, #0xff8
229 cbz tmp2, L(page_end_loop)
230
231 ldr data1, [src1], #8
232 ldr data2, [src2], #8
233 sub tmp1, data1, zeroones
234 orr tmp2, data1, #REP8_7f
235 eor diff, data1, data2 /* Non-zero if differences found. */
236 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
237 ccmp diff, #0, #0, eq
238 b.ne L(not_limit)
239 subs limit_wd, limit_wd, #1
240 b.pl L(loop_misaligned)
241
242L(done_loop):
243 /* We found a difference or a NULL before the limit was reached. */
244 and limit, limit, #7
245 cbz limit, L(not_limit)
246 /* Read the last word. */
247 sub src1, src1, 8
248 sub src2, src2, 8
249 ldr data1, [src1, limit]
250 ldr data2, [src2, limit]
251 sub tmp1, data1, zeroones
252 orr tmp2, data1, #REP8_7f
253 eor diff, data1, data2 /* Non-zero if differences found. */
254 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
255 ccmp diff, #0, #0, eq
256 b.ne L(not_limit)
257
258L(ret0):
259 mov result, #0
260 ret
261
262END ( __strncmp_aarch64)
263

source code of libc/AOR_v20.02/string/aarch64/strncmp.S