1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (c) 2013-2022, Arm Limited.
4 *
5 * Adapted from the original at:
6 * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
7 */
8
9#include <linux/linkage.h>
10#include <asm/assembler.h>
11
12/* Assumptions:
13 *
14 * ARMv8-a, AArch64.
15 * MTE compatible.
16 */
17
18#define L(label) .L ## label
19
20#define REP8_01 0x0101010101010101
21#define REP8_7f 0x7f7f7f7f7f7f7f7f
22
23/* Parameters and result. */
24#define src1 x0
25#define src2 x1
26#define limit x2
27#define result x0
28
29/* Internal variables. */
30#define data1 x3
31#define data1w w3
32#define data2 x4
33#define data2w w4
34#define has_nul x5
35#define diff x6
36#define syndrome x7
37#define tmp1 x8
38#define tmp2 x9
39#define tmp3 x10
40#define zeroones x11
41#define pos x12
42#define mask x13
43#define endloop x14
44#define count mask
45#define offset pos
46#define neg_offset x15
47
48/* Define endian dependent shift operations.
49 On big-endian early bytes are at MSB and on little-endian LSB.
50 LS_FW means shifting towards early bytes.
51 LS_BK means shifting towards later bytes.
52 */
53#ifdef __AARCH64EB__
54#define LS_FW lsl
55#define LS_BK lsr
56#else
57#define LS_FW lsr
58#define LS_BK lsl
59#endif
60
61SYM_FUNC_START(__pi_strncmp)
62 cbz limit, L(ret0)
63 eor tmp1, src1, src2
64 mov zeroones, #REP8_01
65 tst tmp1, #7
66 and count, src1, #7
67 b.ne L(misaligned8)
68 cbnz count, L(mutual_align)
69
70 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
71 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
72 can be done in parallel across the entire word. */
73 .p2align 4
74L(loop_aligned):
75 ldr data1, [src1], #8
76 ldr data2, [src2], #8
77L(start_realigned):
78 subs limit, limit, #8
79 sub tmp1, data1, zeroones
80 orr tmp2, data1, #REP8_7f
81 eor diff, data1, data2 /* Non-zero if differences found. */
82 csinv endloop, diff, xzr, hi /* Last Dword or differences. */
83 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
84 ccmp endloop, #0, #0, eq
85 b.eq L(loop_aligned)
86 /* End of main loop */
87
88L(full_check):
89#ifndef __AARCH64EB__
90 orr syndrome, diff, has_nul
91 add limit, limit, 8 /* Rewind limit to before last subs. */
92L(syndrome_check):
93 /* Limit was reached. Check if the NUL byte or the difference
94 is before the limit. */
95 rev syndrome, syndrome
96 rev data1, data1
97 clz pos, syndrome
98 rev data2, data2
99 lsl data1, data1, pos
100 cmp limit, pos, lsr #3
101 lsl data2, data2, pos
102 /* But we need to zero-extend (char is unsigned) the value and then
103 perform a signed 32-bit subtraction. */
104 lsr data1, data1, #56
105 sub result, data1, data2, lsr #56
106 csel result, result, xzr, hi
107 ret
108#else
109 /* Not reached the limit, must have found the end or a diff. */
110 tbz limit, #63, L(not_limit)
111 add tmp1, limit, 8
112 cbz limit, L(not_limit)
113
114 lsl limit, tmp1, #3 /* Bits -> bytes. */
115 mov mask, #~0
116 lsr mask, mask, limit
117 bic data1, data1, mask
118 bic data2, data2, mask
119
120 /* Make sure that the NUL byte is marked in the syndrome. */
121 orr has_nul, has_nul, mask
122
123L(not_limit):
124 /* For big-endian we cannot use the trick with the syndrome value
125 as carry-propagation can corrupt the upper bits if the trailing
126 bytes in the string contain 0x01. */
127 /* However, if there is no NUL byte in the dword, we can generate
128 the result directly. We can't just subtract the bytes as the
129 MSB might be significant. */
130 cbnz has_nul, 1f
131 cmp data1, data2
132 cset result, ne
133 cneg result, result, lo
134 ret
1351:
136 /* Re-compute the NUL-byte detection, using a byte-reversed value. */
137 rev tmp3, data1
138 sub tmp1, tmp3, zeroones
139 orr tmp2, tmp3, #REP8_7f
140 bic has_nul, tmp1, tmp2
141 rev has_nul, has_nul
142 orr syndrome, diff, has_nul
143 clz pos, syndrome
144 /* The most-significant-non-zero bit of the syndrome marks either the
145 first bit that is different, or the top bit of the first zero byte.
146 Shifting left now will bring the critical information into the
147 top bits. */
148L(end_quick):
149 lsl data1, data1, pos
150 lsl data2, data2, pos
151 /* But we need to zero-extend (char is unsigned) the value and then
152 perform a signed 32-bit subtraction. */
153 lsr data1, data1, #56
154 sub result, data1, data2, lsr #56
155 ret
156#endif
157
158L(mutual_align):
159 /* Sources are mutually aligned, but are not currently at an
160 alignment boundary. Round down the addresses and then mask off
161 the bytes that precede the start point.
162 We also need to adjust the limit calculations, but without
163 overflowing if the limit is near ULONG_MAX. */
164 bic src1, src1, #7
165 bic src2, src2, #7
166 ldr data1, [src1], #8
167 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
168 ldr data2, [src2], #8
169 mov tmp2, #~0
170 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
171 /* Adjust the limit and ensure it doesn't overflow. */
172 adds limit, limit, count
173 csinv limit, limit, xzr, lo
174 orr data1, data1, tmp2
175 orr data2, data2, tmp2
176 b L(start_realigned)
177
178 .p2align 4
179 /* Don't bother with dwords for up to 16 bytes. */
180L(misaligned8):
181 cmp limit, #16
182 b.hs L(try_misaligned_words)
183
184L(byte_loop):
185 /* Perhaps we can do better than this. */
186 ldrb data1w, [src1], #1
187 ldrb data2w, [src2], #1
188 subs limit, limit, #1
189 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
190 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
191 b.eq L(byte_loop)
192L(done):
193 sub result, data1, data2
194 ret
195 /* Align the SRC1 to a dword by doing a bytewise compare and then do
196 the dword loop. */
197L(try_misaligned_words):
198 cbz count, L(src1_aligned)
199
200 neg count, count
201 and count, count, #7
202 sub limit, limit, count
203
204L(page_end_loop):
205 ldrb data1w, [src1], #1
206 ldrb data2w, [src2], #1
207 cmp data1w, #1
208 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
209 b.ne L(done)
210 subs count, count, #1
211 b.hi L(page_end_loop)
212
213 /* The following diagram explains the comparison of misaligned strings.
214 The bytes are shown in natural order. For little-endian, it is
215 reversed in the registers. The "x" bytes are before the string.
216 The "|" separates data that is loaded at one time.
217 src1 | a a a a a a a a | b b b c c c c c | . . .
218 src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
219
220 After shifting in each step, the data looks like this:
221 STEP_A STEP_B STEP_C
222 data1 a a a a a a a a b b b c c c c c b b b c c c c c
223 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
224
225 The bytes with "0" are eliminated from the syndrome via mask.
226
227 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228 time from SRC2. The comparison happens in 3 steps. After each step
229 the loop can exit, or read from SRC1 or SRC2. */
230L(src1_aligned):
231 /* Calculate offset from 8 byte alignment to string start in bits. No
232 need to mask offset since shifts are ignoring upper bits. */
233 lsl offset, src2, #3
234 bic src2, src2, #0xf
235 mov mask, -1
236 neg neg_offset, offset
237 ldr data1, [src1], #8
238 ldp tmp1, tmp2, [src2], #16
239 LS_BK mask, mask, neg_offset
240 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
241 /* Skip the first compare if data in tmp1 is irrelevant. */
242 tbnz offset, 6, L(misaligned_mid_loop)
243
244L(loop_misaligned):
245 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
246 LS_FW data2, tmp1, offset
247 LS_BK tmp1, tmp2, neg_offset
248 subs limit, limit, #8
249 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
250 sub has_nul, data1, zeroones
251 eor diff, data1, data2 /* Non-zero if differences found. */
252 orr tmp3, data1, #REP8_7f
253 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
254 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
255 orr tmp3, endloop, has_nul
256 cbnz tmp3, L(full_check)
257
258 ldr data1, [src1], #8
259L(misaligned_mid_loop):
260 /* STEP_B: Compare first part of data1 to second part of tmp2. */
261 LS_FW data2, tmp2, offset
262#ifdef __AARCH64EB__
263 /* For big-endian we do a byte reverse to avoid carry-propagation
264 problem described above. This way we can reuse the has_nul in the
265 next step and also use syndrome value trick at the end. */
266 rev tmp3, data1
267 #define data1_fixed tmp3
268#else
269 #define data1_fixed data1
270#endif
271 sub has_nul, data1_fixed, zeroones
272 orr tmp3, data1_fixed, #REP8_7f
273 eor diff, data2, data1 /* Non-zero if differences found. */
274 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
275#ifdef __AARCH64EB__
276 rev has_nul, has_nul
277#endif
278 cmp limit, neg_offset, lsr #3
279 orr syndrome, diff, has_nul
280 bic syndrome, syndrome, mask /* Ignore later bytes. */
281 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
282 cbnz tmp3, L(syndrome_check)
283
284 /* STEP_C: Compare second part of data1 to first part of tmp1. */
285 ldp tmp1, tmp2, [src2], #16
286 cmp limit, #8
287 LS_BK data2, tmp1, neg_offset
288 eor diff, data2, data1 /* Non-zero if differences found. */
289 orr syndrome, diff, has_nul
290 and syndrome, syndrome, mask /* Ignore earlier bytes. */
291 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
292 cbnz tmp3, L(syndrome_check)
293
294 ldr data1, [src1], #8
295 sub limit, limit, #8
296 b L(loop_misaligned)
297
298#ifdef __AARCH64EB__
299L(syndrome_check):
300 clz pos, syndrome
301 cmp pos, limit, lsl #3
302 b.lo L(end_quick)
303#endif
304
305L(ret0):
306 mov result, #0
307 ret
308SYM_FUNC_END(__pi_strncmp)
309SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
310EXPORT_SYMBOL_NOKASAN(strncmp)
311

source code of linux/arch/arm64/lib/strncmp.S