1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright (c) 2013-2022, Arm Limited. |
4 | * |
5 | * Adapted from the original at: |
6 | * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S |
7 | */ |
8 | |
9 | #include <linux/linkage.h> |
10 | #include <asm/assembler.h> |
11 | |
12 | /* Assumptions: |
13 | * |
14 | * ARMv8-a, AArch64. |
15 | * MTE compatible. |
16 | */ |
17 | |
18 | #define L(label) .L ## label |
19 | |
20 | #define REP8_01 0x0101010101010101 |
21 | #define REP8_7f 0x7f7f7f7f7f7f7f7f |
22 | |
23 | /* Parameters and result. */ |
24 | #define src1 x0 |
25 | #define src2 x1 |
26 | #define limit x2 |
27 | #define result x0 |
28 | |
29 | /* Internal variables. */ |
30 | #define data1 x3 |
31 | #define data1w w3 |
32 | #define data2 x4 |
33 | #define data2w w4 |
34 | #define has_nul x5 |
35 | #define diff x6 |
36 | #define syndrome x7 |
37 | #define tmp1 x8 |
38 | #define tmp2 x9 |
39 | #define tmp3 x10 |
40 | #define zeroones x11 |
41 | #define pos x12 |
42 | #define mask x13 |
43 | #define endloop x14 |
44 | #define count mask |
45 | #define offset pos |
46 | #define neg_offset x15 |
47 | |
48 | /* Define endian dependent shift operations. |
49 | On big-endian early bytes are at MSB and on little-endian LSB. |
50 | LS_FW means shifting towards early bytes. |
51 | LS_BK means shifting towards later bytes. |
52 | */ |
53 | #ifdef __AARCH64EB__ |
54 | #define LS_FW lsl |
55 | #define LS_BK lsr |
56 | #else |
57 | #define LS_FW lsr |
58 | #define LS_BK lsl |
59 | #endif |
60 | |
61 | SYM_FUNC_START(__pi_strncmp) |
62 | cbz limit, L(ret0) |
63 | eor tmp1, src1, src2 |
64 | mov zeroones, #REP8_01 |
65 | tst tmp1, #7 |
66 | and count, src1, #7 |
67 | b.ne L(misaligned8) |
68 | cbnz count, L(mutual_align) |
69 | |
70 | /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 |
71 | (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and |
72 | can be done in parallel across the entire word. */ |
73 | .p2align 4 |
74 | L(loop_aligned): |
75 | ldr data1, [src1], #8 |
76 | ldr data2, [src2], #8 |
77 | L(start_realigned): |
78 | subs limit, limit, #8 |
79 | sub tmp1, data1, zeroones |
80 | orr tmp2, data1, #REP8_7f |
81 | eor diff, data1, data2 /* Non-zero if differences found. */ |
82 | csinv endloop, diff, xzr, hi /* Last Dword or differences. */ |
83 | bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ |
84 | ccmp endloop, #0, #0, eq |
85 | b.eq L(loop_aligned) |
86 | /* End of main loop */ |
87 | |
88 | L(full_check): |
89 | #ifndef __AARCH64EB__ |
90 | orr syndrome, diff, has_nul |
91 | add limit, limit, 8 /* Rewind limit to before last subs. */ |
92 | L(syndrome_check): |
93 | /* Limit was reached. Check if the NUL byte or the difference |
94 | is before the limit. */ |
95 | rev syndrome, syndrome |
96 | rev data1, data1 |
97 | clz pos, syndrome |
98 | rev data2, data2 |
99 | lsl data1, data1, pos |
100 | cmp limit, pos, lsr #3 |
101 | lsl data2, data2, pos |
102 | /* But we need to zero-extend (char is unsigned) the value and then |
103 | perform a signed 32-bit subtraction. */ |
104 | lsr data1, data1, #56 |
105 | sub result, data1, data2, lsr #56 |
106 | csel result, result, xzr, hi |
107 | ret |
108 | #else |
109 | /* Not reached the limit, must have found the end or a diff. */ |
110 | tbz limit, #63, L(not_limit) |
111 | add tmp1, limit, 8 |
112 | cbz limit, L(not_limit) |
113 | |
114 | lsl limit, tmp1, #3 /* Bits -> bytes. */ |
115 | mov mask, #~0 |
116 | lsr mask, mask, limit |
117 | bic data1, data1, mask |
118 | bic data2, data2, mask |
119 | |
120 | /* Make sure that the NUL byte is marked in the syndrome. */ |
121 | orr has_nul, has_nul, mask |
122 | |
123 | L(not_limit): |
124 | /* For big-endian we cannot use the trick with the syndrome value |
125 | as carry-propagation can corrupt the upper bits if the trailing |
126 | bytes in the string contain 0x01. */ |
127 | /* However, if there is no NUL byte in the dword, we can generate |
128 | the result directly. We can't just subtract the bytes as the |
129 | MSB might be significant. */ |
130 | cbnz has_nul, 1f |
131 | cmp data1, data2 |
132 | cset result, ne |
133 | cneg result, result, lo |
134 | ret |
135 | 1: |
136 | /* Re-compute the NUL-byte detection, using a byte-reversed value. */ |
137 | rev tmp3, data1 |
138 | sub tmp1, tmp3, zeroones |
139 | orr tmp2, tmp3, #REP8_7f |
140 | bic has_nul, tmp1, tmp2 |
141 | rev has_nul, has_nul |
142 | orr syndrome, diff, has_nul |
143 | clz pos, syndrome |
144 | /* The most-significant-non-zero bit of the syndrome marks either the |
145 | first bit that is different, or the top bit of the first zero byte. |
146 | Shifting left now will bring the critical information into the |
147 | top bits. */ |
148 | L(end_quick): |
149 | lsl data1, data1, pos |
150 | lsl data2, data2, pos |
151 | /* But we need to zero-extend (char is unsigned) the value and then |
152 | perform a signed 32-bit subtraction. */ |
153 | lsr data1, data1, #56 |
154 | sub result, data1, data2, lsr #56 |
155 | ret |
156 | #endif |
157 | |
158 | L(mutual_align): |
159 | /* Sources are mutually aligned, but are not currently at an |
160 | alignment boundary. Round down the addresses and then mask off |
161 | the bytes that precede the start point. |
162 | We also need to adjust the limit calculations, but without |
163 | overflowing if the limit is near ULONG_MAX. */ |
164 | bic src1, src1, #7 |
165 | bic src2, src2, #7 |
166 | ldr data1, [src1], #8 |
167 | neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ |
168 | ldr data2, [src2], #8 |
169 | mov tmp2, #~0 |
170 | LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ |
171 | /* Adjust the limit and ensure it doesn't overflow. */ |
172 | adds limit, limit, count |
173 | csinv limit, limit, xzr, lo |
174 | orr data1, data1, tmp2 |
175 | orr data2, data2, tmp2 |
176 | b L(start_realigned) |
177 | |
178 | .p2align 4 |
179 | /* Don't bother with dwords for up to 16 bytes. */ |
180 | L(misaligned8): |
181 | cmp limit, #16 |
182 | b.hs L(try_misaligned_words) |
183 | |
184 | L(byte_loop): |
185 | /* Perhaps we can do better than this. */ |
186 | ldrb data1w, [src1], #1 |
187 | ldrb data2w, [src2], #1 |
188 | subs limit, limit, #1 |
189 | ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ |
190 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ |
191 | b.eq L(byte_loop) |
192 | L(done): |
193 | sub result, data1, data2 |
194 | ret |
195 | /* Align the SRC1 to a dword by doing a bytewise compare and then do |
196 | the dword loop. */ |
197 | L(try_misaligned_words): |
198 | cbz count, L(src1_aligned) |
199 | |
200 | neg count, count |
201 | and count, count, #7 |
202 | sub limit, limit, count |
203 | |
204 | L(page_end_loop): |
205 | ldrb data1w, [src1], #1 |
206 | ldrb data2w, [src2], #1 |
207 | cmp data1w, #1 |
208 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ |
209 | b.ne L(done) |
210 | subs count, count, #1 |
211 | b.hi L(page_end_loop) |
212 | |
213 | /* The following diagram explains the comparison of misaligned strings. |
214 | The bytes are shown in natural order. For little-endian, it is |
215 | reversed in the registers. The "x" bytes are before the string. |
216 | The "|" separates data that is loaded at one time. |
217 | src1 | a a a a a a a a | b b b c c c c c | . . . |
218 | src2 | x x x x x a a a a a a a a b b b | c c c c c . . . |
219 | |
220 | After shifting in each step, the data looks like this: |
221 | STEP_A STEP_B STEP_C |
222 | data1 a a a a a a a a b b b c c c c c b b b c c c c c |
223 | data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c |
224 | |
225 | The bytes with "0" are eliminated from the syndrome via mask. |
226 | |
227 | Align SRC2 down to 16 bytes. This way we can read 16 bytes at a |
228 | time from SRC2. The comparison happens in 3 steps. After each step |
229 | the loop can exit, or read from SRC1 or SRC2. */ |
230 | L(src1_aligned): |
231 | /* Calculate offset from 8 byte alignment to string start in bits. No |
232 | need to mask offset since shifts are ignoring upper bits. */ |
233 | lsl offset, src2, #3 |
234 | bic src2, src2, #0xf |
235 | mov mask, -1 |
236 | neg neg_offset, offset |
237 | ldr data1, [src1], #8 |
238 | ldp tmp1, tmp2, [src2], #16 |
239 | LS_BK mask, mask, neg_offset |
240 | and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ |
241 | /* Skip the first compare if data in tmp1 is irrelevant. */ |
242 | tbnz offset, 6, L(misaligned_mid_loop) |
243 | |
244 | L(loop_misaligned): |
245 | /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ |
246 | LS_FW data2, tmp1, offset |
247 | LS_BK tmp1, tmp2, neg_offset |
248 | subs limit, limit, #8 |
249 | orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ |
250 | sub has_nul, data1, zeroones |
251 | eor diff, data1, data2 /* Non-zero if differences found. */ |
252 | orr tmp3, data1, #REP8_7f |
253 | csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ |
254 | bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ |
255 | orr tmp3, endloop, has_nul |
256 | cbnz tmp3, L(full_check) |
257 | |
258 | ldr data1, [src1], #8 |
259 | L(misaligned_mid_loop): |
260 | /* STEP_B: Compare first part of data1 to second part of tmp2. */ |
261 | LS_FW data2, tmp2, offset |
262 | #ifdef __AARCH64EB__ |
263 | /* For big-endian we do a byte reverse to avoid carry-propagation |
264 | problem described above. This way we can reuse the has_nul in the |
265 | next step and also use syndrome value trick at the end. */ |
266 | rev tmp3, data1 |
267 | #define data1_fixed tmp3 |
268 | #else |
269 | #define data1_fixed data1 |
270 | #endif |
271 | sub has_nul, data1_fixed, zeroones |
272 | orr tmp3, data1_fixed, #REP8_7f |
273 | eor diff, data2, data1 /* Non-zero if differences found. */ |
274 | bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ |
275 | #ifdef __AARCH64EB__ |
276 | rev has_nul, has_nul |
277 | #endif |
278 | cmp limit, neg_offset, lsr #3 |
279 | orr syndrome, diff, has_nul |
280 | bic syndrome, syndrome, mask /* Ignore later bytes. */ |
281 | csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ |
282 | cbnz tmp3, L(syndrome_check) |
283 | |
284 | /* STEP_C: Compare second part of data1 to first part of tmp1. */ |
285 | ldp tmp1, tmp2, [src2], #16 |
286 | cmp limit, #8 |
287 | LS_BK data2, tmp1, neg_offset |
288 | eor diff, data2, data1 /* Non-zero if differences found. */ |
289 | orr syndrome, diff, has_nul |
290 | and syndrome, syndrome, mask /* Ignore earlier bytes. */ |
291 | csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ |
292 | cbnz tmp3, L(syndrome_check) |
293 | |
294 | ldr data1, [src1], #8 |
295 | sub limit, limit, #8 |
296 | b L(loop_misaligned) |
297 | |
298 | #ifdef __AARCH64EB__ |
299 | L(syndrome_check): |
300 | clz pos, syndrome |
301 | cmp pos, limit, lsl #3 |
302 | b.lo L(end_quick) |
303 | #endif |
304 | |
305 | L(ret0): |
306 | mov result, #0 |
307 | ret |
308 | SYM_FUNC_END(__pi_strncmp) |
309 | SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp) |
310 | EXPORT_SYMBOL_NOKASAN(strncmp) |
311 | |