1 | /* Copyright (C) 2013-2024 Free Software Foundation, Inc. |
2 | |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library. If not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | /* Assumptions: |
22 | * |
23 | * ARMv8-a, AArch64 |
24 | */ |
25 | |
26 | #define REP8_01 0x0101010101010101 |
27 | #define REP8_7f 0x7f7f7f7f7f7f7f7f |
28 | |
29 | /* Parameters and result. */ |
30 | #define src1 x0 |
31 | #define src2 x1 |
32 | #define limit x2 |
33 | #define result x0 |
34 | |
35 | /* Internal variables. */ |
36 | #define data1 x3 |
37 | #define data1w w3 |
38 | #define data2 x4 |
39 | #define data2w w4 |
40 | #define has_nul x5 |
41 | #define diff x6 |
42 | #define syndrome x7 |
43 | #define tmp1 x8 |
44 | #define tmp2 x9 |
45 | #define tmp3 x10 |
46 | #define zeroones x11 |
47 | #define pos x12 |
48 | #define mask x13 |
49 | #define endloop x14 |
50 | #define count mask |
51 | #define offset pos |
52 | #define neg_offset x15 |
53 | |
54 | /* Define endian dependent shift operations. |
55 | On big-endian early bytes are at MSB and on little-endian LSB. |
56 | LS_FW means shifting towards early bytes. |
57 | LS_BK means shifting towards later bytes. |
58 | */ |
59 | #ifdef __AARCH64EB__ |
60 | #define LS_FW lsl |
61 | #define LS_BK lsr |
62 | #else |
63 | #define LS_FW lsr |
64 | #define LS_BK lsl |
65 | #endif |
66 | |
67 | .text |
68 | .p2align 6 |
69 | .rep 9 |
70 | nop /* Pad so that the loop below fits a cache line. */ |
71 | .endr |
72 | ENTRY_ALIGN (strncmp, 0) |
73 | cbz limit, L(ret0) |
74 | eor tmp1, src1, src2 |
75 | mov zeroones, #REP8_01 |
76 | tst tmp1, #7 |
77 | and count, src1, #7 |
78 | b.ne L(misaligned8) |
79 | cbnz count, L(mutual_align) |
80 | |
81 | /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 |
82 | (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and |
83 | can be done in parallel across the entire word. */ |
84 | /* Start of performance-critical section -- one 64B cache line. */ |
85 | L(loop_aligned): |
86 | ldr data1, [src1], #8 |
87 | ldr data2, [src2], #8 |
88 | L(start_realigned): |
89 | subs limit, limit, #8 |
90 | sub tmp1, data1, zeroones |
91 | orr tmp2, data1, #REP8_7f |
92 | eor diff, data1, data2 /* Non-zero if differences found. */ |
93 | csinv endloop, diff, xzr, hi /* Last Dword or differences. */ |
94 | bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ |
95 | ccmp endloop, #0, #0, eq |
96 | b.eq L(loop_aligned) |
97 | /* End of performance-critical section -- one 64B cache line. */ |
98 | |
99 | L(full_check): |
100 | #ifndef __AARCH64EB__ |
101 | orr syndrome, diff, has_nul |
102 | add limit, limit, 8 /* Rewind limit to before last subs. */ |
103 | L(syndrome_check): |
104 | /* Limit was reached. Check if the NUL byte or the difference |
105 | is before the limit. */ |
106 | rev syndrome, syndrome |
107 | rev data1, data1 |
108 | clz pos, syndrome |
109 | rev data2, data2 |
110 | lsl data1, data1, pos |
111 | cmp limit, pos, lsr #3 |
112 | lsl data2, data2, pos |
113 | /* But we need to zero-extend (char is unsigned) the value and then |
114 | perform a signed 32-bit subtraction. */ |
115 | lsr data1, data1, #56 |
116 | sub result, data1, data2, lsr #56 |
117 | csel result, result, xzr, hi |
118 | ret |
119 | #else |
120 | /* Not reached the limit, must have found the end or a diff. */ |
121 | tbz limit, #63, L(not_limit) |
122 | add tmp1, limit, 8 |
123 | cbz limit, L(not_limit) |
124 | |
125 | lsl limit, tmp1, #3 /* Bits -> bytes. */ |
126 | mov mask, #~0 |
127 | lsr mask, mask, limit |
128 | bic data1, data1, mask |
129 | bic data2, data2, mask |
130 | |
131 | /* Make sure that the NUL byte is marked in the syndrome. */ |
132 | orr has_nul, has_nul, mask |
133 | |
134 | L(not_limit): |
135 | /* For big-endian we cannot use the trick with the syndrome value |
136 | as carry-propagation can corrupt the upper bits if the trailing |
137 | bytes in the string contain 0x01. */ |
138 | /* However, if there is no NUL byte in the dword, we can generate |
139 | the result directly. We can't just subtract the bytes as the |
140 | MSB might be significant. */ |
141 | cbnz has_nul, 1f |
142 | cmp data1, data2 |
143 | cset result, ne |
144 | cneg result, result, lo |
145 | ret |
146 | 1: |
147 | /* Re-compute the NUL-byte detection, using a byte-reversed value. */ |
148 | rev tmp3, data1 |
149 | sub tmp1, tmp3, zeroones |
150 | orr tmp2, tmp3, #REP8_7f |
151 | bic has_nul, tmp1, tmp2 |
152 | rev has_nul, has_nul |
153 | orr syndrome, diff, has_nul |
154 | clz pos, syndrome |
155 | /* The most-significant-non-zero bit of the syndrome marks either the |
156 | first bit that is different, or the top bit of the first zero byte. |
157 | Shifting left now will bring the critical information into the |
158 | top bits. */ |
159 | L(end_quick): |
160 | lsl data1, data1, pos |
161 | lsl data2, data2, pos |
162 | /* But we need to zero-extend (char is unsigned) the value and then |
163 | perform a signed 32-bit subtraction. */ |
164 | lsr data1, data1, #56 |
165 | sub result, data1, data2, lsr #56 |
166 | ret |
167 | #endif |
168 | |
169 | L(mutual_align): |
170 | /* Sources are mutually aligned, but are not currently at an |
171 | alignment boundary. Round down the addresses and then mask off |
172 | the bytes that precede the start point. |
173 | We also need to adjust the limit calculations, but without |
174 | overflowing if the limit is near ULONG_MAX. */ |
175 | bic src1, src1, #7 |
176 | bic src2, src2, #7 |
177 | ldr data1, [src1], #8 |
178 | neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ |
179 | ldr data2, [src2], #8 |
180 | mov tmp2, #~0 |
181 | LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ |
182 | /* Adjust the limit and ensure it doesn't overflow. */ |
183 | adds limit, limit, count |
184 | csinv limit, limit, xzr, lo |
185 | orr data1, data1, tmp2 |
186 | orr data2, data2, tmp2 |
187 | b L(start_realigned) |
188 | |
189 | .p2align 6 |
190 | /* Don't bother with dwords for up to 16 bytes. */ |
191 | L(misaligned8): |
192 | cmp limit, #16 |
193 | b.hs L(try_misaligned_words) |
194 | |
195 | L(byte_loop): |
196 | /* Perhaps we can do better than this. */ |
197 | ldrb data1w, [src1], #1 |
198 | ldrb data2w, [src2], #1 |
199 | subs limit, limit, #1 |
200 | ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ |
201 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ |
202 | b.eq L(byte_loop) |
203 | L(done): |
204 | sub result, data1, data2 |
205 | ret |
206 | /* Align the SRC1 to a dword by doing a bytewise compare and then do |
207 | the dword loop. */ |
208 | L(try_misaligned_words): |
209 | cbz count, L(src1_aligned) |
210 | |
211 | neg count, count |
212 | and count, count, #7 |
213 | sub limit, limit, count |
214 | |
215 | L(page_end_loop): |
216 | ldrb data1w, [src1], #1 |
217 | ldrb data2w, [src2], #1 |
218 | cmp data1w, #1 |
219 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ |
220 | b.ne L(done) |
221 | subs count, count, #1 |
222 | b.hi L(page_end_loop) |
223 | |
224 | /* The following diagram explains the comparison of misaligned strings. |
225 | The bytes are shown in natural order. For little-endian, it is |
226 | reversed in the registers. The "x" bytes are before the string. |
227 | The "|" separates data that is loaded at one time. |
228 | src1 | a a a a a a a a | b b b c c c c c | . . . |
229 | src2 | x x x x x a a a a a a a a b b b | c c c c c . . . |
230 | After shifting in each step, the data looks like this: |
231 | STEP_A STEP_B STEP_C |
232 | data1 a a a a a a a a b b b c c c c c b b b c c c c c |
233 | data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c |
234 | The bytes with "0" are eliminated from the syndrome via mask. |
235 | Align SRC2 down to 16 bytes. This way we can read 16 bytes at a |
236 | time from SRC2. The comparison happens in 3 steps. After each step |
237 | the loop can exit, or read from SRC1 or SRC2. */ |
238 | L(src1_aligned): |
239 | /* Calculate offset from 8 byte alignment to string start in bits. No |
240 | need to mask offset since shifts are ignoring upper bits. */ |
241 | lsl offset, src2, #3 |
242 | bic src2, src2, #0xf |
243 | mov mask, -1 |
244 | neg neg_offset, offset |
245 | ldr data1, [src1], #8 |
246 | ldp tmp1, tmp2, [src2], #16 |
247 | LS_BK mask, mask, neg_offset |
248 | and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ |
249 | /* Skip the first compare if data in tmp1 is irrelevant. */ |
250 | tbnz offset, 6, L(misaligned_mid_loop) |
251 | |
252 | L(loop_misaligned): |
253 | /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ |
254 | LS_FW data2, tmp1, offset |
255 | LS_BK tmp1, tmp2, neg_offset |
256 | subs limit, limit, #8 |
257 | orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ |
258 | sub has_nul, data1, zeroones |
259 | eor diff, data1, data2 /* Non-zero if differences found. */ |
260 | orr tmp3, data1, #REP8_7f |
261 | csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ |
262 | bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ |
263 | orr tmp3, endloop, has_nul |
264 | cbnz tmp3, L(full_check) |
265 | |
266 | ldr data1, [src1], #8 |
267 | L(misaligned_mid_loop): |
268 | /* STEP_B: Compare first part of data1 to second part of tmp2. */ |
269 | LS_FW data2, tmp2, offset |
270 | #ifdef __AARCH64EB__ |
271 | /* For big-endian we do a byte reverse to avoid carry-propagation |
272 | problem described above. This way we can reuse the has_nul in the |
273 | next step and also use syndrome value trick at the end. */ |
274 | rev tmp3, data1 |
275 | #define data1_fixed tmp3 |
276 | #else |
277 | #define data1_fixed data1 |
278 | #endif |
279 | sub has_nul, data1_fixed, zeroones |
280 | orr tmp3, data1_fixed, #REP8_7f |
281 | eor diff, data2, data1 /* Non-zero if differences found. */ |
282 | bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ |
283 | #ifdef __AARCH64EB__ |
284 | rev has_nul, has_nul |
285 | #endif |
286 | cmp limit, neg_offset, lsr #3 |
287 | orr syndrome, diff, has_nul |
288 | bic syndrome, syndrome, mask /* Ignore later bytes. */ |
289 | csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ |
290 | cbnz tmp3, L(syndrome_check) |
291 | |
292 | /* STEP_C: Compare second part of data1 to first part of tmp1. */ |
293 | ldp tmp1, tmp2, [src2], #16 |
294 | cmp limit, #8 |
295 | LS_BK data2, tmp1, neg_offset |
296 | eor diff, data2, data1 /* Non-zero if differences found. */ |
297 | orr syndrome, diff, has_nul |
298 | and syndrome, syndrome, mask /* Ignore earlier bytes. */ |
299 | csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ |
300 | cbnz tmp3, L(syndrome_check) |
301 | |
302 | ldr data1, [src1], #8 |
303 | sub limit, limit, #8 |
304 | b L(loop_misaligned) |
305 | |
306 | #ifdef __AARCH64EB__ |
307 | L(syndrome_check): |
308 | clz pos, syndrome |
309 | cmp pos, limit, lsl #3 |
310 | b.lo L(end_quick) |
311 | #endif |
312 | |
313 | L(ret0): |
314 | mov result, #0 |
315 | ret |
316 | |
317 | END (strncmp) |
318 | libc_hidden_builtin_def (strncmp) |
319 | |