1 | /* |
2 | * strcmp for ARMv7 |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | #if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 |
10 | |
11 | /* Implementation of strcmp for ARMv7 when DSP instructions are |
12 | available. Use ldrd to support wider loads, provided the data |
13 | is sufficiently aligned. Use saturating arithmetic to optimize |
14 | the compares. */ |
15 | |
16 | #include "../asmdefs.h" |
17 | |
18 | /* Build Options: |
19 | STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first |
20 | byte in the string. If comparing completely random strings |
21 | the pre-check will save time, since there is a very high |
22 | probability of a mismatch in the first character: we save |
23 | significant overhead if this is the common case. However, |
24 | if strings are likely to be identical (eg because we're |
25 | verifying a hit in a hash table), then this check is largely |
26 | redundant. */ |
27 | |
28 | #define STRCMP_NO_PRECHECK 0 |
29 | |
30 | /* This version uses Thumb-2 code. */ |
31 | .thumb |
32 | .syntax unified |
33 | |
34 | #ifdef __ARM_BIG_ENDIAN |
35 | #define S2LO lsl |
36 | #define S2LOEQ lsleq |
37 | #define S2HI lsr |
38 | #define MSB 0x000000ff |
39 | #define LSB 0xff000000 |
40 | #define BYTE0_OFFSET 24 |
41 | #define BYTE1_OFFSET 16 |
42 | #define BYTE2_OFFSET 8 |
43 | #define BYTE3_OFFSET 0 |
44 | #else /* not __ARM_BIG_ENDIAN */ |
45 | #define S2LO lsr |
46 | #define S2LOEQ lsreq |
47 | #define S2HI lsl |
48 | #define BYTE0_OFFSET 0 |
49 | #define BYTE1_OFFSET 8 |
50 | #define BYTE2_OFFSET 16 |
51 | #define BYTE3_OFFSET 24 |
52 | #define MSB 0xff000000 |
53 | #define LSB 0x000000ff |
54 | #endif /* not __ARM_BIG_ENDIAN */ |
55 | |
56 | /* Parameters and result. */ |
57 | #define src1 r0 |
58 | #define src2 r1 |
59 | #define result r0 /* Overlaps src1. */ |
60 | |
61 | /* Internal variables. */ |
62 | #define tmp1 r4 |
63 | #define tmp2 r5 |
64 | #define const_m1 r12 |
65 | |
66 | /* Additional internal variables for 64-bit aligned data. */ |
67 | #define data1a r2 |
68 | #define data1b r3 |
69 | #define data2a r6 |
70 | #define data2b r7 |
71 | #define syndrome_a tmp1 |
72 | #define syndrome_b tmp2 |
73 | |
74 | /* Additional internal variables for 32-bit aligned data. */ |
75 | #define data1 r2 |
76 | #define data2 r3 |
77 | #define syndrome tmp2 |
78 | |
79 | |
80 | /* Macro to compute and return the result value for word-aligned |
81 | cases. */ |
82 | .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 |
83 | #ifdef __ARM_BIG_ENDIAN |
84 | /* If data1 contains a zero byte, then syndrome will contain a 1 in |
85 | bit 7 of that byte. Otherwise, the highest set bit in the |
86 | syndrome will highlight the first different bit. It is therefore |
87 | sufficient to extract the eight bits starting with the syndrome |
88 | bit. */ |
89 | clz tmp1, \synd |
90 | lsl r1, \d2, tmp1 |
91 | .if \restore_r6 |
92 | ldrd r6, r7, [sp, #8] |
93 | .endif |
94 | .cfi_restore 6 |
95 | .cfi_restore 7 |
96 | lsl \d1, \d1, tmp1 |
97 | .cfi_remember_state |
98 | lsr result, \d1, #24 |
99 | ldrd r4, r5, [sp], #16 |
100 | .cfi_restore 4 |
101 | .cfi_restore 5 |
102 | sub result, result, r1, lsr #24 |
103 | bx lr |
104 | #else |
105 | /* To use the big-endian trick we'd have to reverse all three words. |
106 | that's slower than this approach. */ |
107 | rev \synd, \synd |
108 | clz tmp1, \synd |
109 | bic tmp1, tmp1, #7 |
110 | lsr r1, \d2, tmp1 |
111 | .cfi_remember_state |
112 | .if \restore_r6 |
113 | ldrd r6, r7, [sp, #8] |
114 | .endif |
115 | .cfi_restore 6 |
116 | .cfi_restore 7 |
117 | lsr \d1, \d1, tmp1 |
118 | and result, \d1, #255 |
119 | and r1, r1, #255 |
120 | ldrd r4, r5, [sp], #16 |
121 | .cfi_restore 4 |
122 | .cfi_restore 5 |
123 | sub result, result, r1 |
124 | |
125 | bx lr |
126 | #endif |
127 | .endm |
128 | |
129 | .text |
130 | .p2align 5 |
131 | L(strcmp_start_addr): |
132 | #if STRCMP_NO_PRECHECK == 0 |
133 | L(fastpath_exit): |
134 | sub r0, r2, r3 |
135 | bx lr |
136 | nop |
137 | #endif |
138 | ENTRY_ALIGN (__strcmp_arm, 0) |
139 | #if STRCMP_NO_PRECHECK == 0 |
140 | ldrb r2, [src1] |
141 | ldrb r3, [src2] |
142 | cmp r2, #1 |
143 | it cs |
144 | cmpcs r2, r3 |
145 | bne L(fastpath_exit) |
146 | #endif |
147 | strd r4, r5, [sp, #-16]! |
148 | .cfi_def_cfa_offset 16 |
149 | .cfi_offset 4, -16 |
150 | .cfi_offset 5, -12 |
151 | orr tmp1, src1, src2 |
152 | strd r6, r7, [sp, #8] |
153 | .cfi_offset 6, -8 |
154 | .cfi_offset 7, -4 |
155 | mvn const_m1, #0 |
156 | lsl r2, tmp1, #29 |
157 | cbz r2, L(loop_aligned8) |
158 | |
159 | L(not_aligned): |
160 | eor tmp1, src1, src2 |
161 | tst tmp1, #7 |
162 | bne L(misaligned8) |
163 | |
164 | /* Deal with mutual misalignment by aligning downwards and then |
165 | masking off the unwanted loaded data to prevent a difference. */ |
166 | and tmp1, src1, #7 |
167 | bic src1, src1, #7 |
168 | and tmp2, tmp1, #3 |
169 | bic src2, src2, #7 |
170 | lsl tmp2, tmp2, #3 /* Bytes -> bits. */ |
171 | ldrd data1a, data1b, [src1], #16 |
172 | tst tmp1, #4 |
173 | ldrd data2a, data2b, [src2], #16 |
174 | /* In thumb code we can't use MVN with a register shift, but |
175 | we do have ORN. */ |
176 | S2HI tmp1, const_m1, tmp2 |
177 | orn data1a, data1a, tmp1 |
178 | orn data2a, data2a, tmp1 |
179 | beq L(start_realigned8) |
180 | orn data1b, data1b, tmp1 |
181 | mov data1a, const_m1 |
182 | orn data2b, data2b, tmp1 |
183 | mov data2a, const_m1 |
184 | b L(start_realigned8) |
185 | |
186 | /* Unwind the inner loop by a factor of 2, giving 16 bytes per |
187 | pass. */ |
188 | .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ |
189 | .p2align 2 /* Always word aligned. */ |
190 | L(loop_aligned8): |
191 | ldrd data1a, data1b, [src1], #16 |
192 | ldrd data2a, data2b, [src2], #16 |
193 | L(start_realigned8): |
194 | uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ |
195 | eor syndrome_a, data1a, data2a |
196 | sel syndrome_a, syndrome_a, const_m1 |
197 | cbnz syndrome_a, L(diff_in_a) |
198 | uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ |
199 | eor syndrome_b, data1b, data2b |
200 | sel syndrome_b, syndrome_b, const_m1 |
201 | cbnz syndrome_b, L(diff_in_b) |
202 | |
203 | ldrd data1a, data1b, [src1, #-8] |
204 | ldrd data2a, data2b, [src2, #-8] |
205 | uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ |
206 | eor syndrome_a, data1a, data2a |
207 | sel syndrome_a, syndrome_a, const_m1 |
208 | uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ |
209 | eor syndrome_b, data1b, data2b |
210 | sel syndrome_b, syndrome_b, const_m1 |
211 | /* Can't use CBZ for backwards branch. */ |
212 | orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ |
213 | beq L(loop_aligned8) |
214 | |
215 | L(diff_found): |
216 | cbnz syndrome_a, L(diff_in_a) |
217 | |
218 | L(diff_in_b): |
219 | strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 |
220 | |
221 | L(diff_in_a): |
222 | .cfi_restore_state |
223 | strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 |
224 | |
225 | .cfi_restore_state |
226 | L(misaligned8): |
227 | tst tmp1, #3 |
228 | bne L(misaligned4) |
229 | ands tmp1, src1, #3 |
230 | bne L(mutual_align4) |
231 | |
232 | /* Unrolled by a factor of 2, to reduce the number of post-increment |
233 | operations. */ |
234 | L(loop_aligned4): |
235 | ldr data1, [src1], #8 |
236 | ldr data2, [src2], #8 |
237 | L(start_realigned4): |
238 | uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ |
239 | eor syndrome, data1, data2 |
240 | sel syndrome, syndrome, const_m1 |
241 | cbnz syndrome, L(aligned4_done) |
242 | ldr data1, [src1, #-4] |
243 | ldr data2, [src2, #-4] |
244 | uadd8 syndrome, data1, const_m1 |
245 | eor syndrome, data1, data2 |
246 | sel syndrome, syndrome, const_m1 |
247 | cmp syndrome, #0 |
248 | beq L(loop_aligned4) |
249 | |
250 | L(aligned4_done): |
251 | strcmp_epilogue_aligned syndrome, data1, data2, 0 |
252 | |
253 | L(mutual_align4): |
254 | .cfi_restore_state |
255 | /* Deal with mutual misalignment by aligning downwards and then |
256 | masking off the unwanted loaded data to prevent a difference. */ |
257 | lsl tmp1, tmp1, #3 /* Bytes -> bits. */ |
258 | bic src1, src1, #3 |
259 | ldr data1, [src1], #8 |
260 | bic src2, src2, #3 |
261 | ldr data2, [src2], #8 |
262 | |
263 | /* In thumb code we can't use MVN with a register shift, but |
264 | we do have ORN. */ |
265 | S2HI tmp1, const_m1, tmp1 |
266 | orn data1, data1, tmp1 |
267 | orn data2, data2, tmp1 |
268 | b L(start_realigned4) |
269 | |
270 | L(misaligned4): |
271 | ands tmp1, src1, #3 |
272 | beq L(src1_aligned) |
273 | sub src2, src2, tmp1 |
274 | bic src1, src1, #3 |
275 | lsls tmp1, tmp1, #31 |
276 | ldr data1, [src1], #4 |
277 | beq L(aligned_m2) |
278 | bcs L(aligned_m1) |
279 | |
280 | #if STRCMP_NO_PRECHECK == 1 |
281 | ldrb data2, [src2, #1] |
282 | uxtb tmp1, data1, ror #BYTE1_OFFSET |
283 | subs tmp1, tmp1, data2 |
284 | bne L(misaligned_exit) |
285 | cbz data2, L(misaligned_exit) |
286 | |
287 | L(aligned_m2): |
288 | ldrb data2, [src2, #2] |
289 | uxtb tmp1, data1, ror #BYTE2_OFFSET |
290 | subs tmp1, tmp1, data2 |
291 | bne L(misaligned_exit) |
292 | cbz data2, L(misaligned_exit) |
293 | |
294 | L(aligned_m1): |
295 | ldrb data2, [src2, #3] |
296 | uxtb tmp1, data1, ror #BYTE3_OFFSET |
297 | subs tmp1, tmp1, data2 |
298 | bne L(misaligned_exit) |
299 | add src2, src2, #4 |
300 | cbnz data2, L(src1_aligned) |
301 | #else /* STRCMP_NO_PRECHECK */ |
302 | /* If we've done the pre-check, then we don't need to check the |
303 | first byte again here. */ |
304 | ldrb data2, [src2, #2] |
305 | uxtb tmp1, data1, ror #BYTE2_OFFSET |
306 | subs tmp1, tmp1, data2 |
307 | bne L(misaligned_exit) |
308 | cbz data2, L(misaligned_exit) |
309 | |
310 | L(aligned_m2): |
311 | ldrb data2, [src2, #3] |
312 | uxtb tmp1, data1, ror #BYTE3_OFFSET |
313 | subs tmp1, tmp1, data2 |
314 | bne L(misaligned_exit) |
315 | cbnz data2, L(aligned_m1) |
316 | #endif |
317 | |
318 | L(misaligned_exit): |
319 | .cfi_remember_state |
320 | mov result, tmp1 |
321 | ldr r4, [sp], #16 |
322 | .cfi_restore 4 |
323 | bx lr |
324 | |
325 | #if STRCMP_NO_PRECHECK == 0 |
326 | L(aligned_m1): |
327 | add src2, src2, #4 |
328 | #endif |
329 | L(src1_aligned): |
330 | .cfi_restore_state |
331 | /* src1 is word aligned, but src2 has no common alignment |
332 | with it. */ |
333 | ldr data1, [src1], #4 |
334 | lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ |
335 | |
336 | bic src2, src2, #3 |
337 | ldr data2, [src2], #4 |
338 | bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ |
339 | bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ |
340 | |
341 | /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ |
342 | L(overlap3): |
343 | bic tmp1, data1, #MSB |
344 | uadd8 syndrome, data1, const_m1 |
345 | eors syndrome, tmp1, data2, S2LO #8 |
346 | sel syndrome, syndrome, const_m1 |
347 | bne 4f |
348 | cbnz syndrome, 5f |
349 | ldr data2, [src2], #4 |
350 | eor tmp1, tmp1, data1 |
351 | cmp tmp1, data2, S2HI #24 |
352 | bne 6f |
353 | ldr data1, [src1], #4 |
354 | b L(overlap3) |
355 | 4: |
356 | S2LO data2, data2, #8 |
357 | b L(strcmp_tail) |
358 | |
359 | 5: |
360 | bics syndrome, syndrome, #MSB |
361 | bne L(strcmp_done_equal) |
362 | |
363 | /* We can only get here if the MSB of data1 contains 0, so |
364 | fast-path the exit. */ |
365 | ldrb result, [src2] |
366 | .cfi_remember_state |
367 | ldrd r4, r5, [sp], #16 |
368 | .cfi_restore 4 |
369 | .cfi_restore 5 |
370 | /* R6/7 Not used in this sequence. */ |
371 | .cfi_restore 6 |
372 | .cfi_restore 7 |
373 | neg result, result |
374 | bx lr |
375 | |
376 | 6: |
377 | .cfi_restore_state |
378 | S2LO data1, data1, #24 |
379 | and data2, data2, #LSB |
380 | b L(strcmp_tail) |
381 | |
382 | .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ |
383 | L(overlap2): |
384 | and tmp1, data1, const_m1, S2LO #16 |
385 | uadd8 syndrome, data1, const_m1 |
386 | eors syndrome, tmp1, data2, S2LO #16 |
387 | sel syndrome, syndrome, const_m1 |
388 | bne 4f |
389 | cbnz syndrome, 5f |
390 | ldr data2, [src2], #4 |
391 | eor tmp1, tmp1, data1 |
392 | cmp tmp1, data2, S2HI #16 |
393 | bne 6f |
394 | ldr data1, [src1], #4 |
395 | b L(overlap2) |
396 | 4: |
397 | S2LO data2, data2, #16 |
398 | b L(strcmp_tail) |
399 | 5: |
400 | ands syndrome, syndrome, const_m1, S2LO #16 |
401 | bne L(strcmp_done_equal) |
402 | |
403 | ldrh data2, [src2] |
404 | S2LO data1, data1, #16 |
405 | #ifdef __ARM_BIG_ENDIAN |
406 | lsl data2, data2, #16 |
407 | #endif |
408 | b L(strcmp_tail) |
409 | |
410 | 6: |
411 | S2LO data1, data1, #16 |
412 | and data2, data2, const_m1, S2LO #16 |
413 | b L(strcmp_tail) |
414 | |
415 | .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ |
416 | L(overlap1): |
417 | and tmp1, data1, #LSB |
418 | uadd8 syndrome, data1, const_m1 |
419 | eors syndrome, tmp1, data2, S2LO #24 |
420 | sel syndrome, syndrome, const_m1 |
421 | bne 4f |
422 | cbnz syndrome, 5f |
423 | ldr data2, [src2], #4 |
424 | eor tmp1, tmp1, data1 |
425 | cmp tmp1, data2, S2HI #8 |
426 | bne 6f |
427 | ldr data1, [src1], #4 |
428 | b L(overlap1) |
429 | 4: |
430 | S2LO data2, data2, #24 |
431 | b L(strcmp_tail) |
432 | 5: |
433 | tst syndrome, #LSB |
434 | bne L(strcmp_done_equal) |
435 | ldr data2, [src2] |
436 | 6: |
437 | S2LO data1, data1, #8 |
438 | bic data2, data2, #MSB |
439 | b L(strcmp_tail) |
440 | |
441 | L(strcmp_done_equal): |
442 | mov result, #0 |
443 | .cfi_remember_state |
444 | ldrd r4, r5, [sp], #16 |
445 | .cfi_restore 4 |
446 | .cfi_restore 5 |
447 | /* R6/7 not used in this sequence. */ |
448 | .cfi_restore 6 |
449 | .cfi_restore 7 |
450 | bx lr |
451 | |
452 | L(strcmp_tail): |
453 | .cfi_restore_state |
454 | #ifndef __ARM_BIG_ENDIAN |
455 | rev data1, data1 |
456 | rev data2, data2 |
457 | /* Now everything looks big-endian... */ |
458 | #endif |
459 | uadd8 tmp1, data1, const_m1 |
460 | eor tmp1, data1, data2 |
461 | sel syndrome, tmp1, const_m1 |
462 | clz tmp1, syndrome |
463 | lsl data1, data1, tmp1 |
464 | lsl data2, data2, tmp1 |
465 | lsr result, data1, #24 |
466 | ldrd r4, r5, [sp], #16 |
467 | .cfi_restore 4 |
468 | .cfi_restore 5 |
469 | /* R6/7 not used in this sequence. */ |
470 | .cfi_restore 6 |
471 | .cfi_restore 7 |
472 | sub result, result, data2, lsr #24 |
473 | bx lr |
474 | |
475 | END (__strcmp_arm) |
476 | |
477 | #endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */ |
478 | |