1/*
2 * strcmp for ARMv7
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
10
11/* Implementation of strcmp for ARMv7 when DSP instructions are
12 available. Use ldrd to support wider loads, provided the data
13 is sufficiently aligned. Use saturating arithmetic to optimize
14 the compares. */
15
16#include "../asmdefs.h"
17
18/* Build Options:
19 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
20 byte in the string. If comparing completely random strings
21 the pre-check will save time, since there is a very high
22 probability of a mismatch in the first character: we save
23 significant overhead if this is the common case. However,
24 if strings are likely to be identical (eg because we're
25 verifying a hit in a hash table), then this check is largely
26 redundant. */
27
28#define STRCMP_NO_PRECHECK 0
29
30 /* This version uses Thumb-2 code. */
31 .thumb
32 .syntax unified
33
34#ifdef __ARM_BIG_ENDIAN
35#define S2LO lsl
36#define S2LOEQ lsleq
37#define S2HI lsr
38#define MSB 0x000000ff
39#define LSB 0xff000000
40#define BYTE0_OFFSET 24
41#define BYTE1_OFFSET 16
42#define BYTE2_OFFSET 8
43#define BYTE3_OFFSET 0
44#else /* not __ARM_BIG_ENDIAN */
45#define S2LO lsr
46#define S2LOEQ lsreq
47#define S2HI lsl
48#define BYTE0_OFFSET 0
49#define BYTE1_OFFSET 8
50#define BYTE2_OFFSET 16
51#define BYTE3_OFFSET 24
52#define MSB 0xff000000
53#define LSB 0x000000ff
54#endif /* not __ARM_BIG_ENDIAN */
55
56/* Parameters and result. */
57#define src1 r0
58#define src2 r1
59#define result r0 /* Overlaps src1. */
60
61/* Internal variables. */
62#define tmp1 r4
63#define tmp2 r5
64#define const_m1 r12
65
66/* Additional internal variables for 64-bit aligned data. */
67#define data1a r2
68#define data1b r3
69#define data2a r6
70#define data2b r7
71#define syndrome_a tmp1
72#define syndrome_b tmp2
73
74/* Additional internal variables for 32-bit aligned data. */
75#define data1 r2
76#define data2 r3
77#define syndrome tmp2
78
79
80 /* Macro to compute and return the result value for word-aligned
81 cases. */
82 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
83#ifdef __ARM_BIG_ENDIAN
84 /* If data1 contains a zero byte, then syndrome will contain a 1 in
85 bit 7 of that byte. Otherwise, the highest set bit in the
86 syndrome will highlight the first different bit. It is therefore
87 sufficient to extract the eight bits starting with the syndrome
88 bit. */
89 clz tmp1, \synd
90 lsl r1, \d2, tmp1
91 .if \restore_r6
92 ldrd r6, r7, [sp, #8]
93 .endif
94 .cfi_restore 6
95 .cfi_restore 7
96 lsl \d1, \d1, tmp1
97 .cfi_remember_state
98 lsr result, \d1, #24
99 ldrd r4, r5, [sp], #16
100 .cfi_restore 4
101 .cfi_restore 5
102 sub result, result, r1, lsr #24
103 bx lr
104#else
105 /* To use the big-endian trick we'd have to reverse all three words.
106 that's slower than this approach. */
107 rev \synd, \synd
108 clz tmp1, \synd
109 bic tmp1, tmp1, #7
110 lsr r1, \d2, tmp1
111 .cfi_remember_state
112 .if \restore_r6
113 ldrd r6, r7, [sp, #8]
114 .endif
115 .cfi_restore 6
116 .cfi_restore 7
117 lsr \d1, \d1, tmp1
118 and result, \d1, #255
119 and r1, r1, #255
120 ldrd r4, r5, [sp], #16
121 .cfi_restore 4
122 .cfi_restore 5
123 sub result, result, r1
124
125 bx lr
126#endif
127 .endm
128
129 .text
130 .p2align 5
131L(strcmp_start_addr):
132#if STRCMP_NO_PRECHECK == 0
133L(fastpath_exit):
134 sub r0, r2, r3
135 bx lr
136 nop
137#endif
138ENTRY_ALIGN (__strcmp_arm, 0)
139#if STRCMP_NO_PRECHECK == 0
140 ldrb r2, [src1]
141 ldrb r3, [src2]
142 cmp r2, #1
143 it cs
144 cmpcs r2, r3
145 bne L(fastpath_exit)
146#endif
147 strd r4, r5, [sp, #-16]!
148 .cfi_def_cfa_offset 16
149 .cfi_offset 4, -16
150 .cfi_offset 5, -12
151 orr tmp1, src1, src2
152 strd r6, r7, [sp, #8]
153 .cfi_offset 6, -8
154 .cfi_offset 7, -4
155 mvn const_m1, #0
156 lsl r2, tmp1, #29
157 cbz r2, L(loop_aligned8)
158
159L(not_aligned):
160 eor tmp1, src1, src2
161 tst tmp1, #7
162 bne L(misaligned8)
163
164 /* Deal with mutual misalignment by aligning downwards and then
165 masking off the unwanted loaded data to prevent a difference. */
166 and tmp1, src1, #7
167 bic src1, src1, #7
168 and tmp2, tmp1, #3
169 bic src2, src2, #7
170 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
171 ldrd data1a, data1b, [src1], #16
172 tst tmp1, #4
173 ldrd data2a, data2b, [src2], #16
174 /* In thumb code we can't use MVN with a register shift, but
175 we do have ORN. */
176 S2HI tmp1, const_m1, tmp2
177 orn data1a, data1a, tmp1
178 orn data2a, data2a, tmp1
179 beq L(start_realigned8)
180 orn data1b, data1b, tmp1
181 mov data1a, const_m1
182 orn data2b, data2b, tmp1
183 mov data2a, const_m1
184 b L(start_realigned8)
185
186 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
187 pass. */
188 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
189 .p2align 2 /* Always word aligned. */
190L(loop_aligned8):
191 ldrd data1a, data1b, [src1], #16
192 ldrd data2a, data2b, [src2], #16
193L(start_realigned8):
194 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
195 eor syndrome_a, data1a, data2a
196 sel syndrome_a, syndrome_a, const_m1
197 cbnz syndrome_a, L(diff_in_a)
198 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
199 eor syndrome_b, data1b, data2b
200 sel syndrome_b, syndrome_b, const_m1
201 cbnz syndrome_b, L(diff_in_b)
202
203 ldrd data1a, data1b, [src1, #-8]
204 ldrd data2a, data2b, [src2, #-8]
205 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
206 eor syndrome_a, data1a, data2a
207 sel syndrome_a, syndrome_a, const_m1
208 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
209 eor syndrome_b, data1b, data2b
210 sel syndrome_b, syndrome_b, const_m1
211 /* Can't use CBZ for backwards branch. */
212 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
213 beq L(loop_aligned8)
214
215L(diff_found):
216 cbnz syndrome_a, L(diff_in_a)
217
218L(diff_in_b):
219 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
220
221L(diff_in_a):
222 .cfi_restore_state
223 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
224
225 .cfi_restore_state
226L(misaligned8):
227 tst tmp1, #3
228 bne L(misaligned4)
229 ands tmp1, src1, #3
230 bne L(mutual_align4)
231
232 /* Unrolled by a factor of 2, to reduce the number of post-increment
233 operations. */
234L(loop_aligned4):
235 ldr data1, [src1], #8
236 ldr data2, [src2], #8
237L(start_realigned4):
238 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
239 eor syndrome, data1, data2
240 sel syndrome, syndrome, const_m1
241 cbnz syndrome, L(aligned4_done)
242 ldr data1, [src1, #-4]
243 ldr data2, [src2, #-4]
244 uadd8 syndrome, data1, const_m1
245 eor syndrome, data1, data2
246 sel syndrome, syndrome, const_m1
247 cmp syndrome, #0
248 beq L(loop_aligned4)
249
250L(aligned4_done):
251 strcmp_epilogue_aligned syndrome, data1, data2, 0
252
253L(mutual_align4):
254 .cfi_restore_state
255 /* Deal with mutual misalignment by aligning downwards and then
256 masking off the unwanted loaded data to prevent a difference. */
257 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
258 bic src1, src1, #3
259 ldr data1, [src1], #8
260 bic src2, src2, #3
261 ldr data2, [src2], #8
262
263 /* In thumb code we can't use MVN with a register shift, but
264 we do have ORN. */
265 S2HI tmp1, const_m1, tmp1
266 orn data1, data1, tmp1
267 orn data2, data2, tmp1
268 b L(start_realigned4)
269
270L(misaligned4):
271 ands tmp1, src1, #3
272 beq L(src1_aligned)
273 sub src2, src2, tmp1
274 bic src1, src1, #3
275 lsls tmp1, tmp1, #31
276 ldr data1, [src1], #4
277 beq L(aligned_m2)
278 bcs L(aligned_m1)
279
280#if STRCMP_NO_PRECHECK == 1
281 ldrb data2, [src2, #1]
282 uxtb tmp1, data1, ror #BYTE1_OFFSET
283 subs tmp1, tmp1, data2
284 bne L(misaligned_exit)
285 cbz data2, L(misaligned_exit)
286
287L(aligned_m2):
288 ldrb data2, [src2, #2]
289 uxtb tmp1, data1, ror #BYTE2_OFFSET
290 subs tmp1, tmp1, data2
291 bne L(misaligned_exit)
292 cbz data2, L(misaligned_exit)
293
294L(aligned_m1):
295 ldrb data2, [src2, #3]
296 uxtb tmp1, data1, ror #BYTE3_OFFSET
297 subs tmp1, tmp1, data2
298 bne L(misaligned_exit)
299 add src2, src2, #4
300 cbnz data2, L(src1_aligned)
301#else /* STRCMP_NO_PRECHECK */
302 /* If we've done the pre-check, then we don't need to check the
303 first byte again here. */
304 ldrb data2, [src2, #2]
305 uxtb tmp1, data1, ror #BYTE2_OFFSET
306 subs tmp1, tmp1, data2
307 bne L(misaligned_exit)
308 cbz data2, L(misaligned_exit)
309
310L(aligned_m2):
311 ldrb data2, [src2, #3]
312 uxtb tmp1, data1, ror #BYTE3_OFFSET
313 subs tmp1, tmp1, data2
314 bne L(misaligned_exit)
315 cbnz data2, L(aligned_m1)
316#endif
317
318L(misaligned_exit):
319 .cfi_remember_state
320 mov result, tmp1
321 ldr r4, [sp], #16
322 .cfi_restore 4
323 bx lr
324
325#if STRCMP_NO_PRECHECK == 0
326L(aligned_m1):
327 add src2, src2, #4
328#endif
329L(src1_aligned):
330 .cfi_restore_state
331 /* src1 is word aligned, but src2 has no common alignment
332 with it. */
333 ldr data1, [src1], #4
334 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
335
336 bic src2, src2, #3
337 ldr data2, [src2], #4
338 bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */
339 bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */
340
341 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
342L(overlap3):
343 bic tmp1, data1, #MSB
344 uadd8 syndrome, data1, const_m1
345 eors syndrome, tmp1, data2, S2LO #8
346 sel syndrome, syndrome, const_m1
347 bne 4f
348 cbnz syndrome, 5f
349 ldr data2, [src2], #4
350 eor tmp1, tmp1, data1
351 cmp tmp1, data2, S2HI #24
352 bne 6f
353 ldr data1, [src1], #4
354 b L(overlap3)
3554:
356 S2LO data2, data2, #8
357 b L(strcmp_tail)
358
3595:
360 bics syndrome, syndrome, #MSB
361 bne L(strcmp_done_equal)
362
363 /* We can only get here if the MSB of data1 contains 0, so
364 fast-path the exit. */
365 ldrb result, [src2]
366 .cfi_remember_state
367 ldrd r4, r5, [sp], #16
368 .cfi_restore 4
369 .cfi_restore 5
370 /* R6/7 Not used in this sequence. */
371 .cfi_restore 6
372 .cfi_restore 7
373 neg result, result
374 bx lr
375
3766:
377 .cfi_restore_state
378 S2LO data1, data1, #24
379 and data2, data2, #LSB
380 b L(strcmp_tail)
381
382 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
383L(overlap2):
384 and tmp1, data1, const_m1, S2LO #16
385 uadd8 syndrome, data1, const_m1
386 eors syndrome, tmp1, data2, S2LO #16
387 sel syndrome, syndrome, const_m1
388 bne 4f
389 cbnz syndrome, 5f
390 ldr data2, [src2], #4
391 eor tmp1, tmp1, data1
392 cmp tmp1, data2, S2HI #16
393 bne 6f
394 ldr data1, [src1], #4
395 b L(overlap2)
3964:
397 S2LO data2, data2, #16
398 b L(strcmp_tail)
3995:
400 ands syndrome, syndrome, const_m1, S2LO #16
401 bne L(strcmp_done_equal)
402
403 ldrh data2, [src2]
404 S2LO data1, data1, #16
405#ifdef __ARM_BIG_ENDIAN
406 lsl data2, data2, #16
407#endif
408 b L(strcmp_tail)
409
4106:
411 S2LO data1, data1, #16
412 and data2, data2, const_m1, S2LO #16
413 b L(strcmp_tail)
414
415 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
416L(overlap1):
417 and tmp1, data1, #LSB
418 uadd8 syndrome, data1, const_m1
419 eors syndrome, tmp1, data2, S2LO #24
420 sel syndrome, syndrome, const_m1
421 bne 4f
422 cbnz syndrome, 5f
423 ldr data2, [src2], #4
424 eor tmp1, tmp1, data1
425 cmp tmp1, data2, S2HI #8
426 bne 6f
427 ldr data1, [src1], #4
428 b L(overlap1)
4294:
430 S2LO data2, data2, #24
431 b L(strcmp_tail)
4325:
433 tst syndrome, #LSB
434 bne L(strcmp_done_equal)
435 ldr data2, [src2]
4366:
437 S2LO data1, data1, #8
438 bic data2, data2, #MSB
439 b L(strcmp_tail)
440
441L(strcmp_done_equal):
442 mov result, #0
443 .cfi_remember_state
444 ldrd r4, r5, [sp], #16
445 .cfi_restore 4
446 .cfi_restore 5
447 /* R6/7 not used in this sequence. */
448 .cfi_restore 6
449 .cfi_restore 7
450 bx lr
451
452L(strcmp_tail):
453 .cfi_restore_state
454#ifndef __ARM_BIG_ENDIAN
455 rev data1, data1
456 rev data2, data2
457 /* Now everything looks big-endian... */
458#endif
459 uadd8 tmp1, data1, const_m1
460 eor tmp1, data1, data2
461 sel syndrome, tmp1, const_m1
462 clz tmp1, syndrome
463 lsl data1, data1, tmp1
464 lsl data2, data2, tmp1
465 lsr result, data1, #24
466 ldrd r4, r5, [sp], #16
467 .cfi_restore 4
468 .cfi_restore 5
469 /* R6/7 not used in this sequence. */
470 .cfi_restore 6
471 .cfi_restore 7
472 sub result, result, data2, lsr #24
473 bx lr
474
475END (__strcmp_arm)
476
477#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */
478

source code of libc/AOR_v20.02/string/arm/strcmp.S