1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright (c) 2013-2021, Arm Limited. |
4 | * |
5 | * Adapted from the original at: |
6 | * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S |
7 | */ |
8 | |
9 | #include <linux/linkage.h> |
10 | #include <asm/assembler.h> |
11 | |
12 | /* Assumptions: |
13 | * |
14 | * ARMv8-a, AArch64, unaligned accesses. |
15 | */ |
16 | |
17 | #define L(label) .L ## label |
18 | |
19 | /* Parameters and result. */ |
20 | #define src1 x0 |
21 | #define src2 x1 |
22 | #define limit x2 |
23 | #define result w0 |
24 | |
25 | /* Internal variables. */ |
26 | #define data1 x3 |
27 | #define data1w w3 |
28 | #define data1h x4 |
29 | #define data2 x5 |
30 | #define data2w w5 |
31 | #define data2h x6 |
32 | #define tmp1 x7 |
33 | #define tmp2 x8 |
34 | |
35 | SYM_FUNC_START(__pi_memcmp) |
36 | subs limit, limit, 8 |
37 | b.lo L(less8) |
38 | |
39 | ldr data1, [src1], 8 |
40 | ldr data2, [src2], 8 |
41 | cmp data1, data2 |
42 | b.ne L(return) |
43 | |
44 | subs limit, limit, 8 |
45 | b.gt L(more16) |
46 | |
47 | ldr data1, [src1, limit] |
48 | ldr data2, [src2, limit] |
49 | b L(return) |
50 | |
51 | L(more16): |
52 | ldr data1, [src1], 8 |
53 | ldr data2, [src2], 8 |
54 | cmp data1, data2 |
55 | bne L(return) |
56 | |
57 | /* Jump directly to comparing the last 16 bytes for 32 byte (or less) |
58 | strings. */ |
59 | subs limit, limit, 16 |
60 | b.ls L(last_bytes) |
61 | |
62 | /* We overlap loads between 0-32 bytes at either side of SRC1 when we |
63 | try to align, so limit it only to strings larger than 128 bytes. */ |
64 | cmp limit, 96 |
65 | b.ls L(loop16) |
66 | |
67 | /* Align src1 and adjust src2 with bytes not yet done. */ |
68 | and tmp1, src1, 15 |
69 | add limit, limit, tmp1 |
70 | sub src1, src1, tmp1 |
71 | sub src2, src2, tmp1 |
72 | |
73 | /* Loop performing 16 bytes per iteration using aligned src1. |
74 | Limit is pre-decremented by 16 and must be larger than zero. |
75 | Exit if <= 16 bytes left to do or if the data is not equal. */ |
76 | .p2align 4 |
77 | L(loop16): |
78 | ldp data1, data1h, [src1], 16 |
79 | ldp data2, data2h, [src2], 16 |
80 | subs limit, limit, 16 |
81 | ccmp data1, data2, 0, hi |
82 | ccmp data1h, data2h, 0, eq |
83 | b.eq L(loop16) |
84 | |
85 | cmp data1, data2 |
86 | bne L(return) |
87 | mov data1, data1h |
88 | mov data2, data2h |
89 | cmp data1, data2 |
90 | bne L(return) |
91 | |
92 | /* Compare last 1-16 bytes using unaligned access. */ |
93 | L(last_bytes): |
94 | add src1, src1, limit |
95 | add src2, src2, limit |
96 | ldp data1, data1h, [src1] |
97 | ldp data2, data2h, [src2] |
98 | cmp data1, data2 |
99 | bne L(return) |
100 | mov data1, data1h |
101 | mov data2, data2h |
102 | cmp data1, data2 |
103 | |
104 | /* Compare data bytes and set return value to 0, -1 or 1. */ |
105 | L(return): |
106 | #ifndef __AARCH64EB__ |
107 | rev data1, data1 |
108 | rev data2, data2 |
109 | #endif |
110 | cmp data1, data2 |
111 | L(ret_eq): |
112 | cset result, ne |
113 | cneg result, result, lo |
114 | ret |
115 | |
116 | .p2align 4 |
117 | /* Compare up to 8 bytes. Limit is [-8..-1]. */ |
118 | L(less8): |
119 | adds limit, limit, 4 |
120 | b.lo L(less4) |
121 | ldr data1w, [src1], 4 |
122 | ldr data2w, [src2], 4 |
123 | cmp data1w, data2w |
124 | b.ne L(return) |
125 | sub limit, limit, 4 |
126 | L(less4): |
127 | adds limit, limit, 4 |
128 | beq L(ret_eq) |
129 | L(byte_loop): |
130 | ldrb data1w, [src1], 1 |
131 | ldrb data2w, [src2], 1 |
132 | subs limit, limit, 1 |
133 | ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ |
134 | b.eq L(byte_loop) |
135 | sub result, data1w, data2w |
136 | ret |
137 | SYM_FUNC_END(__pi_memcmp) |
138 | SYM_FUNC_ALIAS_WEAK(memcmp, __pi_memcmp) |
139 | EXPORT_SYMBOL_NOKASAN(memcmp) |
140 | |