1 | /* memchr - find a character in a memory zone using base integer registers |
2 | |
3 | Copyright (C) 2018-2024 Free Software Foundation, Inc. |
4 | |
5 | This file is part of the GNU C Library. |
6 | |
7 | The GNU C Library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Lesser General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2.1 of the License, or (at your option) any later version. |
11 | |
12 | The GNU C Library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Lesser General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Lesser General Public |
18 | License along with the GNU C Library. If not, see |
19 | <https://www.gnu.org/licenses/>. */ |
20 | |
21 | #include <sysdep.h> |
22 | |
23 | /* Assumptions: |
24 | * |
25 | * ARMv8-a, AArch64 |
26 | * Use base integer registers. |
27 | */ |
28 | |
29 | /* Arguments and results. */ |
30 | #define srcin x0 |
31 | #define chrin x1 |
32 | #define cntin x2 |
33 | |
34 | #define result x0 |
35 | |
36 | #define repchr x1 |
37 | |
38 | #define tmp1 x2 |
39 | #define tmp2 x3 |
40 | #define tmp3 x4 |
41 | #define tmp4 x5 |
42 | |
43 | #define src x6 |
44 | #define srcend x7 |
45 | #define srcend16 x8 |
46 | |
47 | #define anymore x9 |
48 | |
49 | #define zeroones x10 |
50 | |
51 | #define data1 x11 |
52 | #define data2 x12 |
53 | |
54 | #define has_chr1 x13 |
55 | #define has_chr2 x14 |
56 | |
57 | #define REP8_01 0x0101010101010101 |
58 | #define REP8_7f 0x7f7f7f7f7f7f7f7f |
59 | |
60 | |
61 | ENTRY (__memchr_nosimd) |
62 | |
63 | PTR_ARG (0) |
64 | SIZE_ARG (2) |
65 | |
66 | /* Do not dereference srcin if no bytes to compare. */ |
67 | cbz cntin, L(none_chr) |
68 | |
69 | /* Start address is 16-byte aligned or not? */ |
70 | tst srcin, 15 |
71 | bic src, srcin, 15 |
72 | |
73 | mov zeroones, REP8_01 |
74 | and repchr, chrin, 255 |
75 | /* Generate a qword integer as |c|c|c|c|c|c|c|c|. */ |
76 | mul repchr, repchr, zeroones |
77 | |
78 | add srcend, srcin, cntin |
79 | /* |
80 | * srcend16 is address of the block following the last block. |
81 | * |
82 | * [A block is 16-byte aligned and sized.] |
83 | */ |
84 | add srcend16, srcend, 15 |
85 | bic srcend16, srcend16, 15 |
86 | |
87 | b.eq L(loop) |
88 | |
89 | /* Load the first block containing start address. */ |
90 | ldp data1, data2, [src], 16 |
91 | |
92 | lsl tmp1, srcin, 3 |
93 | mov tmp2, ~0 |
94 | #ifdef __AARCH64EB__ |
95 | lsr tmp3, tmp2, tmp1 |
96 | #else |
97 | lsl tmp3, tmp2, tmp1 |
98 | #endif |
99 | /* Start address is in the first or the second qword? */ |
100 | tst srcin, 8 |
101 | |
102 | /* |
103 | * Transform any byte in the block to zero using XOR operation, |
104 | * if that byte equals the char to search. In this way, searching |
105 | * the char becomes detecting zero in the resulting two qwords. |
106 | */ |
107 | eor data1, data1, repchr |
108 | eor data2, data2, repchr |
109 | |
110 | /* |
111 | * Set those unused bytes(before start address) to 0xff, so |
112 | * that they will not hit any zero detection. |
113 | */ |
114 | orn tmp1, data1, tmp3 |
115 | orn tmp2, data2, tmp3 |
116 | |
117 | csinv data1, tmp1, xzr, eq |
118 | csel data2, data2, tmp2, eq |
119 | |
120 | /* |
121 | * When the first and last block are the same, there are two cases: |
122 | * o. Memory range to search is just in one block. |
123 | * ( start address - end address) < 0 |
124 | * |
125 | * o. Memory range is so large that end address wrap-around. |
126 | * ( start address - end address) > 0 |
127 | */ |
128 | cmp srcin, srcend |
129 | ccmp src, srcend16, 0, mi |
130 | csetm anymore, ne |
131 | b L(find_chr) |
132 | |
133 | .p2align 4 |
134 | L(loop): |
135 | ldp data1, data2, [src], 16 |
136 | |
137 | subs anymore, src, srcend16 |
138 | |
139 | /* |
140 | * Transform any byte in the block to zero using XOR operation, |
141 | * if that byte equals the char to search. |
142 | */ |
143 | eor data1, data1, repchr |
144 | eor data2, data2, repchr |
145 | |
146 | L(find_chr): |
147 | /* |
148 | * Use the following integer test to find out if any byte in a |
149 | * qword is zero. If do not contain zero-valued byte, test result |
150 | * is zero. |
151 | * |
152 | * (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080 |
153 | * = |
154 | * (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f) |
155 | * |
156 | */ |
157 | sub tmp1, data1, zeroones |
158 | sub tmp2, data2, zeroones |
159 | |
160 | orr tmp3, data1, REP8_7f |
161 | orr tmp4, data2, REP8_7f |
162 | |
163 | bic has_chr1, tmp1, tmp3 |
164 | bic has_chr2, tmp2, tmp4 |
165 | |
166 | orr tmp1, has_chr1, has_chr2 |
167 | ccmp tmp1, 0, 0, ne |
168 | |
169 | b.eq L(loop) |
170 | |
171 | cbz has_chr1, 1f |
172 | sub result, src, 16 |
173 | #ifdef __AARCH64EB__ |
174 | rev data1, data1 |
175 | #else |
176 | rev has_chr1, has_chr1 |
177 | #endif |
178 | b L(done) |
179 | |
180 | 1: cbz has_chr2, L(none_chr) |
181 | sub result, src, 8 |
182 | #ifdef __AARCH64EB__ |
183 | rev data1, data2 |
184 | #else |
185 | rev has_chr1, has_chr2 |
186 | #endif |
187 | |
188 | L(done): |
189 | #ifdef __AARCH64EB__ |
190 | /* |
191 | * For big-endian, can not directly use has_chr1/has_chr2 because |
192 | * two qwords has been reversed after loading from memory. |
193 | * Thus, have to perform char detection on two qwords again, which |
194 | * should be byte-swapped this time. |
195 | */ |
196 | sub tmp1, data1, zeroones |
197 | orr tmp3, data1, REP8_7f |
198 | bic has_chr1, tmp1, tmp3 |
199 | rev has_chr1, has_chr1 |
200 | #endif |
201 | |
202 | /* |
203 | * If the specified char is found in a qword, the corresponding |
204 | * byte of in has_chr has value of 1, while this is only true for |
205 | * the first occurrence, not other occurrences. |
206 | */ |
207 | cmp anymore, 0 |
208 | clz tmp1, has_chr1 |
209 | add result, result, tmp1, lsr 3 |
210 | ccmp result, srcend, 8, eq /* NZCV = 8000 */ |
211 | csel result, result, xzr, mi |
212 | ret |
213 | |
214 | L(none_chr): |
215 | mov result, 0 |
216 | ret |
217 | |
218 | END (__memchr_nosimd) |
219 | |