1/*
2 * strcpy/stpcpy - copy a string returning pointer to start/end.
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
12 */
13
14#include "../asmdefs.h"
15
16/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
17
18 To test the page crossing code path more thoroughly, compile with
19 -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
20 entry path. This option is not intended for production use. */
21
22/* Arguments and results. */
23#define dstin x0
24#define srcin x1
25
26/* Locals and temporaries. */
27#define src x2
28#define dst x3
29#define data1 x4
30#define data1w w4
31#define data2 x5
32#define data2w w5
33#define has_nul1 x6
34#define has_nul2 x7
35#define tmp1 x8
36#define tmp2 x9
37#define tmp3 x10
38#define tmp4 x11
39#define zeroones x12
40#define data1a x13
41#define data2a x14
42#define pos x15
43#define len x16
44#define to_align x17
45
46#ifdef BUILD_STPCPY
47#define STRCPY __stpcpy_aarch64
48#else
49#define STRCPY __strcpy_aarch64
50#endif
51
52 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
53 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
54 can be done in parallel across the entire word. */
55
56#define REP8_01 0x0101010101010101
57#define REP8_7f 0x7f7f7f7f7f7f7f7f
58#define REP8_80 0x8080808080808080
59
60 /* AArch64 systems have a minimum page size of 4k. We can do a quick
61 page size check for crossing this boundary on entry and if we
62 do not, then we can short-circuit much of the entry code. We
63 expect early page-crossing strings to be rare (probability of
64 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
65 predictable, even with random strings.
66
67 We don't bother checking for larger page sizes, the cost of setting
68 up the correct page size is just not worth the extra gain from
69 a small reduction in the cases taking the slow path. Note that
70 we only care about whether the first fetch, which may be
71 misaligned, crosses a page boundary - after that we move to aligned
72 fetches for the remainder of the string. */
73
74#ifdef STRCPY_TEST_PAGE_CROSS
75 /* Make everything that isn't Qword aligned look like a page cross. */
76#define MIN_PAGE_P2 4
77#else
78#define MIN_PAGE_P2 12
79#endif
80
81#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
82
83ENTRY (STRCPY)
84 /* For moderately short strings, the fastest way to do the copy is to
85 calculate the length of the string in the same way as strlen, then
86 essentially do a memcpy of the result. This avoids the need for
87 multiple byte copies and further means that by the time we
88 reach the bulk copy loop we know we can always use DWord
89 accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
90 with the same source string, so branch prediction is likely to
91 always be difficult - we mitigate against this by preferring
92 conditional select operations over branches whenever this is
93 feasible. */
94 and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
95 mov zeroones, #REP8_01
96 and to_align, srcin, #15
97 cmp tmp2, #(MIN_PAGE_SIZE - 16)
98 neg tmp1, to_align
99 /* The first fetch will straddle a (possible) page boundary iff
100 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
101 aligned string will never fail the page align check, so will
102 always take the fast path. */
103 b.gt L(page_cross)
104
105L(page_cross_ok):
106 ldp data1, data2, [srcin]
107#ifdef __AARCH64EB__
108 /* Because we expect the end to be found within 16 characters
109 (profiling shows this is the most common case), it's worth
110 swapping the bytes now to save having to recalculate the
111 termination syndrome later. We preserve data1 and data2
112 so that we can re-use the values later on. */
113 rev tmp2, data1
114 sub tmp1, tmp2, zeroones
115 orr tmp2, tmp2, #REP8_7f
116 bics has_nul1, tmp1, tmp2
117 b.ne L(fp_le8)
118 rev tmp4, data2
119 sub tmp3, tmp4, zeroones
120 orr tmp4, tmp4, #REP8_7f
121#else
122 sub tmp1, data1, zeroones
123 orr tmp2, data1, #REP8_7f
124 bics has_nul1, tmp1, tmp2
125 b.ne L(fp_le8)
126 sub tmp3, data2, zeroones
127 orr tmp4, data2, #REP8_7f
128#endif
129 bics has_nul2, tmp3, tmp4
130 b.eq L(bulk_entry)
131
132 /* The string is short (<=16 bytes). We don't know exactly how
133 short though, yet. Work out the exact length so that we can
134 quickly select the optimal copy strategy. */
135L(fp_gt8):
136 rev has_nul2, has_nul2
137 clz pos, has_nul2
138 mov tmp2, #56
139 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
140 sub pos, tmp2, pos
141#ifdef __AARCH64EB__
142 lsr data2, data2, pos
143#else
144 lsl data2, data2, pos
145#endif
146 str data2, [dst, #1]
147 str data1, [dstin]
148#ifdef BUILD_STPCPY
149 add dstin, dst, #8
150#endif
151 ret
152
153L(fp_le8):
154 rev has_nul1, has_nul1
155 clz pos, has_nul1
156 add dst, dstin, pos, lsr #3 /* Bits to bytes. */
157 subs tmp2, pos, #24 /* Pos in bits. */
158 b.lt L(fp_lt4)
159#ifdef __AARCH64EB__
160 mov tmp2, #56
161 sub pos, tmp2, pos
162 lsr data2, data1, pos
163 lsr data1, data1, #32
164#else
165 lsr data2, data1, tmp2
166#endif
167 /* 4->7 bytes to copy. */
168 str data2w, [dst, #-3]
169 str data1w, [dstin]
170#ifdef BUILD_STPCPY
171 mov dstin, dst
172#endif
173 ret
174L(fp_lt4):
175 cbz pos, L(fp_lt2)
176 /* 2->3 bytes to copy. */
177#ifdef __AARCH64EB__
178 lsr data1, data1, #48
179#endif
180 strh data1w, [dstin]
181 /* Fall-through, one byte (max) to go. */
182L(fp_lt2):
183 /* Null-terminated string. Last character must be zero! */
184 strb wzr, [dst]
185#ifdef BUILD_STPCPY
186 mov dstin, dst
187#endif
188 ret
189
190 .p2align 6
191 /* Aligning here ensures that the entry code and main loop all lies
192 within one 64-byte cache line. */
193L(bulk_entry):
194 sub to_align, to_align, #16
195 stp data1, data2, [dstin]
196 sub src, srcin, to_align
197 sub dst, dstin, to_align
198 b L(entry_no_page_cross)
199
200 /* The inner loop deals with two Dwords at a time. This has a
201 slightly higher start-up cost, but we should win quite quickly,
202 especially on cores with a high number of issue slots per
203 cycle, as we get much better parallelism out of the operations. */
204L(main_loop):
205 stp data1, data2, [dst], #16
206L(entry_no_page_cross):
207 ldp data1, data2, [src], #16
208 sub tmp1, data1, zeroones
209 orr tmp2, data1, #REP8_7f
210 sub tmp3, data2, zeroones
211 orr tmp4, data2, #REP8_7f
212 bic has_nul1, tmp1, tmp2
213 bics has_nul2, tmp3, tmp4
214 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
215 b.eq L(main_loop)
216
217 /* Since we know we are copying at least 16 bytes, the fastest way
218 to deal with the tail is to determine the location of the
219 trailing NUL, then (re)copy the 16 bytes leading up to that. */
220 cmp has_nul1, #0
221#ifdef __AARCH64EB__
222 /* For big-endian, carry propagation (if the final byte in the
223 string is 0x01) means we cannot use has_nul directly. The
224 easiest way to get the correct byte is to byte-swap the data
225 and calculate the syndrome a second time. */
226 csel data1, data1, data2, ne
227 rev data1, data1
228 sub tmp1, data1, zeroones
229 orr tmp2, data1, #REP8_7f
230 bic has_nul1, tmp1, tmp2
231#else
232 csel has_nul1, has_nul1, has_nul2, ne
233#endif
234 rev has_nul1, has_nul1
235 clz pos, has_nul1
236 add tmp1, pos, #72
237 add pos, pos, #8
238 csel pos, pos, tmp1, ne
239 add src, src, pos, lsr #3
240 add dst, dst, pos, lsr #3
241 ldp data1, data2, [src, #-32]
242 stp data1, data2, [dst, #-16]
243#ifdef BUILD_STPCPY
244 sub dstin, dst, #1
245#endif
246 ret
247
248L(page_cross):
249 bic src, srcin, #15
250 /* Start by loading two words at [srcin & ~15], then forcing the
251 bytes that precede srcin to 0xff. This means they never look
252 like termination bytes. */
253 ldp data1, data2, [src]
254 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
255 tst to_align, #7
256 csetm tmp2, ne
257#ifdef __AARCH64EB__
258 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
259#else
260 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
261#endif
262 orr data1, data1, tmp2
263 orr data2a, data2, tmp2
264 cmp to_align, #8
265 csinv data1, data1, xzr, lt
266 csel data2, data2, data2a, lt
267 sub tmp1, data1, zeroones
268 orr tmp2, data1, #REP8_7f
269 sub tmp3, data2, zeroones
270 orr tmp4, data2, #REP8_7f
271 bic has_nul1, tmp1, tmp2
272 bics has_nul2, tmp3, tmp4
273 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
274 b.eq L(page_cross_ok)
275 /* We now need to make data1 and data2 look like they've been
276 loaded directly from srcin. Do a rotate on the 128-bit value. */
277 lsl tmp1, to_align, #3 /* Bytes->bits. */
278 neg tmp2, to_align, lsl #3
279#ifdef __AARCH64EB__
280 lsl data1a, data1, tmp1
281 lsr tmp4, data2, tmp2
282 lsl data2, data2, tmp1
283 orr tmp4, tmp4, data1a
284 cmp to_align, #8
285 csel data1, tmp4, data2, lt
286 rev tmp2, data1
287 rev tmp4, data2
288 sub tmp1, tmp2, zeroones
289 orr tmp2, tmp2, #REP8_7f
290 sub tmp3, tmp4, zeroones
291 orr tmp4, tmp4, #REP8_7f
292#else
293 lsr data1a, data1, tmp1
294 lsl tmp4, data2, tmp2
295 lsr data2, data2, tmp1
296 orr tmp4, tmp4, data1a
297 cmp to_align, #8
298 csel data1, tmp4, data2, lt
299 sub tmp1, data1, zeroones
300 orr tmp2, data1, #REP8_7f
301 sub tmp3, data2, zeroones
302 orr tmp4, data2, #REP8_7f
303#endif
304 bic has_nul1, tmp1, tmp2
305 cbnz has_nul1, L(fp_le8)
306 bic has_nul2, tmp3, tmp4
307 b L(fp_gt8)
308
309END (STRCPY)
310

source code of libc/AOR_v20.02/string/aarch64/strcpy.S