strncmp.S source code [glibc/sysdeps/aarch64/strncmp.S]

1	/ Copyright (C) 2013-2024 Free Software Foundation, Inc.*
2
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library. If not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	/ Assumptions:*
22	*
23	* ARMv8-a, AArch64
24	*/
25
26	#define REP8_01 0x0101010101010101
27	#define REP8_7f 0x7f7f7f7f7f7f7f7f
28
29	/ Parameters and result. /
30	#define src1 x0
31	#define src2 x1
32	#define limit x2
33	#define result x0
34
35	/ Internal variables. /
36	#define data1 x3
37	#define data1w w3
38	#define data2 x4
39	#define data2w w4
40	#define has_nul x5
41	#define diff x6
42	#define syndrome x7
43	#define tmp1 x8
44	#define tmp2 x9
45	#define tmp3 x10
46	#define zeroones x11
47	#define pos x12
48	#define mask x13
49	#define endloop x14
50	#define count mask
51	#define offset pos
52	#define neg_offset x15
53
54	/ Define endian dependent shift operations.*
55	On big-endian early bytes are at MSB and on little-endian LSB.
56	LS_FW means shifting towards early bytes.
57	LS_BK means shifting towards later bytes.
58	*/
59	#ifdef __AARCH64EB__
60	#define LS_FW lsl
61	#define LS_BK lsr
62	#else
63	#define LS_FW lsr
64	#define LS_BK lsl
65	#endif
66
67	.text
68	.p2align `6`
69	.rep `9`
70	nop / Pad so that the loop below fits a cache line. /
71	.endr
72	ENTRY_ALIGN (strncmp, `0`)
73	cbz limit, L(ret0)
74	eor tmp1, src1, src2
75	mov zeroones, #REP8_01
76	tst tmp1, #`7`
77	and count, src1, #`7`
78	b.ne L(misaligned8)
79	cbnz count, L(mutual_align)
80
81	/ NUL detection works on the principle that (X - 1) & (~X) & 0x80*
82	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
83	can be done in parallel across the entire word. /*
84	/ Start of performance-critical section -- one 64B cache line. /
85	L(loop_aligned):
86	ldr data1, [src1], #`8`
87	ldr data2, [src2], #`8`
88	L(start_realigned):
89	subs limit, limit, #`8`
90	sub tmp1, data1, zeroones
91	orr tmp2, data1, #REP8_7f
92	eor diff, data1, data2 / Non-zero if differences found. /
93	csinv endloop, diff, xzr, hi / Last Dword or differences. /
94	bics has_nul, tmp1, tmp2 / Non-zero if NUL terminator. /
95	ccmp endloop, #`0`, #`0`, eq
96	b.eq L(loop_aligned)
97	/ End of performance-critical section -- one 64B cache line. /
98
99	L(full_check):
100	#ifndef __AARCH64EB__
101	orr syndrome, diff, has_nul
102	add limit, limit, `8` / Rewind limit to before last subs. /
103	L(syndrome_check):
104	/ Limit was reached. Check if the NUL byte or the difference*
105	is before the limit. /*
106	rev syndrome, syndrome
107	rev data1, data1
108	clz pos, syndrome
109	rev data2, data2
110	lsl data1, data1, pos
111	cmp limit, pos, lsr #`3`
112	lsl data2, data2, pos
113	/ But we need to zero-extend (char is unsigned) the value and then*
114	perform a signed 32-bit subtraction. /*
115	lsr data1, data1, #`56`
116	sub result, data1, data2, lsr #`56`
117	csel result, result, xzr, hi
118	ret
119	#else
120	/ Not reached the limit, must have found the end or a diff. /
121	tbz limit, #`63`, L(not_limit)
122	add tmp1, limit, `8`
123	cbz limit, L(not_limit)
124
125	lsl limit, tmp1, #`3` / Bits -> bytes. /
126	mov mask, #~`0`
127	lsr mask, mask, limit
128	bic data1, data1, mask
129	bic data2, data2, mask
130
131	/ Make sure that the NUL byte is marked in the syndrome. /
132	orr has_nul, has_nul, mask
133
134	L(not_limit):
135	/ For big-endian we cannot use the trick with the syndrome value*
136	as carry-propagation can corrupt the upper bits if the trailing
137	bytes in the string contain 0x01. /*
138	/ However, if there is no NUL byte in the dword, we can generate*
139	the result directly. We can't just subtract the bytes as the
140	MSB might be significant. /*
141	cbnz has_nul, `1f`
142	cmp data1, data2
143	cset result, ne
144	cneg result, result, lo
145	ret
146	`1`:
147	/ Re-compute the NUL-byte detection, using a byte-reversed value. /
148	rev tmp3, data1
149	sub tmp1, tmp3, zeroones
150	orr tmp2, tmp3, #REP8_7f
151	bic has_nul, tmp1, tmp2
152	rev has_nul, has_nul
153	orr syndrome, diff, has_nul
154	clz pos, syndrome
155	/ The most-significant-non-zero bit of the syndrome marks either the*
156	first bit that is different, or the top bit of the first zero byte.
157	Shifting left now will bring the critical information into the
158	top bits. /*
159	L(end_quick):
160	lsl data1, data1, pos
161	lsl data2, data2, pos
162	/ But we need to zero-extend (char is unsigned) the value and then*
163	perform a signed 32-bit subtraction. /*
164	lsr data1, data1, #`56`
165	sub result, data1, data2, lsr #`56`
166	ret
167	#endif
168
169	L(mutual_align):
170	/ Sources are mutually aligned, but are not currently at an*
171	alignment boundary. Round down the addresses and then mask off
172	the bytes that precede the start point.
173	We also need to adjust the limit calculations, but without
174	overflowing if the limit is near ULONG_MAX. /*
175	bic src1, src1, #`7`
176	bic src2, src2, #`7`
177	ldr data1, [src1], #`8`
178	neg tmp3, count, lsl #`3` / 64 - bits(bytes beyond align). /
179	ldr data2, [src2], #`8`
180	mov tmp2, #~`0`
181	LS_FW tmp2, tmp2, tmp3 / Shift (count & 63). /
182	/ Adjust the limit and ensure it doesn't overflow. /
183	adds limit, limit, count
184	csinv limit, limit, xzr, lo
185	orr data1, data1, tmp2
186	orr data2, data2, tmp2
187	b L(start_realigned)
188
189	.p2align `6`
190	/ Don't bother with dwords for up to 16 bytes. /
191	L(misaligned8):
192	cmp limit, #`16`
193	b.hs L(try_misaligned_words)
194
195	L(byte_loop):
196	/ Perhaps we can do better than this. /
197	ldrb data1w, [src1], #`1`
198	ldrb data2w, [src2], #`1`
199	subs limit, limit, #`1`
200	ccmp data1w, #`1`, #`0`, hi / NZCV = 0b0000. /
201	ccmp data1w, data2w, #`0`, cs / NZCV = 0b0000. /
202	b.eq L(byte_loop)
203	L(done):
204	sub result, data1, data2
205	ret
206	/ Align the SRC1 to a dword by doing a bytewise compare and then do*
207	the dword loop. /*
208	L(try_misaligned_words):
209	cbz count, L(src1_aligned)
210
211	neg count, count
212	and count, count, #`7`
213	sub limit, limit, count
214
215	L(page_end_loop):
216	ldrb data1w, [src1], #`1`
217	ldrb data2w, [src2], #`1`
218	cmp data1w, #`1`
219	ccmp data1w, data2w, #`0`, cs / NZCV = 0b0000. /
220	b.ne L(done)
221	subs count, count, #`1`
222	b.hi L(page_end_loop)
223
224	/ The following diagram explains the comparison of misaligned strings.*
225	The bytes are shown in natural order. For little-endian, it is
226	reversed in the registers. The "x" bytes are before the string.
227	The "\|" separates data that is loaded at one time.
228	src1 \| a a a a a a a a \| b b b c c c c c \| . . .
229	src2 \| x x x x x a a a a a a a a b b b \| c c c c c . . .
230	After shifting in each step, the data looks like this:
231	STEP_A STEP_B STEP_C
232	data1 a a a a a a a a b b b c c c c c b b b c c c c c
233	data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
234	The bytes with "0" are eliminated from the syndrome via mask.
235	Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
236	time from SRC2. The comparison happens in 3 steps. After each step
237	the loop can exit, or read from SRC1 or SRC2. /*
238	L(src1_aligned):
239	/ Calculate offset from 8 byte alignment to string start in bits. No*
240	need to mask offset since shifts are ignoring upper bits. /*
241	lsl offset, src2, #`3`
242	bic src2, src2, #`0xf`
243	mov mask, -`1`
244	neg neg_offset, offset
245	ldr data1, [src1], #`8`
246	ldp tmp1, tmp2, [src2], #`16`
247	LS_BK mask, mask, neg_offset
248	and neg_offset, neg_offset, #`63` / Need actual value for cmp later. /
249	/ Skip the first compare if data in tmp1 is irrelevant. /
250	tbnz offset, `6`, L(misaligned_mid_loop)
251
252	L(loop_misaligned):
253	/ STEP_A: Compare full 8 bytes when there is enough data from SRC2./
254	LS_FW data2, tmp1, offset
255	LS_BK tmp1, tmp2, neg_offset
256	subs limit, limit, #`8`
257	orr data2, data2, tmp1 / 8 bytes from SRC2 combined from two regs./
258	sub has_nul, data1, zeroones
259	eor diff, data1, data2 / Non-zero if differences found. /
260	orr tmp3, data1, #REP8_7f
261	csinv endloop, diff, xzr, hi / If limit, set to all ones. /
262	bic has_nul, has_nul, tmp3 / Non-zero if NUL byte found in SRC1. /
263	orr tmp3, endloop, has_nul
264	cbnz tmp3, L(full_check)
265
266	ldr data1, [src1], #`8`
267	L(misaligned_mid_loop):
268	/ STEP_B: Compare first part of data1 to second part of tmp2. /
269	LS_FW data2, tmp2, offset
270	#ifdef __AARCH64EB__
271	/ For big-endian we do a byte reverse to avoid carry-propagation*
272	problem described above. This way we can reuse the has_nul in the
273	next step and also use syndrome value trick at the end. /*
274	rev tmp3, data1
275	#define data1_fixed tmp3
276	#else
277	#define data1_fixed data1
278	#endif
279	sub has_nul, data1_fixed, zeroones
280	orr tmp3, data1_fixed, #REP8_7f
281	eor diff, data2, data1 / Non-zero if differences found. /
282	bic has_nul, has_nul, tmp3 / Non-zero if NUL terminator. /
283	#ifdef __AARCH64EB__
284	rev has_nul, has_nul
285	#endif
286	cmp limit, neg_offset, lsr #`3`
287	orr syndrome, diff, has_nul
288	bic syndrome, syndrome, mask / Ignore later bytes. /
289	csinv tmp3, syndrome, xzr, hi / If limit, set to all ones. /
290	cbnz tmp3, L(syndrome_check)
291
292	/ STEP_C: Compare second part of data1 to first part of tmp1. /
293	ldp tmp1, tmp2, [src2], #`16`
294	cmp limit, #`8`
295	LS_BK data2, tmp1, neg_offset
296	eor diff, data2, data1 / Non-zero if differences found. /
297	orr syndrome, diff, has_nul
298	and syndrome, syndrome, mask / Ignore earlier bytes. /
299	csinv tmp3, syndrome, xzr, hi / If limit, set to all ones. /
300	cbnz tmp3, L(syndrome_check)
301
302	ldr data1, [src1], #`8`
303	sub limit, limit, #`8`
304	b L(loop_misaligned)
305
306	#ifdef __AARCH64EB__
307	L(syndrome_check):
308	clz pos, syndrome
309	cmp pos, limit, lsl #`3`
310	b.lo L(end_quick)
311	#endif
312
313	L(ret0):
314	mov result, #`0`
315	ret
316
317	END (strncmp)
318	libc_hidden_builtin_def (strncmp)
319

source code of glibc/sysdeps/aarch64/strncmp.S