strncmp.S source code [linux/arch/arm64/lib/strncmp.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* Copyright (c) 2013-2022, Arm Limited.
4	*
5	* Adapted from the original at:
6	* https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
7	*/
8
9	#include <linux/linkage.h>
10	#include <asm/assembler.h>
11
12	/ Assumptions:*
13	*
14	* ARMv8-a, AArch64.
15	* MTE compatible.
16	*/
17
18	#define L(label) .L ## label
19
20	#define REP8_01 0x0101010101010101
21	#define REP8_7f 0x7f7f7f7f7f7f7f7f
22
23	/ Parameters and result. /
24	#define src1 x0
25	#define src2 x1
26	#define limit x2
27	#define result x0
28
29	/ Internal variables. /
30	#define data1 x3
31	#define data1w w3
32	#define data2 x4
33	#define data2w w4
34	#define has_nul x5
35	#define diff x6
36	#define syndrome x7
37	#define tmp1 x8
38	#define tmp2 x9
39	#define tmp3 x10
40	#define zeroones x11
41	#define pos x12
42	#define mask x13
43	#define endloop x14
44	#define count mask
45	#define offset pos
46	#define neg_offset x15
47
48	/ Define endian dependent shift operations.*
49	On big-endian early bytes are at MSB and on little-endian LSB.
50	LS_FW means shifting towards early bytes.
51	LS_BK means shifting towards later bytes.
52	*/
53	#ifdef __AARCH64EB__
54	#define LS_FW lsl
55	#define LS_BK lsr
56	#else
57	#define LS_FW lsr
58	#define LS_BK lsl
59	#endif
60
61	SYM_FUNC_START(__pi_strncmp)
62	cbz limit, L(ret0)
63	eor tmp1, src1, src2
64	mov zeroones, #REP8_01
65	tst tmp1, #`7`
66	and count, src1, #`7`
67	b.ne L(misaligned8)
68	cbnz count, L(mutual_align)
69
70	/ NUL detection works on the principle that (X - 1) & (~X) & 0x80*
71	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
72	can be done in parallel across the entire word. /*
73	.p2align `4`
74	L(loop_aligned):
75	ldr data1, [src1], #`8`
76	ldr data2, [src2], #`8`
77	L(start_realigned):
78	subs limit, limit, #`8`
79	sub tmp1, data1, zeroones
80	orr tmp2, data1, #REP8_7f
81	eor diff, data1, data2 / Non-zero if differences found. /
82	csinv endloop, diff, xzr, hi / Last Dword or differences. /
83	bics has_nul, tmp1, tmp2 / Non-zero if NUL terminator. /
84	ccmp endloop, #`0`, #`0`, eq
85	b.eq L(loop_aligned)
86	/ End of main loop /
87
88	L(full_check):
89	#ifndef __AARCH64EB__
90	orr syndrome, diff, has_nul
91	add limit, limit, `8` / Rewind limit to before last subs. /
92	L(syndrome_check):
93	/ Limit was reached. Check if the NUL byte or the difference*
94	is before the limit. /*
95	rev syndrome, syndrome
96	rev data1, data1
97	clz pos, syndrome
98	rev data2, data2
99	lsl data1, data1, pos
100	cmp limit, pos, lsr #`3`
101	lsl data2, data2, pos
102	/ But we need to zero-extend (char is unsigned) the value and then*
103	perform a signed 32-bit subtraction. /*
104	lsr data1, data1, #`56`
105	sub result, data1, data2, lsr #`56`
106	csel result, result, xzr, hi
107	ret
108	#else
109	/ Not reached the limit, must have found the end or a diff. /
110	tbz limit, #`63`, L(not_limit)
111	add tmp1, limit, `8`
112	cbz limit, L(not_limit)
113
114	lsl limit, tmp1, #`3` / Bits -> bytes. /
115	mov mask, #~`0`
116	lsr mask, mask, limit
117	bic data1, data1, mask
118	bic data2, data2, mask
119
120	/ Make sure that the NUL byte is marked in the syndrome. /
121	orr has_nul, has_nul, mask
122
123	L(not_limit):
124	/ For big-endian we cannot use the trick with the syndrome value*
125	as carry-propagation can corrupt the upper bits if the trailing
126	bytes in the string contain 0x01. /*
127	/ However, if there is no NUL byte in the dword, we can generate*
128	the result directly. We can't just subtract the bytes as the
129	MSB might be significant. /*
130	cbnz has_nul, `1f`
131	cmp data1, data2
132	cset result, ne
133	cneg result, result, lo
134	ret
135	`1`:
136	/ Re-compute the NUL-byte detection, using a byte-reversed value. /
137	rev tmp3, data1
138	sub tmp1, tmp3, zeroones
139	orr tmp2, tmp3, #REP8_7f
140	bic has_nul, tmp1, tmp2
141	rev has_nul, has_nul
142	orr syndrome, diff, has_nul
143	clz pos, syndrome
144	/ The most-significant-non-zero bit of the syndrome marks either the*
145	first bit that is different, or the top bit of the first zero byte.
146	Shifting left now will bring the critical information into the
147	top bits. /*
148	L(end_quick):
149	lsl data1, data1, pos
150	lsl data2, data2, pos
151	/ But we need to zero-extend (char is unsigned) the value and then*
152	perform a signed 32-bit subtraction. /*
153	lsr data1, data1, #`56`
154	sub result, data1, data2, lsr #`56`
155	ret
156	#endif
157
158	L(mutual_align):
159	/ Sources are mutually aligned, but are not currently at an*
160	alignment boundary. Round down the addresses and then mask off
161	the bytes that precede the start point.
162	We also need to adjust the limit calculations, but without
163	overflowing if the limit is near ULONG_MAX. /*
164	bic src1, src1, #`7`
165	bic src2, src2, #`7`
166	ldr data1, [src1], #`8`
167	neg tmp3, count, lsl #`3` / 64 - bits(bytes beyond align). /
168	ldr data2, [src2], #`8`
169	mov tmp2, #~`0`
170	LS_FW tmp2, tmp2, tmp3 / Shift (count & 63). /
171	/ Adjust the limit and ensure it doesn't overflow. /
172	adds limit, limit, count
173	csinv limit, limit, xzr, lo
174	orr data1, data1, tmp2
175	orr data2, data2, tmp2
176	b L(start_realigned)
177
178	.p2align `4`
179	/ Don't bother with dwords for up to 16 bytes. /
180	L(misaligned8):
181	cmp limit, #`16`
182	b.hs L(try_misaligned_words)
183
184	L(byte_loop):
185	/ Perhaps we can do better than this. /
186	ldrb data1w, [src1], #`1`
187	ldrb data2w, [src2], #`1`
188	subs limit, limit, #`1`
189	ccmp data1w, #`1`, #`0`, hi / NZCV = 0b0000. /
190	ccmp data1w, data2w, #`0`, cs / NZCV = 0b0000. /
191	b.eq L(byte_loop)
192	L(done):
193	sub result, data1, data2
194	ret
195	/ Align the SRC1 to a dword by doing a bytewise compare and then do*
196	the dword loop. /*
197	L(try_misaligned_words):
198	cbz count, L(src1_aligned)
199
200	neg count, count
201	and count, count, #`7`
202	sub limit, limit, count
203
204	L(page_end_loop):
205	ldrb data1w, [src1], #`1`
206	ldrb data2w, [src2], #`1`
207	cmp data1w, #`1`
208	ccmp data1w, data2w, #`0`, cs / NZCV = 0b0000. /
209	b.ne L(done)
210	subs count, count, #`1`
211	b.hi L(page_end_loop)
212
213	/ The following diagram explains the comparison of misaligned strings.*
214	The bytes are shown in natural order. For little-endian, it is
215	reversed in the registers. The "x" bytes are before the string.
216	The "\|" separates data that is loaded at one time.
217	src1 \| a a a a a a a a \| b b b c c c c c \| . . .
218	src2 \| x x x x x a a a a a a a a b b b \| c c c c c . . .
219
220	After shifting in each step, the data looks like this:
221	STEP_A STEP_B STEP_C
222	data1 a a a a a a a a b b b c c c c c b b b c c c c c
223	data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
224
225	The bytes with "0" are eliminated from the syndrome via mask.
226
227	Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228	time from SRC2. The comparison happens in 3 steps. After each step
229	the loop can exit, or read from SRC1 or SRC2. /*
230	L(src1_aligned):
231	/ Calculate offset from 8 byte alignment to string start in bits. No*
232	need to mask offset since shifts are ignoring upper bits. /*
233	lsl offset, src2, #`3`
234	bic src2, src2, #`0xf`
235	mov mask, -`1`
236	neg neg_offset, offset
237	ldr data1, [src1], #`8`
238	ldp tmp1, tmp2, [src2], #`16`
239	LS_BK mask, mask, neg_offset
240	and neg_offset, neg_offset, #`63` / Need actual value for cmp later. /
241	/ Skip the first compare if data in tmp1 is irrelevant. /
242	tbnz offset, `6`, L(misaligned_mid_loop)
243
244	L(loop_misaligned):
245	/ STEP_A: Compare full 8 bytes when there is enough data from SRC2./
246	LS_FW data2, tmp1, offset
247	LS_BK tmp1, tmp2, neg_offset
248	subs limit, limit, #`8`
249	orr data2, data2, tmp1 / 8 bytes from SRC2 combined from two regs./
250	sub has_nul, data1, zeroones
251	eor diff, data1, data2 / Non-zero if differences found. /
252	orr tmp3, data1, #REP8_7f
253	csinv endloop, diff, xzr, hi / If limit, set to all ones. /
254	bic has_nul, has_nul, tmp3 / Non-zero if NUL byte found in SRC1. /
255	orr tmp3, endloop, has_nul
256	cbnz tmp3, L(full_check)
257
258	ldr data1, [src1], #`8`
259	L(misaligned_mid_loop):
260	/ STEP_B: Compare first part of data1 to second part of tmp2. /
261	LS_FW data2, tmp2, offset
262	#ifdef __AARCH64EB__
263	/ For big-endian we do a byte reverse to avoid carry-propagation*
264	problem described above. This way we can reuse the has_nul in the
265	next step and also use syndrome value trick at the end. /*
266	rev tmp3, data1
267	#define data1_fixed tmp3
268	#else
269	#define data1_fixed data1
270	#endif
271	sub has_nul, data1_fixed, zeroones
272	orr tmp3, data1_fixed, #REP8_7f
273	eor diff, data2, data1 / Non-zero if differences found. /
274	bic has_nul, has_nul, tmp3 / Non-zero if NUL terminator. /
275	#ifdef __AARCH64EB__
276	rev has_nul, has_nul
277	#endif
278	cmp limit, neg_offset, lsr #`3`
279	orr syndrome, diff, has_nul
280	bic syndrome, syndrome, mask / Ignore later bytes. /
281	csinv tmp3, syndrome, xzr, hi / If limit, set to all ones. /
282	cbnz tmp3, L(syndrome_check)
283
284	/ STEP_C: Compare second part of data1 to first part of tmp1. /
285	ldp tmp1, tmp2, [src2], #`16`
286	cmp limit, #`8`
287	LS_BK data2, tmp1, neg_offset
288	eor diff, data2, data1 / Non-zero if differences found. /
289	orr syndrome, diff, has_nul
290	and syndrome, syndrome, mask / Ignore earlier bytes. /
291	csinv tmp3, syndrome, xzr, hi / If limit, set to all ones. /
292	cbnz tmp3, L(syndrome_check)
293
294	ldr data1, [src1], #`8`
295	sub limit, limit, #`8`
296	b L(loop_misaligned)
297
298	#ifdef __AARCH64EB__
299	L(syndrome_check):
300	clz pos, syndrome
301	cmp pos, limit, lsl #`3`
302	b.lo L(end_quick)
303	#endif
304
305	L(ret0):
306	mov result, #`0`
307	ret
308	SYM_FUNC_END(__pi_strncmp)
309	SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
310	EXPORT_SYMBOL_NOKASAN(strncmp)
311

source code of linux/arch/arm64/lib/strncmp.S