strnlen.S source code [libc/AOR_v20.02/string/aarch64/strnlen.S]

1	/*
2	* strnlen - calculate the length of a string with limit.
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64
12	*/
13
14	#include "../asmdefs.h"
15
16	/ Arguments and results. /
17	#define srcin x0
18	#define len x0
19	#define limit x1
20
21	/ Locals and temporaries. /
22	#define src x2
23	#define data1 x3
24	#define data2 x4
25	#define data2a x5
26	#define has_nul1 x6
27	#define has_nul2 x7
28	#define tmp1 x8
29	#define tmp2 x9
30	#define tmp3 x10
31	#define tmp4 x11
32	#define zeroones x12
33	#define pos x13
34	#define limit_wd x14
35
36	#define REP8_01 0x0101010101010101
37	#define REP8_7f 0x7f7f7f7f7f7f7f7f
38	#define REP8_80 0x8080808080808080
39
40	.text
41	.p2align `6`
42	L(start):
43	/ Pre-pad to ensure critical loop begins an icache line. /
44	.rep `7`
45	nop
46	.endr
47	/ Put this code here to avoid wasting more space with pre-padding. /
48	L(hit_limit):
49	mov len, limit
50	ret
51
52	ENTRY_ALIGN (__strnlen_aarch64, `0`)
53	cbz limit, L(hit_limit)
54	mov zeroones, #REP8_01
55	bic src, srcin, #`15`
56	ands tmp1, srcin, #`15`
57	b.ne L(misaligned)
58	/ Calculate the number of full and partial words -1. /
59	sub limit_wd, limit, #`1` / Limit != 0, so no underflow. /
60	lsr limit_wd, limit_wd, #`4` / Convert to Qwords. /
61
62	/ NUL detection works on the principle that (X - 1) & (~X) & 0x80*
63	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
64	can be done in parallel across the entire word. /*
65	/ The inner loop deals with two Dwords at a time. This has a*
66	slightly higher start-up cost, but we should win quite quickly,
67	especially on cores with a high number of issue slots per
68	cycle, as we get much better parallelism out of the operations. /*
69
70	/ Start of critical section -- keep to one 64Byte cache line. /
71	L(loop):
72	ldp data1, data2, [src], #`16`
73	L(realigned):
74	sub tmp1, data1, zeroones
75	orr tmp2, data1, #REP8_7f
76	sub tmp3, data2, zeroones
77	orr tmp4, data2, #REP8_7f
78	bic has_nul1, tmp1, tmp2
79	bic has_nul2, tmp3, tmp4
80	subs limit_wd, limit_wd, #`1`
81	orr tmp1, has_nul1, has_nul2
82	ccmp tmp1, #`0`, #`0`, pl / NZCV = 0000 /
83	b.eq L(loop)
84	/ End of critical section -- keep to one 64Byte cache line. /
85
86	orr tmp1, has_nul1, has_nul2
87	cbz tmp1, L(hit_limit) / No null in final Qword. /
88
89	/ We know there's a null in the final Qword. The easiest thing*
90	to do now is work out the length of the string and return
91	MIN (len, limit). /*
92
93	sub len, src, srcin
94	cbz has_nul1, L(nul_in_data2)
95	#ifdef __AARCH64EB__
96	mov data2, data1
97	#endif
98	sub len, len, #`8`
99	mov has_nul2, has_nul1
100	L(nul_in_data2):
101	#ifdef __AARCH64EB__
102	/ For big-endian, carry propagation (if the final byte in the*
103	string is 0x01) means we cannot use has_nul directly. The
104	easiest way to get the correct byte is to byte-swap the data
105	and calculate the syndrome a second time. /*
106	rev data2, data2
107	sub tmp1, data2, zeroones
108	orr tmp2, data2, #REP8_7f
109	bic has_nul2, tmp1, tmp2
110	#endif
111	sub len, len, #`8`
112	rev has_nul2, has_nul2
113	clz pos, has_nul2
114	add len, len, pos, lsr #`3` / Bits to bytes. /
115	cmp len, limit
116	csel len, len, limit, ls / Return the lower value. /
117	ret
118
119	L(misaligned):
120	/ Deal with a partial first word.*
121	We're doing two things in parallel here;
122	1) Calculate the number of words (but avoiding overflow if
123	limit is near ULONG_MAX) - to do this we need to work out
124	limit + tmp1 - 1 as a 65-bit value before shifting it;
125	2) Load and mask the initial data words - we force the bytes
126	before the ones we are interested in to 0xff - this ensures
127	early bytes will not hit any zero detection. /*
128	sub limit_wd, limit, #`1`
129	neg tmp4, tmp1
130	cmp tmp1, #`8`
131
132	and tmp3, limit_wd, #`15`
133	lsr limit_wd, limit_wd, #`4`
134	mov tmp2, #~`0`
135
136	ldp data1, data2, [src], #`16`
137	lsl tmp4, tmp4, #`3` / Bytes beyond alignment -> bits. /
138	add tmp3, tmp3, tmp1
139
140	#ifdef __AARCH64EB__
141	/ Big-endian. Early bytes are at MSB. /
142	lsl tmp2, tmp2, tmp4 / Shift (tmp1 & 63). /
143	#else
144	/ Little-endian. Early bytes are at LSB. /
145	lsr tmp2, tmp2, tmp4 / Shift (tmp1 & 63). /
146	#endif
147	add limit_wd, limit_wd, tmp3, lsr #`4`
148
149	orr data1, data1, tmp2
150	orr data2a, data2, tmp2
151
152	csinv data1, data1, xzr, le
153	csel data2, data2, data2a, le
154	b L(realigned)
155
156	END (__strnlen_aarch64)
157

source code of libc/AOR_v20.02/string/aarch64/strnlen.S