sme-libc-opt-memset-memchr.S source code [compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S]

1	//===----------------------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file contains assembly-optimized implementations of Scalable Matrix
11	/// Extension (SME) compatible memset and memchr functions.
12	///
13	/// These implementations depend on unaligned access and floating-point support.
14	///
15	/// Routines taken from libc/AOR_v20.02/string/aarch64.
16	///
17	//===----------------------------------------------------------------------===//
18
19	#include "../assembly.h"
20
21	//
22	// __arm_sc_memset
23	//
24
25	#define dstin x0
26	#define val x1
27	#define valw w1
28	#define count x2
29	#define dst x3
30	#define dstend2 x4
31	#define zva_val x5
32
33	DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
34	#ifdef __ARM_FEATURE_SVE
35	mov z0.b, valw
36	#else
37	bfi valw, valw, #`8`, #`8`
38	bfi valw, valw, #`16`, #`16`
39	bfi val, val, #`32`, #`32`
40	fmov d0, val
41	fmov v0.d[`1`], val
42	#endif
43	add dstend2, dstin, count
44
45	cmp count, `96`
46	b.hi `7f` // set_long
47	cmp count, `16`
48	b.hs `4f` // set_medium
49	mov val, v0.D[`0`]
50
51	/ Set 0..15 bytes. /
52	tbz count, `3`, `1f`
53	str val, [dstin]
54	str val, [dstend2, -`8`]
55	ret
56	nop
57	`1`: tbz count, `2`, `2f`
58	str valw, [dstin]
59	str valw, [dstend2, -`4`]
60	ret
61	`2`: cbz count, `3f`
62	strb valw, [dstin]
63	tbz count, `1`, `3f`
64	strh valw, [dstend2, -`2`]
65	`3`: ret
66
67	/ Set 17..96 bytes. /
68	`4`: // set_medium
69	str q0, [dstin]
70	tbnz count, `6`, `6f` // set96
71	str q0, [dstend2, -`16`]
72	tbz count, `5`, `5f`
73	str q0, [dstin, `16`]
74	str q0, [dstend2, -`32`]
75	`5`: ret
76
77	.p2align `4`
78	/ Set 64..96 bytes. Write 64 bytes from the start and*
79	32 bytes from the end. /*
80	`6`: // set96
81	str q0, [dstin, `16`]
82	stp q0, q0, [dstin, `32`]
83	stp q0, q0, [dstend2, -`32`]
84	ret
85
86	.p2align `4`
87	`7`: // set_long
88	and valw, valw, `255`
89	bic dst, dstin, `15`
90	str q0, [dstin]
91	cmp count, `160`
92	ccmp valw, `0`, `0`, hs
93	b.ne `9f` // no_zva
94
95	#ifndef SKIP_ZVA_CHECK
96	mrs zva_val, dczid_el0
97	and zva_val, zva_val, `31`
98	cmp zva_val, `4` / ZVA size is 64 bytes. /
99	b.ne `9f` // no_zva
100	#endif
101	str q0, [dst, `16`]
102	stp q0, q0, [dst, `32`]
103	bic dst, dst, `63`
104	sub count, dstend2, dst / Count is now 64 too large. /
105	sub count, count, `128` / Adjust count and bias for loop. /
106
107	.p2align `4`
108	`8`: // zva_loop
109	add dst, dst, `64`
110	dc zva, dst
111	subs count, count, `64`
112	b.hi `8b` // zva_loop
113	stp q0, q0, [dstend2, -`64`]
114	stp q0, q0, [dstend2, -`32`]
115	ret
116
117	`9`: // no_zva
118	sub count, dstend2, dst / Count is 16 too large. /
119	sub dst, dst, `16` / Dst is biased by -32. /
120	sub count, count, `64` + `16` / Adjust count and bias for loop. /
121	`10`: // no_zva_loop
122	stp q0, q0, [dst, `32`]
123	stp q0, q0, [dst, `64`]!
124	subs count, count, `64`
125	b.hi `10b` // no_zva_loop
126	stp q0, q0, [dstend2, -`64`]
127	stp q0, q0, [dstend2, -`32`]
128	ret
129	END_COMPILERRT_FUNCTION(__arm_sc_memset)
130
131	//
132	// __arm_sc_memchr
133	//
134
135	#define srcin x0
136	#define chrin w1
137	#define cntin x2
138
139	#define result x0
140
141	#define src x3
142	#define tmp x4
143	#define wtmp2 w5
144	#define synd x6
145	#define soff x9
146	#define cntrem x10
147
148	#define vrepchr v0
149	#define vdata1 v1
150	#define vdata2 v2
151	#define vhas_chr1 v3
152	#define vhas_chr2 v4
153	#define vrepmask v5
154	#define vend v6
155
156	/*
157	* Core algorithm:
158	*
159	* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
160	* per byte. For each tuple, bit 0 is set if the relevant byte matched the
161	* requested character and bit 1 is not used (faster than using a 32bit
162	* syndrome). Since the bits in the syndrome reflect exactly the order in which
163	* things occur in the original string, counting trailing zeros allows to
164	* identify exactly which byte has matched.
165	*/
166
167	DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr)
168	/ Do not dereference srcin if no bytes to compare. /
169	cbz cntin, `4f`
170	/*
171	* Magic constant 0x40100401 allows us to identify which lane matches
172	* the requested byte.
173	*/
174	mov wtmp2, #`0x0401`
175	movk wtmp2, #`0x4010`, lsl #`16`
176	dup vrepchr`.16b`, chrin
177	/ Work with aligned 32-byte chunks /
178	bic src, srcin, #`31`
179	dup vrepmask`.4s`, wtmp2
180	ands soff, srcin, #`31`
181	and cntrem, cntin, #`31`
182	b.eq `0f`
183
184	/*
185	* Input string is not 32-byte aligned. We calculate the syndrome
186	* value for the aligned 32 bytes block containing the first bytes
187	* and mask the irrelevant part.
188	*/
189
190	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
191	sub tmp, soff, #`32`
192	adds cntin, cntin, tmp
193	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
194	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
195	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask`.16b`
196	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask`.16b`
197	addp vend`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` / 256->128 /
198	addp vend`.16b`, vend`.16b`, vend`.16b` / 128->64 /
199	mov synd, vend.d[`0`]
200	/ Clear the soff2 lower bits /*
201	lsl tmp, soff, #`1`
202	lsr synd, synd, tmp
203	lsl synd, synd, tmp
204	/ The first block can also be the last /
205	b.ls `2f`
206	/ Have we found something already? /
207	cbnz synd, `3f`
208
209	`0`: // loop
210	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
211	subs cntin, cntin, #`32`
212	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
213	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
214	/ If we're out of data we finish regardless of the result /
215	b.ls `1f`
216	/ Use a fast check for the termination condition /
217	orr vend`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b`
218	addp vend`.2d`, vend`.2d`, vend`.2d`
219	mov synd, vend.d[`0`]
220	/ We're not out of data, loop if we haven't found the character /
221	cbz synd, `0b`
222
223	`1`: // end
224	/ Termination condition found, let's calculate the syndrome value /
225	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask`.16b`
226	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask`.16b`
227	addp vend`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` / 256->128 /
228	addp vend`.16b`, vend`.16b`, vend`.16b` / 128->64 /
229	mov synd, vend.d[`0`]
230	/ Only do the clear for the last possible block /
231	b.hi `3f`
232
233	`2`: // masklast
234	/ Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits /
235	add tmp, cntrem, soff
236	and tmp, tmp, #`31`
237	sub tmp, tmp, #`32`
238	neg tmp, tmp, lsl #`1`
239	lsl synd, synd, tmp
240	lsr synd, synd, tmp
241
242	`3`: // tail
243	/ Count the trailing zeros using bit reversing /
244	rbit synd, synd
245	/ Compensate the last post-increment /
246	sub src, src, #`32`
247	/ Check that we have found a character /
248	cmp synd, #`0`
249	/ And count the leading zeros /
250	clz synd, synd
251	/ Compute the potential result /
252	add result, src, synd, lsr #`1`
253	/ Select result or NULL /
254	csel result, xzr, result, eq
255	ret
256
257	`4`: // zero_length
258	mov result, #`0`
259	ret
260	END_COMPILERRT_FUNCTION(__arm_sc_memchr)
261
262

source code of compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S