1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains assembly-optimized implementations of Scalable Matrix
11/// Extension (SME) compatible memset and memchr functions.
12///
13/// These implementations depend on unaligned access and floating-point support.
14///
15/// Routines taken from libc/AOR_v20.02/string/aarch64.
16///
17//===----------------------------------------------------------------------===//
18
19#include "../assembly.h"
20
21//
22// __arm_sc_memset
23//
24
25#define dstin x0
26#define val x1
27#define valw w1
28#define count x2
29#define dst x3
30#define dstend2 x4
31#define zva_val x5
32
33DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
34#ifdef __ARM_FEATURE_SVE
35 mov z0.b, valw
36#else
37 bfi valw, valw, #8, #8
38 bfi valw, valw, #16, #16
39 bfi val, val, #32, #32
40 fmov d0, val
41 fmov v0.d[1], val
42#endif
43 add dstend2, dstin, count
44
45 cmp count, 96
46 b.hi 7f // set_long
47 cmp count, 16
48 b.hs 4f // set_medium
49 mov val, v0.D[0]
50
51 /* Set 0..15 bytes. */
52 tbz count, 3, 1f
53 str val, [dstin]
54 str val, [dstend2, -8]
55 ret
56 nop
571: tbz count, 2, 2f
58 str valw, [dstin]
59 str valw, [dstend2, -4]
60 ret
612: cbz count, 3f
62 strb valw, [dstin]
63 tbz count, 1, 3f
64 strh valw, [dstend2, -2]
653: ret
66
67 /* Set 17..96 bytes. */
684: // set_medium
69 str q0, [dstin]
70 tbnz count, 6, 6f // set96
71 str q0, [dstend2, -16]
72 tbz count, 5, 5f
73 str q0, [dstin, 16]
74 str q0, [dstend2, -32]
755: ret
76
77 .p2align 4
78 /* Set 64..96 bytes. Write 64 bytes from the start and
79 32 bytes from the end. */
806: // set96
81 str q0, [dstin, 16]
82 stp q0, q0, [dstin, 32]
83 stp q0, q0, [dstend2, -32]
84 ret
85
86 .p2align 4
877: // set_long
88 and valw, valw, 255
89 bic dst, dstin, 15
90 str q0, [dstin]
91 cmp count, 160
92 ccmp valw, 0, 0, hs
93 b.ne 9f // no_zva
94
95#ifndef SKIP_ZVA_CHECK
96 mrs zva_val, dczid_el0
97 and zva_val, zva_val, 31
98 cmp zva_val, 4 /* ZVA size is 64 bytes. */
99 b.ne 9f // no_zva
100#endif
101 str q0, [dst, 16]
102 stp q0, q0, [dst, 32]
103 bic dst, dst, 63
104 sub count, dstend2, dst /* Count is now 64 too large. */
105 sub count, count, 128 /* Adjust count and bias for loop. */
106
107 .p2align 4
1088: // zva_loop
109 add dst, dst, 64
110 dc zva, dst
111 subs count, count, 64
112 b.hi 8b // zva_loop
113 stp q0, q0, [dstend2, -64]
114 stp q0, q0, [dstend2, -32]
115 ret
116
1179: // no_zva
118 sub count, dstend2, dst /* Count is 16 too large. */
119 sub dst, dst, 16 /* Dst is biased by -32. */
120 sub count, count, 64 + 16 /* Adjust count and bias for loop. */
12110: // no_zva_loop
122 stp q0, q0, [dst, 32]
123 stp q0, q0, [dst, 64]!
124 subs count, count, 64
125 b.hi 10b // no_zva_loop
126 stp q0, q0, [dstend2, -64]
127 stp q0, q0, [dstend2, -32]
128 ret
129END_COMPILERRT_FUNCTION(__arm_sc_memset)
130
131//
132// __arm_sc_memchr
133//
134
135#define srcin x0
136#define chrin w1
137#define cntin x2
138
139#define result x0
140
141#define src x3
142#define tmp x4
143#define wtmp2 w5
144#define synd x6
145#define soff x9
146#define cntrem x10
147
148#define vrepchr v0
149#define vdata1 v1
150#define vdata2 v2
151#define vhas_chr1 v3
152#define vhas_chr2 v4
153#define vrepmask v5
154#define vend v6
155
156/*
157 * Core algorithm:
158 *
159 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
160 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
161 * requested character and bit 1 is not used (faster than using a 32bit
162 * syndrome). Since the bits in the syndrome reflect exactly the order in which
163 * things occur in the original string, counting trailing zeros allows to
164 * identify exactly which byte has matched.
165 */
166
167DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr)
168 /* Do not dereference srcin if no bytes to compare. */
169 cbz cntin, 4f
170 /*
171 * Magic constant 0x40100401 allows us to identify which lane matches
172 * the requested byte.
173 */
174 mov wtmp2, #0x0401
175 movk wtmp2, #0x4010, lsl #16
176 dup vrepchr.16b, chrin
177 /* Work with aligned 32-byte chunks */
178 bic src, srcin, #31
179 dup vrepmask.4s, wtmp2
180 ands soff, srcin, #31
181 and cntrem, cntin, #31
182 b.eq 0f
183
184 /*
185 * Input string is not 32-byte aligned. We calculate the syndrome
186 * value for the aligned 32 bytes block containing the first bytes
187 * and mask the irrelevant part.
188 */
189
190 ld1 {vdata1.16b, vdata2.16b}, [src], #32
191 sub tmp, soff, #32
192 adds cntin, cntin, tmp
193 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
194 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
195 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
196 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
197 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
198 addp vend.16b, vend.16b, vend.16b /* 128->64 */
199 mov synd, vend.d[0]
200 /* Clear the soff*2 lower bits */
201 lsl tmp, soff, #1
202 lsr synd, synd, tmp
203 lsl synd, synd, tmp
204 /* The first block can also be the last */
205 b.ls 2f
206 /* Have we found something already? */
207 cbnz synd, 3f
208
2090: // loop
210 ld1 {vdata1.16b, vdata2.16b}, [src], #32
211 subs cntin, cntin, #32
212 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
213 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
214 /* If we're out of data we finish regardless of the result */
215 b.ls 1f
216 /* Use a fast check for the termination condition */
217 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
218 addp vend.2d, vend.2d, vend.2d
219 mov synd, vend.d[0]
220 /* We're not out of data, loop if we haven't found the character */
221 cbz synd, 0b
222
2231: // end
224 /* Termination condition found, let's calculate the syndrome value */
225 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
226 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
227 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
228 addp vend.16b, vend.16b, vend.16b /* 128->64 */
229 mov synd, vend.d[0]
230 /* Only do the clear for the last possible block */
231 b.hi 3f
232
2332: // masklast
234 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
235 add tmp, cntrem, soff
236 and tmp, tmp, #31
237 sub tmp, tmp, #32
238 neg tmp, tmp, lsl #1
239 lsl synd, synd, tmp
240 lsr synd, synd, tmp
241
2423: // tail
243 /* Count the trailing zeros using bit reversing */
244 rbit synd, synd
245 /* Compensate the last post-increment */
246 sub src, src, #32
247 /* Check that we have found a character */
248 cmp synd, #0
249 /* And count the leading zeros */
250 clz synd, synd
251 /* Compute the potential result */
252 add result, src, synd, lsr #1
253 /* Select result or NULL */
254 csel result, xzr, result, eq
255 ret
256
2574: // zero_length
258 mov result, #0
259 ret
260END_COMPILERRT_FUNCTION(__arm_sc_memchr)
261
262

source code of compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S