1 | //===----------------------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file contains assembly-optimized implementations of Scalable Matrix |
11 | /// Extension (SME) compatible memset and memchr functions. |
12 | /// |
13 | /// These implementations depend on unaligned access and floating-point support. |
14 | /// |
15 | /// Routines taken from libc/AOR_v20.02/string/aarch64. |
16 | /// |
17 | //===----------------------------------------------------------------------===// |
18 | |
19 | #include "../assembly.h" |
20 | |
21 | // |
22 | // __arm_sc_memset |
23 | // |
24 | |
25 | #define dstin x0 |
26 | #define val x1 |
27 | #define valw w1 |
28 | #define count x2 |
29 | #define dst x3 |
30 | #define dstend2 x4 |
31 | #define zva_val x5 |
32 | |
33 | DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset) |
34 | #ifdef __ARM_FEATURE_SVE |
35 | mov z0.b, valw |
36 | #else |
37 | bfi valw, valw, #8, #8 |
38 | bfi valw, valw, #16, #16 |
39 | bfi val, val, #32, #32 |
40 | fmov d0, val |
41 | fmov v0.d[1], val |
42 | #endif |
43 | add dstend2, dstin, count |
44 | |
45 | cmp count, 96 |
46 | b.hi 7f // set_long |
47 | cmp count, 16 |
48 | b.hs 4f // set_medium |
49 | mov val, v0.D[0] |
50 | |
51 | /* Set 0..15 bytes. */ |
52 | tbz count, 3, 1f |
53 | str val, [dstin] |
54 | str val, [dstend2, -8] |
55 | ret |
56 | nop |
57 | 1: tbz count, 2, 2f |
58 | str valw, [dstin] |
59 | str valw, [dstend2, -4] |
60 | ret |
61 | 2: cbz count, 3f |
62 | strb valw, [dstin] |
63 | tbz count, 1, 3f |
64 | strh valw, [dstend2, -2] |
65 | 3: ret |
66 | |
67 | /* Set 17..96 bytes. */ |
68 | 4: // set_medium |
69 | str q0, [dstin] |
70 | tbnz count, 6, 6f // set96 |
71 | str q0, [dstend2, -16] |
72 | tbz count, 5, 5f |
73 | str q0, [dstin, 16] |
74 | str q0, [dstend2, -32] |
75 | 5: ret |
76 | |
77 | .p2align 4 |
78 | /* Set 64..96 bytes. Write 64 bytes from the start and |
79 | 32 bytes from the end. */ |
80 | 6: // set96 |
81 | str q0, [dstin, 16] |
82 | stp q0, q0, [dstin, 32] |
83 | stp q0, q0, [dstend2, -32] |
84 | ret |
85 | |
86 | .p2align 4 |
87 | 7: // set_long |
88 | and valw, valw, 255 |
89 | bic dst, dstin, 15 |
90 | str q0, [dstin] |
91 | cmp count, 160 |
92 | ccmp valw, 0, 0, hs |
93 | b.ne 9f // no_zva |
94 | |
95 | #ifndef SKIP_ZVA_CHECK |
96 | mrs zva_val, dczid_el0 |
97 | and zva_val, zva_val, 31 |
98 | cmp zva_val, 4 /* ZVA size is 64 bytes. */ |
99 | b.ne 9f // no_zva |
100 | #endif |
101 | str q0, [dst, 16] |
102 | stp q0, q0, [dst, 32] |
103 | bic dst, dst, 63 |
104 | sub count, dstend2, dst /* Count is now 64 too large. */ |
105 | sub count, count, 128 /* Adjust count and bias for loop. */ |
106 | |
107 | .p2align 4 |
108 | 8: // zva_loop |
109 | add dst, dst, 64 |
110 | dc zva, dst |
111 | subs count, count, 64 |
112 | b.hi 8b // zva_loop |
113 | stp q0, q0, [dstend2, -64] |
114 | stp q0, q0, [dstend2, -32] |
115 | ret |
116 | |
117 | 9: // no_zva |
118 | sub count, dstend2, dst /* Count is 16 too large. */ |
119 | sub dst, dst, 16 /* Dst is biased by -32. */ |
120 | sub count, count, 64 + 16 /* Adjust count and bias for loop. */ |
121 | 10: // no_zva_loop |
122 | stp q0, q0, [dst, 32] |
123 | stp q0, q0, [dst, 64]! |
124 | subs count, count, 64 |
125 | b.hi 10b // no_zva_loop |
126 | stp q0, q0, [dstend2, -64] |
127 | stp q0, q0, [dstend2, -32] |
128 | ret |
129 | END_COMPILERRT_FUNCTION(__arm_sc_memset) |
130 | |
131 | // |
132 | // __arm_sc_memchr |
133 | // |
134 | |
135 | #define srcin x0 |
136 | #define chrin w1 |
137 | #define cntin x2 |
138 | |
139 | #define result x0 |
140 | |
141 | #define src x3 |
142 | #define tmp x4 |
143 | #define wtmp2 w5 |
144 | #define synd x6 |
145 | #define soff x9 |
146 | #define cntrem x10 |
147 | |
148 | #define vrepchr v0 |
149 | #define vdata1 v1 |
150 | #define vdata2 v2 |
151 | #define vhas_chr1 v3 |
152 | #define vhas_chr2 v4 |
153 | #define vrepmask v5 |
154 | #define vend v6 |
155 | |
156 | /* |
157 | * Core algorithm: |
158 | * |
159 | * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits |
160 | * per byte. For each tuple, bit 0 is set if the relevant byte matched the |
161 | * requested character and bit 1 is not used (faster than using a 32bit |
162 | * syndrome). Since the bits in the syndrome reflect exactly the order in which |
163 | * things occur in the original string, counting trailing zeros allows to |
164 | * identify exactly which byte has matched. |
165 | */ |
166 | |
167 | DEFINE_COMPILERRT_FUNCTION(__arm_sc_memchr) |
168 | /* Do not dereference srcin if no bytes to compare. */ |
169 | cbz cntin, 4f |
170 | /* |
171 | * Magic constant 0x40100401 allows us to identify which lane matches |
172 | * the requested byte. |
173 | */ |
174 | mov wtmp2, #0x0401 |
175 | movk wtmp2, #0x4010, lsl #16 |
176 | dup vrepchr.16b, chrin |
177 | /* Work with aligned 32-byte chunks */ |
178 | bic src, srcin, #31 |
179 | dup vrepmask.4s, wtmp2 |
180 | ands soff, srcin, #31 |
181 | and cntrem, cntin, #31 |
182 | b.eq 0f |
183 | |
184 | /* |
185 | * Input string is not 32-byte aligned. We calculate the syndrome |
186 | * value for the aligned 32 bytes block containing the first bytes |
187 | * and mask the irrelevant part. |
188 | */ |
189 | |
190 | ld1 {vdata1.16b, vdata2.16b}, [src], #32 |
191 | sub tmp, soff, #32 |
192 | adds cntin, cntin, tmp |
193 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b |
194 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b |
195 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b |
196 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b |
197 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ |
198 | addp vend.16b, vend.16b, vend.16b /* 128->64 */ |
199 | mov synd, vend.d[0] |
200 | /* Clear the soff*2 lower bits */ |
201 | lsl tmp, soff, #1 |
202 | lsr synd, synd, tmp |
203 | lsl synd, synd, tmp |
204 | /* The first block can also be the last */ |
205 | b.ls 2f |
206 | /* Have we found something already? */ |
207 | cbnz synd, 3f |
208 | |
209 | 0: // loop |
210 | ld1 {vdata1.16b, vdata2.16b}, [src], #32 |
211 | subs cntin, cntin, #32 |
212 | cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b |
213 | cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b |
214 | /* If we're out of data we finish regardless of the result */ |
215 | b.ls 1f |
216 | /* Use a fast check for the termination condition */ |
217 | orr vend.16b, vhas_chr1.16b, vhas_chr2.16b |
218 | addp vend.2d, vend.2d, vend.2d |
219 | mov synd, vend.d[0] |
220 | /* We're not out of data, loop if we haven't found the character */ |
221 | cbz synd, 0b |
222 | |
223 | 1: // end |
224 | /* Termination condition found, let's calculate the syndrome value */ |
225 | and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b |
226 | and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b |
227 | addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ |
228 | addp vend.16b, vend.16b, vend.16b /* 128->64 */ |
229 | mov synd, vend.d[0] |
230 | /* Only do the clear for the last possible block */ |
231 | b.hi 3f |
232 | |
233 | 2: // masklast |
234 | /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ |
235 | add tmp, cntrem, soff |
236 | and tmp, tmp, #31 |
237 | sub tmp, tmp, #32 |
238 | neg tmp, tmp, lsl #1 |
239 | lsl synd, synd, tmp |
240 | lsr synd, synd, tmp |
241 | |
242 | 3: // tail |
243 | /* Count the trailing zeros using bit reversing */ |
244 | rbit synd, synd |
245 | /* Compensate the last post-increment */ |
246 | sub src, src, #32 |
247 | /* Check that we have found a character */ |
248 | cmp synd, #0 |
249 | /* And count the leading zeros */ |
250 | clz synd, synd |
251 | /* Compute the potential result */ |
252 | add result, src, synd, lsr #1 |
253 | /* Select result or NULL */ |
254 | csel result, xzr, result, eq |
255 | ret |
256 | |
257 | 4: // zero_length |
258 | mov result, #0 |
259 | ret |
260 | END_COMPILERRT_FUNCTION(__arm_sc_memchr) |
261 | |
262 | |