1 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
2 | // See https://llvm.org/LICENSE.txt for license information. |
3 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
4 | |
5 | // This patch implements the support routines for the SME ABI, |
6 | // described here: |
7 | // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines |
8 | |
9 | #include "../assembly.h" |
10 | |
11 | .set FEAT_SVE_BIT, 30 |
12 | .set FEAT_SME_BIT, 42 |
13 | .set FEAT_SME2_BIT, 57 |
14 | .set FEAT_SME2_MASK, 1 << 57 |
15 | .set SVCR_PSTATE_SM_BIT, 0 |
16 | |
17 | #if !defined(__APPLE__) |
18 | #define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features) |
19 | #define CPU_FEATS_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_cpu_features) |
20 | #else |
21 | // MachO requires @page/@pageoff directives because the global is defined |
22 | // in a different file. Otherwise this file may fail to build. |
23 | #define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)@page |
24 | #define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff |
25 | #endif |
26 | |
27 | .arch armv9-a+sme2 |
28 | |
29 | // Utility function which calls a system's abort() routine. Because the function |
30 | // is streaming-compatible it should disable streaming-SVE mode before calling |
31 | // abort(). Note that there is no need to preserve any state before the call, |
32 | // because the function does not return. |
33 | DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort) |
34 | .cfi_startproc |
35 | .variant_pcs FUNC_SYMBOL(SYMBOL_NAME(do_abort)) |
36 | BTI_C |
37 | stp x29, x30, [sp, #-32]! |
38 | cntd x0 |
39 | // Store VG to a stack location that we describe with .cfi_offset |
40 | str x0, [sp, #16] |
41 | .cfi_def_cfa_offset 32 |
42 | .cfi_offset w30, -24 |
43 | .cfi_offset w29, -32 |
44 | .cfi_offset 46, -16 |
45 | bl FUNC_SYMBOL(SYMBOL_NAME(__arm_sme_state)) |
46 | tbz x0, #0, 2f |
47 | 1: |
48 | smstop sm |
49 | 2: |
50 | // We can't make this into a tail-call because the unwinder would |
51 | // need to restore the value of VG. |
52 | bl FUNC_SYMBOL(SYMBOL_NAME(abort)) |
53 | .cfi_endproc |
54 | END_COMPILERRT_FUNCTION(do_abort) |
55 | |
56 | // __arm_sme_state fills the result registers based on a local |
57 | // that is set as part of the compiler-rt startup code. |
58 | // __aarch64_has_sme_and_tpidr2_el0 |
59 | DEFINE_COMPILERRT_FUNCTION(__arm_sme_state) |
60 | .variant_pcs __arm_sme_state |
61 | BTI_C |
62 | mov x0, xzr |
63 | mov x1, xzr |
64 | |
65 | adrp x16, CPU_FEATS_SYMBOL |
66 | ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] |
67 | tbz x16, #FEAT_SME_BIT, 1f |
68 | 0: |
69 | orr x0, x0, #0xC000000000000000 |
70 | mrs x16, SVCR |
71 | bfxil x0, x16, #0, #2 |
72 | mrs x1, TPIDR2_EL0 |
73 | 1: |
74 | ret |
75 | END_COMPILERRT_FUNCTION(__arm_sme_state) |
76 | |
77 | DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_restore) |
78 | .variant_pcs __arm_tpidr2_restore |
79 | BTI_C |
80 | // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific |
81 | // manner. |
82 | mrs x14, TPIDR2_EL0 |
83 | cbnz x14, 2f |
84 | |
85 | // If any of the reserved bytes in the first 16 bytes of BLK are nonzero, |
86 | // the subroutine [..] aborts in some platform-defined manner. |
87 | ldrh w14, [x0, #10] |
88 | cbnz w14, 2f |
89 | ldr w14, [x0, #12] |
90 | cbnz w14, 2f |
91 | |
92 | // If BLK.za_save_buffer is NULL, the subroutine does nothing. |
93 | ldr x16, [x0] |
94 | cbz x16, 1f |
95 | |
96 | // If BLK.num_za_save_slices is zero, the subroutine does nothing. |
97 | ldrh w14, [x0, #8] |
98 | cbz x14, 1f |
99 | |
100 | mov x15, xzr |
101 | 0: |
102 | ldr za[w15,0], [x16] |
103 | addsvl x16, x16, #1 |
104 | add x15, x15, #1 |
105 | cmp x14, x15 |
106 | b.ne 0b |
107 | 1: |
108 | ret |
109 | 2: |
110 | b FUNC_SYMBOL(SYMBOL_NAME(do_abort)) |
111 | END_COMPILERRT_FUNCTION(__arm_tpidr2_restore) |
112 | |
113 | DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_save) |
114 | .variant_pcs __arm_tpidr2_save |
115 | BTI_C |
116 | // If the current thread does not have access to TPIDR2_EL0, the subroutine |
117 | // does nothing. |
118 | adrp x14, CPU_FEATS_SYMBOL |
119 | ldr x14, [x14, CPU_FEATS_SYMBOL_OFFSET] |
120 | tbz x14, #FEAT_SME_BIT, 1f |
121 | |
122 | // If TPIDR2_EL0 is null, the subroutine does nothing. |
123 | mrs x16, TPIDR2_EL0 |
124 | cbz x16, 1f |
125 | |
126 | // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are |
127 | // nonzero, the subroutine [..] aborts in some platform-defined manner. |
128 | ldrh w14, [x16, #10] |
129 | cbnz w14, 2f |
130 | ldr w14, [x16, #12] |
131 | cbnz w14, 2f |
132 | |
133 | // If num_za_save_slices is zero, the subroutine does nothing. |
134 | ldrh w14, [x16, #8] |
135 | cbz x14, 1f |
136 | |
137 | // If za_save_buffer is NULL, the subroutine does nothing. |
138 | ldr x16, [x16] |
139 | cbz x16, 1f |
140 | |
141 | mov x15, xzr |
142 | 0: |
143 | str za[w15,0], [x16] |
144 | addsvl x16, x16, #1 |
145 | add x15, x15, #1 |
146 | cmp x14, x15 |
147 | b.ne 0b |
148 | 1: |
149 | ret |
150 | 2: |
151 | b FUNC_SYMBOL(SYMBOL_NAME(do_abort)) |
152 | END_COMPILERRT_FUNCTION(__arm_tpidr2_save) |
153 | |
154 | DEFINE_COMPILERRT_FUNCTION(__arm_za_disable) |
155 | .cfi_startproc |
156 | .variant_pcs __arm_za_disable |
157 | BTI_C |
158 | // If the current thread does not have access to SME, the subroutine does |
159 | // nothing. |
160 | adrp x14, CPU_FEATS_SYMBOL |
161 | ldr x14, [x14, CPU_FEATS_SYMBOL_OFFSET] |
162 | tbz x14, #FEAT_SME_BIT, 0f |
163 | |
164 | // Otherwise, the subroutine behaves as if it did the following: |
165 | // * Call __arm_tpidr2_save. |
166 | stp x29, x30, [sp, #-16]! |
167 | .cfi_def_cfa_offset 16 |
168 | mov x29, sp |
169 | .cfi_def_cfa w29, 16 |
170 | .cfi_offset w30, -8 |
171 | .cfi_offset w29, -16 |
172 | bl FUNC_SYMBOL(SYMBOL_NAME(__arm_tpidr2_save)) |
173 | |
174 | // * Set TPIDR2_EL0 to null. |
175 | msr TPIDR2_EL0, xzr |
176 | |
177 | // * Set PSTATE.ZA to 0. |
178 | smstop za |
179 | |
180 | .cfi_def_cfa wsp, 16 |
181 | ldp x29, x30, [sp], #16 |
182 | .cfi_def_cfa_offset 0 |
183 | .cfi_restore w30 |
184 | .cfi_restore w29 |
185 | 0: |
186 | ret |
187 | .cfi_endproc |
188 | END_COMPILERRT_FUNCTION(__arm_za_disable) |
189 | |
190 | DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg) |
191 | .variant_pcs __arm_get_current_vg |
192 | BTI_C |
193 | |
194 | adrp x17, CPU_FEATS_SYMBOL |
195 | ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] |
196 | tbnz w17, #FEAT_SVE_BIT, 1f |
197 | tbz x17, #FEAT_SME_BIT, 2f |
198 | 0: |
199 | mrs x17, SVCR |
200 | tbz x17, #SVCR_PSTATE_SM_BIT, 2f |
201 | 1: |
202 | cntd x0 |
203 | ret |
204 | 2: |
205 | mov x0, xzr |
206 | ret |
207 | END_COMPILERRT_FUNCTION(__arm_get_current_vg) |
208 | |
209 | // The diagram below describes the layout used in the following routines: |
210 | // * __arm_sme_state_size |
211 | // * __arm_sme_save |
212 | // * __arm_sme_restore |
213 | // |
214 | // +---------------------------------+ |
215 | // | ... | |
216 | // | ZA buffer | |
217 | // | ... | |
218 | // +---------------------------------+ <- @96 |
219 | // | ZT0 contents | |
220 | // +---------------------------------+ <- @32 |
221 | // | byte 15-10: zero (reserved) | |
222 | // | byte 9-8: num_za_save_slices | TPIDR2 block |
223 | // | byte 7-0: za_save_buffer | |
224 | // +---------------------------------+ <- @16 |
225 | // | bit 127-1: zero (reserved) | Internal state for __arm_sme_save/restore |
226 | // | bit 0: VALID | |
227 | // +---------------------------------+ <- @0 |
228 | |
229 | DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size) |
230 | .variant_pcs __arm_sme_state_size |
231 | BTI_C |
232 | |
233 | // Test if SME is available and ZA state is 'active'. |
234 | adrp x17, CPU_FEATS_SYMBOL |
235 | ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] |
236 | tbz x17, #FEAT_SME_BIT, 0f |
237 | mrs x16, SVCR |
238 | tbz x16, #1, 0f |
239 | mrs x16, TPIDR2_EL0 |
240 | cbnz x16, 0f |
241 | |
242 | // Size = HAS_FEAT_SME2 ? 96 : 32 |
243 | tst x17, #FEAT_SME2_MASK |
244 | mov w17, #32 |
245 | mov w16, #96 |
246 | csel x16, x17, x16, eq |
247 | |
248 | // Size = Size + (SVLB * SVLB) |
249 | rdsvl x17, #1 |
250 | madd x0, x17, x17, x16 |
251 | ret |
252 | |
253 | 0: |
254 | // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes) |
255 | mov w0, #16 |
256 | ret |
257 | END_COMPILERRT_FUNCTION(__arm_sme_state_size) |
258 | |
259 | DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) |
260 | .variant_pcs __arm_sme_save |
261 | BTI_C |
262 | |
263 | // If PTR is not 16-byte aligned, abort. |
264 | tst x0, #0xF |
265 | b.ne 3f |
266 | |
267 | // Clear internal state bits |
268 | stp xzr, xzr, [x0] |
269 | |
270 | // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return. |
271 | adrp x17, CPU_FEATS_SYMBOL |
272 | ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] |
273 | tbz x17, #FEAT_SME_BIT, 2f |
274 | mrs x16, SVCR |
275 | tbz x16, #1, 2f |
276 | mrs x16, TPIDR2_EL0 |
277 | cbnz x16, 2f |
278 | |
279 | # ZA or ZT0 need saving, we can now set internal VALID bit to 1 |
280 | mov w16, #1 |
281 | str x16, [x0] |
282 | |
283 | add x18, x0, #32 |
284 | tbz x17, #FEAT_SME2_BIT, 1f |
285 | |
286 | // Store ZT0 |
287 | str zt0, [x18] |
288 | add x18, x18, #64 |
289 | |
290 | 1: |
291 | // Set up lazy-save (x18 = pointer to buffer) |
292 | rdsvl x17, #1 |
293 | str x18, [x0, #16]! |
294 | strh w17, [x0, #8] |
295 | strh wzr, [x0, #10] |
296 | str wzr, [x0, #12] |
297 | msr TPIDR2_EL0, x0 |
298 | |
299 | 2: |
300 | // Do nothing |
301 | ret |
302 | |
303 | 3: |
304 | b FUNC_SYMBOL(SYMBOL_NAME(do_abort)) |
305 | END_COMPILERRT_FUNCTION(__arm_sme_save) |
306 | |
307 | DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore) |
308 | .cfi_startproc |
309 | .variant_pcs __arm_sme_restore |
310 | BTI_C |
311 | |
312 | stp x29, x30, [sp, #-16]! |
313 | .cfi_def_cfa_offset 16 |
314 | mov x29, sp |
315 | .cfi_def_cfa w29, 16 |
316 | .cfi_offset w30, -8 |
317 | .cfi_offset w29, -16 |
318 | |
319 | // If PTR is not 16-byte aligned, abort. |
320 | tst x0, #0xF |
321 | b.ne 3f |
322 | |
323 | // If the VALID bit is 0, return early. |
324 | ldr x16, [x0] |
325 | cbz x16, 2f |
326 | |
327 | // If SME is not available, abort. |
328 | adrp x17, CPU_FEATS_SYMBOL |
329 | ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] |
330 | tbz x17, #FEAT_SME_BIT, 3f |
331 | |
332 | // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0. |
333 | mrs x16, TPIDR2_EL0 |
334 | cbnz x16, 1f |
335 | |
336 | // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'), |
337 | // abort. |
338 | mrs x16, SVCR |
339 | tbnz x16, #1, 3f |
340 | |
341 | // Restore za. |
342 | smstart za |
343 | add x0, x0, #16 |
344 | bl __arm_tpidr2_restore |
345 | sub x0, x0, #16 |
346 | |
347 | 1: |
348 | smstart za |
349 | msr TPIDR2_EL0, xzr |
350 | |
351 | // Check if zt0 needs restoring. |
352 | tbz x17, #FEAT_SME2_BIT, 2f |
353 | |
354 | // Restore zt0. |
355 | add x16, x0, #32 |
356 | ldr zt0, [x16] |
357 | |
358 | 2: |
359 | // Do nothing |
360 | .cfi_def_cfa wsp, 16 |
361 | ldp x29, x30, [sp], #16 |
362 | .cfi_def_cfa_offset 0 |
363 | .cfi_restore w30 |
364 | .cfi_restore w29 |
365 | ret |
366 | |
367 | 3: |
368 | b FUNC_SYMBOL(SYMBOL_NAME(do_abort)) |
369 | .cfi_endproc |
370 | END_COMPILERRT_FUNCTION(__arm_sme_restore) |
371 | |
372 | NO_EXEC_STACK_DIRECTIVE |
373 | |
374 | // GNU property note for BTI and PAC |
375 | GNU_PROPERTY_BTI_PAC |
376 | |