1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | |
3 | #include <linux/export.h> |
4 | #include <linux/stringify.h> |
5 | #include <linux/linkage.h> |
6 | #include <asm/dwarf2.h> |
7 | #include <asm/cpufeatures.h> |
8 | #include <asm/alternative.h> |
9 | #include <asm/asm-offsets.h> |
10 | #include <asm/nospec-branch.h> |
11 | #include <asm/unwind_hints.h> |
12 | #include <asm/percpu.h> |
13 | #include <asm/frame.h> |
14 | #include <asm/nops.h> |
15 | |
16 | .section .text..__x86.indirect_thunk |
17 | |
18 | |
19 | .macro POLINE reg |
20 | ANNOTATE_INTRA_FUNCTION_CALL |
21 | call .Ldo_rop_\@ |
22 | int3 |
23 | .Ldo_rop_\@: |
24 | mov %\reg, (%_ASM_SP) |
25 | UNWIND_HINT_FUNC |
26 | .endm |
27 | |
28 | .macro RETPOLINE reg |
29 | POLINE \reg |
30 | RET |
31 | .endm |
32 | |
33 | .macro THUNK reg |
34 | |
35 | .align RETPOLINE_THUNK_SIZE |
36 | SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) |
37 | UNWIND_HINT_UNDEFINED |
38 | ANNOTATE_NOENDBR |
39 | |
40 | ALTERNATIVE_2 __stringify(RETPOLINE \reg), \ |
41 | __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE, \ |
42 | __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), ALT_NOT(X86_FEATURE_RETPOLINE) |
43 | |
44 | .endm |
45 | |
46 | /* |
47 | * Despite being an assembler file we can't just use .irp here |
48 | * because __KSYM_DEPS__ only uses the C preprocessor and would |
49 | * only see one instance of "__x86_indirect_thunk_\reg" rather |
50 | * than one per register with the correct names. So we do it |
51 | * the simple and nasty way... |
52 | * |
53 | * Worse, you can only have a single EXPORT_SYMBOL per line, |
54 | * and CPP can't insert newlines, so we have to repeat everything |
55 | * at least twice. |
56 | */ |
57 | |
58 | #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) |
59 | |
60 | .align RETPOLINE_THUNK_SIZE |
61 | SYM_CODE_START(__x86_indirect_thunk_array) |
62 | |
63 | #define GEN(reg) THUNK reg |
64 | #include <asm/GEN-for-each-reg.h> |
65 | #undef GEN |
66 | |
67 | .align RETPOLINE_THUNK_SIZE |
68 | SYM_CODE_END(__x86_indirect_thunk_array) |
69 | |
70 | #define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) |
71 | #include <asm/GEN-for-each-reg.h> |
72 | #undef GEN |
73 | |
74 | #ifdef CONFIG_CALL_DEPTH_TRACKING |
75 | .macro CALL_THUNK reg |
76 | .align RETPOLINE_THUNK_SIZE |
77 | |
78 | SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL) |
79 | UNWIND_HINT_UNDEFINED |
80 | ANNOTATE_NOENDBR |
81 | |
82 | CALL_DEPTH_ACCOUNT |
83 | POLINE \reg |
84 | ANNOTATE_UNRET_SAFE |
85 | ret |
86 | int3 |
87 | .endm |
88 | |
89 | .align RETPOLINE_THUNK_SIZE |
90 | SYM_CODE_START(__x86_indirect_call_thunk_array) |
91 | |
92 | #define GEN(reg) CALL_THUNK reg |
93 | #include <asm/GEN-for-each-reg.h> |
94 | #undef GEN |
95 | |
96 | .align RETPOLINE_THUNK_SIZE |
97 | SYM_CODE_END(__x86_indirect_call_thunk_array) |
98 | |
99 | #define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg) |
100 | #include <asm/GEN-for-each-reg.h> |
101 | #undef GEN |
102 | |
103 | .macro JUMP_THUNK reg |
104 | .align RETPOLINE_THUNK_SIZE |
105 | |
106 | SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL) |
107 | UNWIND_HINT_UNDEFINED |
108 | ANNOTATE_NOENDBR |
109 | POLINE \reg |
110 | ANNOTATE_UNRET_SAFE |
111 | ret |
112 | int3 |
113 | .endm |
114 | |
115 | .align RETPOLINE_THUNK_SIZE |
116 | SYM_CODE_START(__x86_indirect_jump_thunk_array) |
117 | |
118 | #define GEN(reg) JUMP_THUNK reg |
119 | #include <asm/GEN-for-each-reg.h> |
120 | #undef GEN |
121 | |
122 | .align RETPOLINE_THUNK_SIZE |
123 | SYM_CODE_END(__x86_indirect_jump_thunk_array) |
124 | |
125 | #define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) |
126 | #include <asm/GEN-for-each-reg.h> |
127 | #undef GEN |
128 | #endif |
129 | |
130 | #ifdef CONFIG_RETHUNK |
131 | |
132 | /* |
133 | * Be careful here: that label cannot really be removed because in |
134 | * some configurations and toolchains, the JMP __x86_return_thunk the |
135 | * compiler issues is either a short one or the compiler doesn't use |
136 | * relocations for same-section JMPs and that breaks the returns |
137 | * detection logic in apply_returns() and in objtool. |
138 | */ |
139 | .section .text..__x86.return_thunk |
140 | |
141 | #ifdef CONFIG_CPU_SRSO |
142 | |
143 | /* |
144 | * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at |
145 | * special addresses: |
146 | * |
147 | * - srso_alias_untrain_ret() is 2M aligned |
148 | * - srso_alias_safe_ret() is also in the same 2M page but bits 2, 8, 14 |
149 | * and 20 in its virtual address are set (while those bits in the |
150 | * srso_alias_untrain_ret() function are cleared). |
151 | * |
152 | * This guarantees that those two addresses will alias in the branch |
153 | * target buffer of Zen3/4 generations, leading to any potential |
154 | * poisoned entries at that BTB slot to get evicted. |
155 | * |
156 | * As a result, srso_alias_safe_ret() becomes a safe return. |
157 | */ |
158 | .pushsection .text..__x86.rethunk_untrain |
159 | SYM_CODE_START_NOALIGN(srso_alias_untrain_ret) |
160 | UNWIND_HINT_FUNC |
161 | ANNOTATE_NOENDBR |
162 | ASM_NOP2 |
163 | lfence |
164 | jmp srso_alias_return_thunk |
165 | SYM_FUNC_END(srso_alias_untrain_ret) |
166 | .popsection |
167 | |
168 | .pushsection .text..__x86.rethunk_safe |
169 | SYM_CODE_START_NOALIGN(srso_alias_safe_ret) |
170 | lea 8(%_ASM_SP), %_ASM_SP |
171 | UNWIND_HINT_FUNC |
172 | ANNOTATE_UNRET_SAFE |
173 | ret |
174 | int3 |
175 | SYM_FUNC_END(srso_alias_safe_ret) |
176 | |
177 | SYM_CODE_START_NOALIGN(srso_alias_return_thunk) |
178 | UNWIND_HINT_FUNC |
179 | ANNOTATE_NOENDBR |
180 | call srso_alias_safe_ret |
181 | ud2 |
182 | SYM_CODE_END(srso_alias_return_thunk) |
183 | .popsection |
184 | |
185 | /* |
186 | * SRSO untraining sequence for Zen1/2, similar to retbleed_untrain_ret() |
187 | * above. On kernel entry, srso_untrain_ret() is executed which is a |
188 | * |
189 | * movabs $0xccccc30824648d48,%rax |
190 | * |
191 | * and when the return thunk executes the inner label srso_safe_ret() |
192 | * later, it is a stack manipulation and a RET which is mispredicted and |
193 | * thus a "safe" one to use. |
194 | */ |
195 | .align 64 |
196 | .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc |
197 | SYM_CODE_START_LOCAL_NOALIGN(srso_untrain_ret) |
198 | ANNOTATE_NOENDBR |
199 | .byte 0x48, 0xb8 |
200 | |
201 | /* |
202 | * This forces the function return instruction to speculate into a trap |
203 | * (UD2 in srso_return_thunk() below). This RET will then mispredict |
204 | * and execution will continue at the return site read from the top of |
205 | * the stack. |
206 | */ |
207 | SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) |
208 | lea 8(%_ASM_SP), %_ASM_SP |
209 | ret |
210 | int3 |
211 | int3 |
212 | /* end of movabs */ |
213 | lfence |
214 | call srso_safe_ret |
215 | ud2 |
216 | SYM_CODE_END(srso_safe_ret) |
217 | SYM_FUNC_END(srso_untrain_ret) |
218 | |
219 | SYM_CODE_START(srso_return_thunk) |
220 | UNWIND_HINT_FUNC |
221 | ANNOTATE_NOENDBR |
222 | call srso_safe_ret |
223 | ud2 |
224 | SYM_CODE_END(srso_return_thunk) |
225 | |
226 | #define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret" |
227 | #define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret" |
228 | #else /* !CONFIG_CPU_SRSO */ |
229 | #define JMP_SRSO_UNTRAIN_RET "ud2" |
230 | #define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" |
231 | #endif /* CONFIG_CPU_SRSO */ |
232 | |
233 | #ifdef CONFIG_CPU_UNRET_ENTRY |
234 | |
235 | /* |
236 | * Some generic notes on the untraining sequences: |
237 | * |
238 | * They are interchangeable when it comes to flushing potentially wrong |
239 | * RET predictions from the BTB. |
240 | * |
241 | * The SRSO Zen1/2 (MOVABS) untraining sequence is longer than the |
242 | * Retbleed sequence because the return sequence done there |
243 | * (srso_safe_ret()) is longer and the return sequence must fully nest |
244 | * (end before) the untraining sequence. Therefore, the untraining |
245 | * sequence must fully overlap the return sequence. |
246 | * |
247 | * Regarding alignment - the instructions which need to be untrained, |
248 | * must all start at a cacheline boundary for Zen1/2 generations. That |
249 | * is, instruction sequences starting at srso_safe_ret() and |
250 | * the respective instruction sequences at retbleed_return_thunk() |
251 | * must start at a cacheline boundary. |
252 | */ |
253 | |
254 | /* |
255 | * Safety details here pertain to the AMD Zen{1,2} microarchitecture: |
256 | * 1) The RET at retbleed_return_thunk must be on a 64 byte boundary, for |
257 | * alignment within the BTB. |
258 | * 2) The instruction at retbleed_untrain_ret must contain, and not |
259 | * end with, the 0xc3 byte of the RET. |
260 | * 3) STIBP must be enabled, or SMT disabled, to prevent the sibling thread |
261 | * from re-poisioning the BTB prediction. |
262 | */ |
263 | .align 64 |
264 | .skip 64 - (retbleed_return_thunk - retbleed_untrain_ret), 0xcc |
265 | SYM_CODE_START_LOCAL_NOALIGN(retbleed_untrain_ret) |
266 | ANNOTATE_NOENDBR |
267 | /* |
268 | * As executed from retbleed_untrain_ret, this is: |
269 | * |
270 | * TEST $0xcc, %bl |
271 | * LFENCE |
272 | * JMP retbleed_return_thunk |
273 | * |
274 | * Executing the TEST instruction has a side effect of evicting any BTB |
275 | * prediction (potentially attacker controlled) attached to the RET, as |
276 | * retbleed_return_thunk + 1 isn't an instruction boundary at the moment. |
277 | */ |
278 | .byte 0xf6 |
279 | |
280 | /* |
281 | * As executed from retbleed_return_thunk, this is a plain RET. |
282 | * |
283 | * As part of the TEST above, RET is the ModRM byte, and INT3 the imm8. |
284 | * |
285 | * We subsequently jump backwards and architecturally execute the RET. |
286 | * This creates a correct BTB prediction (type=ret), but in the |
287 | * meantime we suffer Straight Line Speculation (because the type was |
288 | * no branch) which is halted by the INT3. |
289 | * |
290 | * With SMT enabled and STIBP active, a sibling thread cannot poison |
291 | * RET's prediction to a type of its choice, but can evict the |
292 | * prediction due to competitive sharing. If the prediction is |
293 | * evicted, retbleed_return_thunk will suffer Straight Line Speculation |
294 | * which will be contained safely by the INT3. |
295 | */ |
296 | SYM_INNER_LABEL(retbleed_return_thunk, SYM_L_GLOBAL) |
297 | ret |
298 | int3 |
299 | SYM_CODE_END(retbleed_return_thunk) |
300 | |
301 | /* |
302 | * Ensure the TEST decoding / BTB invalidation is complete. |
303 | */ |
304 | lfence |
305 | |
306 | /* |
307 | * Jump back and execute the RET in the middle of the TEST instruction. |
308 | * INT3 is for SLS protection. |
309 | */ |
310 | jmp retbleed_return_thunk |
311 | int3 |
312 | SYM_FUNC_END(retbleed_untrain_ret) |
313 | |
314 | #define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret" |
315 | #else /* !CONFIG_CPU_UNRET_ENTRY */ |
316 | #define JMP_RETBLEED_UNTRAIN_RET "ud2" |
317 | #endif /* CONFIG_CPU_UNRET_ENTRY */ |
318 | |
319 | #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) |
320 | |
321 | SYM_FUNC_START(entry_untrain_ret) |
322 | ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ |
323 | JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \ |
324 | JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS |
325 | SYM_FUNC_END(entry_untrain_ret) |
326 | __EXPORT_THUNK(entry_untrain_ret) |
327 | |
328 | #endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */ |
329 | |
330 | #ifdef CONFIG_CALL_DEPTH_TRACKING |
331 | |
332 | .align 64 |
333 | SYM_FUNC_START(call_depth_return_thunk) |
334 | ANNOTATE_NOENDBR |
335 | /* |
336 | * Keep the hotpath in a 16byte I-fetch for the non-debug |
337 | * case. |
338 | */ |
339 | CALL_THUNKS_DEBUG_INC_RETS |
340 | shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) |
341 | jz 1f |
342 | ANNOTATE_UNRET_SAFE |
343 | ret |
344 | int3 |
345 | 1: |
346 | CALL_THUNKS_DEBUG_INC_STUFFS |
347 | .rept 16 |
348 | ANNOTATE_INTRA_FUNCTION_CALL |
349 | call 2f |
350 | int3 |
351 | 2: |
352 | .endr |
353 | add $(8*16), %rsp |
354 | |
355 | CREDIT_CALL_DEPTH |
356 | |
357 | ANNOTATE_UNRET_SAFE |
358 | ret |
359 | int3 |
360 | SYM_FUNC_END(call_depth_return_thunk) |
361 | |
362 | #endif /* CONFIG_CALL_DEPTH_TRACKING */ |
363 | |
364 | /* |
365 | * This function name is magical and is used by -mfunction-return=thunk-extern |
366 | * for the compiler to generate JMPs to it. |
367 | * |
368 | * This code is only used during kernel boot or module init. All |
369 | * 'JMP __x86_return_thunk' sites are changed to something else by |
370 | * apply_returns(). |
371 | * |
372 | * This should be converted eventually to call a warning function which |
373 | * should scream loudly when the default return thunk is called after |
374 | * alternatives have been applied. |
375 | * |
376 | * That warning function cannot BUG() because the bug splat cannot be |
377 | * displayed in all possible configurations, leading to users not really |
378 | * knowing why the machine froze. |
379 | */ |
380 | SYM_CODE_START(__x86_return_thunk) |
381 | UNWIND_HINT_FUNC |
382 | ANNOTATE_NOENDBR |
383 | ANNOTATE_UNRET_SAFE |
384 | ret |
385 | int3 |
386 | SYM_CODE_END(__x86_return_thunk) |
387 | EXPORT_SYMBOL(__x86_return_thunk) |
388 | |
389 | #endif /* CONFIG_RETHUNK */ |
390 | |