1 | /* SPDX-License-Identifier: GPL-2.0 */ |
---|---|
2 | /* |
3 | * linux/arch/x86_64/entry.S |
4 | * |
5 | * Copyright (C) 1991, 1992 Linus Torvalds |
6 | * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs |
7 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
8 | * |
9 | * entry.S contains the system-call and fault low-level handling routines. |
10 | * |
11 | * Some of this is documented in Documentation/arch/x86/entry_64.rst |
12 | * |
13 | * A note on terminology: |
14 | * - iret frame: Architecture defined interrupt frame from SS to RIP |
15 | * at the top of the kernel process stack. |
16 | * |
17 | * Some macro usage: |
18 | * - SYM_FUNC_START/END:Define functions in the symbol table. |
19 | * - idtentry: Define exception entry points. |
20 | */ |
21 | #include <linux/export.h> |
22 | #include <linux/linkage.h> |
23 | #include <asm/segment.h> |
24 | #include <asm/cache.h> |
25 | #include <asm/errno.h> |
26 | #include <asm/asm-offsets.h> |
27 | #include <asm/msr.h> |
28 | #include <asm/unistd.h> |
29 | #include <asm/thread_info.h> |
30 | #include <asm/hw_irq.h> |
31 | #include <asm/page_types.h> |
32 | #include <asm/irqflags.h> |
33 | #include <asm/paravirt.h> |
34 | #include <asm/percpu.h> |
35 | #include <asm/asm.h> |
36 | #include <asm/smap.h> |
37 | #include <asm/pgtable_types.h> |
38 | #include <asm/frame.h> |
39 | #include <asm/trapnr.h> |
40 | #include <asm/nospec-branch.h> |
41 | #include <asm/fsgsbase.h> |
42 | #include <linux/err.h> |
43 | |
44 | #include "calling.h" |
45 | |
46 | .code64 |
47 | .section .entry.text, "ax" |
48 | |
49 | /* |
50 | * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. |
51 | * |
52 | * This is the only entry point used for 64-bit system calls. The |
53 | * hardware interface is reasonably well designed and the register to |
54 | * argument mapping Linux uses fits well with the registers that are |
55 | * available when SYSCALL is used. |
56 | * |
57 | * SYSCALL instructions can be found inlined in libc implementations as |
58 | * well as some other programs and libraries. There are also a handful |
59 | * of SYSCALL instructions in the vDSO used, for example, as a |
60 | * clock_gettimeofday fallback. |
61 | * |
62 | * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, |
63 | * then loads new ss, cs, and rip from previously programmed MSRs. |
64 | * rflags gets masked by a value from another MSR (so CLD and CLAC |
65 | * are not needed). SYSCALL does not save anything on the stack |
66 | * and does not change rsp. |
67 | * |
68 | * Registers on entry: |
69 | * rax system call number |
70 | * rcx return address |
71 | * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) |
72 | * rdi arg0 |
73 | * rsi arg1 |
74 | * rdx arg2 |
75 | * r10 arg3 (needs to be moved to rcx to conform to C ABI) |
76 | * r8 arg4 |
77 | * r9 arg5 |
78 | * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) |
79 | * |
80 | * Only called from user space. |
81 | * |
82 | * When user can change pt_regs->foo always force IRET. That is because |
83 | * it deals with uncanonical addresses better. SYSRET has trouble |
84 | * with them due to bugs in both AMD and Intel CPUs. |
85 | */ |
86 | |
87 | SYM_CODE_START(entry_SYSCALL_64) |
88 | UNWIND_HINT_ENTRY |
89 | ENDBR |
90 | |
91 | swapgs |
92 | /* tss.sp2 is scratch space. */ |
93 | movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) |
94 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp |
95 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
96 | |
97 | SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) |
98 | ANNOTATE_NOENDBR |
99 | |
100 | /* Construct struct pt_regs on stack */ |
101 | pushq $__USER_DS /* pt_regs->ss */ |
102 | pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ |
103 | pushq %r11 /* pt_regs->flags */ |
104 | pushq $__USER_CS /* pt_regs->cs */ |
105 | pushq %rcx /* pt_regs->ip */ |
106 | SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) |
107 | pushq %rax /* pt_regs->orig_ax */ |
108 | |
109 | PUSH_AND_CLEAR_REGS rax=$-ENOSYS |
110 | |
111 | /* IRQs are off. */ |
112 | movq %rsp, %rdi |
113 | /* Sign extend the lower 32bit as syscall numbers are treated as int */ |
114 | movslq %eax, %rsi |
115 | |
116 | /* clobbers %rax, make sure it is after saving the syscall nr */ |
117 | IBRS_ENTER |
118 | UNTRAIN_RET |
119 | CLEAR_BRANCH_HISTORY |
120 | |
121 | call do_syscall_64 /* returns with IRQs disabled */ |
122 | |
123 | /* |
124 | * Try to use SYSRET instead of IRET if we're returning to |
125 | * a completely clean 64-bit userspace context. If we're not, |
126 | * go to the slow exit path. |
127 | * In the Xen PV case we must use iret anyway. |
128 | */ |
129 | |
130 | ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ |
131 | "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV |
132 | |
133 | /* |
134 | * We win! This label is here just for ease of understanding |
135 | * perf profiles. Nothing jumps here. |
136 | */ |
137 | syscall_return_via_sysret: |
138 | IBRS_EXIT |
139 | POP_REGS pop_rdi=0 |
140 | |
141 | /* |
142 | * Now all regs are restored except RSP and RDI. |
143 | * Save old stack pointer and switch to trampoline stack. |
144 | */ |
145 | movq %rsp, %rdi |
146 | movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp |
147 | UNWIND_HINT_END_OF_STACK |
148 | |
149 | pushq RSP-RDI(%rdi) /* RSP */ |
150 | pushq (%rdi) /* RDI */ |
151 | |
152 | /* |
153 | * We are on the trampoline stack. All regs except RDI are live. |
154 | * We can do future final exit work right here. |
155 | */ |
156 | STACKLEAK_ERASE_NOCLOBBER |
157 | |
158 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi |
159 | |
160 | popq %rdi |
161 | popq %rsp |
162 | SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) |
163 | ANNOTATE_NOENDBR |
164 | swapgs |
165 | CLEAR_CPU_BUFFERS |
166 | sysretq |
167 | SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) |
168 | ANNOTATE_NOENDBR |
169 | int3 |
170 | SYM_CODE_END(entry_SYSCALL_64) |
171 | |
172 | /* |
173 | * %rdi: prev task |
174 | * %rsi: next task |
175 | */ |
176 | .pushsection .text, "ax" |
177 | SYM_FUNC_START(__switch_to_asm) |
178 | ANNOTATE_NOENDBR |
179 | /* |
180 | * Save callee-saved registers |
181 | * This must match the order in inactive_task_frame |
182 | */ |
183 | pushq %rbp |
184 | pushq %rbx |
185 | pushq %r12 |
186 | pushq %r13 |
187 | pushq %r14 |
188 | pushq %r15 |
189 | |
190 | /* switch stack */ |
191 | movq %rsp, TASK_threadsp(%rdi) |
192 | movq TASK_threadsp(%rsi), %rsp |
193 | |
194 | #ifdef CONFIG_STACKPROTECTOR |
195 | movq TASK_stack_canary(%rsi), %rbx |
196 | movq %rbx, PER_CPU_VAR(__stack_chk_guard) |
197 | #endif |
198 | |
199 | /* |
200 | * When switching from a shallower to a deeper call stack |
201 | * the RSB may either underflow or use entries populated |
202 | * with userspace addresses. On CPUs where those concerns |
203 | * exist, overwrite the RSB with entries which capture |
204 | * speculative execution to prevent attack. |
205 | */ |
206 | FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW |
207 | |
208 | /* restore callee-saved registers */ |
209 | popq %r15 |
210 | popq %r14 |
211 | popq %r13 |
212 | popq %r12 |
213 | popq %rbx |
214 | popq %rbp |
215 | |
216 | jmp __switch_to |
217 | SYM_FUNC_END(__switch_to_asm) |
218 | .popsection |
219 | |
220 | /* |
221 | * A newly forked process directly context switches into this address. |
222 | * |
223 | * rax: prev task we switched from |
224 | * rbx: kernel thread func (NULL for user thread) |
225 | * r12: kernel thread arg |
226 | */ |
227 | .pushsection .text, "ax" |
228 | SYM_CODE_START(ret_from_fork_asm) |
229 | /* |
230 | * This is the start of the kernel stack; even through there's a |
231 | * register set at the top, the regset isn't necessarily coherent |
232 | * (consider kthreads) and one cannot unwind further. |
233 | * |
234 | * This ensures stack unwinds of kernel threads terminate in a known |
235 | * good state. |
236 | */ |
237 | UNWIND_HINT_END_OF_STACK |
238 | ANNOTATE_NOENDBR // copy_thread |
239 | CALL_DEPTH_ACCOUNT |
240 | |
241 | movq %rax, %rdi /* prev */ |
242 | movq %rsp, %rsi /* regs */ |
243 | movq %rbx, %rdx /* fn */ |
244 | movq %r12, %rcx /* fn_arg */ |
245 | call ret_from_fork |
246 | |
247 | /* |
248 | * Set the stack state to what is expected for the target function |
249 | * -- at this point the register set should be a valid user set |
250 | * and unwind should work normally. |
251 | */ |
252 | UNWIND_HINT_REGS |
253 | |
254 | #ifdef CONFIG_X86_FRED |
255 | ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ |
256 | "jmp asm_fred_exit_user", X86_FEATURE_FRED |
257 | #else |
258 | jmp swapgs_restore_regs_and_return_to_usermode |
259 | #endif |
260 | SYM_CODE_END(ret_from_fork_asm) |
261 | .popsection |
262 | |
263 | .macro DEBUG_ENTRY_ASSERT_IRQS_OFF |
264 | #ifdef CONFIG_DEBUG_ENTRY |
265 | pushq %rax |
266 | SAVE_FLAGS |
267 | testl $X86_EFLAGS_IF, %eax |
268 | jz .Lokay_\@ |
269 | ud2 |
270 | .Lokay_\@: |
271 | popq %rax |
272 | #endif |
273 | .endm |
274 | |
275 | SYM_CODE_START(xen_error_entry) |
276 | ANNOTATE_NOENDBR |
277 | UNWIND_HINT_FUNC |
278 | PUSH_AND_CLEAR_REGS save_ret=1 |
279 | ENCODE_FRAME_POINTER 8 |
280 | UNTRAIN_RET_FROM_CALL |
281 | RET |
282 | SYM_CODE_END(xen_error_entry) |
283 | |
284 | /** |
285 | * idtentry_body - Macro to emit code calling the C function |
286 | * @cfunc: C function to be called |
287 | * @has_error_code: Hardware pushed error code on stack |
288 | */ |
289 | .macro idtentry_body cfunc has_error_code:req |
290 | |
291 | /* |
292 | * Call error_entry() and switch to the task stack if from userspace. |
293 | * |
294 | * When in XENPV, it is already in the task stack, and it can't fault |
295 | * for native_iret() nor native_load_gs_index() since XENPV uses its |
296 | * own pvops for IRET and load_gs_index(). And it doesn't need to |
297 | * switch the CR3. So it can skip invoking error_entry(). |
298 | */ |
299 | ALTERNATIVE "call error_entry; movq %rax, %rsp", \ |
300 | "call xen_error_entry", X86_FEATURE_XENPV |
301 | |
302 | ENCODE_FRAME_POINTER |
303 | UNWIND_HINT_REGS |
304 | |
305 | movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ |
306 | |
307 | .if \has_error_code == 1 |
308 | movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ |
309 | movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ |
310 | .endif |
311 | |
312 | /* For some configurations \cfunc ends up being a noreturn. */ |
313 | ANNOTATE_REACHABLE |
314 | call \cfunc |
315 | |
316 | jmp error_return |
317 | .endm |
318 | |
319 | /** |
320 | * idtentry - Macro to generate entry stubs for simple IDT entries |
321 | * @vector: Vector number |
322 | * @asmsym: ASM symbol for the entry point |
323 | * @cfunc: C function to be called |
324 | * @has_error_code: Hardware pushed error code on stack |
325 | * |
326 | * The macro emits code to set up the kernel context for straight forward |
327 | * and simple IDT entries. No IST stack, no paranoid entry checks. |
328 | */ |
329 | .macro idtentry vector asmsym cfunc has_error_code:req |
330 | SYM_CODE_START(\asmsym) |
331 | |
332 | .if \vector == X86_TRAP_BP |
333 | /* #BP advances %rip to the next instruction */ |
334 | UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0 |
335 | .else |
336 | UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 |
337 | .endif |
338 | |
339 | ENDBR |
340 | ASM_CLAC |
341 | cld |
342 | |
343 | .if \has_error_code == 0 |
344 | pushq $-1 /* ORIG_RAX: no syscall to restart */ |
345 | .endif |
346 | |
347 | .if \vector == X86_TRAP_BP |
348 | /* |
349 | * If coming from kernel space, create a 6-word gap to allow the |
350 | * int3 handler to emulate a call instruction. |
351 | */ |
352 | testb $3, CS-ORIG_RAX(%rsp) |
353 | jnz .Lfrom_usermode_no_gap_\@ |
354 | .rept 6 |
355 | pushq 5*8(%rsp) |
356 | .endr |
357 | UNWIND_HINT_IRET_REGS offset=8 |
358 | .Lfrom_usermode_no_gap_\@: |
359 | .endif |
360 | |
361 | idtentry_body \cfunc \has_error_code |
362 | |
363 | _ASM_NOKPROBE(\asmsym) |
364 | SYM_CODE_END(\asmsym) |
365 | .endm |
366 | |
367 | /* |
368 | * Interrupt entry/exit. |
369 | * |
370 | + The interrupt stubs push (vector) onto the stack, which is the error_code |
371 | * position of idtentry exceptions, and jump to one of the two idtentry points |
372 | * (common/spurious). |
373 | * |
374 | * common_interrupt is a hotpath, align it to a cache line |
375 | */ |
376 | .macro idtentry_irq vector cfunc |
377 | .p2align CONFIG_X86_L1_CACHE_SHIFT |
378 | idtentry \vector asm_\cfunc \cfunc has_error_code=1 |
379 | .endm |
380 | |
381 | /** |
382 | * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB |
383 | * @vector: Vector number |
384 | * @asmsym: ASM symbol for the entry point |
385 | * @cfunc: C function to be called |
386 | * |
387 | * The macro emits code to set up the kernel context for #MC and #DB |
388 | * |
389 | * If the entry comes from user space it uses the normal entry path |
390 | * including the return to user space work and preemption checks on |
391 | * exit. |
392 | * |
393 | * If hits in kernel mode then it needs to go through the paranoid |
394 | * entry as the exception can hit any random state. No preemption |
395 | * check on exit to keep the paranoid path simple. |
396 | */ |
397 | .macro idtentry_mce_db vector asmsym cfunc |
398 | SYM_CODE_START(\asmsym) |
399 | UNWIND_HINT_IRET_ENTRY |
400 | ENDBR |
401 | ASM_CLAC |
402 | cld |
403 | |
404 | pushq $-1 /* ORIG_RAX: no syscall to restart */ |
405 | |
406 | /* |
407 | * If the entry is from userspace, switch stacks and treat it as |
408 | * a normal entry. |
409 | */ |
410 | testb $3, CS-ORIG_RAX(%rsp) |
411 | jnz .Lfrom_usermode_switch_stack_\@ |
412 | |
413 | /* paranoid_entry returns GS information for paranoid_exit in EBX. */ |
414 | call paranoid_entry |
415 | |
416 | UNWIND_HINT_REGS |
417 | |
418 | movq %rsp, %rdi /* pt_regs pointer */ |
419 | |
420 | call \cfunc |
421 | |
422 | jmp paranoid_exit |
423 | |
424 | /* Switch to the regular task stack and use the noist entry point */ |
425 | .Lfrom_usermode_switch_stack_\@: |
426 | idtentry_body noist_\cfunc, has_error_code=0 |
427 | |
428 | _ASM_NOKPROBE(\asmsym) |
429 | SYM_CODE_END(\asmsym) |
430 | .endm |
431 | |
432 | #ifdef CONFIG_AMD_MEM_ENCRYPT |
433 | /** |
434 | * idtentry_vc - Macro to generate entry stub for #VC |
435 | * @vector: Vector number |
436 | * @asmsym: ASM symbol for the entry point |
437 | * @cfunc: C function to be called |
438 | * |
439 | * The macro emits code to set up the kernel context for #VC. The #VC handler |
440 | * runs on an IST stack and needs to be able to cause nested #VC exceptions. |
441 | * |
442 | * To make this work the #VC entry code tries its best to pretend it doesn't use |
443 | * an IST stack by switching to the task stack if coming from user-space (which |
444 | * includes early SYSCALL entry path) or back to the stack in the IRET frame if |
445 | * entered from kernel-mode. |
446 | * |
447 | * If entered from kernel-mode the return stack is validated first, and if it is |
448 | * not safe to use (e.g. because it points to the entry stack) the #VC handler |
449 | * will switch to a fall-back stack (VC2) and call a special handler function. |
450 | * |
451 | * The macro is only used for one vector, but it is planned to be extended in |
452 | * the future for the #HV exception. |
453 | */ |
454 | .macro idtentry_vc vector asmsym cfunc |
455 | SYM_CODE_START(\asmsym) |
456 | UNWIND_HINT_IRET_ENTRY |
457 | ENDBR |
458 | ASM_CLAC |
459 | cld |
460 | |
461 | /* |
462 | * If the entry is from userspace, switch stacks and treat it as |
463 | * a normal entry. |
464 | */ |
465 | testb $3, CS-ORIG_RAX(%rsp) |
466 | jnz .Lfrom_usermode_switch_stack_\@ |
467 | |
468 | /* |
469 | * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. |
470 | * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS |
471 | */ |
472 | call paranoid_entry |
473 | |
474 | UNWIND_HINT_REGS |
475 | |
476 | /* |
477 | * Switch off the IST stack to make it free for nested exceptions. The |
478 | * vc_switch_off_ist() function will switch back to the interrupted |
479 | * stack if it is safe to do so. If not it switches to the VC fall-back |
480 | * stack. |
481 | */ |
482 | movq %rsp, %rdi /* pt_regs pointer */ |
483 | call vc_switch_off_ist |
484 | movq %rax, %rsp /* Switch to new stack */ |
485 | |
486 | ENCODE_FRAME_POINTER |
487 | UNWIND_HINT_REGS |
488 | |
489 | /* Update pt_regs */ |
490 | movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ |
491 | movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ |
492 | |
493 | movq %rsp, %rdi /* pt_regs pointer */ |
494 | |
495 | call kernel_\cfunc |
496 | |
497 | /* |
498 | * No need to switch back to the IST stack. The current stack is either |
499 | * identical to the stack in the IRET frame or the VC fall-back stack, |
500 | * so it is definitely mapped even with PTI enabled. |
501 | */ |
502 | jmp paranoid_exit |
503 | |
504 | /* Switch to the regular task stack */ |
505 | .Lfrom_usermode_switch_stack_\@: |
506 | idtentry_body user_\cfunc, has_error_code=1 |
507 | |
508 | _ASM_NOKPROBE(\asmsym) |
509 | SYM_CODE_END(\asmsym) |
510 | .endm |
511 | #endif |
512 | |
513 | /* |
514 | * Double fault entry. Straight paranoid. No checks from which context |
515 | * this comes because for the espfix induced #DF this would do the wrong |
516 | * thing. |
517 | */ |
518 | .macro idtentry_df vector asmsym cfunc |
519 | SYM_CODE_START(\asmsym) |
520 | UNWIND_HINT_IRET_ENTRY offset=8 |
521 | ENDBR |
522 | ASM_CLAC |
523 | cld |
524 | |
525 | /* paranoid_entry returns GS information for paranoid_exit in EBX. */ |
526 | call paranoid_entry |
527 | UNWIND_HINT_REGS |
528 | |
529 | movq %rsp, %rdi /* pt_regs pointer into first argument */ |
530 | movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ |
531 | movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ |
532 | |
533 | /* For some configurations \cfunc ends up being a noreturn. */ |
534 | ANNOTATE_REACHABLE |
535 | call \cfunc |
536 | |
537 | jmp paranoid_exit |
538 | |
539 | _ASM_NOKPROBE(\asmsym) |
540 | SYM_CODE_END(\asmsym) |
541 | .endm |
542 | |
543 | /* |
544 | * Include the defines which emit the idt entries which are shared |
545 | * shared between 32 and 64 bit and emit the __irqentry_text_* markers |
546 | * so the stacktrace boundary checks work. |
547 | */ |
548 | __ALIGN |
549 | .globl __irqentry_text_start |
550 | __irqentry_text_start: |
551 | |
552 | #include <asm/idtentry.h> |
553 | |
554 | __ALIGN |
555 | .globl __irqentry_text_end |
556 | __irqentry_text_end: |
557 | ANNOTATE_NOENDBR |
558 | |
559 | SYM_CODE_START_LOCAL(common_interrupt_return) |
560 | SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) |
561 | IBRS_EXIT |
562 | #ifdef CONFIG_XEN_PV |
563 | ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV |
564 | #endif |
565 | #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION |
566 | ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI |
567 | #endif |
568 | |
569 | STACKLEAK_ERASE |
570 | POP_REGS |
571 | add $8, %rsp /* orig_ax */ |
572 | UNWIND_HINT_IRET_REGS |
573 | |
574 | .Lswapgs_and_iret: |
575 | swapgs |
576 | CLEAR_CPU_BUFFERS |
577 | /* Assert that the IRET frame indicates user mode. */ |
578 | testb $3, 8(%rsp) |
579 | jnz .Lnative_iret |
580 | ud2 |
581 | |
582 | #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION |
583 | .Lpti_restore_regs_and_return_to_usermode: |
584 | POP_REGS pop_rdi=0 |
585 | |
586 | /* |
587 | * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. |
588 | * Save old stack pointer and switch to trampoline stack. |
589 | */ |
590 | movq %rsp, %rdi |
591 | movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp |
592 | UNWIND_HINT_END_OF_STACK |
593 | |
594 | /* Copy the IRET frame to the trampoline stack. */ |
595 | pushq 6*8(%rdi) /* SS */ |
596 | pushq 5*8(%rdi) /* RSP */ |
597 | pushq 4*8(%rdi) /* EFLAGS */ |
598 | pushq 3*8(%rdi) /* CS */ |
599 | pushq 2*8(%rdi) /* RIP */ |
600 | |
601 | /* Push user RDI on the trampoline stack. */ |
602 | pushq (%rdi) |
603 | |
604 | /* |
605 | * We are on the trampoline stack. All regs except RDI are live. |
606 | * We can do future final exit work right here. |
607 | */ |
608 | STACKLEAK_ERASE_NOCLOBBER |
609 | |
610 | push %rax |
611 | SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax |
612 | pop %rax |
613 | |
614 | /* Restore RDI. */ |
615 | popq %rdi |
616 | jmp .Lswapgs_and_iret |
617 | #endif |
618 | |
619 | SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) |
620 | #ifdef CONFIG_DEBUG_ENTRY |
621 | /* Assert that pt_regs indicates kernel mode. */ |
622 | testb $3, CS(%rsp) |
623 | jz 1f |
624 | ud2 |
625 | 1: |
626 | #endif |
627 | POP_REGS |
628 | addq $8, %rsp /* skip regs->orig_ax */ |
629 | /* |
630 | * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization |
631 | * when returning from IPI handler. |
632 | */ |
633 | #ifdef CONFIG_XEN_PV |
634 | SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL) |
635 | ANNOTATE_NOENDBR |
636 | .byte 0xe9 |
637 | .long .Lnative_iret - (. + 4) |
638 | #endif |
639 | |
640 | .Lnative_iret: |
641 | UNWIND_HINT_IRET_REGS |
642 | /* |
643 | * Are we returning to a stack segment from the LDT? Note: in |
644 | * 64-bit mode SS:RSP on the exception stack is always valid. |
645 | */ |
646 | #ifdef CONFIG_X86_ESPFIX64 |
647 | testb $4, (SS-RIP)(%rsp) |
648 | jnz native_irq_return_ldt |
649 | #endif |
650 | |
651 | SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL) |
652 | ANNOTATE_NOENDBR // exc_double_fault |
653 | /* |
654 | * This may fault. Non-paranoid faults on return to userspace are |
655 | * handled by fixup_bad_iret. These include #SS, #GP, and #NP. |
656 | * Double-faults due to espfix64 are handled in exc_double_fault. |
657 | * Other faults here are fatal. |
658 | */ |
659 | iretq |
660 | |
661 | #ifdef CONFIG_X86_ESPFIX64 |
662 | native_irq_return_ldt: |
663 | /* |
664 | * We are running with user GSBASE. All GPRs contain their user |
665 | * values. We have a percpu ESPFIX stack that is eight slots |
666 | * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom |
667 | * of the ESPFIX stack. |
668 | * |
669 | * We clobber RAX and RDI in this code. We stash RDI on the |
670 | * normal stack and RAX on the ESPFIX stack. |
671 | * |
672 | * The ESPFIX stack layout we set up looks like this: |
673 | * |
674 | * --- top of ESPFIX stack --- |
675 | * SS |
676 | * RSP |
677 | * RFLAGS |
678 | * CS |
679 | * RIP <-- RSP points here when we're done |
680 | * RAX <-- espfix_waddr points here |
681 | * --- bottom of ESPFIX stack --- |
682 | */ |
683 | |
684 | pushq %rdi /* Stash user RDI */ |
685 | swapgs /* to kernel GS */ |
686 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ |
687 | |
688 | movq PER_CPU_VAR(espfix_waddr), %rdi |
689 | movq %rax, (0*8)(%rdi) /* user RAX */ |
690 | movq (1*8)(%rsp), %rax /* user RIP */ |
691 | movq %rax, (1*8)(%rdi) |
692 | movq (2*8)(%rsp), %rax /* user CS */ |
693 | movq %rax, (2*8)(%rdi) |
694 | movq (3*8)(%rsp), %rax /* user RFLAGS */ |
695 | movq %rax, (3*8)(%rdi) |
696 | movq (5*8)(%rsp), %rax /* user SS */ |
697 | movq %rax, (5*8)(%rdi) |
698 | movq (4*8)(%rsp), %rax /* user RSP */ |
699 | movq %rax, (4*8)(%rdi) |
700 | /* Now RAX == RSP. */ |
701 | |
702 | andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ |
703 | |
704 | /* |
705 | * espfix_stack[31:16] == 0. The page tables are set up such that |
706 | * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of |
707 | * espfix_waddr for any X. That is, there are 65536 RO aliases of |
708 | * the same page. Set up RSP so that RSP[31:16] contains the |
709 | * respective 16 bits of the /userspace/ RSP and RSP nonetheless |
710 | * still points to an RO alias of the ESPFIX stack. |
711 | */ |
712 | orq PER_CPU_VAR(espfix_stack), %rax |
713 | |
714 | SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi |
715 | swapgs /* to user GS */ |
716 | popq %rdi /* Restore user RDI */ |
717 | |
718 | movq %rax, %rsp |
719 | UNWIND_HINT_IRET_REGS offset=8 |
720 | |
721 | /* |
722 | * At this point, we cannot write to the stack any more, but we can |
723 | * still read. |
724 | */ |
725 | popq %rax /* Restore user RAX */ |
726 | |
727 | CLEAR_CPU_BUFFERS |
728 | |
729 | /* |
730 | * RSP now points to an ordinary IRET frame, except that the page |
731 | * is read-only and RSP[31:16] are preloaded with the userspace |
732 | * values. We can now IRET back to userspace. |
733 | */ |
734 | jmp native_irq_return_iret |
735 | #endif |
736 | SYM_CODE_END(common_interrupt_return) |
737 | _ASM_NOKPROBE(common_interrupt_return) |
738 | |
739 | /* |
740 | * Reload gs selector with exception handling |
741 | * di: new selector |
742 | * |
743 | * Is in entry.text as it shouldn't be instrumented. |
744 | */ |
745 | SYM_FUNC_START(asm_load_gs_index) |
746 | ANNOTATE_NOENDBR |
747 | FRAME_BEGIN |
748 | swapgs |
749 | .Lgs_change: |
750 | ANNOTATE_NOENDBR // error_entry |
751 | movl %edi, %gs |
752 | 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE |
753 | swapgs |
754 | FRAME_END |
755 | RET |
756 | |
757 | /* running with kernelgs */ |
758 | .Lbad_gs: |
759 | swapgs /* switch back to user gs */ |
760 | .macro ZAP_GS |
761 | /* This can't be a string because the preprocessor needs to see it. */ |
762 | movl $__USER_DS, %eax |
763 | movl %eax, %gs |
764 | .endm |
765 | ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG |
766 | xorl %eax, %eax |
767 | movl %eax, %gs |
768 | jmp 2b |
769 | |
770 | _ASM_EXTABLE(.Lgs_change, .Lbad_gs) |
771 | |
772 | SYM_FUNC_END(asm_load_gs_index) |
773 | EXPORT_SYMBOL(asm_load_gs_index) |
774 | |
775 | #ifdef CONFIG_XEN_PV |
776 | /* |
777 | * A note on the "critical region" in our callback handler. |
778 | * We want to avoid stacking callback handlers due to events occurring |
779 | * during handling of the last event. To do this, we keep events disabled |
780 | * until we've done all processing. HOWEVER, we must enable events before |
781 | * popping the stack frame (can't be done atomically) and so it would still |
782 | * be possible to get enough handler activations to overflow the stack. |
783 | * Although unlikely, bugs of that kind are hard to track down, so we'd |
784 | * like to avoid the possibility. |
785 | * So, on entry to the handler we detect whether we interrupted an |
786 | * existing activation in its critical region -- if so, we pop the current |
787 | * activation and restart the handler using the previous one. |
788 | * |
789 | * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) |
790 | */ |
791 | __FUNC_ALIGN |
792 | SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback) |
793 | |
794 | /* |
795 | * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will |
796 | * see the correct pointer to the pt_regs |
797 | */ |
798 | UNWIND_HINT_FUNC |
799 | movq %rdi, %rsp /* we don't return, adjust the stack frame */ |
800 | UNWIND_HINT_REGS |
801 | |
802 | call xen_pv_evtchn_do_upcall |
803 | |
804 | jmp error_return |
805 | SYM_CODE_END(exc_xen_hypervisor_callback) |
806 | |
807 | /* |
808 | * Hypervisor uses this for application faults while it executes. |
809 | * We get here for two reasons: |
810 | * 1. Fault while reloading DS, ES, FS or GS |
811 | * 2. Fault while executing IRET |
812 | * Category 1 we do not need to fix up as Xen has already reloaded all segment |
813 | * registers that could be reloaded and zeroed the others. |
814 | * Category 2 we fix up by killing the current process. We cannot use the |
815 | * normal Linux return path in this case because if we use the IRET hypercall |
816 | * to pop the stack frame we end up in an infinite loop of failsafe callbacks. |
817 | * We distinguish between categories by comparing each saved segment register |
818 | * with its current contents: any discrepancy means we in category 1. |
819 | */ |
820 | __FUNC_ALIGN |
821 | SYM_CODE_START_NOALIGN(xen_failsafe_callback) |
822 | UNWIND_HINT_UNDEFINED |
823 | ENDBR |
824 | movl %ds, %ecx |
825 | cmpw %cx, 0x10(%rsp) |
826 | jne 1f |
827 | movl %es, %ecx |
828 | cmpw %cx, 0x18(%rsp) |
829 | jne 1f |
830 | movl %fs, %ecx |
831 | cmpw %cx, 0x20(%rsp) |
832 | jne 1f |
833 | movl %gs, %ecx |
834 | cmpw %cx, 0x28(%rsp) |
835 | jne 1f |
836 | /* All segments match their saved values => Category 2 (Bad IRET). */ |
837 | movq (%rsp), %rcx |
838 | movq 8(%rsp), %r11 |
839 | addq $0x30, %rsp |
840 | pushq $0 /* RIP */ |
841 | UNWIND_HINT_IRET_REGS offset=8 |
842 | jmp asm_exc_general_protection |
843 | 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ |
844 | movq (%rsp), %rcx |
845 | movq 8(%rsp), %r11 |
846 | addq $0x30, %rsp |
847 | UNWIND_HINT_IRET_REGS |
848 | pushq $-1 /* orig_ax = -1 => not a system call */ |
849 | PUSH_AND_CLEAR_REGS |
850 | ENCODE_FRAME_POINTER |
851 | jmp error_return |
852 | SYM_CODE_END(xen_failsafe_callback) |
853 | #endif /* CONFIG_XEN_PV */ |
854 | |
855 | /* |
856 | * Save all registers in pt_regs. Return GSBASE related information |
857 | * in EBX depending on the availability of the FSGSBASE instructions: |
858 | * |
859 | * FSGSBASE R/EBX |
860 | * N 0 -> SWAPGS on exit |
861 | * 1 -> no SWAPGS on exit |
862 | * |
863 | * Y GSBASE value at entry, must be restored in paranoid_exit |
864 | * |
865 | * R14 - old CR3 |
866 | * R15 - old SPEC_CTRL |
867 | */ |
868 | SYM_CODE_START(paranoid_entry) |
869 | ANNOTATE_NOENDBR |
870 | UNWIND_HINT_FUNC |
871 | PUSH_AND_CLEAR_REGS save_ret=1 |
872 | ENCODE_FRAME_POINTER 8 |
873 | |
874 | /* |
875 | * Always stash CR3 in %r14. This value will be restored, |
876 | * verbatim, at exit. Needed if paranoid_entry interrupted |
877 | * another entry that already switched to the user CR3 value |
878 | * but has not yet returned to userspace. |
879 | * |
880 | * This is also why CS (stashed in the "iret frame" by the |
881 | * hardware at entry) can not be used: this may be a return |
882 | * to kernel code, but with a user CR3 value. |
883 | * |
884 | * Switching CR3 does not depend on kernel GSBASE so it can |
885 | * be done before switching to the kernel GSBASE. This is |
886 | * required for FSGSBASE because the kernel GSBASE has to |
887 | * be retrieved from a kernel internal table. |
888 | */ |
889 | SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 |
890 | |
891 | /* |
892 | * Handling GSBASE depends on the availability of FSGSBASE. |
893 | * |
894 | * Without FSGSBASE the kernel enforces that negative GSBASE |
895 | * values indicate kernel GSBASE. With FSGSBASE no assumptions |
896 | * can be made about the GSBASE value when entering from user |
897 | * space. |
898 | */ |
899 | ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE |
900 | |
901 | /* |
902 | * Read the current GSBASE and store it in %rbx unconditionally, |
903 | * retrieve and set the current CPUs kernel GSBASE. The stored value |
904 | * has to be restored in paranoid_exit unconditionally. |
905 | * |
906 | * The unconditional write to GS base below ensures that no subsequent |
907 | * loads based on a mispredicted GS base can happen, therefore no LFENCE |
908 | * is needed here. |
909 | */ |
910 | SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx |
911 | jmp .Lparanoid_gsbase_done |
912 | |
913 | .Lparanoid_entry_checkgs: |
914 | /* EBX = 1 -> kernel GSBASE active, no restore required */ |
915 | movl $1, %ebx |
916 | |
917 | /* |
918 | * The kernel-enforced convention is a negative GSBASE indicates |
919 | * a kernel value. No SWAPGS needed on entry and exit. |
920 | */ |
921 | movl $MSR_GS_BASE, %ecx |
922 | rdmsr |
923 | testl %edx, %edx |
924 | js .Lparanoid_kernel_gsbase |
925 | |
926 | /* EBX = 0 -> SWAPGS required on exit */ |
927 | xorl %ebx, %ebx |
928 | swapgs |
929 | .Lparanoid_kernel_gsbase: |
930 | FENCE_SWAPGS_KERNEL_ENTRY |
931 | .Lparanoid_gsbase_done: |
932 | |
933 | /* |
934 | * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like |
935 | * CR3 above, keep the old value in a callee saved register. |
936 | */ |
937 | IBRS_ENTER save_reg=%r15 |
938 | UNTRAIN_RET_FROM_CALL |
939 | |
940 | RET |
941 | SYM_CODE_END(paranoid_entry) |
942 | |
943 | /* |
944 | * "Paranoid" exit path from exception stack. This is invoked |
945 | * only on return from non-NMI IST interrupts that came |
946 | * from kernel space. |
947 | * |
948 | * We may be returning to very strange contexts (e.g. very early |
949 | * in syscall entry), so checking for preemption here would |
950 | * be complicated. Fortunately, there's no good reason to try |
951 | * to handle preemption here. |
952 | * |
953 | * R/EBX contains the GSBASE related information depending on the |
954 | * availability of the FSGSBASE instructions: |
955 | * |
956 | * FSGSBASE R/EBX |
957 | * N 0 -> SWAPGS on exit |
958 | * 1 -> no SWAPGS on exit |
959 | * |
960 | * Y User space GSBASE, must be restored unconditionally |
961 | * |
962 | * R14 - old CR3 |
963 | * R15 - old SPEC_CTRL |
964 | */ |
965 | SYM_CODE_START_LOCAL(paranoid_exit) |
966 | UNWIND_HINT_REGS |
967 | |
968 | /* |
969 | * Must restore IBRS state before both CR3 and %GS since we need access |
970 | * to the per-CPU x86_spec_ctrl_shadow variable. |
971 | */ |
972 | IBRS_EXIT save_reg=%r15 |
973 | |
974 | /* |
975 | * The order of operations is important. PARANOID_RESTORE_CR3 requires |
976 | * kernel GSBASE. |
977 | * |
978 | * NB to anyone to try to optimize this code: this code does |
979 | * not execute at all for exceptions from user mode. Those |
980 | * exceptions go through error_return instead. |
981 | */ |
982 | PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 |
983 | |
984 | /* Handle the three GSBASE cases */ |
985 | ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE |
986 | |
987 | /* With FSGSBASE enabled, unconditionally restore GSBASE */ |
988 | wrgsbase %rbx |
989 | jmp restore_regs_and_return_to_kernel |
990 | |
991 | .Lparanoid_exit_checkgs: |
992 | /* On non-FSGSBASE systems, conditionally do SWAPGS */ |
993 | testl %ebx, %ebx |
994 | jnz restore_regs_and_return_to_kernel |
995 | |
996 | /* We are returning to a context with user GSBASE */ |
997 | swapgs |
998 | jmp restore_regs_and_return_to_kernel |
999 | SYM_CODE_END(paranoid_exit) |
1000 | |
1001 | /* |
1002 | * Switch GS and CR3 if needed. |
1003 | */ |
1004 | SYM_CODE_START(error_entry) |
1005 | ANNOTATE_NOENDBR |
1006 | UNWIND_HINT_FUNC |
1007 | |
1008 | PUSH_AND_CLEAR_REGS save_ret=1 |
1009 | ENCODE_FRAME_POINTER 8 |
1010 | |
1011 | testb $3, CS+8(%rsp) |
1012 | jz .Lerror_kernelspace |
1013 | |
1014 | /* |
1015 | * We entered from user mode or we're pretending to have entered |
1016 | * from user mode due to an IRET fault. |
1017 | */ |
1018 | swapgs |
1019 | FENCE_SWAPGS_USER_ENTRY |
1020 | /* We have user CR3. Change to kernel CR3. */ |
1021 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax |
1022 | IBRS_ENTER |
1023 | UNTRAIN_RET_FROM_CALL |
1024 | |
1025 | leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ |
1026 | /* Put us onto the real thread stack. */ |
1027 | jmp sync_regs |
1028 | |
1029 | /* |
1030 | * There are two places in the kernel that can potentially fault with |
1031 | * usergs. Handle them here. B stepping K8s sometimes report a |
1032 | * truncated RIP for IRET exceptions returning to compat mode. Check |
1033 | * for these here too. |
1034 | */ |
1035 | .Lerror_kernelspace: |
1036 | leaq native_irq_return_iret(%rip), %rcx |
1037 | cmpq %rcx, RIP+8(%rsp) |
1038 | je .Lerror_bad_iret |
1039 | movl %ecx, %eax /* zero extend */ |
1040 | cmpq %rax, RIP+8(%rsp) |
1041 | je .Lbstep_iret |
1042 | cmpq $.Lgs_change, RIP+8(%rsp) |
1043 | jne .Lerror_entry_done_lfence |
1044 | |
1045 | /* |
1046 | * hack: .Lgs_change can fail with user gsbase. If this happens, fix up |
1047 | * gsbase and proceed. We'll fix up the exception and land in |
1048 | * .Lgs_change's error handler with kernel gsbase. |
1049 | */ |
1050 | swapgs |
1051 | |
1052 | /* |
1053 | * Issue an LFENCE to prevent GS speculation, regardless of whether it is a |
1054 | * kernel or user gsbase. |
1055 | */ |
1056 | .Lerror_entry_done_lfence: |
1057 | FENCE_SWAPGS_KERNEL_ENTRY |
1058 | CALL_DEPTH_ACCOUNT |
1059 | leaq 8(%rsp), %rax /* return pt_regs pointer */ |
1060 | VALIDATE_UNRET_END |
1061 | RET |
1062 | |
1063 | .Lbstep_iret: |
1064 | /* Fix truncated RIP */ |
1065 | movq %rcx, RIP+8(%rsp) |
1066 | /* fall through */ |
1067 | |
1068 | .Lerror_bad_iret: |
1069 | /* |
1070 | * We came from an IRET to user mode, so we have user |
1071 | * gsbase and CR3. Switch to kernel gsbase and CR3: |
1072 | */ |
1073 | swapgs |
1074 | FENCE_SWAPGS_USER_ENTRY |
1075 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax |
1076 | IBRS_ENTER |
1077 | UNTRAIN_RET_FROM_CALL |
1078 | |
1079 | /* |
1080 | * Pretend that the exception came from user mode: set up pt_regs |
1081 | * as if we faulted immediately after IRET. |
1082 | */ |
1083 | leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */ |
1084 | call fixup_bad_iret |
1085 | mov %rax, %rdi |
1086 | jmp sync_regs |
1087 | SYM_CODE_END(error_entry) |
1088 | |
1089 | SYM_CODE_START_LOCAL(error_return) |
1090 | UNWIND_HINT_REGS |
1091 | DEBUG_ENTRY_ASSERT_IRQS_OFF |
1092 | testb $3, CS(%rsp) |
1093 | jz restore_regs_and_return_to_kernel |
1094 | jmp swapgs_restore_regs_and_return_to_usermode |
1095 | SYM_CODE_END(error_return) |
1096 | |
1097 | /* |
1098 | * Runs on exception stack. Xen PV does not go through this path at all, |
1099 | * so we can use real assembly here. |
1100 | * |
1101 | * Registers: |
1102 | * %r14: Used to save/restore the CR3 of the interrupted context |
1103 | * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber. |
1104 | */ |
1105 | SYM_CODE_START(asm_exc_nmi) |
1106 | UNWIND_HINT_IRET_ENTRY |
1107 | ENDBR |
1108 | |
1109 | /* |
1110 | * We allow breakpoints in NMIs. If a breakpoint occurs, then |
1111 | * the iretq it performs will take us out of NMI context. |
1112 | * This means that we can have nested NMIs where the next |
1113 | * NMI is using the top of the stack of the previous NMI. We |
1114 | * can't let it execute because the nested NMI will corrupt the |
1115 | * stack of the previous NMI. NMI handlers are not re-entrant |
1116 | * anyway. |
1117 | * |
1118 | * To handle this case we do the following: |
1119 | * Check a special location on the stack that contains a |
1120 | * variable that is set when NMIs are executing. |
1121 | * The interrupted task's stack is also checked to see if it |
1122 | * is an NMI stack. |
1123 | * If the variable is not set and the stack is not the NMI |
1124 | * stack then: |
1125 | * o Set the special variable on the stack |
1126 | * o Copy the interrupt frame into an "outermost" location on the |
1127 | * stack |
1128 | * o Copy the interrupt frame into an "iret" location on the stack |
1129 | * o Continue processing the NMI |
1130 | * If the variable is set or the previous stack is the NMI stack: |
1131 | * o Modify the "iret" location to jump to the repeat_nmi |
1132 | * o return back to the first NMI |
1133 | * |
1134 | * Now on exit of the first NMI, we first clear the stack variable |
1135 | * The NMI stack will tell any nested NMIs at that point that it is |
1136 | * nested. Then we pop the stack normally with iret, and if there was |
1137 | * a nested NMI that updated the copy interrupt stack frame, a |
1138 | * jump will be made to the repeat_nmi code that will handle the second |
1139 | * NMI. |
1140 | * |
1141 | * However, espfix prevents us from directly returning to userspace |
1142 | * with a single IRET instruction. Similarly, IRET to user mode |
1143 | * can fault. We therefore handle NMIs from user space like |
1144 | * other IST entries. |
1145 | */ |
1146 | |
1147 | ASM_CLAC |
1148 | cld |
1149 | |
1150 | /* Use %rdx as our temp variable throughout */ |
1151 | pushq %rdx |
1152 | |
1153 | testb $3, CS-RIP+8(%rsp) |
1154 | jz .Lnmi_from_kernel |
1155 | |
1156 | /* |
1157 | * NMI from user mode. We need to run on the thread stack, but we |
1158 | * can't go through the normal entry paths: NMIs are masked, and |
1159 | * we don't want to enable interrupts, because then we'll end |
1160 | * up in an awkward situation in which IRQs are on but NMIs |
1161 | * are off. |
1162 | * |
1163 | * We also must not push anything to the stack before switching |
1164 | * stacks lest we corrupt the "NMI executing" variable. |
1165 | */ |
1166 | |
1167 | swapgs |
1168 | FENCE_SWAPGS_USER_ENTRY |
1169 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx |
1170 | movq %rsp, %rdx |
1171 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
1172 | UNWIND_HINT_IRET_REGS base=%rdx offset=8 |
1173 | pushq 5*8(%rdx) /* pt_regs->ss */ |
1174 | pushq 4*8(%rdx) /* pt_regs->rsp */ |
1175 | pushq 3*8(%rdx) /* pt_regs->flags */ |
1176 | pushq 2*8(%rdx) /* pt_regs->cs */ |
1177 | pushq 1*8(%rdx) /* pt_regs->rip */ |
1178 | UNWIND_HINT_IRET_REGS |
1179 | pushq $-1 /* pt_regs->orig_ax */ |
1180 | PUSH_AND_CLEAR_REGS rdx=(%rdx) |
1181 | ENCODE_FRAME_POINTER |
1182 | |
1183 | IBRS_ENTER |
1184 | UNTRAIN_RET |
1185 | |
1186 | /* |
1187 | * At this point we no longer need to worry about stack damage |
1188 | * due to nesting -- we're on the normal thread stack and we're |
1189 | * done with the NMI stack. |
1190 | */ |
1191 | |
1192 | movq %rsp, %rdi |
1193 | call exc_nmi |
1194 | |
1195 | /* |
1196 | * Return back to user mode. We must *not* do the normal exit |
1197 | * work, because we don't want to enable interrupts. |
1198 | */ |
1199 | jmp swapgs_restore_regs_and_return_to_usermode |
1200 | |
1201 | .Lnmi_from_kernel: |
1202 | /* |
1203 | * Here's what our stack frame will look like: |
1204 | * +---------------------------------------------------------+ |
1205 | * | original SS | |
1206 | * | original Return RSP | |
1207 | * | original RFLAGS | |
1208 | * | original CS | |
1209 | * | original RIP | |
1210 | * +---------------------------------------------------------+ |
1211 | * | temp storage for rdx | |
1212 | * +---------------------------------------------------------+ |
1213 | * | "NMI executing" variable | |
1214 | * +---------------------------------------------------------+ |
1215 | * | iret SS } Copied from "outermost" frame | |
1216 | * | iret Return RSP } on each loop iteration; overwritten | |
1217 | * | iret RFLAGS } by a nested NMI to force another | |
1218 | * | iret CS } iteration if needed. | |
1219 | * | iret RIP } | |
1220 | * +---------------------------------------------------------+ |
1221 | * | outermost SS } initialized in first_nmi; | |
1222 | * | outermost Return RSP } will not be changed before | |
1223 | * | outermost RFLAGS } NMI processing is done. | |
1224 | * | outermost CS } Copied to "iret" frame on each | |
1225 | * | outermost RIP } iteration. | |
1226 | * +---------------------------------------------------------+ |
1227 | * | pt_regs | |
1228 | * +---------------------------------------------------------+ |
1229 | * |
1230 | * The "original" frame is used by hardware. Before re-enabling |
1231 | * NMIs, we need to be done with it, and we need to leave enough |
1232 | * space for the asm code here. |
1233 | * |
1234 | * We return by executing IRET while RSP points to the "iret" frame. |
1235 | * That will either return for real or it will loop back into NMI |
1236 | * processing. |
1237 | * |
1238 | * The "outermost" frame is copied to the "iret" frame on each |
1239 | * iteration of the loop, so each iteration starts with the "iret" |
1240 | * frame pointing to the final return target. |
1241 | */ |
1242 | |
1243 | /* |
1244 | * Determine whether we're a nested NMI. |
1245 | * |
1246 | * If we interrupted kernel code between repeat_nmi and |
1247 | * end_repeat_nmi, then we are a nested NMI. We must not |
1248 | * modify the "iret" frame because it's being written by |
1249 | * the outer NMI. That's okay; the outer NMI handler is |
1250 | * about to call exc_nmi() anyway, so we can just resume |
1251 | * the outer NMI. |
1252 | */ |
1253 | |
1254 | movq $repeat_nmi, %rdx |
1255 | cmpq 8(%rsp), %rdx |
1256 | ja 1f |
1257 | movq $end_repeat_nmi, %rdx |
1258 | cmpq 8(%rsp), %rdx |
1259 | ja nested_nmi_out |
1260 | 1: |
1261 | |
1262 | /* |
1263 | * Now check "NMI executing". If it's set, then we're nested. |
1264 | * This will not detect if we interrupted an outer NMI just |
1265 | * before IRET. |
1266 | */ |
1267 | cmpl $1, -8(%rsp) |
1268 | je nested_nmi |
1269 | |
1270 | /* |
1271 | * Now test if the previous stack was an NMI stack. This covers |
1272 | * the case where we interrupt an outer NMI after it clears |
1273 | * "NMI executing" but before IRET. We need to be careful, though: |
1274 | * there is one case in which RSP could point to the NMI stack |
1275 | * despite there being no NMI active: naughty userspace controls |
1276 | * RSP at the very beginning of the SYSCALL targets. We can |
1277 | * pull a fast one on naughty userspace, though: we program |
1278 | * SYSCALL to mask DF, so userspace cannot cause DF to be set |
1279 | * if it controls the kernel's RSP. We set DF before we clear |
1280 | * "NMI executing". |
1281 | */ |
1282 | lea 6*8(%rsp), %rdx |
1283 | /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ |
1284 | cmpq %rdx, 4*8(%rsp) |
1285 | /* If the stack pointer is above the NMI stack, this is a normal NMI */ |
1286 | ja first_nmi |
1287 | |
1288 | subq $EXCEPTION_STKSZ, %rdx |
1289 | cmpq %rdx, 4*8(%rsp) |
1290 | /* If it is below the NMI stack, it is a normal NMI */ |
1291 | jb first_nmi |
1292 | |
1293 | /* Ah, it is within the NMI stack. */ |
1294 | |
1295 | testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) |
1296 | jz first_nmi /* RSP was user controlled. */ |
1297 | |
1298 | /* This is a nested NMI. */ |
1299 | |
1300 | nested_nmi: |
1301 | /* |
1302 | * Modify the "iret" frame to point to repeat_nmi, forcing another |
1303 | * iteration of NMI handling. |
1304 | */ |
1305 | subq $8, %rsp |
1306 | leaq -10*8(%rsp), %rdx |
1307 | pushq $__KERNEL_DS |
1308 | pushq %rdx |
1309 | pushfq |
1310 | pushq $__KERNEL_CS |
1311 | pushq $repeat_nmi |
1312 | |
1313 | /* Put stack back */ |
1314 | addq $(6*8), %rsp |
1315 | |
1316 | nested_nmi_out: |
1317 | popq %rdx |
1318 | |
1319 | /* We are returning to kernel mode, so this cannot result in a fault. */ |
1320 | iretq |
1321 | |
1322 | first_nmi: |
1323 | /* Restore rdx. */ |
1324 | movq (%rsp), %rdx |
1325 | |
1326 | /* Make room for "NMI executing". */ |
1327 | pushq $0 |
1328 | |
1329 | /* Leave room for the "iret" frame */ |
1330 | subq $(5*8), %rsp |
1331 | |
1332 | /* Copy the "original" frame to the "outermost" frame */ |
1333 | .rept 5 |
1334 | pushq 11*8(%rsp) |
1335 | .endr |
1336 | UNWIND_HINT_IRET_REGS |
1337 | |
1338 | /* Everything up to here is safe from nested NMIs */ |
1339 | |
1340 | #ifdef CONFIG_DEBUG_ENTRY |
1341 | /* |
1342 | * For ease of testing, unmask NMIs right away. Disabled by |
1343 | * default because IRET is very expensive. |
1344 | */ |
1345 | pushq $0 /* SS */ |
1346 | pushq %rsp /* RSP (minus 8 because of the previous push) */ |
1347 | addq $8, (%rsp) /* Fix up RSP */ |
1348 | pushfq /* RFLAGS */ |
1349 | pushq $__KERNEL_CS /* CS */ |
1350 | pushq $1f /* RIP */ |
1351 | iretq /* continues at repeat_nmi below */ |
1352 | UNWIND_HINT_IRET_REGS |
1353 | 1: |
1354 | #endif |
1355 | |
1356 | repeat_nmi: |
1357 | ANNOTATE_NOENDBR // this code |
1358 | /* |
1359 | * If there was a nested NMI, the first NMI's iret will return |
1360 | * here. But NMIs are still enabled and we can take another |
1361 | * nested NMI. The nested NMI checks the interrupted RIP to see |
1362 | * if it is between repeat_nmi and end_repeat_nmi, and if so |
1363 | * it will just return, as we are about to repeat an NMI anyway. |
1364 | * This makes it safe to copy to the stack frame that a nested |
1365 | * NMI will update. |
1366 | * |
1367 | * RSP is pointing to "outermost RIP". gsbase is unknown, but, if |
1368 | * we're repeating an NMI, gsbase has the same value that it had on |
1369 | * the first iteration. paranoid_entry will load the kernel |
1370 | * gsbase if needed before we call exc_nmi(). "NMI executing" |
1371 | * is zero. |
1372 | */ |
1373 | movq $1, 10*8(%rsp) /* Set "NMI executing". */ |
1374 | |
1375 | /* |
1376 | * Copy the "outermost" frame to the "iret" frame. NMIs that nest |
1377 | * here must not modify the "iret" frame while we're writing to |
1378 | * it or it will end up containing garbage. |
1379 | */ |
1380 | addq $(10*8), %rsp |
1381 | .rept 5 |
1382 | pushq -6*8(%rsp) |
1383 | .endr |
1384 | subq $(5*8), %rsp |
1385 | end_repeat_nmi: |
1386 | ANNOTATE_NOENDBR // this code |
1387 | |
1388 | /* |
1389 | * Everything below this point can be preempted by a nested NMI. |
1390 | * If this happens, then the inner NMI will change the "iret" |
1391 | * frame to point back to repeat_nmi. |
1392 | */ |
1393 | pushq $-1 /* ORIG_RAX: no syscall to restart */ |
1394 | |
1395 | /* |
1396 | * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
1397 | * as we should not be calling schedule in NMI context. |
1398 | * Even with normal interrupts enabled. An NMI should not be |
1399 | * setting NEED_RESCHED or anything that normal interrupts and |
1400 | * exceptions might do. |
1401 | */ |
1402 | call paranoid_entry |
1403 | UNWIND_HINT_REGS |
1404 | |
1405 | movq %rsp, %rdi |
1406 | call exc_nmi |
1407 | |
1408 | /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ |
1409 | IBRS_EXIT save_reg=%r15 |
1410 | |
1411 | PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 |
1412 | |
1413 | /* |
1414 | * The above invocation of paranoid_entry stored the GSBASE |
1415 | * related information in R/EBX depending on the availability |
1416 | * of FSGSBASE. |
1417 | * |
1418 | * If FSGSBASE is enabled, restore the saved GSBASE value |
1419 | * unconditionally, otherwise take the conditional SWAPGS path. |
1420 | */ |
1421 | ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE |
1422 | |
1423 | wrgsbase %rbx |
1424 | jmp nmi_restore |
1425 | |
1426 | nmi_no_fsgsbase: |
1427 | /* EBX == 0 -> invoke SWAPGS */ |
1428 | testl %ebx, %ebx |
1429 | jnz nmi_restore |
1430 | |
1431 | nmi_swapgs: |
1432 | swapgs |
1433 | |
1434 | nmi_restore: |
1435 | POP_REGS |
1436 | |
1437 | /* |
1438 | * Skip orig_ax and the "outermost" frame to point RSP at the "iret" |
1439 | * at the "iret" frame. |
1440 | */ |
1441 | addq $6*8, %rsp |
1442 | |
1443 | /* |
1444 | * Clear "NMI executing". Set DF first so that we can easily |
1445 | * distinguish the remaining code between here and IRET from |
1446 | * the SYSCALL entry and exit paths. |
1447 | * |
1448 | * We arguably should just inspect RIP instead, but I (Andy) wrote |
1449 | * this code when I had the misapprehension that Xen PV supported |
1450 | * NMIs, and Xen PV would break that approach. |
1451 | */ |
1452 | std |
1453 | movq $0, 5*8(%rsp) /* clear "NMI executing" */ |
1454 | |
1455 | /* |
1456 | * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like |
1457 | * NMI in kernel after user state is restored. For an unprivileged user |
1458 | * these conditions are hard to meet. |
1459 | */ |
1460 | |
1461 | /* |
1462 | * iretq reads the "iret" frame and exits the NMI stack in a |
1463 | * single instruction. We are returning to kernel mode, so this |
1464 | * cannot result in a fault. Similarly, we don't need to worry |
1465 | * about espfix64 on the way back to kernel mode. |
1466 | */ |
1467 | iretq |
1468 | SYM_CODE_END(asm_exc_nmi) |
1469 | |
1470 | /* |
1471 | * This handles SYSCALL from 32-bit code. There is no way to program |
1472 | * MSRs to fully disable 32-bit SYSCALL. |
1473 | */ |
1474 | SYM_CODE_START(entry_SYSCALL32_ignore) |
1475 | UNWIND_HINT_END_OF_STACK |
1476 | ENDBR |
1477 | mov $-ENOSYS, %eax |
1478 | CLEAR_CPU_BUFFERS |
1479 | sysretl |
1480 | SYM_CODE_END(entry_SYSCALL32_ignore) |
1481 | |
1482 | .pushsection .text, "ax" |
1483 | __FUNC_ALIGN |
1484 | SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) |
1485 | UNWIND_HINT_FUNC |
1486 | /* Prevent any naive code from trying to unwind to our caller. */ |
1487 | xorl %ebp, %ebp |
1488 | |
1489 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rax |
1490 | leaq -PTREGS_SIZE(%rax), %rsp |
1491 | UNWIND_HINT_REGS |
1492 | |
1493 | call make_task_dead |
1494 | SYM_CODE_END(rewind_stack_and_make_dead) |
1495 | .popsection |
1496 | |
1497 | /* |
1498 | * This sequence executes branches in order to remove user branch information |
1499 | * from the branch history tracker in the Branch Predictor, therefore removing |
1500 | * user influence on subsequent BTB lookups. |
1501 | * |
1502 | * It should be used on parts prior to Alder Lake. Newer parts should use the |
1503 | * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being |
1504 | * virtualized on newer hardware the VMM should protect against BHI attacks by |
1505 | * setting BHI_DIS_S for the guests. |
1506 | * |
1507 | * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging |
1508 | * and not clearing the branch history. The call tree looks like: |
1509 | * |
1510 | * call 1 |
1511 | * call 2 |
1512 | * call 2 |
1513 | * call 2 |
1514 | * call 2 |
1515 | * call 2 |
1516 | * ret |
1517 | * ret |
1518 | * ret |
1519 | * ret |
1520 | * ret |
1521 | * ret |
1522 | * |
1523 | * This means that the stack is non-constant and ORC can't unwind it with %rsp |
1524 | * alone. Therefore we unconditionally set up the frame pointer, which allows |
1525 | * ORC to unwind properly. |
1526 | * |
1527 | * The alignment is for performance and not for safety, and may be safely |
1528 | * refactored in the future if needed. The .skips are for safety, to ensure |
1529 | * that all RETs are in the second half of a cacheline to mitigate Indirect |
1530 | * Target Selection, rather than taking the slowpath via its_return_thunk. |
1531 | */ |
1532 | SYM_FUNC_START(clear_bhb_loop) |
1533 | ANNOTATE_NOENDBR |
1534 | push %rbp |
1535 | mov %rsp, %rbp |
1536 | movl $5, %ecx |
1537 | ANNOTATE_INTRA_FUNCTION_CALL |
1538 | call 1f |
1539 | jmp 5f |
1540 | .align 64, 0xcc |
1541 | /* |
1542 | * Shift instructions so that the RET is in the upper half of the |
1543 | * cacheline and don't take the slowpath to its_return_thunk. |
1544 | */ |
1545 | .skip 32 - (.Lret1 - 1f), 0xcc |
1546 | ANNOTATE_INTRA_FUNCTION_CALL |
1547 | 1: call 2f |
1548 | .Lret1: RET |
1549 | .align 64, 0xcc |
1550 | /* |
1551 | * As above shift instructions for RET at .Lret2 as well. |
1552 | * |
1553 | * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc |
1554 | * but some Clang versions (e.g. 18) don't like this. |
1555 | */ |
1556 | .skip 32 - 18, 0xcc |
1557 | 2: movl $5, %eax |
1558 | 3: jmp 4f |
1559 | nop |
1560 | 4: sub $1, %eax |
1561 | jnz 3b |
1562 | sub $1, %ecx |
1563 | jnz 1b |
1564 | .Lret2: RET |
1565 | 5: lfence |
1566 | pop %rbp |
1567 | RET |
1568 | SYM_FUNC_END(clear_bhb_loop) |
1569 | EXPORT_SYMBOL_GPL(clear_bhb_loop) |
1570 | STACK_FRAME_NON_STANDARD(clear_bhb_loop) |
1571 |
Definitions
- entry_SYSCALL_64
- entry_SYSCALL_64_safe_stack
- entry_SYSCALL_64_after_hwframe
- entry_SYSRETQ_unsafe_stack
- entry_SYSRETQ_end
- __switch_to_asm
- ret_from_fork_asm
- xen_error_entry
- common_interrupt_return
- swapgs_restore_regs_and_return_to_usermode
- ALTERNATIVE
- restore_regs_and_return_to_kernel
- early_xen_iret_patch
- native_irq_return_iret
- asm_load_gs_index
- exc_xen_hypervisor_callback
- movq
- xen_failsafe_callback
- paranoid_entry
- paranoid_exit
- IBRS_EXIT
- error_entry
- error_return
- DEBUG_ENTRY_ASSERT_IRQS_OFF
- asm_exc_nmi
- entry_SYSCALL32_ignore
- rewind_stack_and_make_dead
- xorl
Improve your Profiling and Debugging skills
Find out more