entry_64.S source code [linux/arch/x86/entry/entry_64.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* linux/arch/x86_64/entry.S
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
7	* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
8	*
9	* entry.S contains the system-call and fault low-level handling routines.
10	*
11	* Some of this is documented in Documentation/arch/x86/entry_64.rst
12	*
13	* A note on terminology:
14	* - iret frame: Architecture defined interrupt frame from SS to RIP
15	* at the top of the kernel process stack.
16	*
17	* Some macro usage:
18	* - SYM_FUNC_START/END:Define functions in the symbol table.
19	* - idtentry: Define exception entry points.
20	*/
21	#include <linux/export.h>
22	#include <linux/linkage.h>
23	#include <asm/segment.h>
24	#include <asm/cache.h>
25	#include <asm/errno.h>
26	#include <asm/asm-offsets.h>
27	#include <asm/msr.h>
28	#include <asm/unistd.h>
29	#include <asm/thread_info.h>
30	#include <asm/hw_irq.h>
31	#include <asm/page_types.h>
32	#include <asm/irqflags.h>
33	#include <asm/paravirt.h>
34	#include <asm/percpu.h>
35	#include <asm/asm.h>
36	#include <asm/smap.h>
37	#include <asm/pgtable_types.h>
38	#include <asm/frame.h>
39	#include <asm/trapnr.h>
40	#include <asm/nospec-branch.h>
41	#include <asm/fsgsbase.h>
42	#include <linux/err.h>
43
44	#include "calling.h"
45
46	.code64
47	.section .entry.text, "ax"
48
49	/*
50	* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
51	*
52	* This is the only entry point used for 64-bit system calls. The
53	* hardware interface is reasonably well designed and the register to
54	* argument mapping Linux uses fits well with the registers that are
55	* available when SYSCALL is used.
56	*
57	* SYSCALL instructions can be found inlined in libc implementations as
58	* well as some other programs and libraries. There are also a handful
59	* of SYSCALL instructions in the vDSO used, for example, as a
60	* clock_gettimeofday fallback.
61	*
62	* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
63	* then loads new ss, cs, and rip from previously programmed MSRs.
64	* rflags gets masked by a value from another MSR (so CLD and CLAC
65	* are not needed). SYSCALL does not save anything on the stack
66	* and does not change rsp.
67	*
68	* Registers on entry:
69	* rax system call number
70	* rcx return address
71	* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
72	* rdi arg0
73	* rsi arg1
74	* rdx arg2
75	* r10 arg3 (needs to be moved to rcx to conform to C ABI)
76	* r8 arg4
77	* r9 arg5
78	* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
79	*
80	* Only called from user space.
81	*
82	* When user can change pt_regs->foo always force IRET. That is because
83	* it deals with uncanonical addresses better. SYSRET has trouble
84	* with them due to bugs in both AMD and Intel CPUs.
85	*/
86
87	SYM_CODE_START(entry_SYSCALL_64)
88	UNWIND_HINT_ENTRY
89	ENDBR
90
91	swapgs
92	/ tss.sp2 is scratch space. /
93	movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
94	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
95	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
96
97	SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
98	ANNOTATE_NOENDBR
99
100	/ Construct struct pt_regs on stack /
101	pushq $__USER_DS / pt_regs->ss /
102	pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) / pt_regs->sp /
103	pushq %r11 / pt_regs->flags /
104	pushq $__USER_CS / pt_regs->cs /
105	pushq %rcx / pt_regs->ip /
106	SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
107	pushq %rax / pt_regs->orig_ax /
108
109	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
110
111	/ IRQs are off. /
112	movq %rsp, %rdi
113	/ Sign extend the lower 32bit as syscall numbers are treated as int /
114	movslq %eax, %rsi
115
116	/ clobbers %rax, make sure it is after saving the syscall nr /
117	IBRS_ENTER
118	UNTRAIN_RET
119	CLEAR_BRANCH_HISTORY
120
121	call do_syscall_64 / returns with IRQs disabled /
122
123	/*
124	* Try to use SYSRET instead of IRET if we're returning to
125	* a completely clean 64-bit userspace context. If we're not,
126	* go to the slow exit path.
127	* In the Xen PV case we must use iret anyway.
128	*/
129
130	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
131	"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
132
133	/*
134	* We win! This label is here just for ease of understanding
135	* perf profiles. Nothing jumps here.
136	*/
137	syscall_return_via_sysret:
138	IBRS_EXIT
139	POP_REGS pop_rdi=`0`
140
141	/*
142	* Now all regs are restored except RSP and RDI.
143	* Save old stack pointer and switch to trampoline stack.
144	*/
145	movq %rsp, %rdi
146	movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
147	UNWIND_HINT_END_OF_STACK
148
149	pushq RSP-RDI(%rdi) / RSP /
150	pushq (%rdi) / RDI /
151
152	/*
153	* We are on the trampoline stack. All regs except RDI are live.
154	* We can do future final exit work right here.
155	*/
156	STACKLEAK_ERASE_NOCLOBBER
157
158	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
159
160	popq %rdi
161	popq %rsp
162	SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
163	ANNOTATE_NOENDBR
164	swapgs
165	CLEAR_CPU_BUFFERS
166	sysretq
167	SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
168	ANNOTATE_NOENDBR
169	int3
170	SYM_CODE_END(entry_SYSCALL_64)
171
172	/*
173	* %rdi: prev task
174	* %rsi: next task
175	*/
176	.pushsection .text, "ax"
177	SYM_FUNC_START(__switch_to_asm)
178	ANNOTATE_NOENDBR
179	/*
180	* Save callee-saved registers
181	* This must match the order in inactive_task_frame
182	*/
183	pushq %rbp
184	pushq %rbx
185	pushq %r12
186	pushq %r13
187	pushq %r14
188	pushq %r15
189
190	/ switch stack /
191	movq %rsp, TASK_threadsp(%rdi)
192	movq TASK_threadsp(%rsi), %rsp
193
194	#ifdef CONFIG_STACKPROTECTOR
195	movq TASK_stack_canary(%rsi), %rbx
196	movq %rbx, PER_CPU_VAR(__stack_chk_guard)
197	#endif
198
199	/*
200	* When switching from a shallower to a deeper call stack
201	* the RSB may either underflow or use entries populated
202	* with userspace addresses. On CPUs where those concerns
203	* exist, overwrite the RSB with entries which capture
204	* speculative execution to prevent attack.
205	*/
206	FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
207
208	/ restore callee-saved registers /
209	popq %r15
210	popq %r14
211	popq %r13
212	popq %r12
213	popq %rbx
214	popq %rbp
215
216	jmp __switch_to
217	SYM_FUNC_END(__switch_to_asm)
218	.popsection
219
220	/*
221	* A newly forked process directly context switches into this address.
222	*
223	* rax: prev task we switched from
224	* rbx: kernel thread func (NULL for user thread)
225	* r12: kernel thread arg
226	*/
227	.pushsection .text, "ax"
228	SYM_CODE_START(ret_from_fork_asm)
229	/*
230	* This is the start of the kernel stack; even through there's a
231	* register set at the top, the regset isn't necessarily coherent
232	* (consider kthreads) and one cannot unwind further.
233	*
234	* This ensures stack unwinds of kernel threads terminate in a known
235	* good state.
236	*/
237	UNWIND_HINT_END_OF_STACK
238	ANNOTATE_NOENDBR // copy_thread
239	CALL_DEPTH_ACCOUNT
240
241	movq %rax, %rdi / prev /
242	movq %rsp, %rsi / regs /
243	movq %rbx, %rdx / fn /
244	movq %r12, %rcx / fn_arg /
245	call ret_from_fork
246
247	/*
248	* Set the stack state to what is expected for the target function
249	* -- at this point the register set should be a valid user set
250	* and unwind should work normally.
251	*/
252	UNWIND_HINT_REGS
253
254	#ifdef CONFIG_X86_FRED
255	ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
256	"jmp asm_fred_exit_user", X86_FEATURE_FRED
257	#else
258	jmp swapgs_restore_regs_and_return_to_usermode
259	#endif
260	SYM_CODE_END(ret_from_fork_asm)
261	.popsection
262
263	.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
264	#ifdef CONFIG_DEBUG_ENTRY
265	pushq %rax
266	SAVE_FLAGS
267	testl $X86_EFLAGS_IF, %eax
268	jz .Lokay_\@
269	ud2
270	.Lokay_\@:
271	popq %rax
272	#endif
273	.endm
274
275	SYM_CODE_START(xen_error_entry)
276	ANNOTATE_NOENDBR
277	UNWIND_HINT_FUNC
278	PUSH_AND_CLEAR_REGS save_ret=`1`
279	ENCODE_FRAME_POINTER `8`
280	UNTRAIN_RET_FROM_CALL
281	RET
282	SYM_CODE_END(xen_error_entry)
283
284	/**
285	* idtentry_body - Macro to emit code calling the C function
286	* @cfunc: C function to be called
287	* @has_error_code: Hardware pushed error code on stack
288	*/
289	.macro idtentry_body cfunc has_error_code:req
290
291	/*
292	* Call error_entry() and switch to the task stack if from userspace.
293	*
294	* When in XENPV, it is already in the task stack, and it can't fault
295	* for native_iret() nor native_load_gs_index() since XENPV uses its
296	* own pvops for IRET and load_gs_index(). And it doesn't need to
297	* switch the CR3. So it can skip invoking error_entry().
298	*/
299	ALTERNATIVE "call error_entry; movq %rax, %rsp", \
300	"call xen_error_entry", X86_FEATURE_XENPV
301
302	ENCODE_FRAME_POINTER
303	UNWIND_HINT_REGS
304
305	movq %rsp, %rdi / pt_regs pointer into 1st argument/
306
307	.if \has_error_code == `1`
308	movq ORIG_RAX(%rsp), %rsi / get error code into 2nd argument/
309	movq $-`1`, ORIG_RAX(%rsp) / no syscall to restart /
310	.endif
311
312	/ For some configurations \cfunc ends up being a noreturn. /
313	ANNOTATE_REACHABLE
314	call \cfunc
315
316	jmp error_return
317	.endm
318
319	/**
320	* idtentry - Macro to generate entry stubs for simple IDT entries
321	* @vector: Vector number
322	* @asmsym: ASM symbol for the entry point
323	* @cfunc: C function to be called
324	* @has_error_code: Hardware pushed error code on stack
325	*
326	* The macro emits code to set up the kernel context for straight forward
327	* and simple IDT entries. No IST stack, no paranoid entry checks.
328	*/
329	.macro idtentry vector asmsym cfunc has_error_code:req
330	SYM_CODE_START(\asmsym)
331
332	.if \vector == X86_TRAP_BP
333	/ #BP advances %rip to the next instruction /
334	UNWIND_HINT_IRET_ENTRY offset=\has_error_code*`8` signal=`0`
335	.else
336	UNWIND_HINT_IRET_ENTRY offset=\has_error_code*`8`
337	.endif
338
339	ENDBR
340	ASM_CLAC
341	cld
342
343	.if \has_error_code == `0`
344	pushq $-`1` / ORIG_RAX: no syscall to restart /
345	.endif
346
347	.if \vector == X86_TRAP_BP
348	/*
349	* If coming from kernel space, create a 6-word gap to allow the
350	* int3 handler to emulate a call instruction.
351	*/
352	testb $`3`, CS-ORIG_RAX(%rsp)
353	jnz .Lfrom_usermode_no_gap_\@
354	.rept `6`
355	pushq `5`*`8`(%rsp)
356	.endr
357	UNWIND_HINT_IRET_REGS offset=`8`
358	.Lfrom_usermode_no_gap_\@:
359	.endif
360
361	idtentry_body \cfunc \has_error_code
362
363	_ASM_NOKPROBE(\asmsym)
364	SYM_CODE_END(\asmsym)
365	.endm
366
367	/*
368	* Interrupt entry/exit.
369	*
370	+ The interrupt stubs push (vector) onto the stack, which is the error_code
371	* position of idtentry exceptions, and jump to one of the two idtentry points
372	* (common/spurious).
373	*
374	* common_interrupt is a hotpath, align it to a cache line
375	*/
376	.macro idtentry_irq vector cfunc
377	.p2align CONFIG_X86_L1_CACHE_SHIFT
378	idtentry \vector asm_\cfunc \cfunc has_error_code=`1`
379	.endm
380
381	/**
382	* idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
383	* @vector: Vector number
384	* @asmsym: ASM symbol for the entry point
385	* @cfunc: C function to be called
386	*
387	* The macro emits code to set up the kernel context for #MC and #DB
388	*
389	* If the entry comes from user space it uses the normal entry path
390	* including the return to user space work and preemption checks on
391	* exit.
392	*
393	* If hits in kernel mode then it needs to go through the paranoid
394	* entry as the exception can hit any random state. No preemption
395	* check on exit to keep the paranoid path simple.
396	*/
397	.macro idtentry_mce_db vector asmsym cfunc
398	SYM_CODE_START(\asmsym)
399	UNWIND_HINT_IRET_ENTRY
400	ENDBR
401	ASM_CLAC
402	cld
403
404	pushq $-`1` / ORIG_RAX: no syscall to restart /
405
406	/*
407	* If the entry is from userspace, switch stacks and treat it as
408	* a normal entry.
409	*/
410	testb $`3`, CS-ORIG_RAX(%rsp)
411	jnz .Lfrom_usermode_switch_stack_\@
412
413	/ paranoid_entry returns GS information for paranoid_exit in EBX. /
414	call paranoid_entry
415
416	UNWIND_HINT_REGS
417
418	movq %rsp, %rdi / pt_regs pointer /
419
420	call \cfunc
421
422	jmp paranoid_exit
423
424	/ Switch to the regular task stack and use the noist entry point /
425	.Lfrom_usermode_switch_stack_\@:
426	idtentry_body noist_\cfunc, has_error_code=`0`
427
428	_ASM_NOKPROBE(\asmsym)
429	SYM_CODE_END(\asmsym)
430	.endm
431
432	#ifdef CONFIG_AMD_MEM_ENCRYPT
433	/**
434	* idtentry_vc - Macro to generate entry stub for #VC
435	* @vector: Vector number
436	* @asmsym: ASM symbol for the entry point
437	* @cfunc: C function to be called
438	*
439	* The macro emits code to set up the kernel context for #VC. The #VC handler
440	* runs on an IST stack and needs to be able to cause nested #VC exceptions.
441	*
442	* To make this work the #VC entry code tries its best to pretend it doesn't use
443	* an IST stack by switching to the task stack if coming from user-space (which
444	* includes early SYSCALL entry path) or back to the stack in the IRET frame if
445	* entered from kernel-mode.
446	*
447	* If entered from kernel-mode the return stack is validated first, and if it is
448	* not safe to use (e.g. because it points to the entry stack) the #VC handler
449	* will switch to a fall-back stack (VC2) and call a special handler function.
450	*
451	* The macro is only used for one vector, but it is planned to be extended in
452	* the future for the #HV exception.
453	*/
454	.macro idtentry_vc vector asmsym cfunc
455	SYM_CODE_START(\asmsym)
456	UNWIND_HINT_IRET_ENTRY
457	ENDBR
458	ASM_CLAC
459	cld
460
461	/*
462	* If the entry is from userspace, switch stacks and treat it as
463	* a normal entry.
464	*/
465	testb $`3`, CS-ORIG_RAX(%rsp)
466	jnz .Lfrom_usermode_switch_stack_\@
467
468	/*
469	* paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
470	* EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
471	*/
472	call paranoid_entry
473
474	UNWIND_HINT_REGS
475
476	/*
477	* Switch off the IST stack to make it free for nested exceptions. The
478	* vc_switch_off_ist() function will switch back to the interrupted
479	* stack if it is safe to do so. If not it switches to the VC fall-back
480	* stack.
481	*/
482	movq %rsp, %rdi / pt_regs pointer /
483	call vc_switch_off_ist
484	movq %rax, %rsp / Switch to new stack /
485
486	ENCODE_FRAME_POINTER
487	UNWIND_HINT_REGS
488
489	/ Update pt_regs /
490	movq ORIG_RAX(%rsp), %rsi / get error code into 2nd argument/
491	movq $-`1`, ORIG_RAX(%rsp) / no syscall to restart /
492
493	movq %rsp, %rdi / pt_regs pointer /
494
495	call kernel_\cfunc
496
497	/*
498	* No need to switch back to the IST stack. The current stack is either
499	* identical to the stack in the IRET frame or the VC fall-back stack,
500	* so it is definitely mapped even with PTI enabled.
501	*/
502	jmp paranoid_exit
503
504	/ Switch to the regular task stack /
505	.Lfrom_usermode_switch_stack_\@:
506	idtentry_body user_\cfunc, has_error_code=`1`
507
508	_ASM_NOKPROBE(\asmsym)
509	SYM_CODE_END(\asmsym)
510	.endm
511	#endif
512
513	/*
514	* Double fault entry. Straight paranoid. No checks from which context
515	* this comes because for the espfix induced #DF this would do the wrong
516	* thing.
517	*/
518	.macro idtentry_df vector asmsym cfunc
519	SYM_CODE_START(\asmsym)
520	UNWIND_HINT_IRET_ENTRY offset=`8`
521	ENDBR
522	ASM_CLAC
523	cld
524
525	/ paranoid_entry returns GS information for paranoid_exit in EBX. /
526	call paranoid_entry
527	UNWIND_HINT_REGS
528
529	movq %rsp, %rdi / pt_regs pointer into first argument /
530	movq ORIG_RAX(%rsp), %rsi / get error code into 2nd argument/
531	movq $-`1`, ORIG_RAX(%rsp) / no syscall to restart /
532
533	/ For some configurations \cfunc ends up being a noreturn. /
534	ANNOTATE_REACHABLE
535	call \cfunc
536
537	jmp paranoid_exit
538
539	_ASM_NOKPROBE(\asmsym)
540	SYM_CODE_END(\asmsym)
541	.endm
542
543	/*
544	* Include the defines which emit the idt entries which are shared
545	* shared between 32 and 64 bit and emit the __irqentry_text_* markers
546	* so the stacktrace boundary checks work.
547	*/
548	__ALIGN
549	.globl __irqentry_text_start
550	__irqentry_text_start:
551
552	#include <asm/idtentry.h>
553
554	__ALIGN
555	.globl __irqentry_text_end
556	__irqentry_text_end:
557	ANNOTATE_NOENDBR
558
559	SYM_CODE_START_LOCAL(common_interrupt_return)
560	SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
561	IBRS_EXIT
562	#ifdef CONFIG_XEN_PV
563	ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
564	#endif
565	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
566	ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
567	#endif
568
569	STACKLEAK_ERASE
570	POP_REGS
571	add $`8`, %rsp / orig_ax /
572	UNWIND_HINT_IRET_REGS
573
574	.Lswapgs_and_iret:
575	swapgs
576	CLEAR_CPU_BUFFERS
577	/ Assert that the IRET frame indicates user mode. /
578	testb $`3`, `8`(%rsp)
579	jnz .Lnative_iret
580	ud2
581
582	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
583	.Lpti_restore_regs_and_return_to_usermode:
584	POP_REGS pop_rdi=`0`
585
586	/*
587	* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
588	* Save old stack pointer and switch to trampoline stack.
589	*/
590	movq %rsp, %rdi
591	movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
592	UNWIND_HINT_END_OF_STACK
593
594	/ Copy the IRET frame to the trampoline stack. /
595	pushq `6``8`(%rdi) /* SS /
596	pushq `5``8`(%rdi) /* RSP /
597	pushq `4``8`(%rdi) /* EFLAGS /
598	pushq `3``8`(%rdi) /* CS /
599	pushq `2``8`(%rdi) /* RIP /
600
601	/ Push user RDI on the trampoline stack. /
602	pushq (%rdi)
603
604	/*
605	* We are on the trampoline stack. All regs except RDI are live.
606	* We can do future final exit work right here.
607	*/
608	STACKLEAK_ERASE_NOCLOBBER
609
610	push %rax
611	SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax
612	pop %rax
613
614	/ Restore RDI. /
615	popq %rdi
616	jmp .Lswapgs_and_iret
617	#endif
618
619	SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
620	#ifdef CONFIG_DEBUG_ENTRY
621	/ Assert that pt_regs indicates kernel mode. /
622	testb $`3`, CS(%rsp)
623	jz `1f`
624	ud2
625	`1`:
626	#endif
627	POP_REGS
628	addq $`8`, %rsp / skip regs->orig_ax /
629	/*
630	* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
631	* when returning from IPI handler.
632	*/
633	#ifdef CONFIG_XEN_PV
634	SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)
635	ANNOTATE_NOENDBR
636	.byte `0xe9`
637	.long .Lnative_iret - (. + `4`)
638	#endif
639
640	.Lnative_iret:
641	UNWIND_HINT_IRET_REGS
642	/*
643	* Are we returning to a stack segment from the LDT? Note: in
644	* 64-bit mode SS:RSP on the exception stack is always valid.
645	*/
646	#ifdef CONFIG_X86_ESPFIX64
647	testb $`4`, (SS-RIP)(%rsp)
648	jnz native_irq_return_ldt
649	#endif
650
651	SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
652	ANNOTATE_NOENDBR // exc_double_fault
653	/*
654	* This may fault. Non-paranoid faults on return to userspace are
655	* handled by fixup_bad_iret. These include #SS, #GP, and #NP.
656	* Double-faults due to espfix64 are handled in exc_double_fault.
657	* Other faults here are fatal.
658	*/
659	iretq
660
661	#ifdef CONFIG_X86_ESPFIX64
662	native_irq_return_ldt:
663	/*
664	* We are running with user GSBASE. All GPRs contain their user
665	* values. We have a percpu ESPFIX stack that is eight slots
666	* long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
667	* of the ESPFIX stack.
668	*
669	* We clobber RAX and RDI in this code. We stash RDI on the
670	* normal stack and RAX on the ESPFIX stack.
671	*
672	* The ESPFIX stack layout we set up looks like this:
673	*
674	* --- top of ESPFIX stack ---
675	* SS
676	* RSP
677	* RFLAGS
678	* CS
679	* RIP <-- RSP points here when we're done
680	* RAX <-- espfix_waddr points here
681	* --- bottom of ESPFIX stack ---
682	*/
683
684	pushq %rdi / Stash user RDI /
685	swapgs / to kernel GS /
686	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi / to kernel CR3 /
687
688	movq PER_CPU_VAR(espfix_waddr), %rdi
689	movq %rax, (`0``8`)(%rdi) /* user RAX /
690	movq (`1``8`)(%rsp), %rax /* user RIP /
691	movq %rax, (`1`*`8`)(%rdi)
692	movq (`2``8`)(%rsp), %rax /* user CS /
693	movq %rax, (`2`*`8`)(%rdi)
694	movq (`3``8`)(%rsp), %rax /* user RFLAGS /
695	movq %rax, (`3`*`8`)(%rdi)
696	movq (`5``8`)(%rsp), %rax /* user SS /
697	movq %rax, (`5`*`8`)(%rdi)
698	movq (`4``8`)(%rsp), %rax /* user RSP /
699	movq %rax, (`4`*`8`)(%rdi)
700	/ Now RAX == RSP. /
701
702	andl $`0xffff0000`, %eax / RAX = (RSP & 0xffff0000) /
703
704	/*
705	* espfix_stack[31:16] == 0. The page tables are set up such that
706	* (espfix_stack \| (X & 0xffff0000)) points to a read-only alias of
707	* espfix_waddr for any X. That is, there are 65536 RO aliases of
708	* the same page. Set up RSP so that RSP[31:16] contains the
709	* respective 16 bits of the /userspace/ RSP and RSP nonetheless
710	* still points to an RO alias of the ESPFIX stack.
711	*/
712	orq PER_CPU_VAR(espfix_stack), %rax
713
714	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
715	swapgs / to user GS /
716	popq %rdi / Restore user RDI /
717
718	movq %rax, %rsp
719	UNWIND_HINT_IRET_REGS offset=`8`
720
721	/*
722	* At this point, we cannot write to the stack any more, but we can
723	* still read.
724	*/
725	popq %rax / Restore user RAX /
726
727	CLEAR_CPU_BUFFERS
728
729	/*
730	* RSP now points to an ordinary IRET frame, except that the page
731	* is read-only and RSP[31:16] are preloaded with the userspace
732	* values. We can now IRET back to userspace.
733	*/
734	jmp native_irq_return_iret
735	#endif
736	SYM_CODE_END(common_interrupt_return)
737	_ASM_NOKPROBE(common_interrupt_return)
738
739	/*
740	* Reload gs selector with exception handling
741	* di: new selector
742	*
743	* Is in entry.text as it shouldn't be instrumented.
744	*/
745	SYM_FUNC_START(asm_load_gs_index)
746	ANNOTATE_NOENDBR
747	FRAME_BEGIN
748	swapgs
749	.Lgs_change:
750	ANNOTATE_NOENDBR // error_entry
751	movl %edi, %gs
752	`2`: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
753	swapgs
754	FRAME_END
755	RET
756
757	/ running with kernelgs /
758	.Lbad_gs:
759	swapgs / switch back to user gs /
760	.macro ZAP_GS
761	/ This can't be a string because the preprocessor needs to see it. /
762	movl $__USER_DS, %eax
763	movl %eax, %gs
764	.endm
765	ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
766	xorl %eax, %eax
767	movl %eax, %gs
768	jmp `2b`
769
770	_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
771
772	SYM_FUNC_END(asm_load_gs_index)
773	EXPORT_SYMBOL(asm_load_gs_index)
774
775	#ifdef CONFIG_XEN_PV
776	/*
777	* A note on the "critical region" in our callback handler.
778	* We want to avoid stacking callback handlers due to events occurring
779	* during handling of the last event. To do this, we keep events disabled
780	* until we've done all processing. HOWEVER, we must enable events before
781	* popping the stack frame (can't be done atomically) and so it would still
782	* be possible to get enough handler activations to overflow the stack.
783	* Although unlikely, bugs of that kind are hard to track down, so we'd
784	* like to avoid the possibility.
785	* So, on entry to the handler we detect whether we interrupted an
786	* existing activation in its critical region -- if so, we pop the current
787	* activation and restart the handler using the previous one.
788	*
789	* C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
790	*/
791	__FUNC_ALIGN
792	SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)
793
794	/*
795	* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
796	* see the correct pointer to the pt_regs
797	*/
798	UNWIND_HINT_FUNC
799	movq %rdi, %rsp / we don't return, adjust the stack frame /
800	UNWIND_HINT_REGS
801
802	call xen_pv_evtchn_do_upcall
803
804	jmp error_return
805	SYM_CODE_END(exc_xen_hypervisor_callback)
806
807	/*
808	* Hypervisor uses this for application faults while it executes.
809	* We get here for two reasons:
810	* 1. Fault while reloading DS, ES, FS or GS
811	* 2. Fault while executing IRET
812	* Category 1 we do not need to fix up as Xen has already reloaded all segment
813	* registers that could be reloaded and zeroed the others.
814	* Category 2 we fix up by killing the current process. We cannot use the
815	* normal Linux return path in this case because if we use the IRET hypercall
816	* to pop the stack frame we end up in an infinite loop of failsafe callbacks.
817	* We distinguish between categories by comparing each saved segment register
818	* with its current contents: any discrepancy means we in category 1.
819	*/
820	__FUNC_ALIGN
821	SYM_CODE_START_NOALIGN(xen_failsafe_callback)
822	UNWIND_HINT_UNDEFINED
823	ENDBR
824	movl %ds, %ecx
825	cmpw %cx, `0x10`(%rsp)
826	jne `1f`
827	movl %es, %ecx
828	cmpw %cx, `0x18`(%rsp)
829	jne `1f`
830	movl %fs, %ecx
831	cmpw %cx, `0x20`(%rsp)
832	jne `1f`
833	movl %gs, %ecx
834	cmpw %cx, `0x28`(%rsp)
835	jne `1f`
836	/ All segments match their saved values => Category 2 (Bad IRET). /
837	movq (%rsp), %rcx
838	movq `8`(%rsp), %r11
839	addq $`0x30`, %rsp
840	pushq $`0` / RIP /
841	UNWIND_HINT_IRET_REGS offset=`8`
842	jmp asm_exc_general_protection
843	`1`: / Segment mismatch => Category 1 (Bad segment). Retry the IRET. /
844	movq (%rsp), %rcx
845	movq `8`(%rsp), %r11
846	addq $`0x30`, %rsp
847	UNWIND_HINT_IRET_REGS
848	pushq $-`1` / orig_ax = -1 => not a system call /
849	PUSH_AND_CLEAR_REGS
850	ENCODE_FRAME_POINTER
851	jmp error_return
852	SYM_CODE_END(xen_failsafe_callback)
853	#endif /* CONFIG_XEN_PV */
854
855	/*
856	* Save all registers in pt_regs. Return GSBASE related information
857	* in EBX depending on the availability of the FSGSBASE instructions:
858	*
859	* FSGSBASE R/EBX
860	* N 0 -> SWAPGS on exit
861	* 1 -> no SWAPGS on exit
862	*
863	* Y GSBASE value at entry, must be restored in paranoid_exit
864	*
865	* R14 - old CR3
866	* R15 - old SPEC_CTRL
867	*/
868	SYM_CODE_START(paranoid_entry)
869	ANNOTATE_NOENDBR
870	UNWIND_HINT_FUNC
871	PUSH_AND_CLEAR_REGS save_ret=`1`
872	ENCODE_FRAME_POINTER `8`
873
874	/*
875	* Always stash CR3 in %r14. This value will be restored,
876	* verbatim, at exit. Needed if paranoid_entry interrupted
877	* another entry that already switched to the user CR3 value
878	* but has not yet returned to userspace.
879	*
880	* This is also why CS (stashed in the "iret frame" by the
881	* hardware at entry) can not be used: this may be a return
882	* to kernel code, but with a user CR3 value.
883	*
884	* Switching CR3 does not depend on kernel GSBASE so it can
885	* be done before switching to the kernel GSBASE. This is
886	* required for FSGSBASE because the kernel GSBASE has to
887	* be retrieved from a kernel internal table.
888	*/
889	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
890
891	/*
892	* Handling GSBASE depends on the availability of FSGSBASE.
893	*
894	* Without FSGSBASE the kernel enforces that negative GSBASE
895	* values indicate kernel GSBASE. With FSGSBASE no assumptions
896	* can be made about the GSBASE value when entering from user
897	* space.
898	*/
899	ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
900
901	/*
902	* Read the current GSBASE and store it in %rbx unconditionally,
903	* retrieve and set the current CPUs kernel GSBASE. The stored value
904	* has to be restored in paranoid_exit unconditionally.
905	*
906	* The unconditional write to GS base below ensures that no subsequent
907	* loads based on a mispredicted GS base can happen, therefore no LFENCE
908	* is needed here.
909	*/
910	SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
911	jmp .Lparanoid_gsbase_done
912
913	.Lparanoid_entry_checkgs:
914	/ EBX = 1 -> kernel GSBASE active, no restore required /
915	movl $`1`, %ebx
916
917	/*
918	* The kernel-enforced convention is a negative GSBASE indicates
919	* a kernel value. No SWAPGS needed on entry and exit.
920	*/
921	movl $MSR_GS_BASE, %ecx
922	rdmsr
923	testl %edx, %edx
924	js .Lparanoid_kernel_gsbase
925
926	/ EBX = 0 -> SWAPGS required on exit /
927	xorl %ebx, %ebx
928	swapgs
929	.Lparanoid_kernel_gsbase:
930	FENCE_SWAPGS_KERNEL_ENTRY
931	.Lparanoid_gsbase_done:
932
933	/*
934	* Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
935	* CR3 above, keep the old value in a callee saved register.
936	*/
937	IBRS_ENTER save_reg=%r15
938	UNTRAIN_RET_FROM_CALL
939
940	RET
941	SYM_CODE_END(paranoid_entry)
942
943	/*
944	* "Paranoid" exit path from exception stack. This is invoked
945	* only on return from non-NMI IST interrupts that came
946	* from kernel space.
947	*
948	* We may be returning to very strange contexts (e.g. very early
949	* in syscall entry), so checking for preemption here would
950	* be complicated. Fortunately, there's no good reason to try
951	* to handle preemption here.
952	*
953	* R/EBX contains the GSBASE related information depending on the
954	* availability of the FSGSBASE instructions:
955	*
956	* FSGSBASE R/EBX
957	* N 0 -> SWAPGS on exit
958	* 1 -> no SWAPGS on exit
959	*
960	* Y User space GSBASE, must be restored unconditionally
961	*
962	* R14 - old CR3
963	* R15 - old SPEC_CTRL
964	*/
965	SYM_CODE_START_LOCAL(paranoid_exit)
966	UNWIND_HINT_REGS
967
968	/*
969	* Must restore IBRS state before both CR3 and %GS since we need access
970	* to the per-CPU x86_spec_ctrl_shadow variable.
971	*/
972	IBRS_EXIT save_reg=%r15
973
974	/*
975	* The order of operations is important. PARANOID_RESTORE_CR3 requires
976	* kernel GSBASE.
977	*
978	* NB to anyone to try to optimize this code: this code does
979	* not execute at all for exceptions from user mode. Those
980	* exceptions go through error_return instead.
981	*/
982	PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14
983
984	/ Handle the three GSBASE cases /
985	ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
986
987	/ With FSGSBASE enabled, unconditionally restore GSBASE /
988	wrgsbase %rbx
989	jmp restore_regs_and_return_to_kernel
990
991	.Lparanoid_exit_checkgs:
992	/ On non-FSGSBASE systems, conditionally do SWAPGS /
993	testl %ebx, %ebx
994	jnz restore_regs_and_return_to_kernel
995
996	/ We are returning to a context with user GSBASE /
997	swapgs
998	jmp restore_regs_and_return_to_kernel
999	SYM_CODE_END(paranoid_exit)
1000
1001	/*
1002	* Switch GS and CR3 if needed.
1003	*/
1004	SYM_CODE_START(error_entry)
1005	ANNOTATE_NOENDBR
1006	UNWIND_HINT_FUNC
1007
1008	PUSH_AND_CLEAR_REGS save_ret=`1`
1009	ENCODE_FRAME_POINTER `8`
1010
1011	testb $`3`, CS+`8`(%rsp)
1012	jz .Lerror_kernelspace
1013
1014	/*
1015	* We entered from user mode or we're pretending to have entered
1016	* from user mode due to an IRET fault.
1017	*/
1018	swapgs
1019	FENCE_SWAPGS_USER_ENTRY
1020	/ We have user CR3. Change to kernel CR3. /
1021	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1022	IBRS_ENTER
1023	UNTRAIN_RET_FROM_CALL
1024
1025	leaq `8`(%rsp), %rdi / arg0 = pt_regs pointer /
1026	/ Put us onto the real thread stack. /
1027	jmp sync_regs
1028
1029	/*
1030	* There are two places in the kernel that can potentially fault with
1031	* usergs. Handle them here. B stepping K8s sometimes report a
1032	* truncated RIP for IRET exceptions returning to compat mode. Check
1033	* for these here too.
1034	*/
1035	.Lerror_kernelspace:
1036	leaq native_irq_return_iret(%rip), %rcx
1037	cmpq %rcx, RIP+`8`(%rsp)
1038	je .Lerror_bad_iret
1039	movl %ecx, %eax / zero extend /
1040	cmpq %rax, RIP+`8`(%rsp)
1041	je .Lbstep_iret
1042	cmpq $.Lgs_change, RIP+`8`(%rsp)
1043	jne .Lerror_entry_done_lfence
1044
1045	/*
1046	* hack: .Lgs_change can fail with user gsbase. If this happens, fix up
1047	* gsbase and proceed. We'll fix up the exception and land in
1048	* .Lgs_change's error handler with kernel gsbase.
1049	*/
1050	swapgs
1051
1052	/*
1053	* Issue an LFENCE to prevent GS speculation, regardless of whether it is a
1054	* kernel or user gsbase.
1055	*/
1056	.Lerror_entry_done_lfence:
1057	FENCE_SWAPGS_KERNEL_ENTRY
1058	CALL_DEPTH_ACCOUNT
1059	leaq `8`(%rsp), %rax / return pt_regs pointer /
1060	VALIDATE_UNRET_END
1061	RET
1062
1063	.Lbstep_iret:
1064	/ Fix truncated RIP /
1065	movq %rcx, RIP+`8`(%rsp)
1066	/ fall through /
1067
1068	.Lerror_bad_iret:
1069	/*
1070	* We came from an IRET to user mode, so we have user
1071	* gsbase and CR3. Switch to kernel gsbase and CR3:
1072	*/
1073	swapgs
1074	FENCE_SWAPGS_USER_ENTRY
1075	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1076	IBRS_ENTER
1077	UNTRAIN_RET_FROM_CALL
1078
1079	/*
1080	* Pretend that the exception came from user mode: set up pt_regs
1081	* as if we faulted immediately after IRET.
1082	*/
1083	leaq `8`(%rsp), %rdi / arg0 = pt_regs pointer /
1084	call fixup_bad_iret
1085	mov %rax, %rdi
1086	jmp sync_regs
1087	SYM_CODE_END(error_entry)
1088
1089	SYM_CODE_START_LOCAL(error_return)
1090	UNWIND_HINT_REGS
1091	DEBUG_ENTRY_ASSERT_IRQS_OFF
1092	testb $`3`, CS(%rsp)
1093	jz restore_regs_and_return_to_kernel
1094	jmp swapgs_restore_regs_and_return_to_usermode
1095	SYM_CODE_END(error_return)
1096
1097	/*
1098	* Runs on exception stack. Xen PV does not go through this path at all,
1099	* so we can use real assembly here.
1100	*
1101	* Registers:
1102	* %r14: Used to save/restore the CR3 of the interrupted context
1103	* when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber.
1104	*/
1105	SYM_CODE_START(asm_exc_nmi)
1106	UNWIND_HINT_IRET_ENTRY
1107	ENDBR
1108
1109	/*
1110	* We allow breakpoints in NMIs. If a breakpoint occurs, then
1111	* the iretq it performs will take us out of NMI context.
1112	* This means that we can have nested NMIs where the next
1113	* NMI is using the top of the stack of the previous NMI. We
1114	* can't let it execute because the nested NMI will corrupt the
1115	* stack of the previous NMI. NMI handlers are not re-entrant
1116	* anyway.
1117	*
1118	* To handle this case we do the following:
1119	* Check a special location on the stack that contains a
1120	* variable that is set when NMIs are executing.
1121	* The interrupted task's stack is also checked to see if it
1122	* is an NMI stack.
1123	* If the variable is not set and the stack is not the NMI
1124	* stack then:
1125	* o Set the special variable on the stack
1126	* o Copy the interrupt frame into an "outermost" location on the
1127	* stack
1128	* o Copy the interrupt frame into an "iret" location on the stack
1129	* o Continue processing the NMI
1130	* If the variable is set or the previous stack is the NMI stack:
1131	* o Modify the "iret" location to jump to the repeat_nmi
1132	* o return back to the first NMI
1133	*
1134	* Now on exit of the first NMI, we first clear the stack variable
1135	* The NMI stack will tell any nested NMIs at that point that it is
1136	* nested. Then we pop the stack normally with iret, and if there was
1137	* a nested NMI that updated the copy interrupt stack frame, a
1138	* jump will be made to the repeat_nmi code that will handle the second
1139	* NMI.
1140	*
1141	* However, espfix prevents us from directly returning to userspace
1142	* with a single IRET instruction. Similarly, IRET to user mode
1143	* can fault. We therefore handle NMIs from user space like
1144	* other IST entries.
1145	*/
1146
1147	ASM_CLAC
1148	cld
1149
1150	/ Use %rdx as our temp variable throughout /
1151	pushq %rdx
1152
1153	testb $`3`, CS-RIP+`8`(%rsp)
1154	jz .Lnmi_from_kernel
1155
1156	/*
1157	* NMI from user mode. We need to run on the thread stack, but we
1158	* can't go through the normal entry paths: NMIs are masked, and
1159	* we don't want to enable interrupts, because then we'll end
1160	* up in an awkward situation in which IRQs are on but NMIs
1161	* are off.
1162	*
1163	* We also must not push anything to the stack before switching
1164	* stacks lest we corrupt the "NMI executing" variable.
1165	*/
1166
1167	swapgs
1168	FENCE_SWAPGS_USER_ENTRY
1169	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
1170	movq %rsp, %rdx
1171	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
1172	UNWIND_HINT_IRET_REGS base=%rdx offset=`8`
1173	pushq `5``8`(%rdx) /* pt_regs->ss /
1174	pushq `4``8`(%rdx) /* pt_regs->rsp /
1175	pushq `3``8`(%rdx) /* pt_regs->flags /
1176	pushq `2``8`(%rdx) /* pt_regs->cs /
1177	pushq `1``8`(%rdx) /* pt_regs->rip /
1178	UNWIND_HINT_IRET_REGS
1179	pushq $-`1` / pt_regs->orig_ax /
1180	PUSH_AND_CLEAR_REGS rdx=(%rdx)
1181	ENCODE_FRAME_POINTER
1182
1183	IBRS_ENTER
1184	UNTRAIN_RET
1185
1186	/*
1187	* At this point we no longer need to worry about stack damage
1188	* due to nesting -- we're on the normal thread stack and we're
1189	* done with the NMI stack.
1190	*/
1191
1192	movq %rsp, %rdi
1193	call exc_nmi
1194
1195	/*
1196	* Return back to user mode. We must not do the normal exit
1197	* work, because we don't want to enable interrupts.
1198	*/
1199	jmp swapgs_restore_regs_and_return_to_usermode
1200
1201	.Lnmi_from_kernel:
1202	/*
1203	* Here's what our stack frame will look like:
1204	* +---------------------------------------------------------+
1205	* \| original SS \|
1206	* \| original Return RSP \|
1207	* \| original RFLAGS \|
1208	* \| original CS \|
1209	* \| original RIP \|
1210	* +---------------------------------------------------------+
1211	* \| temp storage for rdx \|
1212	* +---------------------------------------------------------+
1213	* \| "NMI executing" variable \|
1214	* +---------------------------------------------------------+
1215	* \| iret SS } Copied from "outermost" frame \|
1216	* \| iret Return RSP } on each loop iteration; overwritten \|
1217	* \| iret RFLAGS } by a nested NMI to force another \|
1218	* \| iret CS } iteration if needed. \|
1219	* \| iret RIP } \|
1220	* +---------------------------------------------------------+
1221	* \| outermost SS } initialized in first_nmi; \|
1222	* \| outermost Return RSP } will not be changed before \|
1223	* \| outermost RFLAGS } NMI processing is done. \|
1224	* \| outermost CS } Copied to "iret" frame on each \|
1225	* \| outermost RIP } iteration. \|
1226	* +---------------------------------------------------------+
1227	* \| pt_regs \|
1228	* +---------------------------------------------------------+
1229	*
1230	* The "original" frame is used by hardware. Before re-enabling
1231	* NMIs, we need to be done with it, and we need to leave enough
1232	* space for the asm code here.
1233	*
1234	* We return by executing IRET while RSP points to the "iret" frame.
1235	* That will either return for real or it will loop back into NMI
1236	* processing.
1237	*
1238	* The "outermost" frame is copied to the "iret" frame on each
1239	* iteration of the loop, so each iteration starts with the "iret"
1240	* frame pointing to the final return target.
1241	*/
1242
1243	/*
1244	* Determine whether we're a nested NMI.
1245	*
1246	* If we interrupted kernel code between repeat_nmi and
1247	* end_repeat_nmi, then we are a nested NMI. We must not
1248	* modify the "iret" frame because it's being written by
1249	* the outer NMI. That's okay; the outer NMI handler is
1250	* about to call exc_nmi() anyway, so we can just resume
1251	* the outer NMI.
1252	*/
1253
1254	movq $repeat_nmi, %rdx
1255	cmpq `8`(%rsp), %rdx
1256	ja `1f`
1257	movq $end_repeat_nmi, %rdx
1258	cmpq `8`(%rsp), %rdx
1259	ja nested_nmi_out
1260	`1`:
1261
1262	/*
1263	* Now check "NMI executing". If it's set, then we're nested.
1264	* This will not detect if we interrupted an outer NMI just
1265	* before IRET.
1266	*/
1267	cmpl $`1`, -`8`(%rsp)
1268	je nested_nmi
1269
1270	/*
1271	* Now test if the previous stack was an NMI stack. This covers
1272	* the case where we interrupt an outer NMI after it clears
1273	* "NMI executing" but before IRET. We need to be careful, though:
1274	* there is one case in which RSP could point to the NMI stack
1275	* despite there being no NMI active: naughty userspace controls
1276	* RSP at the very beginning of the SYSCALL targets. We can
1277	* pull a fast one on naughty userspace, though: we program
1278	* SYSCALL to mask DF, so userspace cannot cause DF to be set
1279	* if it controls the kernel's RSP. We set DF before we clear
1280	* "NMI executing".
1281	*/
1282	lea `6`*`8`(%rsp), %rdx
1283	/ Compare the NMI stack (rdx) with the stack we came from (48(%rsp)) /*
1284	cmpq %rdx, `4`*`8`(%rsp)
1285	/ If the stack pointer is above the NMI stack, this is a normal NMI /
1286	ja first_nmi
1287
1288	subq $EXCEPTION_STKSZ, %rdx
1289	cmpq %rdx, `4`*`8`(%rsp)
1290	/ If it is below the NMI stack, it is a normal NMI /
1291	jb first_nmi
1292
1293	/ Ah, it is within the NMI stack. /
1294
1295	testb $(X86_EFLAGS_DF >> `8`), (`3`*`8` + `1`)(%rsp)
1296	jz first_nmi / RSP was user controlled. /
1297
1298	/ This is a nested NMI. /
1299
1300	nested_nmi:
1301	/*
1302	* Modify the "iret" frame to point to repeat_nmi, forcing another
1303	* iteration of NMI handling.
1304	*/
1305	subq $`8`, %rsp
1306	leaq -`10`*`8`(%rsp), %rdx
1307	pushq $__KERNEL_DS
1308	pushq %rdx
1309	pushfq
1310	pushq $__KERNEL_CS
1311	pushq $repeat_nmi
1312
1313	/ Put stack back /
1314	addq $(`6`*`8`), %rsp
1315
1316	nested_nmi_out:
1317	popq %rdx
1318
1319	/ We are returning to kernel mode, so this cannot result in a fault. /
1320	iretq
1321
1322	first_nmi:
1323	/ Restore rdx. /
1324	movq (%rsp), %rdx
1325
1326	/ Make room for "NMI executing". /
1327	pushq $`0`
1328
1329	/ Leave room for the "iret" frame /
1330	subq $(`5`*`8`), %rsp
1331
1332	/ Copy the "original" frame to the "outermost" frame /
1333	.rept `5`
1334	pushq `11`*`8`(%rsp)
1335	.endr
1336	UNWIND_HINT_IRET_REGS
1337
1338	/ Everything up to here is safe from nested NMIs /
1339
1340	#ifdef CONFIG_DEBUG_ENTRY
1341	/*
1342	* For ease of testing, unmask NMIs right away. Disabled by
1343	* default because IRET is very expensive.
1344	*/
1345	pushq $`0` / SS /
1346	pushq %rsp / RSP (minus 8 because of the previous push) /
1347	addq $`8`, (%rsp) / Fix up RSP /
1348	pushfq / RFLAGS /
1349	pushq $__KERNEL_CS / CS /
1350	pushq $`1f` / RIP /
1351	iretq / continues at repeat_nmi below /
1352	UNWIND_HINT_IRET_REGS
1353	`1`:
1354	#endif
1355
1356	repeat_nmi:
1357	ANNOTATE_NOENDBR // this code
1358	/*
1359	* If there was a nested NMI, the first NMI's iret will return
1360	* here. But NMIs are still enabled and we can take another
1361	* nested NMI. The nested NMI checks the interrupted RIP to see
1362	* if it is between repeat_nmi and end_repeat_nmi, and if so
1363	* it will just return, as we are about to repeat an NMI anyway.
1364	* This makes it safe to copy to the stack frame that a nested
1365	* NMI will update.
1366	*
1367	* RSP is pointing to "outermost RIP". gsbase is unknown, but, if
1368	* we're repeating an NMI, gsbase has the same value that it had on
1369	* the first iteration. paranoid_entry will load the kernel
1370	* gsbase if needed before we call exc_nmi(). "NMI executing"
1371	* is zero.
1372	*/
1373	movq $`1`, `10``8`(%rsp) /* Set "NMI executing". /
1374
1375	/*
1376	* Copy the "outermost" frame to the "iret" frame. NMIs that nest
1377	* here must not modify the "iret" frame while we're writing to
1378	* it or it will end up containing garbage.
1379	*/
1380	addq $(`10`*`8`), %rsp
1381	.rept `5`
1382	pushq -`6`*`8`(%rsp)
1383	.endr
1384	subq $(`5`*`8`), %rsp
1385	end_repeat_nmi:
1386	ANNOTATE_NOENDBR // this code
1387
1388	/*
1389	* Everything below this point can be preempted by a nested NMI.
1390	* If this happens, then the inner NMI will change the "iret"
1391	* frame to point back to repeat_nmi.
1392	*/
1393	pushq $-`1` / ORIG_RAX: no syscall to restart /
1394
1395	/*
1396	* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1397	* as we should not be calling schedule in NMI context.
1398	* Even with normal interrupts enabled. An NMI should not be
1399	* setting NEED_RESCHED or anything that normal interrupts and
1400	* exceptions might do.
1401	*/
1402	call paranoid_entry
1403	UNWIND_HINT_REGS
1404
1405	movq %rsp, %rdi
1406	call exc_nmi
1407
1408	/ Always restore stashed SPEC_CTRL value (see paranoid_entry) /
1409	IBRS_EXIT save_reg=%r15
1410
1411	PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
1412
1413	/*
1414	* The above invocation of paranoid_entry stored the GSBASE
1415	* related information in R/EBX depending on the availability
1416	* of FSGSBASE.
1417	*
1418	* If FSGSBASE is enabled, restore the saved GSBASE value
1419	* unconditionally, otherwise take the conditional SWAPGS path.
1420	*/
1421	ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
1422
1423	wrgsbase %rbx
1424	jmp nmi_restore
1425
1426	nmi_no_fsgsbase:
1427	/ EBX == 0 -> invoke SWAPGS /
1428	testl %ebx, %ebx
1429	jnz nmi_restore
1430
1431	nmi_swapgs:
1432	swapgs
1433
1434	nmi_restore:
1435	POP_REGS
1436
1437	/*
1438	* Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1439	* at the "iret" frame.
1440	*/
1441	addq $`6`*`8`, %rsp
1442
1443	/*
1444	* Clear "NMI executing". Set DF first so that we can easily
1445	* distinguish the remaining code between here and IRET from
1446	* the SYSCALL entry and exit paths.
1447	*
1448	* We arguably should just inspect RIP instead, but I (Andy) wrote
1449	* this code when I had the misapprehension that Xen PV supported
1450	* NMIs, and Xen PV would break that approach.
1451	*/
1452	std
1453	movq $`0`, `5``8`(%rsp) /* clear "NMI executing" /
1454
1455	/*
1456	* Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like
1457	* NMI in kernel after user state is restored. For an unprivileged user
1458	* these conditions are hard to meet.
1459	*/
1460
1461	/*
1462	* iretq reads the "iret" frame and exits the NMI stack in a
1463	* single instruction. We are returning to kernel mode, so this
1464	* cannot result in a fault. Similarly, we don't need to worry
1465	* about espfix64 on the way back to kernel mode.
1466	*/
1467	iretq
1468	SYM_CODE_END(asm_exc_nmi)
1469
1470	/*
1471	* This handles SYSCALL from 32-bit code. There is no way to program
1472	* MSRs to fully disable 32-bit SYSCALL.
1473	*/
1474	SYM_CODE_START(entry_SYSCALL32_ignore)
1475	UNWIND_HINT_END_OF_STACK
1476	ENDBR
1477	mov $-ENOSYS, %eax
1478	CLEAR_CPU_BUFFERS
1479	sysretl
1480	SYM_CODE_END(entry_SYSCALL32_ignore)
1481
1482	.pushsection .text, "ax"
1483	__FUNC_ALIGN
1484	SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
1485	UNWIND_HINT_FUNC
1486	/ Prevent any naive code from trying to unwind to our caller. /
1487	xorl %ebp, %ebp
1488
1489	movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
1490	leaq -PTREGS_SIZE(%rax), %rsp
1491	UNWIND_HINT_REGS
1492
1493	call make_task_dead
1494	SYM_CODE_END(rewind_stack_and_make_dead)
1495	.popsection
1496
1497	/*
1498	* This sequence executes branches in order to remove user branch information
1499	* from the branch history tracker in the Branch Predictor, therefore removing
1500	* user influence on subsequent BTB lookups.
1501	*
1502	* It should be used on parts prior to Alder Lake. Newer parts should use the
1503	* BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being
1504	* virtualized on newer hardware the VMM should protect against BHI attacks by
1505	* setting BHI_DIS_S for the guests.
1506	*
1507	* CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging
1508	* and not clearing the branch history. The call tree looks like:
1509	*
1510	* call 1
1511	* call 2
1512	* call 2
1513	* call 2
1514	* call 2
1515	* call 2
1516	* ret
1517	* ret
1518	* ret
1519	* ret
1520	* ret
1521	* ret
1522	*
1523	* This means that the stack is non-constant and ORC can't unwind it with %rsp
1524	* alone. Therefore we unconditionally set up the frame pointer, which allows
1525	* ORC to unwind properly.
1526	*
1527	* The alignment is for performance and not for safety, and may be safely
1528	* refactored in the future if needed. The .skips are for safety, to ensure
1529	* that all RETs are in the second half of a cacheline to mitigate Indirect
1530	* Target Selection, rather than taking the slowpath via its_return_thunk.
1531	*/
1532	SYM_FUNC_START(clear_bhb_loop)
1533	ANNOTATE_NOENDBR
1534	push %rbp
1535	mov %rsp, %rbp
1536	movl $`5`, %ecx
1537	ANNOTATE_INTRA_FUNCTION_CALL
1538	call `1f`
1539	jmp `5f`
1540	.align `64`, `0xcc`
1541	/*
1542	* Shift instructions so that the RET is in the upper half of the
1543	* cacheline and don't take the slowpath to its_return_thunk.
1544	*/
1545	.skip `32` - (.Lret1 - `1f`), `0xcc`
1546	ANNOTATE_INTRA_FUNCTION_CALL
1547	`1`: call `2f`
1548	.Lret1: RET
1549	.align `64`, `0xcc`
1550	/*
1551	* As above shift instructions for RET at .Lret2 as well.
1552	*
1553	* This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
1554	* but some Clang versions (e.g. 18) don't like this.
1555	*/
1556	.skip `32` - `18`, `0xcc`
1557	`2`: movl $`5`, %eax
1558	`3`: jmp `4f`
1559	nop
1560	`4`: sub $`1`, %eax
1561	jnz `3b`
1562	sub $`1`, %ecx
1563	jnz `1b`
1564	.Lret2: RET
1565	`5`: lfence
1566	pop %rbp
1567	RET
1568	SYM_FUNC_END(clear_bhb_loop)
1569	EXPORT_SYMBOL_GPL(clear_bhb_loop)
1570	STACK_FRAME_NON_STANDARD(clear_bhb_loop)
1571

Provided by KDAB

Definitions

source code of linux/arch/x86/entry/entry_64.S