1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * Compatibility mode system call entry point for x86-64. |
4 | * |
5 | * Copyright 2000-2002 Andi Kleen, SuSE Labs. |
6 | */ |
7 | #include <asm/asm-offsets.h> |
8 | #include <asm/current.h> |
9 | #include <asm/errno.h> |
10 | #include <asm/ia32_unistd.h> |
11 | #include <asm/thread_info.h> |
12 | #include <asm/segment.h> |
13 | #include <asm/irqflags.h> |
14 | #include <asm/asm.h> |
15 | #include <asm/smap.h> |
16 | #include <asm/nospec-branch.h> |
17 | #include <linux/linkage.h> |
18 | #include <linux/err.h> |
19 | |
20 | #include "calling.h" |
21 | |
22 | .section .entry.text, "ax" |
23 | |
24 | /* |
25 | * 32-bit SYSENTER entry. |
26 | * |
27 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here |
28 | * on 64-bit kernels running on Intel CPUs. |
29 | * |
30 | * The SYSENTER instruction, in principle, should *only* occur in the |
31 | * vDSO. In practice, a small number of Android devices were shipped |
32 | * with a copy of Bionic that inlined a SYSENTER instruction. This |
33 | * never happened in any of Google's Bionic versions -- it only happened |
34 | * in a narrow range of Intel-provided versions. |
35 | * |
36 | * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. |
37 | * IF and VM in RFLAGS are cleared (IOW: interrupts are off). |
38 | * SYSENTER does not save anything on the stack, |
39 | * and does not save old RIP (!!!), RSP, or RFLAGS. |
40 | * |
41 | * Arguments: |
42 | * eax system call number |
43 | * ebx arg1 |
44 | * ecx arg2 |
45 | * edx arg3 |
46 | * esi arg4 |
47 | * edi arg5 |
48 | * ebp user stack |
49 | * 0(%ebp) arg6 |
50 | */ |
51 | SYM_CODE_START(entry_SYSENTER_compat) |
52 | UNWIND_HINT_ENTRY |
53 | ENDBR |
54 | /* Interrupts are off on entry. */ |
55 | swapgs |
56 | |
57 | pushq %rax |
58 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax |
59 | popq %rax |
60 | |
61 | movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp |
62 | |
63 | /* Construct struct pt_regs on stack */ |
64 | pushq $__USER_DS /* pt_regs->ss */ |
65 | pushq $0 /* pt_regs->sp = 0 (placeholder) */ |
66 | |
67 | /* |
68 | * Push flags. This is nasty. First, interrupts are currently |
69 | * off, but we need pt_regs->flags to have IF set. Second, if TS |
70 | * was set in usermode, it's still set, and we're singlestepping |
71 | * through this code. do_SYSENTER_32() will fix up IF. |
72 | */ |
73 | pushfq /* pt_regs->flags (except IF = 0) */ |
74 | pushq $__USER32_CS /* pt_regs->cs */ |
75 | pushq $0 /* pt_regs->ip = 0 (placeholder) */ |
76 | SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) |
77 | |
78 | /* |
79 | * User tracing code (ptrace or signal handlers) might assume that |
80 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit |
81 | * syscall. Just in case the high bits are nonzero, zero-extend |
82 | * the syscall number. (This could almost certainly be deleted |
83 | * with no ill effects.) |
84 | */ |
85 | movl %eax, %eax |
86 | |
87 | pushq %rax /* pt_regs->orig_ax */ |
88 | PUSH_AND_CLEAR_REGS rax=$-ENOSYS |
89 | UNWIND_HINT_REGS |
90 | |
91 | cld |
92 | |
93 | IBRS_ENTER |
94 | UNTRAIN_RET |
95 | CLEAR_BRANCH_HISTORY |
96 | |
97 | /* |
98 | * SYSENTER doesn't filter flags, so we need to clear NT and AC |
99 | * ourselves. To save a few cycles, we can check whether |
100 | * either was set instead of doing an unconditional popfq. |
101 | * This needs to happen before enabling interrupts so that |
102 | * we don't get preempted with NT set. |
103 | * |
104 | * If TF is set, we will single-step all the way to here -- do_debug |
105 | * will ignore all the traps. (Yes, this is slow, but so is |
106 | * single-stepping in general. This allows us to avoid having |
107 | * a more complicated code to handle the case where a user program |
108 | * forces us to single-step through the SYSENTER entry code.) |
109 | * |
110 | * NB.: .Lsysenter_fix_flags is a label with the code under it moved |
111 | * out-of-line as an optimization: NT is unlikely to be set in the |
112 | * majority of the cases and instead of polluting the I$ unnecessarily, |
113 | * we're keeping that code behind a branch which will predict as |
114 | * not-taken and therefore its instructions won't be fetched. |
115 | */ |
116 | testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) |
117 | jnz .Lsysenter_fix_flags |
118 | .Lsysenter_flags_fixed: |
119 | |
120 | movq %rsp, %rdi |
121 | call do_SYSENTER_32 |
122 | jmp sysret32_from_system_call |
123 | |
124 | .Lsysenter_fix_flags: |
125 | pushq $X86_EFLAGS_FIXED |
126 | popfq |
127 | jmp .Lsysenter_flags_fixed |
128 | SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) |
129 | SYM_CODE_END(entry_SYSENTER_compat) |
130 | |
131 | /* |
132 | * 32-bit SYSCALL entry. |
133 | * |
134 | * 32-bit system calls through the vDSO's __kernel_vsyscall enter here |
135 | * on 64-bit kernels running on AMD CPUs. |
136 | * |
137 | * The SYSCALL instruction, in principle, should *only* occur in the |
138 | * vDSO. In practice, it appears that this really is the case. |
139 | * As evidence: |
140 | * |
141 | * - The calling convention for SYSCALL has changed several times without |
142 | * anyone noticing. |
143 | * |
144 | * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything |
145 | * user task that did SYSCALL without immediately reloading SS |
146 | * would randomly crash. |
147 | * |
148 | * - Most programmers do not directly target AMD CPUs, and the 32-bit |
149 | * SYSCALL instruction does not exist on Intel CPUs. Even on AMD |
150 | * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels |
151 | * because the SYSCALL instruction in legacy/native 32-bit mode (as |
152 | * opposed to compat mode) is sufficiently poorly designed as to be |
153 | * essentially unusable. |
154 | * |
155 | * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves |
156 | * RFLAGS to R11, then loads new SS, CS, and RIP from previously |
157 | * programmed MSRs. RFLAGS gets masked by a value from another MSR |
158 | * (so CLD and CLAC are not needed). SYSCALL does not save anything on |
159 | * the stack and does not change RSP. |
160 | * |
161 | * Note: RFLAGS saving+masking-with-MSR happens only in Long mode |
162 | * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). |
163 | * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit |
164 | * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes |
165 | * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). |
166 | * |
167 | * Arguments: |
168 | * eax system call number |
169 | * ecx return address |
170 | * ebx arg1 |
171 | * ebp arg2 (note: not saved in the stack frame, should not be touched) |
172 | * edx arg3 |
173 | * esi arg4 |
174 | * edi arg5 |
175 | * esp user stack |
176 | * 0(%esp) arg6 |
177 | */ |
178 | SYM_CODE_START(entry_SYSCALL_compat) |
179 | UNWIND_HINT_ENTRY |
180 | ENDBR |
181 | /* Interrupts are off on entry. */ |
182 | swapgs |
183 | |
184 | /* Stash user ESP */ |
185 | movl %esp, %r8d |
186 | |
187 | /* Use %rsp as scratch reg. User ESP is stashed in r8 */ |
188 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp |
189 | |
190 | /* Switch to the kernel stack */ |
191 | movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp |
192 | |
193 | SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) |
194 | ANNOTATE_NOENDBR |
195 | |
196 | /* Construct struct pt_regs on stack */ |
197 | pushq $__USER_DS /* pt_regs->ss */ |
198 | pushq %r8 /* pt_regs->sp */ |
199 | pushq %r11 /* pt_regs->flags */ |
200 | pushq $__USER32_CS /* pt_regs->cs */ |
201 | pushq %rcx /* pt_regs->ip */ |
202 | SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) |
203 | movl %eax, %eax /* discard orig_ax high bits */ |
204 | pushq %rax /* pt_regs->orig_ax */ |
205 | PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS |
206 | UNWIND_HINT_REGS |
207 | |
208 | IBRS_ENTER |
209 | UNTRAIN_RET |
210 | CLEAR_BRANCH_HISTORY |
211 | |
212 | movq %rsp, %rdi |
213 | call do_fast_syscall_32 |
214 | |
215 | sysret32_from_system_call: |
216 | /* XEN PV guests always use IRET path */ |
217 | ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode" , \ |
218 | "jmp swapgs_restore_regs_and_return_to_usermode" , X86_FEATURE_XENPV |
219 | |
220 | /* |
221 | * Opportunistic SYSRET |
222 | * |
223 | * We are not going to return to userspace from the trampoline |
224 | * stack. So let's erase the thread stack right now. |
225 | */ |
226 | STACKLEAK_ERASE |
227 | |
228 | IBRS_EXIT |
229 | |
230 | movq RBX(%rsp), %rbx /* pt_regs->rbx */ |
231 | movq RBP(%rsp), %rbp /* pt_regs->rbp */ |
232 | movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ |
233 | movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ |
234 | addq $RAX, %rsp /* Skip r8-r15 */ |
235 | popq %rax /* pt_regs->rax */ |
236 | popq %rdx /* Skip pt_regs->cx */ |
237 | popq %rdx /* pt_regs->dx */ |
238 | popq %rsi /* pt_regs->si */ |
239 | popq %rdi /* pt_regs->di */ |
240 | |
241 | /* |
242 | * USERGS_SYSRET32 does: |
243 | * GSBASE = user's GS base |
244 | * EIP = ECX |
245 | * RFLAGS = R11 |
246 | * CS = __USER32_CS |
247 | * SS = __USER_DS |
248 | * |
249 | * ECX will not match pt_regs->cx, but we're returning to a vDSO |
250 | * trampoline that will fix up RCX, so this is okay. |
251 | * |
252 | * R12-R15 are callee-saved, so they contain whatever was in them |
253 | * when the system call started, which is already known to user |
254 | * code. We zero R8-R10 to avoid info leaks. |
255 | */ |
256 | movq RSP-ORIG_RAX(%rsp), %rsp |
257 | SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) |
258 | ANNOTATE_NOENDBR |
259 | |
260 | /* |
261 | * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored |
262 | * on the process stack which is not mapped to userspace and |
263 | * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 |
264 | * switch until after after the last reference to the process |
265 | * stack. |
266 | * |
267 | * %r8/%r9 are zeroed before the sysret, thus safe to clobber. |
268 | */ |
269 | SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 |
270 | |
271 | xorl %r8d, %r8d |
272 | xorl %r9d, %r9d |
273 | xorl %r10d, %r10d |
274 | swapgs |
275 | CLEAR_CPU_BUFFERS |
276 | sysretl |
277 | SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) |
278 | ANNOTATE_NOENDBR |
279 | int3 |
280 | SYM_CODE_END(entry_SYSCALL_compat) |
281 | |
282 | /* |
283 | * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries |
284 | * point to C routines, however since this is a system call interface the branch |
285 | * history needs to be scrubbed to protect against BHI attacks, and that |
286 | * scrubbing needs to take place in assembly code prior to entering any C |
287 | * routines. |
288 | */ |
289 | SYM_CODE_START(int80_emulation) |
290 | ANNOTATE_NOENDBR |
291 | UNWIND_HINT_FUNC |
292 | CLEAR_BRANCH_HISTORY |
293 | jmp do_int80_emulation |
294 | SYM_CODE_END(int80_emulation) |
295 | |