1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * common.c - C code for kernel entry and exit |
4 | * Copyright (c) 2015 Andrew Lutomirski |
5 | * |
6 | * Based on asm and ptrace code by many authors. The code here originated |
7 | * in ptrace.c and signal.c. |
8 | */ |
9 | |
10 | #include <linux/kernel.h> |
11 | #include <linux/sched.h> |
12 | #include <linux/sched/task_stack.h> |
13 | #include <linux/entry-common.h> |
14 | #include <linux/mm.h> |
15 | #include <linux/smp.h> |
16 | #include <linux/errno.h> |
17 | #include <linux/ptrace.h> |
18 | #include <linux/export.h> |
19 | #include <linux/nospec.h> |
20 | #include <linux/syscalls.h> |
21 | #include <linux/uaccess.h> |
22 | #include <linux/init.h> |
23 | |
24 | #ifdef CONFIG_XEN_PV |
25 | #include <xen/xen-ops.h> |
26 | #include <xen/events.h> |
27 | #endif |
28 | |
29 | #include <asm/apic.h> |
30 | #include <asm/desc.h> |
31 | #include <asm/traps.h> |
32 | #include <asm/vdso.h> |
33 | #include <asm/cpufeature.h> |
34 | #include <asm/fpu/api.h> |
35 | #include <asm/nospec-branch.h> |
36 | #include <asm/io_bitmap.h> |
37 | #include <asm/syscall.h> |
38 | #include <asm/irq_stack.h> |
39 | |
40 | #ifdef CONFIG_X86_64 |
41 | |
42 | static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) |
43 | { |
44 | /* |
45 | * Convert negative numbers to very high and thus out of range |
46 | * numbers for comparisons. |
47 | */ |
48 | unsigned int unr = nr; |
49 | |
50 | if (likely(unr < NR_syscalls)) { |
51 | unr = array_index_nospec(unr, NR_syscalls); |
52 | regs->ax = x64_sys_call(regs, nr: unr); |
53 | return true; |
54 | } |
55 | return false; |
56 | } |
57 | |
58 | static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) |
59 | { |
60 | /* |
61 | * Adjust the starting offset of the table, and convert numbers |
62 | * < __X32_SYSCALL_BIT to very high and thus out of range |
63 | * numbers for comparisons. |
64 | */ |
65 | unsigned int xnr = nr - __X32_SYSCALL_BIT; |
66 | |
67 | if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { |
68 | xnr = array_index_nospec(xnr, X32_NR_syscalls); |
69 | regs->ax = x32_sys_call(regs, nr: xnr); |
70 | return true; |
71 | } |
72 | return false; |
73 | } |
74 | |
75 | /* Returns true to return using SYSRET, or false to use IRET */ |
76 | __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) |
77 | { |
78 | add_random_kstack_offset(); |
79 | nr = syscall_enter_from_user_mode(regs, syscall: nr); |
80 | |
81 | instrumentation_begin(); |
82 | |
83 | if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { |
84 | /* Invalid system call, but still a system call. */ |
85 | regs->ax = __x64_sys_ni_syscall(regs); |
86 | } |
87 | |
88 | instrumentation_end(); |
89 | syscall_exit_to_user_mode(regs); |
90 | |
91 | /* |
92 | * Check that the register state is valid for using SYSRET to exit |
93 | * to userspace. Otherwise use the slower but fully capable IRET |
94 | * exit path. |
95 | */ |
96 | |
97 | /* XEN PV guests always use the IRET path */ |
98 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) |
99 | return false; |
100 | |
101 | /* SYSRET requires RCX == RIP and R11 == EFLAGS */ |
102 | if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) |
103 | return false; |
104 | |
105 | /* CS and SS must match the values set in MSR_STAR */ |
106 | if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) |
107 | return false; |
108 | |
109 | /* |
110 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP |
111 | * in kernel space. This essentially lets the user take over |
112 | * the kernel, since userspace controls RSP. |
113 | * |
114 | * TASK_SIZE_MAX covers all user-accessible addresses other than |
115 | * the deprecated vsyscall page. |
116 | */ |
117 | if (unlikely(regs->ip >= TASK_SIZE_MAX)) |
118 | return false; |
119 | |
120 | /* |
121 | * SYSRET cannot restore RF. It can restore TF, but unlike IRET, |
122 | * restoring TF results in a trap from userspace immediately after |
123 | * SYSRET. |
124 | */ |
125 | if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) |
126 | return false; |
127 | |
128 | /* Use SYSRET to exit to userspace */ |
129 | return true; |
130 | } |
131 | #endif |
132 | |
133 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
134 | static __always_inline int syscall_32_enter(struct pt_regs *regs) |
135 | { |
136 | if (IS_ENABLED(CONFIG_IA32_EMULATION)) |
137 | current_thread_info()->status |= TS_COMPAT; |
138 | |
139 | return (int)regs->orig_ax; |
140 | } |
141 | |
142 | #ifdef CONFIG_IA32_EMULATION |
143 | bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); |
144 | |
145 | static int ia32_emulation_override_cmdline(char *arg) |
146 | { |
147 | return kstrtobool(s: arg, res: &__ia32_enabled); |
148 | } |
149 | early_param("ia32_emulation" , ia32_emulation_override_cmdline); |
150 | #endif |
151 | |
152 | /* |
153 | * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. |
154 | */ |
155 | static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) |
156 | { |
157 | /* |
158 | * Convert negative numbers to very high and thus out of range |
159 | * numbers for comparisons. |
160 | */ |
161 | unsigned int unr = nr; |
162 | |
163 | if (likely(unr < IA32_NR_syscalls)) { |
164 | unr = array_index_nospec(unr, IA32_NR_syscalls); |
165 | regs->ax = ia32_sys_call(regs, nr: unr); |
166 | } else if (nr != -1) { |
167 | regs->ax = __ia32_sys_ni_syscall(regs); |
168 | } |
169 | } |
170 | |
171 | #ifdef CONFIG_IA32_EMULATION |
172 | static __always_inline bool int80_is_external(void) |
173 | { |
174 | const unsigned int offs = (0x80 / 32) * 0x10; |
175 | const u32 bit = BIT(0x80 % 32); |
176 | |
177 | /* The local APIC on XENPV guests is fake */ |
178 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) |
179 | return false; |
180 | |
181 | /* |
182 | * If vector 0x80 is set in the APIC ISR then this is an external |
183 | * interrupt. Either from broken hardware or injected by a VMM. |
184 | * |
185 | * Note: In guest mode this is only valid for secure guests where |
186 | * the secure module fully controls the vAPIC exposed to the guest. |
187 | */ |
188 | return apic_read(APIC_ISR + offs) & bit; |
189 | } |
190 | |
191 | /** |
192 | * do_int80_emulation - 32-bit legacy syscall C entry from asm |
193 | * |
194 | * This entry point can be used by 32-bit and 64-bit programs to perform |
195 | * 32-bit system calls. Instances of INT $0x80 can be found inline in |
196 | * various programs and libraries. It is also used by the vDSO's |
197 | * __kernel_vsyscall fallback for hardware that doesn't support a faster |
198 | * entry method. Restarted 32-bit system calls also fall back to INT |
199 | * $0x80 regardless of what instruction was originally used to do the |
200 | * system call. |
201 | * |
202 | * This is considered a slow path. It is not used by most libc |
203 | * implementations on modern hardware except during process startup. |
204 | * |
205 | * The arguments for the INT $0x80 based syscall are on stack in the |
206 | * pt_regs structure: |
207 | * eax: system call number |
208 | * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 |
209 | */ |
210 | __visible noinstr void do_int80_emulation(struct pt_regs *regs) |
211 | { |
212 | int nr; |
213 | |
214 | /* Kernel does not use INT $0x80! */ |
215 | if (unlikely(!user_mode(regs))) { |
216 | irqentry_enter(regs); |
217 | instrumentation_begin(); |
218 | panic(fmt: "Unexpected external interrupt 0x80\n" ); |
219 | } |
220 | |
221 | /* |
222 | * Establish kernel context for instrumentation, including for |
223 | * int80_is_external() below which calls into the APIC driver. |
224 | * Identical for soft and external interrupts. |
225 | */ |
226 | enter_from_user_mode(regs); |
227 | |
228 | instrumentation_begin(); |
229 | add_random_kstack_offset(); |
230 | |
231 | /* Validate that this is a soft interrupt to the extent possible */ |
232 | if (unlikely(int80_is_external())) |
233 | panic(fmt: "Unexpected external interrupt 0x80\n" ); |
234 | |
235 | /* |
236 | * The low level idtentry code pushed -1 into regs::orig_ax |
237 | * and regs::ax contains the syscall number. |
238 | * |
239 | * User tracing code (ptrace or signal handlers) might assume |
240 | * that the regs::orig_ax contains a 32-bit number on invoking |
241 | * a 32-bit syscall. |
242 | * |
243 | * Establish the syscall convention by saving the 32bit truncated |
244 | * syscall number in regs::orig_ax and by invalidating regs::ax. |
245 | */ |
246 | regs->orig_ax = regs->ax & GENMASK(31, 0); |
247 | regs->ax = -ENOSYS; |
248 | |
249 | nr = syscall_32_enter(regs); |
250 | |
251 | local_irq_enable(); |
252 | nr = syscall_enter_from_user_mode_work(regs, syscall: nr); |
253 | do_syscall_32_irqs_on(regs, nr); |
254 | |
255 | instrumentation_end(); |
256 | syscall_exit_to_user_mode(regs); |
257 | } |
258 | |
259 | #ifdef CONFIG_X86_FRED |
260 | /* |
261 | * A FRED-specific INT80 handler is warranted for the follwing reasons: |
262 | * |
263 | * 1) As INT instructions and hardware interrupts are separate event |
264 | * types, FRED does not preclude the use of vector 0x80 for external |
265 | * interrupts. As a result, the FRED setup code does not reserve |
266 | * vector 0x80 and calling int80_is_external() is not merely |
267 | * suboptimal but actively incorrect: it could cause a system call |
268 | * to be incorrectly ignored. |
269 | * |
270 | * 2) It is called only for handling vector 0x80 of event type |
271 | * EVENT_TYPE_SWINT and will never be called to handle any external |
272 | * interrupt (event type EVENT_TYPE_EXTINT). |
273 | * |
274 | * 3) FRED has separate entry flows depending on if the event came from |
275 | * user space or kernel space, and because the kernel does not use |
276 | * INT insns, the FRED kernel entry handler fred_entry_from_kernel() |
277 | * falls through to fred_bad_type() if the event type is |
278 | * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling |
279 | * an INT insn, it can only be from a user level. |
280 | * |
281 | * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will |
282 | * likely take a different approach if it is ever needed: it |
283 | * probably belongs in either fred_intx()/ fred_other() or |
284 | * asm_fred_entrypoint_user(), depending on if this ought to be done |
285 | * for all entries from userspace or only system |
286 | * calls. |
287 | * |
288 | * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. |
289 | */ |
290 | DEFINE_FREDENTRY_RAW(int80_emulation) |
291 | { |
292 | int nr; |
293 | |
294 | enter_from_user_mode(regs); |
295 | |
296 | instrumentation_begin(); |
297 | add_random_kstack_offset(); |
298 | |
299 | /* |
300 | * FRED pushed 0 into regs::orig_ax and regs::ax contains the |
301 | * syscall number. |
302 | * |
303 | * User tracing code (ptrace or signal handlers) might assume |
304 | * that the regs::orig_ax contains a 32-bit number on invoking |
305 | * a 32-bit syscall. |
306 | * |
307 | * Establish the syscall convention by saving the 32bit truncated |
308 | * syscall number in regs::orig_ax and by invalidating regs::ax. |
309 | */ |
310 | regs->orig_ax = regs->ax & GENMASK(31, 0); |
311 | regs->ax = -ENOSYS; |
312 | |
313 | nr = syscall_32_enter(regs); |
314 | |
315 | local_irq_enable(); |
316 | nr = syscall_enter_from_user_mode_work(regs, syscall: nr); |
317 | do_syscall_32_irqs_on(regs, nr); |
318 | |
319 | instrumentation_end(); |
320 | syscall_exit_to_user_mode(regs); |
321 | } |
322 | #endif |
323 | #else /* CONFIG_IA32_EMULATION */ |
324 | |
325 | /* Handles int $0x80 on a 32bit kernel */ |
326 | __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) |
327 | { |
328 | int nr = syscall_32_enter(regs); |
329 | |
330 | add_random_kstack_offset(); |
331 | /* |
332 | * Subtlety here: if ptrace pokes something larger than 2^31-1 into |
333 | * orig_ax, the int return value truncates it. This matches |
334 | * the semantics of syscall_get_nr(). |
335 | */ |
336 | nr = syscall_enter_from_user_mode(regs, nr); |
337 | instrumentation_begin(); |
338 | |
339 | do_syscall_32_irqs_on(regs, nr); |
340 | |
341 | instrumentation_end(); |
342 | syscall_exit_to_user_mode(regs); |
343 | } |
344 | #endif /* !CONFIG_IA32_EMULATION */ |
345 | |
346 | static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) |
347 | { |
348 | int nr = syscall_32_enter(regs); |
349 | int res; |
350 | |
351 | add_random_kstack_offset(); |
352 | /* |
353 | * This cannot use syscall_enter_from_user_mode() as it has to |
354 | * fetch EBP before invoking any of the syscall entry work |
355 | * functions. |
356 | */ |
357 | syscall_enter_from_user_mode_prepare(regs); |
358 | |
359 | instrumentation_begin(); |
360 | /* Fetch EBP from where the vDSO stashed it. */ |
361 | if (IS_ENABLED(CONFIG_X86_64)) { |
362 | /* |
363 | * Micro-optimization: the pointer we're following is |
364 | * explicitly 32 bits, so it can't be out of range. |
365 | */ |
366 | res = __get_user(*(u32 *)®s->bp, |
367 | (u32 __user __force *)(unsigned long)(u32)regs->sp); |
368 | } else { |
369 | res = get_user(*(u32 *)®s->bp, |
370 | (u32 __user __force *)(unsigned long)(u32)regs->sp); |
371 | } |
372 | |
373 | if (res) { |
374 | /* User code screwed up. */ |
375 | regs->ax = -EFAULT; |
376 | |
377 | local_irq_disable(); |
378 | instrumentation_end(); |
379 | irqentry_exit_to_user_mode(regs); |
380 | return false; |
381 | } |
382 | |
383 | nr = syscall_enter_from_user_mode_work(regs, syscall: nr); |
384 | |
385 | /* Now this is just like a normal syscall. */ |
386 | do_syscall_32_irqs_on(regs, nr); |
387 | |
388 | instrumentation_end(); |
389 | syscall_exit_to_user_mode(regs); |
390 | return true; |
391 | } |
392 | |
393 | /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ |
394 | __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) |
395 | { |
396 | /* |
397 | * Called using the internal vDSO SYSENTER/SYSCALL32 calling |
398 | * convention. Adjust regs so it looks like we entered using int80. |
399 | */ |
400 | unsigned long landing_pad = (unsigned long)current->mm->context.vdso + |
401 | vdso_image_32.sym_int80_landing_pad; |
402 | |
403 | /* |
404 | * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward |
405 | * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. |
406 | * Fix it up. |
407 | */ |
408 | regs->ip = landing_pad; |
409 | |
410 | /* Invoke the syscall. If it failed, keep it simple: use IRET. */ |
411 | if (!__do_fast_syscall_32(regs)) |
412 | return false; |
413 | |
414 | /* |
415 | * Check that the register state is valid for using SYSRETL/SYSEXIT |
416 | * to exit to userspace. Otherwise use the slower but fully capable |
417 | * IRET exit path. |
418 | */ |
419 | |
420 | /* XEN PV guests always use the IRET path */ |
421 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) |
422 | return false; |
423 | |
424 | /* EIP must point to the VDSO landing pad */ |
425 | if (unlikely(regs->ip != landing_pad)) |
426 | return false; |
427 | |
428 | /* CS and SS must match the values set in MSR_STAR */ |
429 | if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) |
430 | return false; |
431 | |
432 | /* If the TF, RF, or VM flags are set, use IRET */ |
433 | if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) |
434 | return false; |
435 | |
436 | /* Use SYSRETL/SYSEXIT to exit to userspace */ |
437 | return true; |
438 | } |
439 | |
440 | /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ |
441 | __visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) |
442 | { |
443 | /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ |
444 | regs->sp = regs->bp; |
445 | |
446 | /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ |
447 | regs->flags |= X86_EFLAGS_IF; |
448 | |
449 | return do_fast_syscall_32(regs); |
450 | } |
451 | #endif |
452 | |
453 | SYSCALL_DEFINE0(ni_syscall) |
454 | { |
455 | return -ENOSYS; |
456 | } |
457 | |
458 | #ifdef CONFIG_XEN_PV |
459 | #ifndef CONFIG_PREEMPTION |
460 | /* |
461 | * Some hypercalls issued by the toolstack can take many 10s of |
462 | * seconds. Allow tasks running hypercalls via the privcmd driver to |
463 | * be voluntarily preempted even if full kernel preemption is |
464 | * disabled. |
465 | * |
466 | * Such preemptible hypercalls are bracketed by |
467 | * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() |
468 | * calls. |
469 | */ |
470 | DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); |
471 | EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); |
472 | |
473 | /* |
474 | * In case of scheduling the flag must be cleared and restored after |
475 | * returning from schedule as the task might move to a different CPU. |
476 | */ |
477 | static __always_inline bool get_and_clear_inhcall(void) |
478 | { |
479 | bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); |
480 | |
481 | __this_cpu_write(xen_in_preemptible_hcall, false); |
482 | return inhcall; |
483 | } |
484 | |
485 | static __always_inline void restore_inhcall(bool inhcall) |
486 | { |
487 | __this_cpu_write(xen_in_preemptible_hcall, inhcall); |
488 | } |
489 | #else |
490 | static __always_inline bool get_and_clear_inhcall(void) { return false; } |
491 | static __always_inline void restore_inhcall(bool inhcall) { } |
492 | #endif |
493 | |
494 | static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) |
495 | { |
496 | struct pt_regs *old_regs = set_irq_regs(regs); |
497 | |
498 | inc_irq_stat(irq_hv_callback_count); |
499 | |
500 | xen_evtchn_do_upcall(); |
501 | |
502 | set_irq_regs(old_regs); |
503 | } |
504 | |
505 | __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) |
506 | { |
507 | irqentry_state_t state = irqentry_enter(regs); |
508 | bool inhcall; |
509 | |
510 | instrumentation_begin(); |
511 | run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); |
512 | |
513 | inhcall = get_and_clear_inhcall(); |
514 | if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { |
515 | irqentry_exit_cond_resched(); |
516 | instrumentation_end(); |
517 | restore_inhcall(inhcall); |
518 | } else { |
519 | instrumentation_end(); |
520 | irqentry_exit(regs, state); |
521 | } |
522 | } |
523 | #endif /* CONFIG_XEN_PV */ |
524 | |