| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* 64-bit system call dispatch */ |
| 3 | |
| 4 | #include <linux/linkage.h> |
| 5 | #include <linux/sys.h> |
| 6 | #include <linux/cache.h> |
| 7 | #include <linux/syscalls.h> |
| 8 | #include <linux/entry-common.h> |
| 9 | #include <linux/nospec.h> |
| 10 | #include <asm/syscall.h> |
| 11 | |
| 12 | #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); |
| 13 | #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); |
| 14 | #include <asm/syscalls_64.h> |
| 15 | #ifdef CONFIG_X86_X32_ABI |
| 16 | #include <asm/syscalls_x32.h> |
| 17 | #endif |
| 18 | #undef __SYSCALL |
| 19 | |
| 20 | #undef __SYSCALL_NORETURN |
| 21 | #define __SYSCALL_NORETURN __SYSCALL |
| 22 | |
| 23 | /* |
| 24 | * The sys_call_table[] is no longer used for system calls, but |
| 25 | * kernel/trace/trace_syscalls.c still wants to know the system |
| 26 | * call address. |
| 27 | */ |
| 28 | #define __SYSCALL(nr, sym) __x64_##sym, |
| 29 | const sys_call_ptr_t sys_call_table[] = { |
| 30 | #include <asm/syscalls_64.h> |
| 31 | }; |
| 32 | #undef __SYSCALL |
| 33 | |
| 34 | #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); |
| 35 | long x64_sys_call(const struct pt_regs *regs, unsigned int nr) |
| 36 | { |
| 37 | switch (nr) { |
| 38 | #include <asm/syscalls_64.h> |
| 39 | default: return __x64_sys_ni_syscall(regs); |
| 40 | } |
| 41 | } |
| 42 | |
| 43 | #ifdef CONFIG_X86_X32_ABI |
| 44 | long x32_sys_call(const struct pt_regs *regs, unsigned int nr) |
| 45 | { |
| 46 | switch (nr) { |
| 47 | #include <asm/syscalls_x32.h> |
| 48 | default: return __x64_sys_ni_syscall(regs); |
| 49 | } |
| 50 | } |
| 51 | #endif |
| 52 | |
| 53 | static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) |
| 54 | { |
| 55 | /* |
| 56 | * Convert negative numbers to very high and thus out of range |
| 57 | * numbers for comparisons. |
| 58 | */ |
| 59 | unsigned int unr = nr; |
| 60 | |
| 61 | if (likely(unr < NR_syscalls)) { |
| 62 | unr = array_index_nospec(unr, NR_syscalls); |
| 63 | regs->ax = x64_sys_call(regs, nr: unr); |
| 64 | return true; |
| 65 | } |
| 66 | return false; |
| 67 | } |
| 68 | |
| 69 | static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) |
| 70 | { |
| 71 | /* |
| 72 | * Adjust the starting offset of the table, and convert numbers |
| 73 | * < __X32_SYSCALL_BIT to very high and thus out of range |
| 74 | * numbers for comparisons. |
| 75 | */ |
| 76 | unsigned int xnr = nr - __X32_SYSCALL_BIT; |
| 77 | |
| 78 | if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { |
| 79 | xnr = array_index_nospec(xnr, X32_NR_syscalls); |
| 80 | regs->ax = x32_sys_call(regs, nr: xnr); |
| 81 | return true; |
| 82 | } |
| 83 | return false; |
| 84 | } |
| 85 | |
| 86 | /* Returns true to return using SYSRET, or false to use IRET */ |
| 87 | __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) |
| 88 | { |
| 89 | add_random_kstack_offset(); |
| 90 | nr = syscall_enter_from_user_mode(regs, syscall: nr); |
| 91 | |
| 92 | instrumentation_begin(); |
| 93 | |
| 94 | if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { |
| 95 | /* Invalid system call, but still a system call. */ |
| 96 | regs->ax = __x64_sys_ni_syscall(regs); |
| 97 | } |
| 98 | |
| 99 | instrumentation_end(); |
| 100 | syscall_exit_to_user_mode(regs); |
| 101 | |
| 102 | /* |
| 103 | * Check that the register state is valid for using SYSRET to exit |
| 104 | * to userspace. Otherwise use the slower but fully capable IRET |
| 105 | * exit path. |
| 106 | */ |
| 107 | |
| 108 | /* XEN PV guests always use the IRET path */ |
| 109 | if (cpu_feature_enabled(X86_FEATURE_XENPV)) |
| 110 | return false; |
| 111 | |
| 112 | /* SYSRET requires RCX == RIP and R11 == EFLAGS */ |
| 113 | if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) |
| 114 | return false; |
| 115 | |
| 116 | /* CS and SS must match the values set in MSR_STAR */ |
| 117 | if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) |
| 118 | return false; |
| 119 | |
| 120 | /* |
| 121 | * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP |
| 122 | * in kernel space. This essentially lets the user take over |
| 123 | * the kernel, since userspace controls RSP. |
| 124 | * |
| 125 | * TASK_SIZE_MAX covers all user-accessible addresses other than |
| 126 | * the deprecated vsyscall page. |
| 127 | */ |
| 128 | if (unlikely(regs->ip >= TASK_SIZE_MAX)) |
| 129 | return false; |
| 130 | |
| 131 | /* |
| 132 | * SYSRET cannot restore RF. It can restore TF, but unlike IRET, |
| 133 | * restoring TF results in a trap from userspace immediately after |
| 134 | * SYSRET. |
| 135 | */ |
| 136 | if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) |
| 137 | return false; |
| 138 | |
| 139 | /* Use SYSRET to exit to userspace */ |
| 140 | return true; |
| 141 | } |
| 142 | |