1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/context_tracking.h> |
4 | #include <linux/entry-common.h> |
5 | #include <linux/resume_user_mode.h> |
6 | #include <linux/highmem.h> |
7 | #include <linux/jump_label.h> |
8 | #include <linux/kmsan.h> |
9 | #include <linux/livepatch.h> |
10 | #include <linux/audit.h> |
11 | #include <linux/tick.h> |
12 | |
13 | #include "common.h" |
14 | |
15 | #define CREATE_TRACE_POINTS |
16 | #include <trace/events/syscalls.h> |
17 | |
18 | static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) |
19 | { |
20 | if (unlikely(audit_context())) { |
21 | unsigned long args[6]; |
22 | |
23 | syscall_get_arguments(current, regs, args); |
24 | audit_syscall_entry(major: syscall, a0: args[0], a1: args[1], a2: args[2], a3: args[3]); |
25 | } |
26 | } |
27 | |
28 | long syscall_trace_enter(struct pt_regs *regs, long syscall, |
29 | unsigned long work) |
30 | { |
31 | long ret = 0; |
32 | |
33 | /* |
34 | * Handle Syscall User Dispatch. This must comes first, since |
35 | * the ABI here can be something that doesn't make sense for |
36 | * other syscall_work features. |
37 | */ |
38 | if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { |
39 | if (syscall_user_dispatch(regs)) |
40 | return -1L; |
41 | } |
42 | |
43 | /* Handle ptrace */ |
44 | if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { |
45 | ret = ptrace_report_syscall_entry(regs); |
46 | if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) |
47 | return -1L; |
48 | } |
49 | |
50 | /* Do seccomp after ptrace, to catch any tracer changes. */ |
51 | if (work & SYSCALL_WORK_SECCOMP) { |
52 | ret = __secure_computing(NULL); |
53 | if (ret == -1L) |
54 | return ret; |
55 | } |
56 | |
57 | /* Either of the above might have changed the syscall number */ |
58 | syscall = syscall_get_nr(current, regs); |
59 | |
60 | if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { |
61 | trace_sys_enter(regs, id: syscall); |
62 | /* |
63 | * Probes or BPF hooks in the tracepoint may have changed the |
64 | * system call number as well. |
65 | */ |
66 | syscall = syscall_get_nr(current, regs); |
67 | } |
68 | |
69 | syscall_enter_audit(regs, syscall); |
70 | |
71 | return ret ? : syscall; |
72 | } |
73 | |
74 | noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) |
75 | { |
76 | enter_from_user_mode(regs); |
77 | instrumentation_begin(); |
78 | local_irq_enable(); |
79 | instrumentation_end(); |
80 | } |
81 | |
82 | /* Workaround to allow gradual conversion of architecture code */ |
83 | void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } |
84 | |
85 | /** |
86 | * exit_to_user_mode_loop - do any pending work before leaving to user space |
87 | * @regs: Pointer to pt_regs on entry stack |
88 | * @ti_work: TIF work flags as read by the caller |
89 | */ |
90 | __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, |
91 | unsigned long ti_work) |
92 | { |
93 | /* |
94 | * Before returning to user space ensure that all pending work |
95 | * items have been completed. |
96 | */ |
97 | while (ti_work & EXIT_TO_USER_MODE_WORK) { |
98 | |
99 | local_irq_enable_exit_to_user(ti_work); |
100 | |
101 | if (ti_work & _TIF_NEED_RESCHED) |
102 | schedule(); |
103 | |
104 | if (ti_work & _TIF_UPROBE) |
105 | uprobe_notify_resume(regs); |
106 | |
107 | if (ti_work & _TIF_PATCH_PENDING) |
108 | klp_update_patch_state(current); |
109 | |
110 | if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) |
111 | arch_do_signal_or_restart(regs); |
112 | |
113 | if (ti_work & _TIF_NOTIFY_RESUME) |
114 | resume_user_mode_work(regs); |
115 | |
116 | /* Architecture specific TIF work */ |
117 | arch_exit_to_user_mode_work(regs, ti_work); |
118 | |
119 | /* |
120 | * Disable interrupts and reevaluate the work flags as they |
121 | * might have changed while interrupts and preemption was |
122 | * enabled above. |
123 | */ |
124 | local_irq_disable_exit_to_user(); |
125 | |
126 | /* Check if any of the above work has queued a deferred wakeup */ |
127 | tick_nohz_user_enter_prepare(); |
128 | |
129 | ti_work = read_thread_flags(); |
130 | } |
131 | |
132 | /* Return the latest work state for arch_exit_to_user_mode() */ |
133 | return ti_work; |
134 | } |
135 | |
136 | /* |
137 | * If SYSCALL_EMU is set, then the only reason to report is when |
138 | * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall |
139 | * instruction has been already reported in syscall_enter_from_user_mode(). |
140 | */ |
141 | static inline bool report_single_step(unsigned long work) |
142 | { |
143 | if (work & SYSCALL_WORK_SYSCALL_EMU) |
144 | return false; |
145 | |
146 | return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; |
147 | } |
148 | |
149 | static void syscall_exit_work(struct pt_regs *regs, unsigned long work) |
150 | { |
151 | bool step; |
152 | |
153 | /* |
154 | * If the syscall was rolled back due to syscall user dispatching, |
155 | * then the tracers below are not invoked for the same reason as |
156 | * the entry side was not invoked in syscall_trace_enter(): The ABI |
157 | * of these syscalls is unknown. |
158 | */ |
159 | if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { |
160 | if (unlikely(current->syscall_dispatch.on_dispatch)) { |
161 | current->syscall_dispatch.on_dispatch = false; |
162 | return; |
163 | } |
164 | } |
165 | |
166 | audit_syscall_exit(pt_regs: regs); |
167 | |
168 | if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) |
169 | trace_sys_exit(regs, ret: syscall_get_return_value(current, regs)); |
170 | |
171 | step = report_single_step(work); |
172 | if (step || work & SYSCALL_WORK_SYSCALL_TRACE) |
173 | ptrace_report_syscall_exit(regs, step); |
174 | } |
175 | |
176 | /* |
177 | * Syscall specific exit to user mode preparation. Runs with interrupts |
178 | * enabled. |
179 | */ |
180 | static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) |
181 | { |
182 | unsigned long work = READ_ONCE(current_thread_info()->syscall_work); |
183 | unsigned long nr = syscall_get_nr(current, regs); |
184 | |
185 | CT_WARN_ON(ct_state() != CONTEXT_KERNEL); |
186 | |
187 | if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { |
188 | if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled" , nr)) |
189 | local_irq_enable(); |
190 | } |
191 | |
192 | rseq_syscall(regs); |
193 | |
194 | /* |
195 | * Do one-time syscall specific work. If these work items are |
196 | * enabled, we want to run them exactly once per syscall exit with |
197 | * interrupts enabled. |
198 | */ |
199 | if (unlikely(work & SYSCALL_WORK_EXIT)) |
200 | syscall_exit_work(regs, work); |
201 | } |
202 | |
203 | static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) |
204 | { |
205 | syscall_exit_to_user_mode_prepare(regs); |
206 | local_irq_disable_exit_to_user(); |
207 | exit_to_user_mode_prepare(regs); |
208 | } |
209 | |
210 | void syscall_exit_to_user_mode_work(struct pt_regs *regs) |
211 | { |
212 | __syscall_exit_to_user_mode_work(regs); |
213 | } |
214 | |
215 | __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) |
216 | { |
217 | instrumentation_begin(); |
218 | __syscall_exit_to_user_mode_work(regs); |
219 | instrumentation_end(); |
220 | exit_to_user_mode(); |
221 | } |
222 | |
223 | noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) |
224 | { |
225 | enter_from_user_mode(regs); |
226 | } |
227 | |
228 | noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) |
229 | { |
230 | instrumentation_begin(); |
231 | exit_to_user_mode_prepare(regs); |
232 | instrumentation_end(); |
233 | exit_to_user_mode(); |
234 | } |
235 | |
236 | noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) |
237 | { |
238 | irqentry_state_t ret = { |
239 | .exit_rcu = false, |
240 | }; |
241 | |
242 | if (user_mode(regs)) { |
243 | irqentry_enter_from_user_mode(regs); |
244 | return ret; |
245 | } |
246 | |
247 | /* |
248 | * If this entry hit the idle task invoke ct_irq_enter() whether |
249 | * RCU is watching or not. |
250 | * |
251 | * Interrupts can nest when the first interrupt invokes softirq |
252 | * processing on return which enables interrupts. |
253 | * |
254 | * Scheduler ticks in the idle task can mark quiescent state and |
255 | * terminate a grace period, if and only if the timer interrupt is |
256 | * not nested into another interrupt. |
257 | * |
258 | * Checking for rcu_is_watching() here would prevent the nesting |
259 | * interrupt to invoke ct_irq_enter(). If that nested interrupt is |
260 | * the tick then rcu_flavor_sched_clock_irq() would wrongfully |
261 | * assume that it is the first interrupt and eventually claim |
262 | * quiescent state and end grace periods prematurely. |
263 | * |
264 | * Unconditionally invoke ct_irq_enter() so RCU state stays |
265 | * consistent. |
266 | * |
267 | * TINY_RCU does not support EQS, so let the compiler eliminate |
268 | * this part when enabled. |
269 | */ |
270 | if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { |
271 | /* |
272 | * If RCU is not watching then the same careful |
273 | * sequence vs. lockdep and tracing is required |
274 | * as in irqentry_enter_from_user_mode(). |
275 | */ |
276 | lockdep_hardirqs_off(CALLER_ADDR0); |
277 | ct_irq_enter(); |
278 | instrumentation_begin(); |
279 | kmsan_unpoison_entry_regs(regs); |
280 | trace_hardirqs_off_finish(); |
281 | instrumentation_end(); |
282 | |
283 | ret.exit_rcu = true; |
284 | return ret; |
285 | } |
286 | |
287 | /* |
288 | * If RCU is watching then RCU only wants to check whether it needs |
289 | * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() |
290 | * already contains a warning when RCU is not watching, so no point |
291 | * in having another one here. |
292 | */ |
293 | lockdep_hardirqs_off(CALLER_ADDR0); |
294 | instrumentation_begin(); |
295 | kmsan_unpoison_entry_regs(regs); |
296 | rcu_irq_enter_check_tick(); |
297 | trace_hardirqs_off_finish(); |
298 | instrumentation_end(); |
299 | |
300 | return ret; |
301 | } |
302 | |
303 | void raw_irqentry_exit_cond_resched(void) |
304 | { |
305 | if (!preempt_count()) { |
306 | /* Sanity check RCU and thread stack */ |
307 | rcu_irq_exit_check_preempt(); |
308 | if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) |
309 | WARN_ON_ONCE(!on_thread_stack()); |
310 | if (need_resched()) |
311 | preempt_schedule_irq(); |
312 | } |
313 | } |
314 | #ifdef CONFIG_PREEMPT_DYNAMIC |
315 | #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) |
316 | DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); |
317 | #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) |
318 | DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); |
319 | void dynamic_irqentry_exit_cond_resched(void) |
320 | { |
321 | if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) |
322 | return; |
323 | raw_irqentry_exit_cond_resched(); |
324 | } |
325 | #endif |
326 | #endif |
327 | |
328 | noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) |
329 | { |
330 | lockdep_assert_irqs_disabled(); |
331 | |
332 | /* Check whether this returns to user mode */ |
333 | if (user_mode(regs)) { |
334 | irqentry_exit_to_user_mode(regs); |
335 | } else if (!regs_irqs_disabled(regs)) { |
336 | /* |
337 | * If RCU was not watching on entry this needs to be done |
338 | * carefully and needs the same ordering of lockdep/tracing |
339 | * and RCU as the return to user mode path. |
340 | */ |
341 | if (state.exit_rcu) { |
342 | instrumentation_begin(); |
343 | /* Tell the tracer that IRET will enable interrupts */ |
344 | trace_hardirqs_on_prepare(); |
345 | lockdep_hardirqs_on_prepare(); |
346 | instrumentation_end(); |
347 | ct_irq_exit(); |
348 | lockdep_hardirqs_on(CALLER_ADDR0); |
349 | return; |
350 | } |
351 | |
352 | instrumentation_begin(); |
353 | if (IS_ENABLED(CONFIG_PREEMPTION)) |
354 | irqentry_exit_cond_resched(); |
355 | |
356 | /* Covers both tracing and lockdep */ |
357 | trace_hardirqs_on(); |
358 | instrumentation_end(); |
359 | } else { |
360 | /* |
361 | * IRQ flags state is correct already. Just tell RCU if it |
362 | * was not watching on entry. |
363 | */ |
364 | if (state.exit_rcu) |
365 | ct_irq_exit(); |
366 | } |
367 | } |
368 | |
369 | irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) |
370 | { |
371 | irqentry_state_t irq_state; |
372 | |
373 | irq_state.lockdep = lockdep_hardirqs_enabled(); |
374 | |
375 | __nmi_enter(); |
376 | lockdep_hardirqs_off(CALLER_ADDR0); |
377 | lockdep_hardirq_enter(); |
378 | ct_nmi_enter(); |
379 | |
380 | instrumentation_begin(); |
381 | kmsan_unpoison_entry_regs(regs); |
382 | trace_hardirqs_off_finish(); |
383 | ftrace_nmi_enter(); |
384 | instrumentation_end(); |
385 | |
386 | return irq_state; |
387 | } |
388 | |
389 | void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) |
390 | { |
391 | instrumentation_begin(); |
392 | ftrace_nmi_exit(); |
393 | if (irq_state.lockdep) { |
394 | trace_hardirqs_on_prepare(); |
395 | lockdep_hardirqs_on_prepare(); |
396 | } |
397 | instrumentation_end(); |
398 | |
399 | ct_nmi_exit(); |
400 | lockdep_hardirq_exit(); |
401 | if (irq_state.lockdep) |
402 | lockdep_hardirqs_on(CALLER_ADDR0); |
403 | __nmi_exit(); |
404 | } |
405 | |