1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> |
4 | * |
5 | */ |
6 | #include <linux/sched/task_stack.h> |
7 | #include <linux/stacktrace.h> |
8 | #include <linux/security.h> |
9 | #include <linux/kallsyms.h> |
10 | #include <linux/seq_file.h> |
11 | #include <linux/spinlock.h> |
12 | #include <linux/uaccess.h> |
13 | #include <linux/ftrace.h> |
14 | #include <linux/module.h> |
15 | #include <linux/sysctl.h> |
16 | #include <linux/init.h> |
17 | |
18 | #include <asm/setup.h> |
19 | |
20 | #include "trace.h" |
21 | |
22 | #define STACK_TRACE_ENTRIES 500 |
23 | |
24 | static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; |
25 | static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; |
26 | |
27 | static unsigned int stack_trace_nr_entries; |
28 | static unsigned long stack_trace_max_size; |
29 | static arch_spinlock_t stack_trace_max_lock = |
30 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
31 | |
32 | DEFINE_PER_CPU(int, disable_stack_tracer); |
33 | static DEFINE_MUTEX(stack_sysctl_mutex); |
34 | |
35 | int stack_tracer_enabled; |
36 | |
37 | static void print_max_stack(void) |
38 | { |
39 | long i; |
40 | int size; |
41 | |
42 | pr_emerg(" Depth Size Location (%d entries)\n" |
43 | " ----- ---- --------\n" , |
44 | stack_trace_nr_entries); |
45 | |
46 | for (i = 0; i < stack_trace_nr_entries; i++) { |
47 | if (i + 1 == stack_trace_nr_entries) |
48 | size = stack_trace_index[i]; |
49 | else |
50 | size = stack_trace_index[i] - stack_trace_index[i+1]; |
51 | |
52 | pr_emerg("%3ld) %8d %5d %pS\n" , i, stack_trace_index[i], |
53 | size, (void *)stack_dump_trace[i]); |
54 | } |
55 | } |
56 | |
57 | /* |
58 | * The stack tracer looks for a maximum stack at each call from a function. It |
59 | * registers a callback from ftrace, and in that callback it examines the stack |
60 | * size. It determines the stack size from the variable passed in, which is the |
61 | * address of a local variable in the stack_trace_call() callback function. |
62 | * The stack size is calculated by the address of the local variable to the top |
63 | * of the current stack. If that size is smaller than the currently saved max |
64 | * stack size, nothing more is done. |
65 | * |
66 | * If the size of the stack is greater than the maximum recorded size, then the |
67 | * following algorithm takes place. |
68 | * |
69 | * For architectures (like x86) that store the function's return address before |
70 | * saving the function's local variables, the stack will look something like |
71 | * this: |
72 | * |
73 | * [ top of stack ] |
74 | * 0: sys call entry frame |
75 | * 10: return addr to entry code |
76 | * 11: start of sys_foo frame |
77 | * 20: return addr to sys_foo |
78 | * 21: start of kernel_func_bar frame |
79 | * 30: return addr to kernel_func_bar |
80 | * 31: [ do trace stack here ] |
81 | * |
82 | * The save_stack_trace() is called returning all the functions it finds in the |
83 | * current stack. Which would be (from the bottom of the stack to the top): |
84 | * |
85 | * return addr to kernel_func_bar |
86 | * return addr to sys_foo |
87 | * return addr to entry code |
88 | * |
89 | * Now to figure out how much each of these functions' local variable size is, |
90 | * a search of the stack is made to find these values. When a match is made, it |
91 | * is added to the stack_dump_trace[] array. The offset into the stack is saved |
92 | * in the stack_trace_index[] array. The above example would show: |
93 | * |
94 | * stack_dump_trace[] | stack_trace_index[] |
95 | * ------------------ + ------------------- |
96 | * return addr to kernel_func_bar | 30 |
97 | * return addr to sys_foo | 20 |
98 | * return addr to entry | 10 |
99 | * |
100 | * The print_max_stack() function above, uses these values to print the size of |
101 | * each function's portion of the stack. |
102 | * |
103 | * for (i = 0; i < nr_entries; i++) { |
104 | * size = i == nr_entries - 1 ? stack_trace_index[i] : |
105 | * stack_trace_index[i] - stack_trace_index[i+1] |
106 | * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); |
107 | * } |
108 | * |
109 | * The above shows |
110 | * |
111 | * depth size location |
112 | * ----- ---- -------- |
113 | * 0 30 10 kernel_func_bar |
114 | * 1 20 10 sys_foo |
115 | * 2 10 10 entry code |
116 | * |
117 | * Now for architectures that might save the return address after the functions |
118 | * local variables (saving the link register before calling nested functions), |
119 | * this will cause the stack to look a little different: |
120 | * |
121 | * [ top of stack ] |
122 | * 0: sys call entry frame |
123 | * 10: start of sys_foo_frame |
124 | * 19: return addr to entry code << lr saved before calling kernel_func_bar |
125 | * 20: start of kernel_func_bar frame |
126 | * 29: return addr to sys_foo_frame << lr saved before calling next function |
127 | * 30: [ do trace stack here ] |
128 | * |
129 | * Although the functions returned by save_stack_trace() may be the same, the |
130 | * placement in the stack will be different. Using the same algorithm as above |
131 | * would yield: |
132 | * |
133 | * stack_dump_trace[] | stack_trace_index[] |
134 | * ------------------ + ------------------- |
135 | * return addr to kernel_func_bar | 30 |
136 | * return addr to sys_foo | 29 |
137 | * return addr to entry | 19 |
138 | * |
139 | * Where the mapping is off by one: |
140 | * |
141 | * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! |
142 | * |
143 | * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the |
144 | * values in stack_trace_index[] are shifted by one to and the number of |
145 | * stack trace entries is decremented by one. |
146 | * |
147 | * stack_dump_trace[] | stack_trace_index[] |
148 | * ------------------ + ------------------- |
149 | * return addr to kernel_func_bar | 29 |
150 | * return addr to sys_foo | 19 |
151 | * |
152 | * Although the entry function is not displayed, the first function (sys_foo) |
153 | * will still include the stack size of it. |
154 | */ |
155 | static void check_stack(unsigned long ip, unsigned long *stack) |
156 | { |
157 | unsigned long this_size, flags; unsigned long *p, *top, *start; |
158 | static int tracer_frame; |
159 | int frame_size = READ_ONCE(tracer_frame); |
160 | int i, x; |
161 | |
162 | this_size = ((unsigned long)stack) & (THREAD_SIZE-1); |
163 | this_size = THREAD_SIZE - this_size; |
164 | /* Remove the frame of the tracer */ |
165 | this_size -= frame_size; |
166 | |
167 | if (this_size <= stack_trace_max_size) |
168 | return; |
169 | |
170 | /* we do not handle interrupt stacks yet */ |
171 | if (!object_is_on_stack(obj: stack)) |
172 | return; |
173 | |
174 | /* Can't do this from NMI context (can cause deadlocks) */ |
175 | if (in_nmi()) |
176 | return; |
177 | |
178 | local_irq_save(flags); |
179 | arch_spin_lock(&stack_trace_max_lock); |
180 | |
181 | /* In case another CPU set the tracer_frame on us */ |
182 | if (unlikely(!frame_size)) |
183 | this_size -= tracer_frame; |
184 | |
185 | /* a race could have already updated it */ |
186 | if (this_size <= stack_trace_max_size) |
187 | goto out; |
188 | |
189 | stack_trace_max_size = this_size; |
190 | |
191 | stack_trace_nr_entries = stack_trace_save(store: stack_dump_trace, |
192 | ARRAY_SIZE(stack_dump_trace) - 1, |
193 | skipnr: 0); |
194 | |
195 | /* Skip over the overhead of the stack tracer itself */ |
196 | for (i = 0; i < stack_trace_nr_entries; i++) { |
197 | if (stack_dump_trace[i] == ip) |
198 | break; |
199 | } |
200 | |
201 | /* |
202 | * Some archs may not have the passed in ip in the dump. |
203 | * If that happens, we need to show everything. |
204 | */ |
205 | if (i == stack_trace_nr_entries) |
206 | i = 0; |
207 | |
208 | /* |
209 | * Now find where in the stack these are. |
210 | */ |
211 | x = 0; |
212 | start = stack; |
213 | top = (unsigned long *) |
214 | (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); |
215 | |
216 | /* |
217 | * Loop through all the entries. One of the entries may |
218 | * for some reason be missed on the stack, so we may |
219 | * have to account for them. If they are all there, this |
220 | * loop will only happen once. This code only takes place |
221 | * on a new max, so it is far from a fast path. |
222 | */ |
223 | while (i < stack_trace_nr_entries) { |
224 | int found = 0; |
225 | |
226 | stack_trace_index[x] = this_size; |
227 | p = start; |
228 | |
229 | for (; p < top && i < stack_trace_nr_entries; p++) { |
230 | /* |
231 | * The READ_ONCE_NOCHECK is used to let KASAN know that |
232 | * this is not a stack-out-of-bounds error. |
233 | */ |
234 | if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { |
235 | stack_dump_trace[x] = stack_dump_trace[i++]; |
236 | this_size = stack_trace_index[x++] = |
237 | (top - p) * sizeof(unsigned long); |
238 | found = 1; |
239 | /* Start the search from here */ |
240 | start = p + 1; |
241 | /* |
242 | * We do not want to show the overhead |
243 | * of the stack tracer stack in the |
244 | * max stack. If we haven't figured |
245 | * out what that is, then figure it out |
246 | * now. |
247 | */ |
248 | if (unlikely(!tracer_frame)) { |
249 | tracer_frame = (p - stack) * |
250 | sizeof(unsigned long); |
251 | stack_trace_max_size -= tracer_frame; |
252 | } |
253 | } |
254 | } |
255 | |
256 | if (!found) |
257 | i++; |
258 | } |
259 | |
260 | #ifdef ARCH_FTRACE_SHIFT_STACK_TRACER |
261 | /* |
262 | * Some archs will store the link register before calling |
263 | * nested functions. This means the saved return address |
264 | * comes after the local storage, and we need to shift |
265 | * for that. |
266 | */ |
267 | if (x > 1) { |
268 | memmove(&stack_trace_index[0], &stack_trace_index[1], |
269 | sizeof(stack_trace_index[0]) * (x - 1)); |
270 | x--; |
271 | } |
272 | #endif |
273 | |
274 | stack_trace_nr_entries = x; |
275 | |
276 | if (task_stack_end_corrupted(current)) { |
277 | print_max_stack(); |
278 | BUG(); |
279 | } |
280 | |
281 | out: |
282 | arch_spin_unlock(&stack_trace_max_lock); |
283 | local_irq_restore(flags); |
284 | } |
285 | |
286 | /* Some archs may not define MCOUNT_INSN_SIZE */ |
287 | #ifndef MCOUNT_INSN_SIZE |
288 | # define MCOUNT_INSN_SIZE 0 |
289 | #endif |
290 | |
291 | static void |
292 | stack_trace_call(unsigned long ip, unsigned long parent_ip, |
293 | struct ftrace_ops *op, struct ftrace_regs *fregs) |
294 | { |
295 | unsigned long stack; |
296 | |
297 | preempt_disable_notrace(); |
298 | |
299 | /* no atomic needed, we only modify this variable by this cpu */ |
300 | __this_cpu_inc(disable_stack_tracer); |
301 | if (__this_cpu_read(disable_stack_tracer) != 1) |
302 | goto out; |
303 | |
304 | /* If rcu is not watching, then save stack trace can fail */ |
305 | if (!rcu_is_watching()) |
306 | goto out; |
307 | |
308 | ip += MCOUNT_INSN_SIZE; |
309 | |
310 | check_stack(ip, stack: &stack); |
311 | |
312 | out: |
313 | __this_cpu_dec(disable_stack_tracer); |
314 | /* prevent recursion in schedule */ |
315 | preempt_enable_notrace(); |
316 | } |
317 | |
318 | static struct ftrace_ops trace_ops __read_mostly = |
319 | { |
320 | .func = stack_trace_call, |
321 | }; |
322 | |
323 | static ssize_t |
324 | stack_max_size_read(struct file *filp, char __user *ubuf, |
325 | size_t count, loff_t *ppos) |
326 | { |
327 | unsigned long *ptr = filp->private_data; |
328 | char buf[64]; |
329 | int r; |
330 | |
331 | r = snprintf(buf, size: sizeof(buf), fmt: "%ld\n" , *ptr); |
332 | if (r > sizeof(buf)) |
333 | r = sizeof(buf); |
334 | return simple_read_from_buffer(to: ubuf, count, ppos, from: buf, available: r); |
335 | } |
336 | |
337 | static ssize_t |
338 | stack_max_size_write(struct file *filp, const char __user *ubuf, |
339 | size_t count, loff_t *ppos) |
340 | { |
341 | long *ptr = filp->private_data; |
342 | unsigned long val, flags; |
343 | int ret; |
344 | |
345 | ret = kstrtoul_from_user(s: ubuf, count, base: 10, res: &val); |
346 | if (ret) |
347 | return ret; |
348 | |
349 | local_irq_save(flags); |
350 | |
351 | /* |
352 | * In case we trace inside arch_spin_lock() or after (NMI), |
353 | * we will cause circular lock, so we also need to increase |
354 | * the percpu disable_stack_tracer here. |
355 | */ |
356 | __this_cpu_inc(disable_stack_tracer); |
357 | |
358 | arch_spin_lock(&stack_trace_max_lock); |
359 | *ptr = val; |
360 | arch_spin_unlock(&stack_trace_max_lock); |
361 | |
362 | __this_cpu_dec(disable_stack_tracer); |
363 | local_irq_restore(flags); |
364 | |
365 | return count; |
366 | } |
367 | |
368 | static const struct file_operations stack_max_size_fops = { |
369 | .open = tracing_open_generic, |
370 | .read = stack_max_size_read, |
371 | .write = stack_max_size_write, |
372 | .llseek = default_llseek, |
373 | }; |
374 | |
375 | static void * |
376 | __next(struct seq_file *m, loff_t *pos) |
377 | { |
378 | long n = *pos - 1; |
379 | |
380 | if (n >= stack_trace_nr_entries) |
381 | return NULL; |
382 | |
383 | m->private = (void *)n; |
384 | return &m->private; |
385 | } |
386 | |
387 | static void * |
388 | t_next(struct seq_file *m, void *v, loff_t *pos) |
389 | { |
390 | (*pos)++; |
391 | return __next(m, pos); |
392 | } |
393 | |
394 | static void *t_start(struct seq_file *m, loff_t *pos) |
395 | { |
396 | local_irq_disable(); |
397 | |
398 | __this_cpu_inc(disable_stack_tracer); |
399 | |
400 | arch_spin_lock(&stack_trace_max_lock); |
401 | |
402 | if (*pos == 0) |
403 | return SEQ_START_TOKEN; |
404 | |
405 | return __next(m, pos); |
406 | } |
407 | |
408 | static void t_stop(struct seq_file *m, void *p) |
409 | { |
410 | arch_spin_unlock(&stack_trace_max_lock); |
411 | |
412 | __this_cpu_dec(disable_stack_tracer); |
413 | |
414 | local_irq_enable(); |
415 | } |
416 | |
417 | static void trace_lookup_stack(struct seq_file *m, long i) |
418 | { |
419 | unsigned long addr = stack_dump_trace[i]; |
420 | |
421 | seq_printf(m, fmt: "%pS\n" , (void *)addr); |
422 | } |
423 | |
424 | static void print_disabled(struct seq_file *m) |
425 | { |
426 | seq_puts(m, s: "#\n" |
427 | "# Stack tracer disabled\n" |
428 | "#\n" |
429 | "# To enable the stack tracer, either add 'stacktrace' to the\n" |
430 | "# kernel command line\n" |
431 | "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" |
432 | "#\n" ); |
433 | } |
434 | |
435 | static int t_show(struct seq_file *m, void *v) |
436 | { |
437 | long i; |
438 | int size; |
439 | |
440 | if (v == SEQ_START_TOKEN) { |
441 | seq_printf(m, fmt: " Depth Size Location" |
442 | " (%d entries)\n" |
443 | " ----- ---- --------\n" , |
444 | stack_trace_nr_entries); |
445 | |
446 | if (!stack_tracer_enabled && !stack_trace_max_size) |
447 | print_disabled(m); |
448 | |
449 | return 0; |
450 | } |
451 | |
452 | i = *(long *)v; |
453 | |
454 | if (i >= stack_trace_nr_entries) |
455 | return 0; |
456 | |
457 | if (i + 1 == stack_trace_nr_entries) |
458 | size = stack_trace_index[i]; |
459 | else |
460 | size = stack_trace_index[i] - stack_trace_index[i+1]; |
461 | |
462 | seq_printf(m, fmt: "%3ld) %8d %5d " , i, stack_trace_index[i], size); |
463 | |
464 | trace_lookup_stack(m, i); |
465 | |
466 | return 0; |
467 | } |
468 | |
469 | static const struct seq_operations stack_trace_seq_ops = { |
470 | .start = t_start, |
471 | .next = t_next, |
472 | .stop = t_stop, |
473 | .show = t_show, |
474 | }; |
475 | |
476 | static int stack_trace_open(struct inode *inode, struct file *file) |
477 | { |
478 | int ret; |
479 | |
480 | ret = security_locked_down(what: LOCKDOWN_TRACEFS); |
481 | if (ret) |
482 | return ret; |
483 | |
484 | return seq_open(file, &stack_trace_seq_ops); |
485 | } |
486 | |
487 | static const struct file_operations stack_trace_fops = { |
488 | .open = stack_trace_open, |
489 | .read = seq_read, |
490 | .llseek = seq_lseek, |
491 | .release = seq_release, |
492 | }; |
493 | |
494 | #ifdef CONFIG_DYNAMIC_FTRACE |
495 | |
496 | static int |
497 | stack_trace_filter_open(struct inode *inode, struct file *file) |
498 | { |
499 | struct ftrace_ops *ops = inode->i_private; |
500 | |
501 | /* Checks for tracefs lockdown */ |
502 | return ftrace_regex_open(ops, flag: FTRACE_ITER_FILTER, |
503 | inode, file); |
504 | } |
505 | |
506 | static const struct file_operations stack_trace_filter_fops = { |
507 | .open = stack_trace_filter_open, |
508 | .read = seq_read, |
509 | .write = ftrace_filter_write, |
510 | .llseek = tracing_lseek, |
511 | .release = ftrace_regex_release, |
512 | }; |
513 | |
514 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
515 | |
516 | int |
517 | stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, |
518 | size_t *lenp, loff_t *ppos) |
519 | { |
520 | int was_enabled; |
521 | int ret; |
522 | |
523 | mutex_lock(&stack_sysctl_mutex); |
524 | was_enabled = !!stack_tracer_enabled; |
525 | |
526 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
527 | |
528 | if (ret || !write || (was_enabled == !!stack_tracer_enabled)) |
529 | goto out; |
530 | |
531 | if (stack_tracer_enabled) |
532 | register_ftrace_function(ops: &trace_ops); |
533 | else |
534 | unregister_ftrace_function(ops: &trace_ops); |
535 | out: |
536 | mutex_unlock(lock: &stack_sysctl_mutex); |
537 | return ret; |
538 | } |
539 | |
540 | static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; |
541 | |
542 | static __init int enable_stacktrace(char *str) |
543 | { |
544 | int len; |
545 | |
546 | if ((len = str_has_prefix(str, prefix: "_filter=" ))) |
547 | strncpy(p: stack_trace_filter_buf, q: str + len, COMMAND_LINE_SIZE); |
548 | |
549 | stack_tracer_enabled = 1; |
550 | return 1; |
551 | } |
552 | __setup("stacktrace" , enable_stacktrace); |
553 | |
554 | static __init int stack_trace_init(void) |
555 | { |
556 | int ret; |
557 | |
558 | ret = tracing_init_dentry(); |
559 | if (ret) |
560 | return 0; |
561 | |
562 | trace_create_file(name: "stack_max_size" , TRACE_MODE_WRITE, NULL, |
563 | data: &stack_trace_max_size, fops: &stack_max_size_fops); |
564 | |
565 | trace_create_file(name: "stack_trace" , TRACE_MODE_READ, NULL, |
566 | NULL, fops: &stack_trace_fops); |
567 | |
568 | #ifdef CONFIG_DYNAMIC_FTRACE |
569 | trace_create_file(name: "stack_trace_filter" , TRACE_MODE_WRITE, NULL, |
570 | data: &trace_ops, fops: &stack_trace_filter_fops); |
571 | #endif |
572 | |
573 | if (stack_trace_filter_buf[0]) |
574 | ftrace_set_early_filter(ops: &trace_ops, buf: stack_trace_filter_buf, enable: 1); |
575 | |
576 | if (stack_tracer_enabled) |
577 | register_ftrace_function(ops: &trace_ops); |
578 | |
579 | return 0; |
580 | } |
581 | |
582 | device_initcall(stack_trace_init); |
583 | |