1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Stack depot - a stack trace storage that avoids duplication. |
4 | * |
5 | * Internally, stack depot maintains a hash table of unique stacktraces. The |
6 | * stack traces themselves are stored contiguously one after another in a set |
7 | * of separate page allocations. |
8 | * |
9 | * Author: Alexander Potapenko <glider@google.com> |
10 | * Copyright (C) 2016 Google, Inc. |
11 | * |
12 | * Based on the code by Dmitry Chernenkov. |
13 | */ |
14 | |
15 | #define pr_fmt(fmt) "stackdepot: " fmt |
16 | |
17 | #include <linux/gfp.h> |
18 | #include <linux/jhash.h> |
19 | #include <linux/kernel.h> |
20 | #include <linux/kmsan.h> |
21 | #include <linux/mm.h> |
22 | #include <linux/mutex.h> |
23 | #include <linux/percpu.h> |
24 | #include <linux/printk.h> |
25 | #include <linux/slab.h> |
26 | #include <linux/stacktrace.h> |
27 | #include <linux/stackdepot.h> |
28 | #include <linux/string.h> |
29 | #include <linux/types.h> |
30 | #include <linux/memblock.h> |
31 | #include <linux/kasan-enabled.h> |
32 | |
33 | #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) |
34 | |
35 | #define DEPOT_VALID_BITS 1 |
36 | #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ |
37 | #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) |
38 | #define DEPOT_STACK_ALIGN 4 |
39 | #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) |
40 | #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \ |
41 | DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) |
42 | #define DEPOT_POOLS_CAP 8192 |
43 | #define DEPOT_MAX_POOLS \ |
44 | (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ |
45 | (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) |
46 | |
47 | /* Compact structure that stores a reference to a stack. */ |
48 | union handle_parts { |
49 | depot_stack_handle_t handle; |
50 | struct { |
51 | u32 pool_index : DEPOT_POOL_INDEX_BITS; |
52 | u32 offset : DEPOT_OFFSET_BITS; |
53 | u32 valid : DEPOT_VALID_BITS; |
54 | u32 extra : STACK_DEPOT_EXTRA_BITS; |
55 | }; |
56 | }; |
57 | |
58 | struct stack_record { |
59 | struct stack_record *next; /* Link in the hash table */ |
60 | u32 hash; /* Hash in the hash table */ |
61 | u32 size; /* Number of stored frames */ |
62 | union handle_parts handle; |
63 | unsigned long entries[]; /* Variable-sized array of frames */ |
64 | }; |
65 | |
66 | static bool stack_depot_disabled; |
67 | static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); |
68 | static bool __stack_depot_early_init_passed __initdata; |
69 | |
70 | /* Use one hash table bucket per 16 KB of memory. */ |
71 | #define STACK_HASH_TABLE_SCALE 14 |
72 | /* Limit the number of buckets between 4K and 1M. */ |
73 | #define STACK_BUCKET_NUMBER_ORDER_MIN 12 |
74 | #define STACK_BUCKET_NUMBER_ORDER_MAX 20 |
75 | /* Initial seed for jhash2. */ |
76 | #define STACK_HASH_SEED 0x9747b28c |
77 | |
78 | /* Hash table of pointers to stored stack traces. */ |
79 | static struct stack_record **stack_table; |
80 | /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ |
81 | static unsigned int stack_bucket_number_order; |
82 | /* Hash mask for indexing the table. */ |
83 | static unsigned int stack_hash_mask; |
84 | |
85 | /* Array of memory regions that store stack traces. */ |
86 | static void *stack_pools[DEPOT_MAX_POOLS]; |
87 | /* Currently used pool in stack_pools. */ |
88 | static int pool_index; |
89 | /* Offset to the unused space in the currently used pool. */ |
90 | static size_t pool_offset; |
91 | /* Lock that protects the variables above. */ |
92 | static DEFINE_RAW_SPINLOCK(pool_lock); |
93 | /* |
94 | * Stack depot tries to keep an extra pool allocated even before it runs out |
95 | * of space in the currently used pool. |
96 | * This flag marks that this next extra pool needs to be allocated and |
97 | * initialized. It has the value 0 when either the next pool is not yet |
98 | * initialized or the limit on the number of pools is reached. |
99 | */ |
100 | static int next_pool_required = 1; |
101 | |
102 | static int __init disable_stack_depot(char *str) |
103 | { |
104 | int ret; |
105 | |
106 | ret = kstrtobool(s: str, res: &stack_depot_disabled); |
107 | if (!ret && stack_depot_disabled) { |
108 | pr_info("disabled\n" ); |
109 | stack_table = NULL; |
110 | } |
111 | return 0; |
112 | } |
113 | early_param("stack_depot_disable" , disable_stack_depot); |
114 | |
115 | void __init stack_depot_request_early_init(void) |
116 | { |
117 | /* Too late to request early init now. */ |
118 | WARN_ON(__stack_depot_early_init_passed); |
119 | |
120 | __stack_depot_early_init_requested = true; |
121 | } |
122 | |
123 | /* Allocates a hash table via memblock. Can only be used during early boot. */ |
124 | int __init stack_depot_early_init(void) |
125 | { |
126 | unsigned long entries = 0; |
127 | |
128 | /* This function must be called only once, from mm_init(). */ |
129 | if (WARN_ON(__stack_depot_early_init_passed)) |
130 | return 0; |
131 | __stack_depot_early_init_passed = true; |
132 | |
133 | /* |
134 | * If KASAN is enabled, use the maximum order: KASAN is frequently used |
135 | * in fuzzing scenarios, which leads to a large number of different |
136 | * stack traces being stored in stack depot. |
137 | */ |
138 | if (kasan_enabled() && !stack_bucket_number_order) |
139 | stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; |
140 | |
141 | if (!__stack_depot_early_init_requested || stack_depot_disabled) |
142 | return 0; |
143 | |
144 | /* |
145 | * If stack_bucket_number_order is not set, leave entries as 0 to rely |
146 | * on the automatic calculations performed by alloc_large_system_hash. |
147 | */ |
148 | if (stack_bucket_number_order) |
149 | entries = 1UL << stack_bucket_number_order; |
150 | pr_info("allocating hash table via alloc_large_system_hash\n" ); |
151 | stack_table = alloc_large_system_hash(tablename: "stackdepot" , |
152 | bucketsize: sizeof(struct stack_record *), |
153 | numentries: entries, |
154 | STACK_HASH_TABLE_SCALE, |
155 | HASH_EARLY | HASH_ZERO, |
156 | NULL, |
157 | hash_mask: &stack_hash_mask, |
158 | low_limit: 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, |
159 | high_limit: 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); |
160 | if (!stack_table) { |
161 | pr_err("hash table allocation failed, disabling\n" ); |
162 | stack_depot_disabled = true; |
163 | return -ENOMEM; |
164 | } |
165 | |
166 | return 0; |
167 | } |
168 | |
169 | /* Allocates a hash table via kvcalloc. Can be used after boot. */ |
170 | int stack_depot_init(void) |
171 | { |
172 | static DEFINE_MUTEX(stack_depot_init_mutex); |
173 | unsigned long entries; |
174 | int ret = 0; |
175 | |
176 | mutex_lock(&stack_depot_init_mutex); |
177 | |
178 | if (stack_depot_disabled || stack_table) |
179 | goto out_unlock; |
180 | |
181 | /* |
182 | * Similarly to stack_depot_early_init, use stack_bucket_number_order |
183 | * if assigned, and rely on automatic scaling otherwise. |
184 | */ |
185 | if (stack_bucket_number_order) { |
186 | entries = 1UL << stack_bucket_number_order; |
187 | } else { |
188 | int scale = STACK_HASH_TABLE_SCALE; |
189 | |
190 | entries = nr_free_buffer_pages(); |
191 | entries = roundup_pow_of_two(entries); |
192 | |
193 | if (scale > PAGE_SHIFT) |
194 | entries >>= (scale - PAGE_SHIFT); |
195 | else |
196 | entries <<= (PAGE_SHIFT - scale); |
197 | } |
198 | |
199 | if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) |
200 | entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; |
201 | if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) |
202 | entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; |
203 | |
204 | pr_info("allocating hash table of %lu entries via kvcalloc\n" , entries); |
205 | stack_table = kvcalloc(n: entries, size: sizeof(struct stack_record *), GFP_KERNEL); |
206 | if (!stack_table) { |
207 | pr_err("hash table allocation failed, disabling\n" ); |
208 | stack_depot_disabled = true; |
209 | ret = -ENOMEM; |
210 | goto out_unlock; |
211 | } |
212 | stack_hash_mask = entries - 1; |
213 | |
214 | out_unlock: |
215 | mutex_unlock(lock: &stack_depot_init_mutex); |
216 | |
217 | return ret; |
218 | } |
219 | EXPORT_SYMBOL_GPL(stack_depot_init); |
220 | |
221 | /* Uses preallocated memory to initialize a new stack depot pool. */ |
222 | static void depot_init_pool(void **prealloc) |
223 | { |
224 | /* |
225 | * If the next pool is already initialized or the maximum number of |
226 | * pools is reached, do not use the preallocated memory. |
227 | * smp_load_acquire() here pairs with smp_store_release() below and |
228 | * in depot_alloc_stack(). |
229 | */ |
230 | if (!smp_load_acquire(&next_pool_required)) |
231 | return; |
232 | |
233 | /* Check if the current pool is not yet allocated. */ |
234 | if (stack_pools[pool_index] == NULL) { |
235 | /* Use the preallocated memory for the current pool. */ |
236 | stack_pools[pool_index] = *prealloc; |
237 | *prealloc = NULL; |
238 | } else { |
239 | /* |
240 | * Otherwise, use the preallocated memory for the next pool |
241 | * as long as we do not exceed the maximum number of pools. |
242 | */ |
243 | if (pool_index + 1 < DEPOT_MAX_POOLS) { |
244 | stack_pools[pool_index + 1] = *prealloc; |
245 | *prealloc = NULL; |
246 | } |
247 | /* |
248 | * At this point, either the next pool is initialized or the |
249 | * maximum number of pools is reached. In either case, take |
250 | * note that initializing another pool is not required. |
251 | * This smp_store_release pairs with smp_load_acquire() above |
252 | * and in stack_depot_save(). |
253 | */ |
254 | smp_store_release(&next_pool_required, 0); |
255 | } |
256 | } |
257 | |
258 | /* Allocates a new stack in a stack depot pool. */ |
259 | static struct stack_record * |
260 | depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) |
261 | { |
262 | struct stack_record *stack; |
263 | size_t required_size = struct_size(stack, entries, size); |
264 | |
265 | required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); |
266 | |
267 | /* Check if there is not enough space in the current pool. */ |
268 | if (unlikely(pool_offset + required_size > DEPOT_POOL_SIZE)) { |
269 | /* Bail out if we reached the pool limit. */ |
270 | if (unlikely(pool_index + 1 >= DEPOT_MAX_POOLS)) { |
271 | WARN_ONCE(1, "Stack depot reached limit capacity" ); |
272 | return NULL; |
273 | } |
274 | |
275 | /* |
276 | * Move on to the next pool. |
277 | * WRITE_ONCE pairs with potential concurrent read in |
278 | * stack_depot_fetch(). |
279 | */ |
280 | WRITE_ONCE(pool_index, pool_index + 1); |
281 | pool_offset = 0; |
282 | /* |
283 | * If the maximum number of pools is not reached, take note |
284 | * that the next pool needs to initialized. |
285 | * smp_store_release() here pairs with smp_load_acquire() in |
286 | * stack_depot_save() and depot_init_pool(). |
287 | */ |
288 | if (pool_index + 1 < DEPOT_MAX_POOLS) |
289 | smp_store_release(&next_pool_required, 1); |
290 | } |
291 | |
292 | /* Assign the preallocated memory to a pool if required. */ |
293 | if (*prealloc) |
294 | depot_init_pool(prealloc); |
295 | |
296 | /* Check if we have a pool to save the stack trace. */ |
297 | if (stack_pools[pool_index] == NULL) |
298 | return NULL; |
299 | |
300 | /* Save the stack trace. */ |
301 | stack = stack_pools[pool_index] + pool_offset; |
302 | stack->hash = hash; |
303 | stack->size = size; |
304 | stack->handle.pool_index = pool_index; |
305 | stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; |
306 | stack->handle.valid = 1; |
307 | stack->handle.extra = 0; |
308 | memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); |
309 | pool_offset += required_size; |
310 | /* |
311 | * Let KMSAN know the stored stack record is initialized. This shall |
312 | * prevent false positive reports if instrumented code accesses it. |
313 | */ |
314 | kmsan_unpoison_memory(address: stack, size: required_size); |
315 | |
316 | return stack; |
317 | } |
318 | |
319 | /* Calculates the hash for a stack. */ |
320 | static inline u32 hash_stack(unsigned long *entries, unsigned int size) |
321 | { |
322 | return jhash2(k: (u32 *)entries, |
323 | array_size(size, sizeof(*entries)) / sizeof(u32), |
324 | STACK_HASH_SEED); |
325 | } |
326 | |
327 | /* |
328 | * Non-instrumented version of memcmp(). |
329 | * Does not check the lexicographical order, only the equality. |
330 | */ |
331 | static inline |
332 | int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, |
333 | unsigned int n) |
334 | { |
335 | for ( ; n-- ; u1++, u2++) { |
336 | if (*u1 != *u2) |
337 | return 1; |
338 | } |
339 | return 0; |
340 | } |
341 | |
342 | /* Finds a stack in a bucket of the hash table. */ |
343 | static inline struct stack_record *find_stack(struct stack_record *bucket, |
344 | unsigned long *entries, int size, |
345 | u32 hash) |
346 | { |
347 | struct stack_record *found; |
348 | |
349 | for (found = bucket; found; found = found->next) { |
350 | if (found->hash == hash && |
351 | found->size == size && |
352 | !stackdepot_memcmp(u1: entries, u2: found->entries, n: size)) |
353 | return found; |
354 | } |
355 | return NULL; |
356 | } |
357 | |
358 | depot_stack_handle_t __stack_depot_save(unsigned long *entries, |
359 | unsigned int nr_entries, |
360 | gfp_t alloc_flags, bool can_alloc) |
361 | { |
362 | struct stack_record *found = NULL, **bucket; |
363 | union handle_parts retval = { .handle = 0 }; |
364 | struct page *page = NULL; |
365 | void *prealloc = NULL; |
366 | unsigned long flags; |
367 | u32 hash; |
368 | |
369 | /* |
370 | * If this stack trace is from an interrupt, including anything before |
371 | * interrupt entry usually leads to unbounded stack depot growth. |
372 | * |
373 | * Since use of filter_irq_stacks() is a requirement to ensure stack |
374 | * depot can efficiently deduplicate interrupt stacks, always |
375 | * filter_irq_stacks() to simplify all callers' use of stack depot. |
376 | */ |
377 | nr_entries = filter_irq_stacks(entries, nr_entries); |
378 | |
379 | if (unlikely(nr_entries == 0) || stack_depot_disabled) |
380 | goto fast_exit; |
381 | |
382 | hash = hash_stack(entries, size: nr_entries); |
383 | bucket = &stack_table[hash & stack_hash_mask]; |
384 | |
385 | /* |
386 | * Fast path: look the stack trace up without locking. |
387 | * The smp_load_acquire() here pairs with smp_store_release() to |
388 | * |bucket| below. |
389 | */ |
390 | found = find_stack(smp_load_acquire(bucket), entries, size: nr_entries, hash); |
391 | if (found) |
392 | goto exit; |
393 | |
394 | /* |
395 | * Check if another stack pool needs to be initialized. If so, allocate |
396 | * the memory now - we won't be able to do that under the lock. |
397 | * |
398 | * The smp_load_acquire() here pairs with smp_store_release() to |
399 | * |next_pool_inited| in depot_alloc_stack() and depot_init_pool(). |
400 | */ |
401 | if (unlikely(can_alloc && smp_load_acquire(&next_pool_required))) { |
402 | /* |
403 | * Zero out zone modifiers, as we don't have specific zone |
404 | * requirements. Keep the flags related to allocation in atomic |
405 | * contexts and I/O. |
406 | */ |
407 | alloc_flags &= ~GFP_ZONEMASK; |
408 | alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); |
409 | alloc_flags |= __GFP_NOWARN; |
410 | page = alloc_pages(gfp: alloc_flags, DEPOT_POOL_ORDER); |
411 | if (page) |
412 | prealloc = page_address(page); |
413 | } |
414 | |
415 | raw_spin_lock_irqsave(&pool_lock, flags); |
416 | |
417 | found = find_stack(bucket: *bucket, entries, size: nr_entries, hash); |
418 | if (!found) { |
419 | struct stack_record *new = |
420 | depot_alloc_stack(entries, size: nr_entries, hash, prealloc: &prealloc); |
421 | |
422 | if (new) { |
423 | new->next = *bucket; |
424 | /* |
425 | * This smp_store_release() pairs with |
426 | * smp_load_acquire() from |bucket| above. |
427 | */ |
428 | smp_store_release(bucket, new); |
429 | found = new; |
430 | } |
431 | } else if (prealloc) { |
432 | /* |
433 | * Stack depot already contains this stack trace, but let's |
434 | * keep the preallocated memory for the future. |
435 | */ |
436 | depot_init_pool(prealloc: &prealloc); |
437 | } |
438 | |
439 | raw_spin_unlock_irqrestore(&pool_lock, flags); |
440 | exit: |
441 | if (prealloc) { |
442 | /* Stack depot didn't use this memory, free it. */ |
443 | free_pages(addr: (unsigned long)prealloc, DEPOT_POOL_ORDER); |
444 | } |
445 | if (found) |
446 | retval.handle = found->handle.handle; |
447 | fast_exit: |
448 | return retval.handle; |
449 | } |
450 | EXPORT_SYMBOL_GPL(__stack_depot_save); |
451 | |
452 | depot_stack_handle_t stack_depot_save(unsigned long *entries, |
453 | unsigned int nr_entries, |
454 | gfp_t alloc_flags) |
455 | { |
456 | return __stack_depot_save(entries, nr_entries, alloc_flags, true); |
457 | } |
458 | EXPORT_SYMBOL_GPL(stack_depot_save); |
459 | |
460 | unsigned int stack_depot_fetch(depot_stack_handle_t handle, |
461 | unsigned long **entries) |
462 | { |
463 | union handle_parts parts = { .handle = handle }; |
464 | /* |
465 | * READ_ONCE pairs with potential concurrent write in |
466 | * depot_alloc_stack. |
467 | */ |
468 | int pool_index_cached = READ_ONCE(pool_index); |
469 | void *pool; |
470 | size_t offset = parts.offset << DEPOT_STACK_ALIGN; |
471 | struct stack_record *stack; |
472 | |
473 | *entries = NULL; |
474 | /* |
475 | * Let KMSAN know *entries is initialized. This shall prevent false |
476 | * positive reports if instrumented code accesses it. |
477 | */ |
478 | kmsan_unpoison_memory(address: entries, size: sizeof(*entries)); |
479 | |
480 | if (!handle) |
481 | return 0; |
482 | |
483 | if (parts.pool_index > pool_index_cached) { |
484 | WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n" , |
485 | parts.pool_index, pool_index_cached, handle); |
486 | return 0; |
487 | } |
488 | pool = stack_pools[parts.pool_index]; |
489 | if (!pool) |
490 | return 0; |
491 | stack = pool + offset; |
492 | |
493 | *entries = stack->entries; |
494 | return stack->size; |
495 | } |
496 | EXPORT_SYMBOL_GPL(stack_depot_fetch); |
497 | |
498 | void stack_depot_print(depot_stack_handle_t stack) |
499 | { |
500 | unsigned long *entries; |
501 | unsigned int nr_entries; |
502 | |
503 | nr_entries = stack_depot_fetch(stack, &entries); |
504 | if (nr_entries > 0) |
505 | stack_trace_print(trace: entries, nr_entries, spaces: 0); |
506 | } |
507 | EXPORT_SYMBOL_GPL(stack_depot_print); |
508 | |
509 | int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, |
510 | int spaces) |
511 | { |
512 | unsigned long *entries; |
513 | unsigned int nr_entries; |
514 | |
515 | nr_entries = stack_depot_fetch(handle, &entries); |
516 | return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, |
517 | spaces) : 0; |
518 | } |
519 | EXPORT_SYMBOL_GPL(stack_depot_snprint); |
520 | |
521 | depot_stack_handle_t __must_check ( |
522 | depot_stack_handle_t handle, unsigned int ) |
523 | { |
524 | union handle_parts parts = { .handle = handle }; |
525 | |
526 | /* Don't set extra bits on empty handles. */ |
527 | if (!handle) |
528 | return 0; |
529 | |
530 | parts.extra = extra_bits; |
531 | return parts.handle; |
532 | } |
533 | EXPORT_SYMBOL(stack_depot_set_extra_bits); |
534 | |
535 | unsigned int (depot_stack_handle_t handle) |
536 | { |
537 | union handle_parts parts = { .handle = handle }; |
538 | |
539 | return parts.extra; |
540 | } |
541 | EXPORT_SYMBOL(stack_depot_get_extra_bits); |
542 | |