1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Debugging module statistics. |
4 | * |
5 | * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> |
6 | */ |
7 | |
8 | #include <linux/module.h> |
9 | #include <uapi/linux/module.h> |
10 | #include <linux/string.h> |
11 | #include <linux/printk.h> |
12 | #include <linux/slab.h> |
13 | #include <linux/list.h> |
14 | #include <linux/debugfs.h> |
15 | #include <linux/rculist.h> |
16 | #include <linux/math.h> |
17 | |
18 | #include "internal.h" |
19 | |
20 | /** |
21 | * DOC: module debugging statistics overview |
22 | * |
23 | * Enabling CONFIG_MODULE_STATS enables module debugging statistics which |
24 | * are useful to monitor and root cause memory pressure issues with module |
25 | * loading. These statistics are useful to allow us to improve production |
26 | * workloads. |
27 | * |
28 | * The current module debugging statistics supported help keep track of module |
29 | * loading failures to enable improvements either for kernel module auto-loading |
30 | * usage (request_module()) or interactions with userspace. Statistics are |
31 | * provided to track all possible failures in the finit_module() path and memory |
32 | * wasted in this process space. Each of the failure counters are associated |
33 | * to a type of module loading failure which is known to incur a certain amount |
34 | * of memory allocation loss. In the worst case loading a module will fail after |
35 | * a 3 step memory allocation process: |
36 | * |
37 | * a) memory allocated with kernel_read_file_from_fd() |
38 | * b) module decompression processes the file read from |
39 | * kernel_read_file_from_fd(), and vmap() is used to map |
40 | * the decompressed module to a new local buffer which represents |
41 | * a copy of the decompressed module passed from userspace. The buffer |
42 | * from kernel_read_file_from_fd() is freed right away. |
43 | * c) layout_and_allocate() allocates space for the final resting |
44 | * place where we would keep the module if it were to be processed |
45 | * successfully. |
46 | * |
47 | * If a failure occurs after these three different allocations only one |
48 | * counter will be incremented with the summation of the allocated bytes freed |
49 | * incurred during this failure. Likewise, if module loading failed only after |
50 | * step b) a separate counter is used and incremented for the bytes freed and |
51 | * not used during both of those allocations. |
52 | * |
53 | * Virtual memory space can be limited, for example on x86 virtual memory size |
54 | * defaults to 128 MiB. We should strive to limit and avoid wasting virtual |
55 | * memory allocations when possible. These module debugging statistics help |
56 | * to evaluate how much memory is being wasted on bootup due to module loading |
57 | * failures. |
58 | * |
59 | * All counters are designed to be incremental. Atomic counters are used so to |
60 | * remain simple and avoid delays and deadlocks. |
61 | */ |
62 | |
63 | /** |
64 | * DOC: dup_failed_modules - tracks duplicate failed modules |
65 | * |
66 | * Linked list of modules which failed to be loaded because an already existing |
67 | * module with the same name was already being processed or already loaded. |
68 | * The finit_module() system call incurs heavy virtual memory allocations. In |
69 | * the worst case an finit_module() system call can end up allocating virtual |
70 | * memory 3 times: |
71 | * |
72 | * 1) kernel_read_file_from_fd() call uses vmalloc() |
73 | * 2) optional module decompression uses vmap() |
74 | * 3) layout_and allocate() can use vzalloc() or an arch specific variation of |
75 | * vmalloc to deal with ELF sections requiring special permissions |
76 | * |
77 | * In practice on a typical boot today most finit_module() calls fail due to |
78 | * the module with the same name already being loaded or about to be processed. |
79 | * All virtual memory allocated to these failed modules will be freed with |
80 | * no functional use. |
81 | * |
82 | * To help with this the dup_failed_modules allows us to track modules which |
83 | * failed to load due to the fact that a module was already loaded or being |
84 | * processed. There are only two points at which we can fail such calls, |
85 | * we list them below along with the number of virtual memory allocation |
86 | * calls: |
87 | * |
88 | * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before |
89 | * layout_and_allocate(). |
90 | * - with module decompression: 2 virtual memory allocation calls |
91 | * - without module decompression: 1 virtual memory allocation calls |
92 | * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module() |
93 | * - with module decompression 3 virtual memory allocation calls |
94 | * - without module decompression 2 virtual memory allocation calls |
95 | * |
96 | * We should strive to get this list to be as small as possible. If this list |
97 | * is not empty it is a reflection of possible work or optimizations possible |
98 | * either in-kernel or in userspace. |
99 | */ |
100 | static LIST_HEAD(dup_failed_modules); |
101 | |
102 | /** |
103 | * DOC: module statistics debugfs counters |
104 | * |
105 | * The total amount of wasted virtual memory allocation space during module |
106 | * loading can be computed by adding the total from the summation: |
107 | * |
108 | * * @invalid_kread_bytes + |
109 | * @invalid_decompress_bytes + |
110 | * @invalid_becoming_bytes + |
111 | * @invalid_mod_bytes |
112 | * |
113 | * The following debugfs counters are available to inspect module loading |
114 | * failures: |
115 | * |
116 | * * total_mod_size: total bytes ever used by all modules we've dealt with on |
117 | * this system |
118 | * * total_text_size: total bytes of the .text and .init.text ELF section |
119 | * sizes we've dealt with on this system |
120 | * * invalid_kread_bytes: bytes allocated and then freed on failures which |
121 | * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd() |
122 | * uses vmalloc(). These should typically not happen unless your system is |
123 | * under memory pressure. |
124 | * * invalid_decompress_bytes: number of bytes allocated and freed due to |
125 | * memory allocations in the module decompression path that use vmap(). |
126 | * These typically should not happen unless your system is under memory |
127 | * pressure. |
128 | * * invalid_becoming_bytes: total number of bytes allocated and freed used |
129 | * to read the kernel module userspace wants us to read before we |
130 | * promote it to be processed to be added to our @modules linked list. These |
131 | * failures can happen if we had a check in between a successful kernel_read_file_from_fd() |
132 | * call and right before we allocate the our private memory for the module |
133 | * which would be kept if the module is successfully loaded. The most common |
134 | * reason for this failure is when userspace is racing to load a module |
135 | * which it does not yet see loaded. The first module to succeed in |
136 | * add_unformed_module() will add a module to our &modules list and |
137 | * subsequent loads of modules with the same name will error out at the |
138 | * end of early_mod_check(). The check for module_patient_check_exists() |
139 | * at the end of early_mod_check() prevents duplicate allocations |
140 | * on layout_and_allocate() for modules already being processed. These |
141 | * duplicate failed modules are non-fatal, however they typically are |
142 | * indicative of userspace not seeing a module in userspace loaded yet and |
143 | * unnecessarily trying to load a module before the kernel even has a chance |
144 | * to begin to process prior requests. Although duplicate failures can be |
145 | * non-fatal, we should try to reduce vmalloc() pressure proactively, so |
146 | * ideally after boot this will be close to as 0 as possible. If module |
147 | * decompression was used we also add to this counter the cost of the |
148 | * initial kernel_read_file_from_fd() of the compressed module. If module |
149 | * decompression was not used the value represents the total allocated and |
150 | * freed bytes in kernel_read_file_from_fd() calls for these type of |
151 | * failures. These failures can occur because: |
152 | * |
153 | * * module_sig_check() - module signature checks |
154 | * * elf_validity_cache_copy() - some ELF validation issue |
155 | * * early_mod_check(): |
156 | * |
157 | * * blacklisting |
158 | * * failed to rewrite section headers |
159 | * * version magic |
160 | * * live patch requirements didn't check out |
161 | * * the module was detected as being already present |
162 | * |
163 | * * invalid_mod_bytes: these are the total number of bytes allocated and |
164 | * freed due to failures after we did all the sanity checks of the module |
165 | * which userspace passed to us and after our first check that the module |
166 | * is unique. A module can still fail to load if we detect the module is |
167 | * loaded after we allocate space for it with layout_and_allocate(), we do |
168 | * this check right before processing the module as live and run its |
169 | * initialization routines. Note that you have a failure of this type it |
170 | * also means the respective kernel_read_file_from_fd() memory space was |
171 | * also freed and not used, and so we increment this counter with twice |
172 | * the size of the module. Additionally if you used module decompression |
173 | * the size of the compressed module is also added to this counter. |
174 | * |
175 | * * modcount: how many modules we've loaded in our kernel life time |
176 | * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd() |
177 | * * failed_decompress: how many failed module decompression attempts we've had. |
178 | * These really should not happen unless your compression / decompression |
179 | * might be broken. |
180 | * * failed_becoming: how many modules failed after we kernel_read_file_from_fd() |
181 | * it and before we allocate memory for it with layout_and_allocate(). This |
182 | * counter is never incremented if you manage to validate the module and |
183 | * call layout_and_allocate() for it. |
184 | * * failed_load_modules: how many modules failed once we've allocated our |
185 | * private space for our module using layout_and_allocate(). These failures |
186 | * should hopefully mostly be dealt with already. Races in theory could |
187 | * still exist here, but it would just mean the kernel had started processing |
188 | * two threads concurrently up to early_mod_check() and one thread won. |
189 | * These failures are good signs the kernel or userspace is doing something |
190 | * seriously stupid or that could be improved. We should strive to fix these, |
191 | * but it is perhaps not easy to fix them. A recent example are the modules |
192 | * requests incurred for frequency modules, a separate module request was |
193 | * being issued for each CPU on a system. |
194 | */ |
195 | |
196 | atomic_long_t total_mod_size; |
197 | atomic_long_t total_text_size; |
198 | atomic_long_t invalid_kread_bytes; |
199 | atomic_long_t invalid_decompress_bytes; |
200 | static atomic_long_t invalid_becoming_bytes; |
201 | static atomic_long_t invalid_mod_bytes; |
202 | atomic_t modcount; |
203 | atomic_t failed_kreads; |
204 | atomic_t failed_decompress; |
205 | static atomic_t failed_becoming; |
206 | static atomic_t failed_load_modules; |
207 | |
208 | static const char *mod_fail_to_str(struct mod_fail_load *mod_fail) |
209 | { |
210 | if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) && |
211 | test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) |
212 | return "Becoming & Load" ; |
213 | if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask)) |
214 | return "Becoming" ; |
215 | if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) |
216 | return "Load" ; |
217 | return "Bug-on-stats" ; |
218 | } |
219 | |
220 | void mod_stat_bump_invalid(struct load_info *info, int flags) |
221 | { |
222 | atomic_long_add(i: info->len * 2, v: &invalid_mod_bytes); |
223 | atomic_inc(v: &failed_load_modules); |
224 | #if defined(CONFIG_MODULE_DECOMPRESS) |
225 | if (flags & MODULE_INIT_COMPRESSED_FILE) |
226 | atomic_long_add(info->compressed_len, &invalid_mod_bytes); |
227 | #endif |
228 | } |
229 | |
230 | void mod_stat_bump_becoming(struct load_info *info, int flags) |
231 | { |
232 | atomic_inc(v: &failed_becoming); |
233 | atomic_long_add(i: info->len, v: &invalid_becoming_bytes); |
234 | #if defined(CONFIG_MODULE_DECOMPRESS) |
235 | if (flags & MODULE_INIT_COMPRESSED_FILE) |
236 | atomic_long_add(info->compressed_len, &invalid_becoming_bytes); |
237 | #endif |
238 | } |
239 | |
240 | int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason) |
241 | { |
242 | struct mod_fail_load *mod_fail; |
243 | |
244 | list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list, |
245 | lockdep_is_held(&module_mutex)) { |
246 | if (!strcmp(mod_fail->name, name)) { |
247 | atomic_long_inc(v: &mod_fail->count); |
248 | __set_bit(reason, &mod_fail->dup_fail_mask); |
249 | goto out; |
250 | } |
251 | } |
252 | |
253 | mod_fail = kzalloc(size: sizeof(*mod_fail), GFP_KERNEL); |
254 | if (!mod_fail) |
255 | return -ENOMEM; |
256 | memcpy(mod_fail->name, name, strlen(name)); |
257 | __set_bit(reason, &mod_fail->dup_fail_mask); |
258 | atomic_long_inc(v: &mod_fail->count); |
259 | list_add_rcu(new: &mod_fail->list, head: &dup_failed_modules); |
260 | out: |
261 | return 0; |
262 | } |
263 | |
264 | /* |
265 | * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the |
266 | * 112 module prints within 8k. |
267 | * |
268 | * 1024 + (64*112) = 8k |
269 | */ |
270 | #define MAX_PREAMBLE 1024 |
271 | #define MAX_FAILED_MOD_PRINT 112 |
272 | #define MAX_BYTES_PER_MOD 64 |
273 | static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, |
274 | size_t count, loff_t *ppos) |
275 | { |
276 | struct mod_fail_load *mod_fail; |
277 | unsigned int len, size, count_failed = 0; |
278 | char *buf; |
279 | int ret; |
280 | u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; |
281 | unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes, |
282 | idecompress_bytes, imod_bytes, total_virtual_lost; |
283 | |
284 | live_mod_count = atomic_read(v: &modcount); |
285 | fkreads = atomic_read(v: &failed_kreads); |
286 | fdecompress = atomic_read(v: &failed_decompress); |
287 | fbecoming = atomic_read(v: &failed_becoming); |
288 | floads = atomic_read(v: &failed_load_modules); |
289 | |
290 | total_size = atomic_long_read(v: &total_mod_size); |
291 | text_size = atomic_long_read(v: &total_text_size); |
292 | ikread_bytes = atomic_long_read(v: &invalid_kread_bytes); |
293 | idecompress_bytes = atomic_long_read(v: &invalid_decompress_bytes); |
294 | ibecoming_bytes = atomic_long_read(v: &invalid_becoming_bytes); |
295 | imod_bytes = atomic_long_read(v: &invalid_mod_bytes); |
296 | |
297 | total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes; |
298 | |
299 | size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming), |
300 | (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD; |
301 | buf = kzalloc(size, GFP_KERNEL); |
302 | if (buf == NULL) |
303 | return -ENOMEM; |
304 | |
305 | /* The beginning of our debug preamble */ |
306 | len = scnprintf(buf, size, fmt: "%25s\t%u\n" , "Mods ever loaded" , live_mod_count); |
307 | |
308 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%u\n" , "Mods failed on kread" , fkreads); |
309 | |
310 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%u\n" , "Mods failed on decompress" , |
311 | fdecompress); |
312 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%u\n" , "Mods failed on becoming" , fbecoming); |
313 | |
314 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%u\n" , "Mods failed on load" , floads); |
315 | |
316 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Total module size" , total_size); |
317 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Total mod text size" , text_size); |
318 | |
319 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Failed kread bytes" , ikread_bytes); |
320 | |
321 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Failed decompress bytes" , |
322 | idecompress_bytes); |
323 | |
324 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Failed becoming bytes" , ibecoming_bytes); |
325 | |
326 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Failed kmod bytes" , imod_bytes); |
327 | |
328 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Virtual mem wasted bytes" , total_virtual_lost); |
329 | |
330 | if (live_mod_count && total_size) { |
331 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Average mod size" , |
332 | DIV_ROUND_UP(total_size, live_mod_count)); |
333 | } |
334 | |
335 | if (live_mod_count && text_size) { |
336 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Average mod text size" , |
337 | DIV_ROUND_UP(text_size, live_mod_count)); |
338 | } |
339 | |
340 | /* |
341 | * We use WARN_ON_ONCE() for the counters to ensure we always have parity |
342 | * for keeping tabs on a type of failure with one type of byte counter. |
343 | * The counters for imod_bytes does not increase for fkreads failures |
344 | * for example, and so on. |
345 | */ |
346 | |
347 | WARN_ON_ONCE(ikread_bytes && !fkreads); |
348 | if (fkreads && ikread_bytes) { |
349 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Avg fail kread bytes" , |
350 | DIV_ROUND_UP(ikread_bytes, fkreads)); |
351 | } |
352 | |
353 | WARN_ON_ONCE(ibecoming_bytes && !fbecoming); |
354 | if (fbecoming && ibecoming_bytes) { |
355 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Avg fail becoming bytes" , |
356 | DIV_ROUND_UP(ibecoming_bytes, fbecoming)); |
357 | } |
358 | |
359 | WARN_ON_ONCE(idecompress_bytes && !fdecompress); |
360 | if (fdecompress && idecompress_bytes) { |
361 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Avg fail decomp bytes" , |
362 | DIV_ROUND_UP(idecompress_bytes, fdecompress)); |
363 | } |
364 | |
365 | WARN_ON_ONCE(imod_bytes && !floads); |
366 | if (floads && imod_bytes) { |
367 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%lu\n" , "Average fail load bytes" , |
368 | DIV_ROUND_UP(imod_bytes, floads)); |
369 | } |
370 | |
371 | /* End of our debug preamble header. */ |
372 | |
373 | /* Catch when we've gone beyond our expected preamble */ |
374 | WARN_ON_ONCE(len >= MAX_PREAMBLE); |
375 | |
376 | if (list_empty(head: &dup_failed_modules)) |
377 | goto out; |
378 | |
379 | len += scnprintf(buf: buf + len, size: size - len, fmt: "Duplicate failed modules:\n" ); |
380 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%15s\t%25s\n" , |
381 | "Module-name" , "How-many-times" , "Reason" ); |
382 | mutex_lock(&module_mutex); |
383 | |
384 | |
385 | list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) { |
386 | if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT)) |
387 | goto out_unlock; |
388 | len += scnprintf(buf: buf + len, size: size - len, fmt: "%25s\t%15lu\t%25s\n" , mod_fail->name, |
389 | atomic_long_read(v: &mod_fail->count), mod_fail_to_str(mod_fail)); |
390 | } |
391 | out_unlock: |
392 | mutex_unlock(lock: &module_mutex); |
393 | out: |
394 | ret = simple_read_from_buffer(to: user_buf, count, ppos, from: buf, available: len); |
395 | kfree(objp: buf); |
396 | return ret; |
397 | } |
398 | #undef MAX_PREAMBLE |
399 | #undef MAX_FAILED_MOD_PRINT |
400 | #undef MAX_BYTES_PER_MOD |
401 | |
402 | static const struct file_operations fops_mod_stats = { |
403 | .read = read_file_mod_stats, |
404 | .open = simple_open, |
405 | .owner = THIS_MODULE, |
406 | .llseek = default_llseek, |
407 | }; |
408 | |
409 | #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter) |
410 | #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name) |
411 | static int __init module_stats_init(void) |
412 | { |
413 | mod_debug_add_ulong(total_mod_size); |
414 | mod_debug_add_ulong(total_text_size); |
415 | mod_debug_add_ulong(invalid_kread_bytes); |
416 | mod_debug_add_ulong(invalid_decompress_bytes); |
417 | mod_debug_add_ulong(invalid_becoming_bytes); |
418 | mod_debug_add_ulong(invalid_mod_bytes); |
419 | |
420 | mod_debug_add_atomic(modcount); |
421 | mod_debug_add_atomic(failed_kreads); |
422 | mod_debug_add_atomic(failed_decompress); |
423 | mod_debug_add_atomic(failed_becoming); |
424 | mod_debug_add_atomic(failed_load_modules); |
425 | |
426 | debugfs_create_file(name: "stats" , mode: 0400, parent: mod_debugfs_root, data: mod_debugfs_root, fops: &fops_mod_stats); |
427 | |
428 | return 0; |
429 | } |
430 | #undef mod_debug_add_ulong |
431 | #undef mod_debug_add_atomic |
432 | module_init(module_stats_init); |
433 | |