profile.c source code [linux/kernel/profile.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/kernel/profile.c
4	* Simple profiling. Manages a direct-mapped profile hit count buffer,
5	* with configurable resolution, support for restricting the cpus on
6	* which profiling is done, and switching between cpu time and
7	* schedule() calls via kernel command line parameters passed at boot.
8	*
9	* Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
10	* Red Hat, July 2004
11	* Consolidation of architecture support code for profiling,
12	* Nadia Yvette Chambers, Oracle, July 2004
13	* Amortized hit count accounting via per-cpu open-addressed hashtables
14	* to resolve timer interrupt livelocks, Nadia Yvette Chambers,
15	* Oracle, 2004
16	*/
17
18	#include <linux/export.h>
19	#include <linux/profile.h>
20	#include <linux/memblock.h>
21	#include <linux/notifier.h>
22	#include <linux/mm.h>
23	#include <linux/cpumask.h>
24	#include <linux/cpu.h>
25	#include <linux/highmem.h>
26	#include <linux/mutex.h>
27	#include <linux/slab.h>
28	#include <linux/vmalloc.h>
29	#include <linux/sched/stat.h>
30
31	#include <asm/sections.h>
32	#include <asm/irq_regs.h>
33	#include <asm/ptrace.h>
34
35	struct profile_hit {
36	u32 pc, hits;
37	};
38	#define PROFILE_GRPSHIFT 3
39	#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
40	#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
41	#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
42
43	static atomic_t *prof_buffer;
44	static unsigned long prof_len;
45	static unsigned short int prof_shift;
46
47	int prof_on __read_mostly;
48	EXPORT_SYMBOL_GPL(prof_on);
49
50	static cpumask_var_t prof_cpu_mask;
51	#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
52	static DEFINE_PER_CPU(struct profile_hit *[`2`], cpu_profile_hits);
53	static DEFINE_PER_CPU(int, cpu_profile_flip);
54	static DEFINE_MUTEX(profile_flip_mutex);
55	#endif /* CONFIG_SMP */
56
57	int profile_setup(char *str)
58	{
59	static const char schedstr[] = "schedule";
60	static const char sleepstr[] = "sleep";
61	static const char kvmstr[] = "kvm";
62	const char *select = NULL;
63	int par;
64
65	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
66	#ifdef CONFIG_SCHEDSTATS
67	force_schedstat_enabled();
68	prof_on = SLEEP_PROFILING;
69	select = sleepstr;
70	#else
71	pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
72	#endif /* CONFIG_SCHEDSTATS */
73	} else if (!strncmp(str, schedstr, strlen(schedstr))) {
74	prof_on = SCHED_PROFILING;
75	select = schedstr;
76	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
77	prof_on = KVM_PROFILING;
78	select = kvmstr;
79	} else if (get_option(str: &str, pint: &par)) {
80	prof_shift = clamp(par, `0`, BITS_PER_LONG - `1`);
81	prof_on = CPU_PROFILING;
82	pr_info("kernel profiling enabled (shift: %u)\n",
83	prof_shift);
84	}
85
86	if (select) {
87	if (str[strlen(select)] == `','`)
88	str += strlen(select) + `1`;
89	if (get_option(str: &str, pint: &par))
90	prof_shift = clamp(par, `0`, BITS_PER_LONG - `1`);
91	pr_info("kernel %s profiling enabled (shift: %u)\n",
92	select, prof_shift);
93	}
94
95	return `1`;
96	}
97	__setup("profile=", profile_setup);
98
99
100	int __ref profile_init(void)
101	{
102	int buffer_bytes;
103	if (!prof_on)
104	return `0`;
105
106	/ only text is profiled /
107	prof_len = (_etext - _stext) >> prof_shift;
108
109	if (!prof_len) {
110	pr_warn("profiling shift: %u too large\n", prof_shift);
111	prof_on = `0`;
112	return -EINVAL;
113	}
114
115	buffer_bytes = prof_len*sizeof(atomic_t);
116
117	if (!alloc_cpumask_var(mask: &prof_cpu_mask, GFP_KERNEL))
118	return -ENOMEM;
119
120	cpumask_copy(dstp: prof_cpu_mask, cpu_possible_mask);
121
122	prof_buffer = kzalloc(size: buffer_bytes, GFP_KERNEL\|__GFP_NOWARN);
123	if (prof_buffer)
124	return `0`;
125
126	prof_buffer = alloc_pages_exact(size: buffer_bytes,
127	GFP_KERNEL\|__GFP_ZERO\|__GFP_NOWARN);
128	if (prof_buffer)
129	return `0`;
130
131	prof_buffer = vzalloc(size: buffer_bytes);
132	if (prof_buffer)
133	return `0`;
134
135	free_cpumask_var(mask: prof_cpu_mask);
136	return -ENOMEM;
137	}
138
139	#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
140	/*
141	* Each cpu has a pair of open-addressed hashtables for pending
142	* profile hits. read_profile() IPI's all cpus to request them
143	* to flip buffers and flushes their contents to prof_buffer itself.
144	* Flip requests are serialized by the profile_flip_mutex. The sole
145	* use of having a second hashtable is for avoiding cacheline
146	* contention that would otherwise happen during flushes of pending
147	* profile hits required for the accuracy of reported profile hits
148	* and so resurrect the interrupt livelock issue.
149	*
150	* The open-addressed hashtables are indexed by profile buffer slot
151	* and hold the number of pending hits to that profile buffer slot on
152	* a cpu in an entry. When the hashtable overflows, all pending hits
153	* are accounted to their corresponding profile buffer slots with
154	* atomic_add() and the hashtable emptied. As numerous pending hits
155	* may be accounted to a profile buffer slot in a hashtable entry,
156	* this amortizes a number of atomic profile buffer increments likely
157	* to be far larger than the number of entries in the hashtable,
158	* particularly given that the number of distinct profile buffer
159	* positions to which hits are accounted during short intervals (e.g.
160	* several seconds) is usually very small. Exclusion from buffer
161	* flipping is provided by interrupt disablement (note that for
162	* SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
163	* process context).
164	* The hash function is meant to be lightweight as opposed to strong,
165	* and was vaguely inspired by ppc64 firmware-supported inverted
166	* pagetable hash functions, but uses a full hashtable full of finite
167	* collision chains, not just pairs of them.
168	*
169	* -- nyc
170	*/
171	static void __profile_flip_buffers(void *unused)
172	{
173	int cpu = smp_processor_id();
174
175	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
176	}
177
178	static void profile_flip_buffers(void)
179	{
180	int i, j, cpu;
181
182	mutex_lock(&profile_flip_mutex);
183	j = per_cpu(cpu_profile_flip, get_cpu());
184	put_cpu();
185	on_each_cpu(func: __profile_flip_buffers, NULL, wait: `1`);
186	for_each_online_cpu(cpu) {
187	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
188	for (i = `0`; i < NR_PROFILE_HIT; ++i) {
189	if (!hits[i].hits) {
190	if (hits[i].pc)
191	hits[i].pc = `0`;
192	continue;
193	}
194	atomic_add(i: hits[i].hits, v: &prof_buffer[hits[i].pc]);
195	hits[i].hits = hits[i].pc = `0`;
196	}
197	}
198	mutex_unlock(lock: &profile_flip_mutex);
199	}
200
201	static void profile_discard_flip_buffers(void)
202	{
203	int i, cpu;
204
205	mutex_lock(&profile_flip_mutex);
206	i = per_cpu(cpu_profile_flip, get_cpu());
207	put_cpu();
208	on_each_cpu(func: __profile_flip_buffers, NULL, wait: `1`);
209	for_each_online_cpu(cpu) {
210	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
211	memset(hits, `0`, NR_PROFILE_HIT*sizeof(struct profile_hit));
212	}
213	mutex_unlock(lock: &profile_flip_mutex);
214	}
215
216	static void do_profile_hits(int type, void __pc, unsigned* int nr_hits)
217	{
218	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
219	int i, j, cpu;
220	struct profile_hit *hits;
221
222	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - `1`);
223	i = primary = (pc & (NR_PROFILE_GRP - `1`)) << PROFILE_GRPSHIFT;
224	secondary = (~(pc << `1`) & (NR_PROFILE_GRP - `1`)) << PROFILE_GRPSHIFT;
225	cpu = get_cpu();
226	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
227	if (!hits) {
228	put_cpu();
229	return;
230	}
231	/*
232	* We buffer the global profiler buffer into a per-CPU
233	* queue and thus reduce the number of global (and possibly
234	* NUMA-alien) accesses. The write-queue is self-coalescing:
235	*/
236	local_irq_save(flags);
237	do {
238	for (j = `0`; j < PROFILE_GRPSZ; ++j) {
239	if (hits[i + j].pc == pc) {
240	hits[i + j].hits += nr_hits;
241	goto out;
242	} else if (!hits[i + j].hits) {
243	hits[i + j].pc = pc;
244	hits[i + j].hits = nr_hits;
245	goto out;
246	}
247	}
248	i = (i + secondary) & (NR_PROFILE_HIT - `1`);
249	} while (i != primary);
250
251	/*
252	* Add the current hit(s) and flush the write-queue out
253	* to the global buffer:
254	*/
255	atomic_add(i: nr_hits, v: &prof_buffer[pc]);
256	for (i = `0`; i < NR_PROFILE_HIT; ++i) {
257	atomic_add(i: hits[i].hits, v: &prof_buffer[hits[i].pc]);
258	hits[i].pc = hits[i].hits = `0`;
259	}
260	out:
261	local_irq_restore(flags);
262	put_cpu();
263	}
264
265	static int profile_dead_cpu(unsigned int cpu)
266	{
267	struct page *page;
268	int i;
269
270	if (cpumask_available(mask: prof_cpu_mask))
271	cpumask_clear_cpu(cpu, dstp: prof_cpu_mask);
272
273	for (i = `0`; i < `2`; i++) {
274	if (per_cpu(cpu_profile_hits, cpu)[i]) {
275	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
276	per_cpu(cpu_profile_hits, cpu)[i] = NULL;
277	__free_page(page);
278	}
279	}
280	return `0`;
281	}
282
283	static int profile_prepare_cpu(unsigned int cpu)
284	{
285	int i, node = cpu_to_mem(cpu);
286	struct page *page;
287
288	per_cpu(cpu_profile_flip, cpu) = `0`;
289
290	for (i = `0`; i < `2`; i++) {
291	if (per_cpu(cpu_profile_hits, cpu)[i])
292	continue;
293
294	page = __alloc_pages_node(nid: node, GFP_KERNEL \| __GFP_ZERO, order: `0`);
295	if (!page) {
296	profile_dead_cpu(cpu);
297	return -ENOMEM;
298	}
299	per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
300
301	}
302	return `0`;
303	}
304
305	static int profile_online_cpu(unsigned int cpu)
306	{
307	if (cpumask_available(mask: prof_cpu_mask))
308	cpumask_set_cpu(cpu, dstp: prof_cpu_mask);
309
310	return `0`;
311	}
312
313	#else /* !CONFIG_SMP */
314	#define profile_flip_buffers() do { } while (0)
315	#define profile_discard_flip_buffers() do { } while (0)
316
317	static void do_profile_hits(int type, void __pc, unsigned* int nr_hits)
318	{
319	unsigned long pc;
320	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
321	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - `1`)]);
322	}
323	#endif /* !CONFIG_SMP */
324
325	void profile_hits(int type, void __pc, unsigned* int nr_hits)
326	{
327	if (prof_on != type \|\| !prof_buffer)
328	return;
329	do_profile_hits(type, __pc, nr_hits);
330	}
331	EXPORT_SYMBOL_GPL(profile_hits);
332
333	void profile_tick(int type)
334	{
335	struct pt_regs *regs = get_irq_regs();
336
337	if (!user_mode(regs) && cpumask_available(mask: prof_cpu_mask) &&
338	cpumask_test_cpu(smp_processor_id(), cpumask: prof_cpu_mask))
339	profile_hit(type, ip: (void *)profile_pc(regs));
340	}
341
342	#ifdef CONFIG_PROC_FS
343	#include <linux/proc_fs.h>
344	#include <linux/seq_file.h>
345	#include <linux/uaccess.h>
346
347	static int prof_cpu_mask_proc_show(struct seq_file m, void* *v)
348	{
349	seq_printf(m, fmt: "%*pb\n", cpumask_pr_args(prof_cpu_mask));
350	return `0`;
351	}
352
353	static int prof_cpu_mask_proc_open(struct inode inode, struct* file *file)
354	{
355	return single_open(file, prof_cpu_mask_proc_show, NULL);
356	}
357
358	static ssize_t prof_cpu_mask_proc_write(struct file *file,
359	const char __user buffer, size_t count, loff_t pos)
360	{
361	cpumask_var_t new_value;
362	int err;
363
364	if (!zalloc_cpumask_var(mask: &new_value, GFP_KERNEL))
365	return -ENOMEM;
366
367	err = cpumask_parse_user(buf: buffer, len: count, dstp: new_value);
368	if (!err) {
369	cpumask_copy(dstp: prof_cpu_mask, srcp: new_value);
370	err = count;
371	}
372	free_cpumask_var(mask: new_value);
373	return err;
374	}
375
376	static const struct proc_ops prof_cpu_mask_proc_ops = {
377	.proc_open = prof_cpu_mask_proc_open,
378	.proc_read = seq_read,
379	.proc_lseek = seq_lseek,
380	.proc_release = single_release,
381	.proc_write = prof_cpu_mask_proc_write,
382	};
383
384	void create_prof_cpu_mask(void)
385	{
386	/ create /proc/irq/prof_cpu_mask /
387	proc_create(name: "irq/prof_cpu_mask", mode: `0600`, NULL, proc_ops: &prof_cpu_mask_proc_ops);
388	}
389
390	/*
391	* This function accesses profiling information. The returned data is
392	* binary: the sampling step and the actual contents of the profile
393	* buffer. Use of the program readprofile is recommended in order to
394	* get meaningful info out of these data.
395	*/
396	static ssize_t
397	read_profile(struct file file, char* __user buf, size_t count, loff_t ppos)
398	{
399	unsigned long p = *ppos;
400	ssize_t read;
401	char *pnt;
402	unsigned long sample_step = `1UL` << prof_shift;
403
404	profile_flip_buffers();
405	if (p >= (prof_len+`1`)*sizeof(unsigned int))
406	return `0`;
407	if (count > (prof_len+`1`)*sizeof(unsigned int) - p)
408	count = (prof_len+`1`)*sizeof(unsigned int) - p;
409	read = `0`;
410
411	while (p < sizeof(unsigned int) && count > `0`) {
412	if (put_user(((char* *)(&sample_step)+p), buf))
413	return -EFAULT;
414	buf++; p++; count--; read++;
415	}
416	pnt = (char )prof_buffer + p - sizeof*(atomic_t);
417	if (copy_to_user(to: buf, from: (void *)pnt, n: count))
418	return -EFAULT;
419	read += count;
420	*ppos += read;
421	return read;
422	}
423
424	/ default is to not implement this call /
425	int __weak setup_profiling_timer(unsigned mult)
426	{
427	return -EINVAL;
428	}
429
430	/*
431	* Writing to /proc/profile resets the counters
432	*
433	* Writing a 'profiling multiplier' value into it also re-sets the profiling
434	* interrupt frequency, on architectures that support this.
435	*/
436	static ssize_t write_profile(struct file file, const* char __user *buf,
437	size_t count, loff_t *ppos)
438	{
439	#ifdef CONFIG_SMP
440	if (count == sizeof(int)) {
441	unsigned int multiplier;
442
443	if (copy_from_user(to: &multiplier, from: buf, n: sizeof(int)))
444	return -EFAULT;
445
446	if (setup_profiling_timer(multiplier))
447	return -EINVAL;
448	}
449	#endif
450	profile_discard_flip_buffers();
451	memset(prof_buffer, `0`, prof_len * sizeof(atomic_t));
452	return count;
453	}
454
455	static const struct proc_ops profile_proc_ops = {
456	.proc_read = read_profile,
457	.proc_write = write_profile,
458	.proc_lseek = default_llseek,
459	};
460
461	int __ref create_proc_profile(void)
462	{
463	struct proc_dir_entry *entry;
464	#ifdef CONFIG_SMP
465	enum cpuhp_state online_state;
466	#endif
467
468	int err = `0`;
469
470	if (!prof_on)
471	return `0`;
472	#ifdef CONFIG_SMP
473	err = cpuhp_setup_state(state: CPUHP_PROFILE_PREPARE, name: "PROFILE_PREPARE",
474	startup: profile_prepare_cpu, teardown: profile_dead_cpu);
475	if (err)
476	return err;
477
478	err = cpuhp_setup_state(state: CPUHP_AP_ONLINE_DYN, name: "AP_PROFILE_ONLINE",
479	startup: profile_online_cpu, NULL);
480	if (err < `0`)
481	goto err_state_prep;
482	online_state = err;
483	err = `0`;
484	#endif
485	entry = proc_create(name: "profile", S_IWUSR \| S_IRUGO,
486	NULL, proc_ops: &profile_proc_ops);
487	if (!entry)
488	goto err_state_onl;
489	proc_set_size(entry, (`1` + prof_len) * sizeof(atomic_t));
490
491	return err;
492	err_state_onl:
493	#ifdef CONFIG_SMP
494	cpuhp_remove_state(state: online_state);
495	err_state_prep:
496	cpuhp_remove_state(state: CPUHP_PROFILE_PREPARE);
497	#endif
498	return err;
499	}
500	subsys_initcall(create_proc_profile);
501	#endif /* CONFIG_PROC_FS */
502

source code of linux/kernel/profile.c