vpa-dtl.c source code [linux/arch/powerpc/perf/vpa-dtl.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Perf interface to expose Dispatch Trace Log counters.
4	*
5	* Copyright (C) 2024 Kajol Jain, IBM Corporation
6	*/
7
8	#ifdef CONFIG_PPC_SPLPAR
9	#define pr_fmt(fmt) "vpa_dtl: " fmt
10
11	#include <asm/dtl.h>
12	#include <linux/perf_event.h>
13	#include <asm/plpar_wrappers.h>
14	#include <linux/vmalloc.h>
15
16	#define EVENT(_name, _code) enum{_name = _code}
17
18	/*
19	* Based on Power Architecture Platform Reference(PAPR) documentation,
20	* Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL)
21	* Enable Mask used to get corresponding virtual processor dispatch
22	* to preempt traces:
23	* DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual
24	* processor waits
25	* DTL_PREEMPT(0x2): Trace time slice preempts
26	* DTL_FAULT(0x4): Trace virtual partition memory page
27	faults.
28	* DTL_ALL(0x7): Trace all (DTL_CEDE \| DTL_PREEMPT \| DTL_FAULT)
29	*
30	* Event codes based on Dispatch Trace Log Enable Mask.
31	*/
32	EVENT(DTL_CEDE, `0x1`);
33	EVENT(DTL_PREEMPT, `0x2`);
34	EVENT(DTL_FAULT, `0x4`);
35	EVENT(DTL_ALL, `0x7`);
36
37	GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE);
38	GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT);
39	GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT);
40	GENERIC_EVENT_ATTR(dtl_all, DTL_ALL);
41
42	PMU_FORMAT_ATTR(event, "config:0-7");
43
44	static struct attribute *events_attr[] = {
45	GENERIC_EVENT_PTR(DTL_CEDE),
46	GENERIC_EVENT_PTR(DTL_PREEMPT),
47	GENERIC_EVENT_PTR(DTL_FAULT),
48	GENERIC_EVENT_PTR(DTL_ALL),
49	NULL
50	};
51
52	static struct attribute_group event_group = {
53	.name = "events",
54	.attrs = events_attr,
55	};
56
57	static struct attribute *format_attrs[] = {
58	&format_attr_event.attr,
59	NULL,
60	};
61
62	static const struct attribute_group format_group = {
63	.name = "format",
64	.attrs = format_attrs,
65	};
66
67	static const struct attribute_group *attr_groups[] = {
68	&format_group,
69	&event_group,
70	NULL,
71	};
72
73	struct vpa_dtl {
74	struct dtl_entry *buf;
75	u64 last_idx;
76	};
77
78	struct vpa_pmu_ctx {
79	struct perf_output_handle handle;
80	};
81
82	struct vpa_pmu_buf {
83	int nr_pages;
84	bool snapshot;
85	u64 *base;
86	u64 size;
87	u64 head;
88	u64 head_size;
89	/ boot timebase and frequency needs to be saved only at once /
90	int boottb_freq_saved;
91	u64 threshold;
92	bool full;
93	};
94
95	/*
96	* To corelate each DTL entry with other events across CPU's,
97	* we need to map timebase from "struct dtl_entry" which phyp
98	* provides with boot timebase. This also needs timebase frequency.
99	* Formula is: ((timbase from DTL entry - boot time) / frequency)
100	*
101	* To match with size of "struct dtl_entry" to ease post processing,
102	* padded 24 bytes to the structure.
103	*/
104	struct boottb_freq {
105	u64 boot_tb;
106	u64 tb_freq;
107	u64 timebase;
108	u64 padded[`3`];
109	};
110
111	static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx);
112	static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu);
113
114	/ variable to capture reference count for the active dtl threads /
115	static int dtl_global_refc;
116	static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock);
117
118	/*
119	* Capture DTL data in AUX buffer
120	*/
121	static void vpa_dtl_capture_aux(long n_entries, struct* vpa_pmu_buf *buf,
122	struct vpa_dtl dtl, int* index)
123	{
124	struct dtl_entry aux_copy_buf = (struct* dtl_entry *)buf->base;
125
126	/*
127	* check if there is enough space to contain the
128	* DTL data. If not, save the data for available
129	* memory and set full to true.
130	*/
131	if (buf->head + *n_entries >= buf->threshold) {
132	*n_entries = buf->threshold - buf->head;
133	buf->full = `1`;
134	}
135
136	/*
137	* Copy to AUX buffer from per-thread address
138	*/
139	memcpy(aux_copy_buf + buf->head, &dtl->buf[index], n_entries sizeof(struct dtl_entry));
140
141	if (buf->full) {
142	/*
143	* Set head of private aux to zero when buffer is full
144	* so that next data will be copied to beginning of the
145	* buffer
146	*/
147	buf->head = `0`;
148	return;
149	}
150
151	buf->head += *n_entries;
152
153	return;
154	}
155
156	/*
157	* Function to dump the dispatch trace log buffer data to the
158	* perf data.
159	*
160	* perf_aux_output_begin: This function is called before writing
161	* to AUX area. This returns the pointer to aux area private structure,
162	* ie "struct vpa_pmu_buf" here which is set in setup_aux() function.
163	* The function obtains the output handle (used in perf_aux_output_end).
164	* when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end()
165	* to commit the recorded data.
166	*
167	* perf_aux_output_end: This function commits data by adjusting the
168	* aux_head of "struct perf_buffer". aux_tail will be moved in perf tools
169	* side when writing the data from aux buffer to perf.data file in disk.
170	*
171	* Here in the private aux structure, we maintain head to know where
172	* to copy data next time in the PMU driver. vpa_pmu_buf->head is moved to
173	* maintain the aux head for PMU driver. It is responsiblity of PMU
174	* driver to make sure data is copied between perf_aux_output_begin and
175	* perf_aux_output_end.
176	*
177	* After data is copied in vpa_dtl_capture_aux() function, perf_aux_output_end()
178	* is called to move the aux->head of "struct perf_buffer" to indicate size of
179	* data in aux buffer. This will post a PERF_RECORD_AUX into the perf buffer.
180	* Data will be written to disk only when the allocated buffer is full.
181	*
182	* By this approach, all the DTL data will be present as-is in the
183	* perf.data. The data will be pre-processed in perf tools side when doing
184	* perf report/perf script and this will avoid time taken to create samples
185	* in the kernel space.
186	*/
187	static void vpa_dtl_dump_sample_data(struct perf_event *event)
188	{
189	u64 cur_idx, last_idx, i;
190	u64 boot_tb;
191	struct boottb_freq boottb_freq;
192
193	/ actual number of entries read /
194	long n_read = `0`, read_size = `0`;
195
196	/ number of entries added to dtl buffer /
197	long n_req;
198
199	struct vpa_pmu_ctx *vpa_ctx = this_cpu_ptr(&vpa_pmu_ctx);
200
201	struct vpa_pmu_buf *aux_buf;
202
203	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
204	u64 size;
205
206	cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx);
207	last_idx = dtl->last_idx;
208
209	if (last_idx + N_DISPATCH_LOG <= cur_idx)
210	last_idx = cur_idx - N_DISPATCH_LOG + `1`;
211
212	n_req = cur_idx - last_idx;
213
214	/ no new entry added to the buffer, return /
215	if (n_req <= `0`)
216	return;
217
218	dtl->last_idx = last_idx + n_req;
219	boot_tb = get_boot_tb();
220
221	i = last_idx % N_DISPATCH_LOG;
222
223	aux_buf = perf_aux_output_begin(&vpa_ctx->handle, event);
224	if (!aux_buf) {
225	pr_debug("returning. no aux\n");
226	return;
227	}
228
229	if (!aux_buf->boottb_freq_saved) {
230	pr_debug("Copying boot tb to aux buffer: %lld\n", boot_tb);
231	/ Save boot_tb to convert raw timebase to it's relative system boot time /
232	boottb_freq.boot_tb = boot_tb;
233	/ Save tb_ticks_per_sec to convert timebase to sec /
234	boottb_freq.tb_freq = tb_ticks_per_sec;
235	boottb_freq.timebase = `0`;
236	memcpy(aux_buf->base, &boottb_freq, sizeof(boottb_freq));
237	aux_buf->head += `1`;
238	aux_buf->boottb_freq_saved = `1`;
239	n_read += `1`;
240	}
241
242	/ read the tail of the buffer if we've wrapped /
243	if (i + n_req > N_DISPATCH_LOG) {
244	read_size = N_DISPATCH_LOG - i;
245	vpa_dtl_capture_aux(&read_size, aux_buf, dtl, i);
246	n_req -= read_size;
247	n_read += read_size;
248	i = `0`;
249	if (aux_buf->full) {
250	size = (n_read * sizeof(struct dtl_entry));
251	if ((size + aux_buf->head_size) > aux_buf->size) {
252	size = aux_buf->size - aux_buf->head_size;
253	perf_aux_output_end(&vpa_ctx->handle, size);
254	aux_buf->head = `0`;
255	aux_buf->head_size = `0`;
256	} else {
257	aux_buf->head_size += (n_read * sizeof(struct dtl_entry));
258	perf_aux_output_end(&vpa_ctx->handle, n_read * sizeof(struct dtl_entry));
259	}
260	goto out;
261	}
262	}
263
264	/ .. and now the head /
265	vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i);
266
267	size = ((n_req + n_read) * sizeof(struct dtl_entry));
268	if ((size + aux_buf->head_size) > aux_buf->size) {
269	size = aux_buf->size - aux_buf->head_size;
270	perf_aux_output_end(&vpa_ctx->handle, size);
271	aux_buf->head = `0`;
272	aux_buf->head_size = `0`;
273	} else {
274	aux_buf->head_size += ((n_req + n_read) * sizeof(struct dtl_entry));
275	/ Move the aux->head to indicate size of data in aux buffer /
276	perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry));
277	}
278	out:
279	aux_buf->full = `0`;
280	}
281
282	/*
283	* The VPA Dispatch Trace log counters do not interrupt on overflow.
284	* Therefore, the kernel needs to poll the counters to avoid missing
285	* an overflow using hrtimer. The timer interval is based on sample_period
286	* count provided by user, and minimum interval is 1 millisecond.
287	*/
288	static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer)
289	{
290	struct perf_event *event;
291	u64 period;
292
293	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
294
295	if (event->state != PERF_EVENT_STATE_ACTIVE)
296	return HRTIMER_NORESTART;
297
298	vpa_dtl_dump_sample_data(event);
299	period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period);
300	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
301
302	return HRTIMER_RESTART;
303	}
304
305	static void vpa_dtl_start_hrtimer(struct perf_event *event)
306	{
307	u64 period;
308	struct hw_perf_event *hwc = &event->hw;
309
310	period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period);
311	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED);
312	}
313
314	static void vpa_dtl_stop_hrtimer(struct perf_event *event)
315	{
316	struct hw_perf_event *hwc = &event->hw;
317
318	hrtimer_cancel(&hwc->hrtimer);
319	}
320
321	static void vpa_dtl_reset_global_refc(struct perf_event *event)
322	{
323	spin_lock(&dtl_global_lock);
324	dtl_global_refc--;
325	if (dtl_global_refc <= `0`) {
326	dtl_global_refc = `0`;
327	up_write(&dtl_access_lock);
328	}
329	spin_unlock(&dtl_global_lock);
330	}
331
332	static int vpa_dtl_mem_alloc(int cpu)
333	{
334	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu);
335	struct dtl_entry *buf = NULL;
336
337	/ Check for dispatch trace log buffer cache /
338	if (!dtl_cache)
339	return -ENOMEM;
340
341	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL \| GFP_ATOMIC, cpu_to_node(cpu));
342	if (!buf) {
343	pr_warn("buffer allocation failed for cpu %d\n", cpu);
344	return -ENOMEM;
345	}
346	dtl->buf = buf;
347	return `0`;
348	}
349
350	static int vpa_dtl_event_init(struct perf_event *event)
351	{
352	struct hw_perf_event *hwc = &event->hw;
353
354	/ test the event attr type for PMU enumeration /
355	if (event->attr.type != event->pmu->type)
356	return -ENOENT;
357
358	if (!perfmon_capable())
359	return -EACCES;
360
361	/ Return if this is a counting event /
362	if (!is_sampling_event(event))
363	return -EOPNOTSUPP;
364
365	/ no branch sampling /
366	if (has_branch_stack(event))
367	return -EOPNOTSUPP;
368
369	/ Invalid eventcode /
370	switch (event->attr.config) {
371	case DTL_LOG_CEDE:
372	case DTL_LOG_PREEMPT:
373	case DTL_LOG_FAULT:
374	case DTL_LOG_ALL:
375	break;
376	default:
377	return -EINVAL;
378	}
379
380	spin_lock(&dtl_global_lock);
381
382	/*
383	* To ensure there are no other conflicting dtl users
384	* (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl),
385	* below code try to take the dtl_access_lock.
386	* The dtl_access_lock is a rwlock defined in dtl.h, which is used
387	* to unsure there is no conflicting dtl users.
388	* Based on below code, vpa_dtl pmu tries to take write access lock
389	* and also checks for dtl_global_refc, to make sure that the
390	* dtl_access_lock is taken by vpa_dtl pmu interface.
391	*/
392	if (dtl_global_refc == `0` && !down_write_trylock(&dtl_access_lock)) {
393	spin_unlock(&dtl_global_lock);
394	return -EBUSY;
395	}
396
397	/ Allocate dtl buffer memory /
398	if (vpa_dtl_mem_alloc(event->cpu)) {
399	spin_unlock(&dtl_global_lock);
400	return -ENOMEM;
401	}
402
403	/*
404	* Increment the number of active vpa_dtl pmu threads. The
405	* dtl_global_refc is used to keep count of cpu threads that
406	* currently capturing dtl data using vpa_dtl pmu interface.
407	*/
408	dtl_global_refc++;
409
410	spin_unlock(&dtl_global_lock);
411
412	hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
413
414	/*
415	* Since hrtimers have a fixed rate, we can do a static freq->period
416	* mapping and avoid the whole period adjust feedback stuff.
417	*/
418	if (event->attr.freq) {
419	long freq = event->attr.sample_freq;
420
421	event->attr.sample_period = NSEC_PER_SEC / freq;
422	hwc->sample_period = event->attr.sample_period;
423	local64_set(&hwc->period_left, hwc->sample_period);
424	hwc->last_period = hwc->sample_period;
425	event->attr.freq = `0`;
426	}
427
428	event->destroy = vpa_dtl_reset_global_refc;
429	return `0`;
430	}
431
432	static int vpa_dtl_event_add(struct perf_event event, int* flags)
433	{
434	int ret, hwcpu;
435	unsigned long addr;
436	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
437
438	/*
439	* Register our dtl buffer with the hypervisor. The
440	* HV expects the buffer size to be passed in the second
441	* word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA'
442	* from PAPR for more information.
443	*/
444	((u32 *)dtl->buf)[`1`] = cpu_to_be32(DISPATCH_LOG_BYTES);
445	dtl->last_idx = `0`;
446
447	hwcpu = get_hard_smp_processor_id(event->cpu);
448	addr = __pa(dtl->buf);
449
450	ret = register_dtl(hwcpu, addr);
451	if (ret) {
452	pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n",
453	event->cpu, hwcpu, ret);
454	return ret;
455	}
456
457	/ set our initial buffer indices /
458	lppaca_of(event->cpu).dtl_idx = `0`;
459
460	/*
461	* Ensure that our updates to the lppaca fields have
462	* occurred before we actually enable the logging
463	*/
464	smp_wmb();
465
466	/ enable event logging /
467	lppaca_of(event->cpu).dtl_enable_mask = event->attr.config;
468
469	vpa_dtl_start_hrtimer(event);
470
471	return `0`;
472	}
473
474	static void vpa_dtl_event_del(struct perf_event event, int* flags)
475	{
476	int hwcpu = get_hard_smp_processor_id(event->cpu);
477	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
478
479	vpa_dtl_stop_hrtimer(event);
480	unregister_dtl(hwcpu);
481	kmem_cache_free(dtl_cache, dtl->buf);
482	dtl->buf = NULL;
483	lppaca_of(event->cpu).dtl_enable_mask = `0x0`;
484	}
485
486	/*
487	* This function definition is empty as vpa_dtl_dump_sample_data
488	* is used to parse and dump the dispatch trace log data,
489	* to perf data.
490	*/
491	static void vpa_dtl_event_read(struct perf_event *event)
492	{
493	}
494
495	/*
496	* Set up pmu-private data structures for an AUX area
497	* **pages contains the aux buffer allocated for this event
498	* for the corresponding cpu. rb_alloc_aux uses "alloc_pages_node"
499	* and returns pointer to each page address. Map these pages to
500	* contiguous space using vmap and use that as base address.
501	*
502	* The aux private data structure ie, "struct vpa_pmu_buf" mainly
503	* saves
504	* - buf->base: aux buffer base address
505	* - buf->head: offset from base address where data will be written to.
506	* - buf->size: Size of allocated memory
507	*/
508	static void vpa_dtl_setup_aux(struct* perf_event event, void* **pages,
509	int nr_pages, bool snapshot)
510	{
511	int i, cpu = event->cpu;
512	struct vpa_pmu_buf *buf __free(kfree) = NULL;
513	struct page **pglist __free(kfree) = NULL;
514
515	/ We need at least one page for this to work. /
516	if (!nr_pages)
517	return NULL;
518
519	if (cpu == -`1`)
520	cpu = raw_smp_processor_id();
521
522	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
523	if (!buf)
524	return NULL;
525
526	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
527	if (!pglist)
528	return NULL;
529
530	for (i = `0`; i < nr_pages; ++i)
531	pglist[i] = virt_to_page(pages[i]);
532
533	buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
534	if (!buf->base)
535	return NULL;
536
537	buf->nr_pages = nr_pages;
538	buf->snapshot = false;
539
540	buf->size = nr_pages << PAGE_SHIFT;
541	buf->head = `0`;
542	buf->head_size = `0`;
543	buf->boottb_freq_saved = `0`;
544	buf->threshold = ((buf->size - `32`) / sizeof(struct dtl_entry));
545	return no_free_ptr(buf);
546	}
547
548	/*
549	* free pmu-private AUX data structures
550	*/
551	static void vpa_dtl_free_aux(void *aux)
552	{
553	struct vpa_pmu_buf *buf = aux;
554
555	vunmap(buf->base);
556	kfree(buf);
557	}
558
559	static struct pmu vpa_dtl_pmu = {
560	.task_ctx_nr = perf_invalid_context,
561
562	.name = "vpa_dtl",
563	.attr_groups = attr_groups,
564	.event_init = vpa_dtl_event_init,
565	.add = vpa_dtl_event_add,
566	.del = vpa_dtl_event_del,
567	.read = vpa_dtl_event_read,
568	.setup_aux = vpa_dtl_setup_aux,
569	.free_aux = vpa_dtl_free_aux,
570	.capabilities = PERF_PMU_CAP_NO_EXCLUDE \| PERF_PMU_CAP_EXCLUSIVE,
571	};
572
573	static int vpa_dtl_init(void)
574	{
575	int r;
576
577	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
578	pr_debug("not a shared virtualized system, not enabling\n");
579	return -ENODEV;
580	}
581
582	/ This driver is intended only for L1 host. /
583	if (is_kvm_guest()) {
584	pr_debug("Only supported for L1 host system\n");
585	return -ENODEV;
586	}
587
588	r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -`1`);
589	if (r)
590	return r;
591
592	return `0`;
593	}
594
595	device_initcall(vpa_dtl_init);
596	#endif //CONFIG_PPC_SPLPAR
597

source code of linux/arch/powerpc/perf/vpa-dtl.c