1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
4 *
5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
6 *
7 * Authors:
8 * Avi Kivity <avi@redhat.com>
9 * Gleb Natapov <gleb@redhat.com>
10 * Wei Huang <wei@redhat.com>
11 */
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <linux/types.h>
15#include <linux/kvm_host.h>
16#include <linux/perf_event.h>
17#include <linux/bsearch.h>
18#include <linux/sort.h>
19#include <asm/perf_event.h>
20#include <asm/cpu_device_id.h>
21#include "x86.h"
22#include "cpuid.h"
23#include "lapic.h"
24#include "pmu.h"
25
26/* This is enough to filter the vast majority of currently defined events. */
27#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
28
29struct x86_pmu_capability __read_mostly kvm_pmu_cap;
30EXPORT_SYMBOL_GPL(kvm_pmu_cap);
31
32struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
33EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
34
35/* Precise Distribution of Instructions Retired (PDIR) */
36static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
37 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
38 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
39 /* Instruction-Accurate PDIR (PDIR++) */
40 X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
41 {}
42};
43
44/* Precise Distribution (PDist) */
45static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
46 X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
47 {}
48};
49
50/* NOTE:
51 * - Each perf counter is defined as "struct kvm_pmc";
52 * - There are two types of perf counters: general purpose (gp) and fixed.
53 * gp counters are stored in gp_counters[] and fixed counters are stored
54 * in fixed_counters[] respectively. Both of them are part of "struct
55 * kvm_pmu";
56 * - pmu.c understands the difference between gp counters and fixed counters.
57 * However AMD doesn't support fixed-counters;
58 * - There are three types of index to access perf counters (PMC):
59 * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
60 * has MSR_K7_PERFCTRn and, for families 15H and later,
61 * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
62 * aliased to MSR_K7_PERFCTRn.
63 * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
64 * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
65 * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
66 * that it also supports fixed counters. idx can be used to as index to
67 * gp and fixed counters.
68 * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
69 * code. Each pmc, stored in kvm_pmc.idx field, is unique across
70 * all perf counters (both gp and fixed). The mapping relationship
71 * between pmc and perf counters is as the following:
72 * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
73 * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
74 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
75 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
76 */
77
78static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
79
80#define KVM_X86_PMU_OP(func) \
81 DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
82 *(((struct kvm_pmu_ops *)0)->func));
83#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
84#include <asm/kvm-x86-pmu-ops.h>
85
86void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
87{
88 memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
89
90#define __KVM_X86_PMU_OP(func) \
91 static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
92#define KVM_X86_PMU_OP(func) \
93 WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
94#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
95#include <asm/kvm-x86-pmu-ops.h>
96#undef __KVM_X86_PMU_OP
97}
98
99static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
100{
101 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
102 bool skip_pmi = false;
103
104 if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
105 if (!in_pmi) {
106 /*
107 * TODO: KVM is currently _choosing_ to not generate records
108 * for emulated instructions, avoiding BUFFER_OVF PMI when
109 * there are no records. Strictly speaking, it should be done
110 * as well in the right context to improve sampling accuracy.
111 */
112 skip_pmi = true;
113 } else {
114 /* Indicate PEBS overflow PMI to guest. */
115 skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
116 (unsigned long *)&pmu->global_status);
117 }
118 } else {
119 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
120 }
121
122 if (pmc->intr && !skip_pmi)
123 kvm_make_request(KVM_REQ_PMI, vcpu: pmc->vcpu);
124}
125
126static void kvm_perf_overflow(struct perf_event *perf_event,
127 struct perf_sample_data *data,
128 struct pt_regs *regs)
129{
130 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
131
132 /*
133 * Ignore asynchronous overflow events for counters that are scheduled
134 * to be reprogrammed, e.g. if a PMI for the previous event races with
135 * KVM's handling of a related guest WRMSR.
136 */
137 if (test_and_set_bit(nr: pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
138 return;
139
140 __kvm_perf_overflow(pmc, in_pmi: true);
141
142 kvm_make_request(KVM_REQ_PMU, vcpu: pmc->vcpu);
143}
144
145static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
146{
147 /*
148 * For some model specific pebs counters with special capabilities
149 * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
150 * level to the maximum value (currently 3, backwards compatible)
151 * so that the perf subsystem would assign specific hardware counter
152 * with that capability for vPMC.
153 */
154 if ((pmc->idx == 0 && x86_match_cpu(match: vmx_pebs_pdist_cpu)) ||
155 (pmc->idx == 32 && x86_match_cpu(match: vmx_pebs_pdir_cpu)))
156 return 3;
157
158 /*
159 * The non-zero precision level of guest event makes the ordinary
160 * guest event becomes a guest PEBS event and triggers the host
161 * PEBS PMI handler to determine whether the PEBS overflow PMI
162 * comes from the host counters or the guest.
163 */
164 return 1;
165}
166
167static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
168{
169 u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
170
171 if (!sample_period)
172 sample_period = pmc_bitmask(pmc) + 1;
173 return sample_period;
174}
175
176static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
177 bool exclude_user, bool exclude_kernel,
178 bool intr)
179{
180 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
181 struct perf_event *event;
182 struct perf_event_attr attr = {
183 .type = type,
184 .size = sizeof(attr),
185 .pinned = true,
186 .exclude_idle = true,
187 .exclude_host = 1,
188 .exclude_user = exclude_user,
189 .exclude_kernel = exclude_kernel,
190 .config = config,
191 };
192 bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
193
194 attr.sample_period = get_sample_period(pmc, counter_value: pmc->counter);
195
196 if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
197 guest_cpuid_is_intel(vcpu: pmc->vcpu)) {
198 /*
199 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
200 * period. Just clear the sample period so at least
201 * allocating the counter doesn't fail.
202 */
203 attr.sample_period = 0;
204 }
205 if (pebs) {
206 /*
207 * For most PEBS hardware events, the difference in the software
208 * precision levels of guest and host PEBS events will not affect
209 * the accuracy of the PEBS profiling result, because the "event IP"
210 * in the PEBS record is calibrated on the guest side.
211 */
212 attr.precise_ip = pmc_get_pebs_precise_level(pmc);
213 }
214
215 event = perf_event_create_kernel_counter(attr: &attr, cpu: -1, current,
216 callback: kvm_perf_overflow, context: pmc);
217 if (IS_ERR(ptr: event)) {
218 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
219 PTR_ERR(event), pmc->idx);
220 return PTR_ERR(ptr: event);
221 }
222
223 pmc->perf_event = event;
224 pmc_to_pmu(pmc)->event_count++;
225 pmc->is_paused = false;
226 pmc->intr = intr || pebs;
227 return 0;
228}
229
230static bool pmc_pause_counter(struct kvm_pmc *pmc)
231{
232 u64 counter = pmc->counter;
233 u64 prev_counter;
234
235 /* update counter, reset event value to avoid redundant accumulation */
236 if (pmc->perf_event && !pmc->is_paused)
237 counter += perf_event_pause(event: pmc->perf_event, reset: true);
238
239 /*
240 * Snapshot the previous counter *after* accumulating state from perf.
241 * If overflow already happened, hardware (via perf) is responsible for
242 * generating a PMI. KVM just needs to detect overflow on emulated
243 * counter events that haven't yet been processed.
244 */
245 prev_counter = counter & pmc_bitmask(pmc);
246
247 counter += pmc->emulated_counter;
248 pmc->counter = counter & pmc_bitmask(pmc);
249
250 pmc->emulated_counter = 0;
251 pmc->is_paused = true;
252
253 return pmc->counter < prev_counter;
254}
255
256static bool pmc_resume_counter(struct kvm_pmc *pmc)
257{
258 if (!pmc->perf_event)
259 return false;
260
261 /* recalibrate sample period and check if it's accepted by perf core */
262 if (is_sampling_event(event: pmc->perf_event) &&
263 perf_event_period(event: pmc->perf_event,
264 value: get_sample_period(pmc, counter_value: pmc->counter)))
265 return false;
266
267 if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
268 (!!pmc->perf_event->attr.precise_ip))
269 return false;
270
271 /* reuse perf_event to serve as pmc_reprogram_counter() does*/
272 perf_event_enable(event: pmc->perf_event);
273 pmc->is_paused = false;
274
275 return true;
276}
277
278static void pmc_release_perf_event(struct kvm_pmc *pmc)
279{
280 if (pmc->perf_event) {
281 perf_event_release_kernel(event: pmc->perf_event);
282 pmc->perf_event = NULL;
283 pmc->current_config = 0;
284 pmc_to_pmu(pmc)->event_count--;
285 }
286}
287
288static void pmc_stop_counter(struct kvm_pmc *pmc)
289{
290 if (pmc->perf_event) {
291 pmc->counter = pmc_read_counter(pmc);
292 pmc_release_perf_event(pmc);
293 }
294}
295
296static void pmc_update_sample_period(struct kvm_pmc *pmc)
297{
298 if (!pmc->perf_event || pmc->is_paused ||
299 !is_sampling_event(event: pmc->perf_event))
300 return;
301
302 perf_event_period(event: pmc->perf_event,
303 value: get_sample_period(pmc, counter_value: pmc->counter));
304}
305
306void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
307{
308 /*
309 * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
310 * read-modify-write. Adjust the counter value so that its value is
311 * relative to the current count, as reading the current count from
312 * perf is faster than pausing and repgrogramming the event in order to
313 * reset it to '0'. Note, this very sneakily offsets the accumulated
314 * emulated count too, by using pmc_read_counter()!
315 */
316 pmc->emulated_counter = 0;
317 pmc->counter += val - pmc_read_counter(pmc);
318 pmc->counter &= pmc_bitmask(pmc);
319 pmc_update_sample_period(pmc);
320}
321EXPORT_SYMBOL_GPL(pmc_write_counter);
322
323static int filter_cmp(const void *pa, const void *pb, u64 mask)
324{
325 u64 a = *(u64 *)pa & mask;
326 u64 b = *(u64 *)pb & mask;
327
328 return (a > b) - (a < b);
329}
330
331
332static int filter_sort_cmp(const void *pa, const void *pb)
333{
334 return filter_cmp(pa, pb, mask: (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
335 KVM_PMU_MASKED_ENTRY_EXCLUDE));
336}
337
338/*
339 * For the event filter, searching is done on the 'includes' list and
340 * 'excludes' list separately rather than on the 'events' list (which
341 * has both). As a result the exclude bit can be ignored.
342 */
343static int filter_event_cmp(const void *pa, const void *pb)
344{
345 return filter_cmp(pa, pb, mask: (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
346}
347
348static int find_filter_index(u64 *events, u64 nevents, u64 key)
349{
350 u64 *fe = bsearch(key: &key, base: events, num: nevents, size: sizeof(events[0]),
351 cmp: filter_event_cmp);
352
353 if (!fe)
354 return -1;
355
356 return fe - events;
357}
358
359static bool is_filter_entry_match(u64 filter_event, u64 umask)
360{
361 u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
362 u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
363
364 BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
365 (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
366 ARCH_PERFMON_EVENTSEL_UMASK);
367
368 return (umask & mask) == match;
369}
370
371static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
372{
373 u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
374 u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
375 int i, index;
376
377 index = find_filter_index(events, nevents, key: event_select);
378 if (index < 0)
379 return false;
380
381 /*
382 * Entries are sorted by the event select. Walk the list in both
383 * directions to process all entries with the targeted event select.
384 */
385 for (i = index; i < nevents; i++) {
386 if (filter_event_cmp(pa: &events[i], pb: &event_select))
387 break;
388
389 if (is_filter_entry_match(filter_event: events[i], umask))
390 return true;
391 }
392
393 for (i = index - 1; i >= 0; i--) {
394 if (filter_event_cmp(pa: &events[i], pb: &event_select))
395 break;
396
397 if (is_filter_entry_match(filter_event: events[i], umask))
398 return true;
399 }
400
401 return false;
402}
403
404static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
405 u64 eventsel)
406{
407 if (filter_contains_match(events: f->includes, nevents: f->nr_includes, eventsel) &&
408 !filter_contains_match(events: f->excludes, nevents: f->nr_excludes, eventsel))
409 return f->action == KVM_PMU_EVENT_ALLOW;
410
411 return f->action == KVM_PMU_EVENT_DENY;
412}
413
414static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
415 int idx)
416{
417 int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
418
419 if (filter->action == KVM_PMU_EVENT_DENY &&
420 test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
421 return false;
422 if (filter->action == KVM_PMU_EVENT_ALLOW &&
423 !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
424 return false;
425
426 return true;
427}
428
429static bool check_pmu_event_filter(struct kvm_pmc *pmc)
430{
431 struct kvm_x86_pmu_event_filter *filter;
432 struct kvm *kvm = pmc->vcpu->kvm;
433
434 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
435 if (!filter)
436 return true;
437
438 if (pmc_is_gp(pmc))
439 return is_gp_event_allowed(f: filter, eventsel: pmc->eventsel);
440
441 return is_fixed_event_allowed(filter, idx: pmc->idx);
442}
443
444static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
445{
446 return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
447 check_pmu_event_filter(pmc);
448}
449
450static int reprogram_counter(struct kvm_pmc *pmc)
451{
452 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
453 u64 eventsel = pmc->eventsel;
454 u64 new_config = eventsel;
455 bool emulate_overflow;
456 u8 fixed_ctr_ctrl;
457
458 emulate_overflow = pmc_pause_counter(pmc);
459
460 if (!pmc_event_is_allowed(pmc))
461 return 0;
462
463 if (emulate_overflow)
464 __kvm_perf_overflow(pmc, in_pmi: false);
465
466 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
467 printk_once("kvm pmu: pin control bit is ignored\n");
468
469 if (pmc_is_fixed(pmc)) {
470 fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
471 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
472 if (fixed_ctr_ctrl & 0x1)
473 eventsel |= ARCH_PERFMON_EVENTSEL_OS;
474 if (fixed_ctr_ctrl & 0x2)
475 eventsel |= ARCH_PERFMON_EVENTSEL_USR;
476 if (fixed_ctr_ctrl & 0x8)
477 eventsel |= ARCH_PERFMON_EVENTSEL_INT;
478 new_config = (u64)fixed_ctr_ctrl;
479 }
480
481 if (pmc->current_config == new_config && pmc_resume_counter(pmc))
482 return 0;
483
484 pmc_release_perf_event(pmc);
485
486 pmc->current_config = new_config;
487
488 return pmc_reprogram_counter(pmc, type: PERF_TYPE_RAW,
489 config: (eventsel & pmu->raw_event_mask),
490 exclude_user: !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
491 exclude_kernel: !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
492 intr: eventsel & ARCH_PERFMON_EVENTSEL_INT);
493}
494
495void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
496{
497 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
498 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
499 struct kvm_pmc *pmc;
500 int bit;
501
502 bitmap_copy(dst: bitmap, src: pmu->reprogram_pmi, X86_PMC_IDX_MAX);
503
504 /*
505 * The reprogramming bitmap can be written asynchronously by something
506 * other than the task that holds vcpu->mutex, take care to clear only
507 * the bits that will actually processed.
508 */
509 BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
510 atomic64_andnot(i: *(s64 *)bitmap, v: &pmu->__reprogram_pmi);
511
512 kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
513 /*
514 * If reprogramming fails, e.g. due to contention, re-set the
515 * regprogram bit set, i.e. opportunistically try again on the
516 * next PMU refresh. Don't make a new request as doing so can
517 * stall the guest if reprogramming repeatedly fails.
518 */
519 if (reprogram_counter(pmc))
520 set_bit(nr: pmc->idx, addr: pmu->reprogram_pmi);
521 }
522
523 /*
524 * Unused perf_events are only released if the corresponding MSRs
525 * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
526 * triggers KVM_REQ_PMU if cleanup is needed.
527 */
528 if (unlikely(pmu->need_cleanup))
529 kvm_pmu_cleanup(vcpu);
530}
531
532int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
533{
534 /*
535 * On Intel, VMX interception has priority over RDPMC exceptions that
536 * aren't already handled by the emulator, i.e. there are no additional
537 * check needed for Intel PMUs.
538 *
539 * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
540 * i.e. an invalid PMC results in a #GP, not #VMEXIT.
541 */
542 if (!kvm_pmu_ops.check_rdpmc_early)
543 return 0;
544
545 return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
546}
547
548bool is_vmware_backdoor_pmc(u32 pmc_idx)
549{
550 switch (pmc_idx) {
551 case VMWARE_BACKDOOR_PMC_HOST_TSC:
552 case VMWARE_BACKDOOR_PMC_REAL_TIME:
553 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
554 return true;
555 }
556 return false;
557}
558
559static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
560{
561 u64 ctr_val;
562
563 switch (idx) {
564 case VMWARE_BACKDOOR_PMC_HOST_TSC:
565 ctr_val = rdtsc();
566 break;
567 case VMWARE_BACKDOOR_PMC_REAL_TIME:
568 ctr_val = ktime_get_boottime_ns();
569 break;
570 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
571 ctr_val = ktime_get_boottime_ns() +
572 vcpu->kvm->arch.kvmclock_offset;
573 break;
574 default:
575 return 1;
576 }
577
578 *data = ctr_val;
579 return 0;
580}
581
582int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
583{
584 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
585 struct kvm_pmc *pmc;
586 u64 mask = ~0ull;
587
588 if (!pmu->version)
589 return 1;
590
591 if (is_vmware_backdoor_pmc(pmc_idx: idx))
592 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
593
594 pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
595 if (!pmc)
596 return 1;
597
598 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
599 (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
600 kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
601 return 1;
602
603 *data = pmc_read_counter(pmc) & mask;
604 return 0;
605}
606
607void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
608{
609 if (lapic_in_kernel(vcpu)) {
610 static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
611 kvm_apic_local_deliver(apic: vcpu->arch.apic, APIC_LVTPC);
612 }
613}
614
615bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
616{
617 switch (msr) {
618 case MSR_CORE_PERF_GLOBAL_STATUS:
619 case MSR_CORE_PERF_GLOBAL_CTRL:
620 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
621 return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
622 default:
623 break;
624 }
625 return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
626 static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
627}
628
629static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
630{
631 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
632 struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
633
634 if (pmc)
635 __set_bit(pmc->idx, pmu->pmc_in_use);
636}
637
638int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
639{
640 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
641 u32 msr = msr_info->index;
642
643 switch (msr) {
644 case MSR_CORE_PERF_GLOBAL_STATUS:
645 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
646 msr_info->data = pmu->global_status;
647 break;
648 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
649 case MSR_CORE_PERF_GLOBAL_CTRL:
650 msr_info->data = pmu->global_ctrl;
651 break;
652 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
653 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
654 msr_info->data = 0;
655 break;
656 default:
657 return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
658 }
659
660 return 0;
661}
662
663int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
664{
665 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
666 u32 msr = msr_info->index;
667 u64 data = msr_info->data;
668 u64 diff;
669
670 /*
671 * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
672 * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
673 */
674 switch (msr) {
675 case MSR_CORE_PERF_GLOBAL_STATUS:
676 if (!msr_info->host_initiated)
677 return 1; /* RO MSR */
678 fallthrough;
679 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
680 /* Per PPR, Read-only MSR. Writes are ignored. */
681 if (!msr_info->host_initiated)
682 break;
683
684 if (data & pmu->global_status_mask)
685 return 1;
686
687 pmu->global_status = data;
688 break;
689 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
690 data &= ~pmu->global_ctrl_mask;
691 fallthrough;
692 case MSR_CORE_PERF_GLOBAL_CTRL:
693 if (!kvm_valid_perf_global_ctrl(pmu, data))
694 return 1;
695
696 if (pmu->global_ctrl != data) {
697 diff = pmu->global_ctrl ^ data;
698 pmu->global_ctrl = data;
699 reprogram_counters(pmu, diff);
700 }
701 break;
702 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
703 /*
704 * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
705 * GLOBAL_STATUS, and so the set of reserved bits is the same.
706 */
707 if (data & pmu->global_status_mask)
708 return 1;
709 fallthrough;
710 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
711 if (!msr_info->host_initiated)
712 pmu->global_status &= ~data;
713 break;
714 default:
715 kvm_pmu_mark_pmc_in_use(vcpu, msr: msr_info->index);
716 return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
717 }
718
719 return 0;
720}
721
722static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
723{
724 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
725 struct kvm_pmc *pmc;
726 int i;
727
728 pmu->need_cleanup = false;
729
730 bitmap_zero(dst: pmu->reprogram_pmi, X86_PMC_IDX_MAX);
731
732 kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
733 pmc_stop_counter(pmc);
734 pmc->counter = 0;
735 pmc->emulated_counter = 0;
736
737 if (pmc_is_gp(pmc))
738 pmc->eventsel = 0;
739 }
740
741 pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
742
743 static_call_cond(kvm_x86_pmu_reset)(vcpu);
744}
745
746
747/*
748 * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
749 * and/or PERF_CAPABILITIES.
750 */
751void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
752{
753 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
754
755 if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
756 return;
757
758 /*
759 * Stop/release all existing counters/events before realizing the new
760 * vPMU model.
761 */
762 kvm_pmu_reset(vcpu);
763
764 pmu->version = 0;
765 pmu->nr_arch_gp_counters = 0;
766 pmu->nr_arch_fixed_counters = 0;
767 pmu->counter_bitmask[KVM_PMC_GP] = 0;
768 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
769 pmu->reserved_bits = 0xffffffff00200000ull;
770 pmu->raw_event_mask = X86_RAW_EVENT_MASK;
771 pmu->global_ctrl_mask = ~0ull;
772 pmu->global_status_mask = ~0ull;
773 pmu->fixed_ctr_ctrl_mask = ~0ull;
774 pmu->pebs_enable_mask = ~0ull;
775 pmu->pebs_data_cfg_mask = ~0ull;
776 bitmap_zero(dst: pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
777
778 if (!vcpu->kvm->arch.enable_pmu)
779 return;
780
781 static_call(kvm_x86_pmu_refresh)(vcpu);
782
783 /*
784 * At RESET, both Intel and AMD CPUs set all enable bits for general
785 * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
786 * was written for v1 PMUs don't unknowingly leave GP counters disabled
787 * in the global controls). Emulate that behavior when refreshing the
788 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
789 */
790 if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
791 pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
792}
793
794void kvm_pmu_init(struct kvm_vcpu *vcpu)
795{
796 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
797
798 memset(pmu, 0, sizeof(*pmu));
799 static_call(kvm_x86_pmu_init)(vcpu);
800 kvm_pmu_refresh(vcpu);
801}
802
803/* Release perf_events for vPMCs that have been unused for a full time slice. */
804void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
805{
806 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
807 struct kvm_pmc *pmc = NULL;
808 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
809 int i;
810
811 pmu->need_cleanup = false;
812
813 bitmap_andnot(dst: bitmask, src1: pmu->all_valid_pmc_idx,
814 src2: pmu->pmc_in_use, X86_PMC_IDX_MAX);
815
816 kvm_for_each_pmc(pmu, pmc, i, bitmask) {
817 if (pmc->perf_event && !pmc_speculative_in_use(pmc))
818 pmc_stop_counter(pmc);
819 }
820
821 static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
822
823 bitmap_zero(dst: pmu->pmc_in_use, X86_PMC_IDX_MAX);
824}
825
826void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
827{
828 kvm_pmu_reset(vcpu);
829}
830
831static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
832{
833 pmc->emulated_counter++;
834 kvm_pmu_request_counter_reprogram(pmc);
835}
836
837static inline bool cpl_is_matched(struct kvm_pmc *pmc)
838{
839 bool select_os, select_user;
840 u64 config;
841
842 if (pmc_is_gp(pmc)) {
843 config = pmc->eventsel;
844 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
845 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
846 } else {
847 config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
848 pmc->idx - KVM_FIXED_PMC_BASE_IDX);
849 select_os = config & 0x1;
850 select_user = config & 0x2;
851 }
852
853 /*
854 * Skip the CPL lookup, which isn't free on Intel, if the result will
855 * be the same regardless of the CPL.
856 */
857 if (select_os == select_user)
858 return select_os;
859
860 return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
861}
862
863void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
864{
865 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
866 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
867 struct kvm_pmc *pmc;
868 int i;
869
870 BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
871
872 if (!kvm_pmu_has_perf_global_ctrl(pmu))
873 bitmap_copy(dst: bitmap, src: pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
874 else if (!bitmap_and(dst: bitmap, src1: pmu->all_valid_pmc_idx,
875 src2: (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
876 return;
877
878 kvm_for_each_pmc(pmu, pmc, i, bitmap) {
879 /*
880 * Ignore checks for edge detect (all events currently emulated
881 * but KVM are always rising edges), pin control (unsupported
882 * by modern CPUs), and counter mask and its invert flag (KVM
883 * doesn't emulate multiple events in a single clock cycle).
884 *
885 * Note, the uppermost nibble of AMD's mask overlaps Intel's
886 * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
887 * bits (bits 35:34). Checking the "in HLE/RTM transaction"
888 * flags is correct as the vCPU can't be in a transaction if
889 * KVM is emulating an instruction. Checking the reserved bits
890 * might be wrong if they are defined in the future, but so
891 * could ignoring them, so do the simple thing for now.
892 */
893 if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
894 !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
895 continue;
896
897 kvm_pmu_incr_counter(pmc);
898 }
899}
900EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
901
902static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
903{
904 u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
905 KVM_PMU_MASKED_ENTRY_UMASK_MASK |
906 KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
907 KVM_PMU_MASKED_ENTRY_EXCLUDE;
908 int i;
909
910 for (i = 0; i < filter->nevents; i++) {
911 if (filter->events[i] & ~mask)
912 return false;
913 }
914
915 return true;
916}
917
918static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
919{
920 int i, j;
921
922 for (i = 0, j = 0; i < filter->nevents; i++) {
923 /*
924 * Skip events that are impossible to match against a guest
925 * event. When filtering, only the event select + unit mask
926 * of the guest event is used. To maintain backwards
927 * compatibility, impossible filters can't be rejected :-(
928 */
929 if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
930 ARCH_PERFMON_EVENTSEL_UMASK))
931 continue;
932 /*
933 * Convert userspace events to a common in-kernel event so
934 * only one code path is needed to support both events. For
935 * the in-kernel events use masked events because they are
936 * flexible enough to handle both cases. To convert to masked
937 * events all that's needed is to add an "all ones" umask_mask,
938 * (unmasked filter events don't support EXCLUDE).
939 */
940 filter->events[j++] = filter->events[i] |
941 (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
942 }
943
944 filter->nevents = j;
945}
946
947static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
948{
949 int i;
950
951 if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
952 convert_to_masked_filter(filter);
953 else if (!is_masked_filter_valid(filter))
954 return -EINVAL;
955
956 /*
957 * Sort entries by event select and includes vs. excludes so that all
958 * entries for a given event select can be processed efficiently during
959 * filtering. The EXCLUDE flag uses a more significant bit than the
960 * event select, and so the sorted list is also effectively split into
961 * includes and excludes sub-lists.
962 */
963 sort(base: &filter->events, num: filter->nevents, size: sizeof(filter->events[0]),
964 cmp_func: filter_sort_cmp, NULL);
965
966 i = filter->nevents;
967 /* Find the first EXCLUDE event (only supported for masked events). */
968 if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
969 for (i = 0; i < filter->nevents; i++) {
970 if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
971 break;
972 }
973 }
974
975 filter->nr_includes = i;
976 filter->nr_excludes = filter->nevents - filter->nr_includes;
977 filter->includes = filter->events;
978 filter->excludes = filter->events + filter->nr_includes;
979
980 return 0;
981}
982
983int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
984{
985 struct kvm_pmu_event_filter __user *user_filter = argp;
986 struct kvm_x86_pmu_event_filter *filter;
987 struct kvm_pmu_event_filter tmp;
988 struct kvm_vcpu *vcpu;
989 unsigned long i;
990 size_t size;
991 int r;
992
993 if (copy_from_user(to: &tmp, from: user_filter, n: sizeof(tmp)))
994 return -EFAULT;
995
996 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
997 tmp.action != KVM_PMU_EVENT_DENY)
998 return -EINVAL;
999
1000 if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
1001 return -EINVAL;
1002
1003 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
1004 return -E2BIG;
1005
1006 size = struct_size(filter, events, tmp.nevents);
1007 filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
1008 if (!filter)
1009 return -ENOMEM;
1010
1011 filter->action = tmp.action;
1012 filter->nevents = tmp.nevents;
1013 filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
1014 filter->flags = tmp.flags;
1015
1016 r = -EFAULT;
1017 if (copy_from_user(to: filter->events, from: user_filter->events,
1018 n: sizeof(filter->events[0]) * filter->nevents))
1019 goto cleanup;
1020
1021 r = prepare_filter_lists(filter);
1022 if (r)
1023 goto cleanup;
1024
1025 mutex_lock(&kvm->lock);
1026 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
1027 mutex_is_locked(&kvm->lock));
1028 mutex_unlock(lock: &kvm->lock);
1029 synchronize_srcu_expedited(ssp: &kvm->srcu);
1030
1031 BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
1032 sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
1033
1034 kvm_for_each_vcpu(i, vcpu, kvm)
1035 atomic64_set(v: &vcpu_to_pmu(vcpu)->__reprogram_pmi, i: -1ull);
1036
1037 kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
1038
1039 r = 0;
1040cleanup:
1041 kfree(objp: filter);
1042 return r;
1043}
1044

source code of linux/arch/x86/kvm/pmu.c