cpuid.c source code [linux/arch/x86/kvm/cpuid.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Kernel-based Virtual Machine driver for Linux
4	* cpuid support routines
5	*
6	* derived from arch/x86/kvm/x86.c
7	*
8	* Copyright 2011 Red Hat, Inc. and/or its affiliates.
9	* Copyright IBM Corporation, 2008
10	*/
11	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13	#include <linux/kvm_host.h>
14	#include "linux/lockdep.h"
15	#include <linux/export.h>
16	#include <linux/vmalloc.h>
17	#include <linux/uaccess.h>
18	#include <linux/sched/stat.h>
19
20	#include <asm/processor.h>
21	#include <asm/user.h>
22	#include <asm/fpu/xstate.h>
23	#include <asm/sgx.h>
24	#include <asm/cpuid/api.h>
25	#include "cpuid.h"
26	#include "lapic.h"
27	#include "mmu.h"
28	#include "trace.h"
29	#include "pmu.h"
30	#include "xen.h"
31
32	/*
33	* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
34	* aligned to sizeof(unsigned long) because it's not accessed via bitops.
35	*/
36	u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
37	EXPORT_SYMBOL_GPL(kvm_cpu_caps);
38
39	struct cpuid_xstate_sizes {
40	u32 eax;
41	u32 ebx;
42	u32 ecx;
43	};
44
45	static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init;
46
47	void __init kvm_init_xstate_sizes(void)
48	{
49	u32 ign;
50	int i;
51
52	for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) {
53	struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
54
55	cpuid_count(op: `0xD`, count: i, eax: &xs->eax, ebx: &xs->ebx, ecx: &xs->ecx, edx: &ign);
56	}
57	}
58
59	u32 xstate_required_size(u64 xstate_bv, bool compacted)
60	{
61	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
62	int i;
63
64	xstate_bv &= XFEATURE_MASK_EXTEND;
65	for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) {
66	struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
67	u32 offset;
68
69	if (!(xstate_bv & BIT_ULL(i)))
70	continue;
71
72	/ ECX[1]: 64B alignment in compacted form /
73	if (compacted)
74	offset = (xs->ecx & `0x2`) ? ALIGN(ret, `64`) : ret;
75	else
76	offset = xs->ebx;
77	ret = max(ret, offset + xs->eax);
78	xstate_bv &= ~BIT_ULL(i);
79	}
80
81	return ret;
82	}
83
84	struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(
85	struct kvm_cpuid_entry2 entries, int* nent, u32 function, u64 index)
86	{
87	struct kvm_cpuid_entry2 *e;
88	int i;
89
90	/*
91	* KVM has a semi-arbitrary rule that querying the guest's CPUID model
92	* with IRQs disabled is disallowed. The CPUID model can legitimately
93	* have over one hundred entries, i.e. the lookup is slow, and IRQs are
94	* typically disabled in KVM only when KVM is in a performance critical
95	* path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
96	* if this rule is violated, this assertion is purely to flag potential
97	* performance issues. If this fires, consider moving the lookup out
98	* of the hotpath, e.g. by caching information during CPUID updates.
99	*/
100	lockdep_assert_irqs_enabled();
101
102	for (i = `0`; i < nent; i++) {
103	e = &entries[i];
104
105	if (e->function != function)
106	continue;
107
108	/*
109	* If the index isn't significant, use the first entry with a
110	* matching function. It's userspace's responsibility to not
111	* provide "duplicate" entries in all cases.
112	*/
113	if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) \|\| e->index == index)
114	return e;
115
116
117	/*
118	* Similarly, use the first matching entry if KVM is doing a
119	* lookup (as opposed to emulating CPUID) for a function that's
120	* architecturally defined as not having a significant index.
121	*/
122	if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) {
123	/*
124	* Direct lookups from KVM should not diverge from what
125	* KVM defines internally (the architectural behavior).
126	*/
127	WARN_ON_ONCE(cpuid_function_is_indexed(function));
128	return e;
129	}
130	}
131
132	return NULL;
133	}
134	EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2);
135
136	static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
137	{
138	struct kvm_cpuid_entry2 *best;
139	u64 xfeatures;
140
141	/*
142	* The existing code assumes virtual address is 48-bit or 57-bit in the
143	* canonical address checks; exit if it is ever changed.
144	*/
145	best = kvm_find_cpuid_entry(vcpu, function: `0x80000008`);
146	if (best) {
147	int vaddr_bits = (best->eax & `0xff00`) >> `8`;
148
149	if (vaddr_bits != `48` && vaddr_bits != `57` && vaddr_bits != `0`)
150	return -EINVAL;
151	}
152
153	/*
154	* Exposing dynamic xfeatures to the guest requires additional
155	* enabling in the FPU, e.g. to expand the guest XSAVE state size.
156	*/
157	best = kvm_find_cpuid_entry_index(vcpu, function: `0xd`, index: `0`);
158	if (!best)
159	return `0`;
160
161	xfeatures = best->eax \| ((u64)best->edx << `32`);
162	xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
163	if (!xfeatures)
164	return `0`;
165
166	return fpu_enable_guest_xfd_features(guest_fpu: &vcpu->arch.guest_fpu, xfeatures);
167	}
168
169	static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu);
170	static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
171
172	/ Check whether the supplied CPUID data is equal to what is already set for the vCPU. /
173	static int kvm_cpuid_check_equal(struct kvm_vcpu vcpu, struct* kvm_cpuid_entry2 *e2,
174	int nent)
175	{
176	struct kvm_cpuid_entry2 *orig;
177	int i;
178
179	/*
180	* Apply runtime CPUID updates to the incoming CPUID entries to avoid
181	* false positives due mismatches on KVM-owned feature flags.
182	*
183	* Note! @e2 and @nent track the _old_ CPUID entries!
184	*/
185	kvm_update_cpuid_runtime(vcpu);
186	kvm_apply_cpuid_pv_features_quirk(vcpu);
187
188	if (nent != vcpu->arch.cpuid_nent)
189	return -EINVAL;
190
191	for (i = `0`; i < nent; i++) {
192	orig = &vcpu->arch.cpuid_entries[i];
193	if (e2[i].function != orig->function \|\|
194	e2[i].index != orig->index \|\|
195	e2[i].flags != orig->flags \|\|
196	e2[i].eax != orig->eax \|\| e2[i].ebx != orig->ebx \|\|
197	e2[i].ecx != orig->ecx \|\| e2[i].edx != orig->edx)
198	return -EINVAL;
199	}
200
201	return `0`;
202	}
203
204	static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
205	const char *sig)
206	{
207	struct kvm_hypervisor_cpuid cpuid = {};
208	struct kvm_cpuid_entry2 *entry;
209	u32 base;
210
211	for_each_possible_cpuid_base_hypervisor(base) {
212	entry = kvm_find_cpuid_entry(vcpu, function: base);
213
214	if (entry) {
215	u32 signature[`3`];
216
217	signature[`0`] = entry->ebx;
218	signature[`1`] = entry->ecx;
219	signature[`2`] = entry->edx;
220
221	if (!memcmp(p: signature, q: sig, size: sizeof(signature))) {
222	cpuid.base = base;
223	cpuid.limit = entry->eax;
224	break;
225	}
226	}
227	}
228
229	return cpuid;
230	}
231
232	static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu)
233	{
234	struct kvm_hypervisor_cpuid kvm_cpuid;
235	struct kvm_cpuid_entry2 *best;
236
237	kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
238	if (!kvm_cpuid.base)
239	return `0`;
240
241	best = kvm_find_cpuid_entry(vcpu, function: kvm_cpuid.base \| KVM_CPUID_FEATURES);
242	if (!best)
243	return `0`;
244
245	if (kvm_hlt_in_guest(kvm: vcpu->kvm))
246	best->eax &= ~(`1` << KVM_FEATURE_PV_UNHALT);
247
248	return best->eax;
249	}
250
251	/*
252	* Calculate guest's supported XCR0 taking into account guest CPUID data and
253	* KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
254	*/
255	static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu)
256	{
257	struct kvm_cpuid_entry2 *best;
258
259	best = kvm_find_cpuid_entry_index(vcpu, function: `0xd`, index: `0`);
260	if (!best)
261	return `0`;
262
263	return (best->eax \| ((u64)best->edx << `32`)) & kvm_caps.supported_xcr0;
264	}
265
266	static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu,
267	struct kvm_cpuid_entry2 *entry,
268	unsigned int x86_feature,
269	bool has_feature)
270	{
271	cpuid_entry_change(entry, x86_feature, set: has_feature);
272	guest_cpu_cap_change(vcpu, x86_feature, guest_has_cap: has_feature);
273	}
274
275	static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
276	{
277	struct kvm_cpuid_entry2 *best;
278
279	vcpu->arch.cpuid_dynamic_bits_dirty = false;
280
281	best = kvm_find_cpuid_entry(vcpu, function: `1`);
282	if (best) {
283	kvm_update_feature_runtime(vcpu, entry: best, X86_FEATURE_OSXSAVE,
284	has_feature: kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
285
286	kvm_update_feature_runtime(vcpu, entry: best, X86_FEATURE_APIC,
287	has_feature: vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
288
289	if (!kvm_check_has_quirk(kvm: vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
290	kvm_update_feature_runtime(vcpu, entry: best, X86_FEATURE_MWAIT,
291	has_feature: vcpu->arch.ia32_misc_enable_msr &
292	MSR_IA32_MISC_ENABLE_MWAIT);
293	}
294
295	best = kvm_find_cpuid_entry_index(vcpu, function: `7`, index: `0`);
296	if (best)
297	kvm_update_feature_runtime(vcpu, entry: best, X86_FEATURE_OSPKE,
298	has_feature: kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
299
300
301	best = kvm_find_cpuid_entry_index(vcpu, function: `0xD`, index: `0`);
302	if (best)
303	best->ebx = xstate_required_size(xstate_bv: vcpu->arch.xcr0, compacted: false);
304
305	best = kvm_find_cpuid_entry_index(vcpu, function: `0xD`, index: `1`);
306	if (best && (cpuid_entry_has(entry: best, X86_FEATURE_XSAVES) \|\|
307	cpuid_entry_has(entry: best, X86_FEATURE_XSAVEC)))
308	best->ebx = xstate_required_size(xstate_bv: vcpu->arch.xcr0, compacted: true);
309	}
310
311	static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu)
312	{
313	#ifdef CONFIG_KVM_HYPERV
314	struct kvm_cpuid_entry2 *entry;
315
316	entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
317	return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
318	#else
319	return false;
320	#endif
321	}
322
323	static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
324	{
325	struct kvm_cpuid_entry2 *entry;
326
327	entry = kvm_find_cpuid_entry(vcpu, function: `0`);
328	if (!entry)
329	return false;
330
331	return is_guest_vendor_amd(ebx: entry->ebx, ecx: entry->ecx, edx: entry->edx) \|\|
332	is_guest_vendor_hygon(ebx: entry->ebx, ecx: entry->ecx, edx: entry->edx);
333	}
334
335	/*
336	* This isn't truly "unsafe", but except for the cpu_caps initialization code,
337	* all register lookups should use __cpuid_entry_get_reg(), which provides
338	* compile-time validation of the input.
339	*/
340	static u32 cpuid_get_reg_unsafe(struct kvm_cpuid_entry2 *entry, u32 reg)
341	{
342	switch (reg) {
343	case CPUID_EAX:
344	return entry->eax;
345	case CPUID_EBX:
346	return entry->ebx;
347	case CPUID_ECX:
348	return entry->ecx;
349	case CPUID_EDX:
350	return entry->edx;
351	default:
352	WARN_ON_ONCE(`1`);
353	return `0`;
354	}
355	}
356
357	static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
358	bool include_partially_emulated);
359
360	void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
361	{
362	struct kvm_lapic *apic = vcpu->arch.apic;
363	struct kvm_cpuid_entry2 *best;
364	struct kvm_cpuid_entry2 *entry;
365	bool allow_gbpages;
366	int i;
367
368	memset(vcpu->arch.cpu_caps, `0`, sizeof(vcpu->arch.cpu_caps));
369	BUILD_BUG_ON(ARRAY_SIZE(reverse_cpuid) != NR_KVM_CPU_CAPS);
370
371	/*
372	* Reset guest capabilities to userspace's guest CPUID definition, i.e.
373	* honor userspace's definition for features that don't require KVM or
374	* hardware management/support (or that KVM simply doesn't care about).
375	*/
376	for (i = `0`; i < NR_KVM_CPU_CAPS; i++) {
377	const struct cpuid_reg cpuid = reverse_cpuid[i];
378	struct kvm_cpuid_entry2 emulated;
379
380	if (!cpuid.function)
381	continue;
382
383	entry = kvm_find_cpuid_entry_index(vcpu, function: cpuid.function, index: cpuid.index);
384	if (!entry)
385	continue;
386
387	cpuid_func_emulated(entry: &emulated, func: cpuid.function, include_partially_emulated: true);
388
389	/*
390	* A vCPU has a feature if it's supported by KVM and is enabled
391	* in guest CPUID. Note, this includes features that are
392	* supported by KVM but aren't advertised to userspace!
393	*/
394	vcpu->arch.cpu_caps[i] = kvm_cpu_caps[i] \|
395	cpuid_get_reg_unsafe(entry: &emulated, reg: cpuid.reg);
396	vcpu->arch.cpu_caps[i] &= cpuid_get_reg_unsafe(entry, reg: cpuid.reg);
397	}
398
399	kvm_update_cpuid_runtime(vcpu);
400
401	/*
402	* If TDP is enabled, let the guest use GBPAGES if they're supported in
403	* hardware. The hardware page walker doesn't let KVM disable GBPAGES,
404	* i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
405	* walk for performance and complexity reasons. Not to mention KVM
406	* _can't_ solve the problem because GVA->GPA walks aren't visible to
407	* KVM once a TDP translation is installed. Mimic hardware behavior so
408	* that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
409	* If TDP is disabled, honor only guest CPUID as KVM has full control
410	* and can install smaller shadow pages if the host lacks 1GiB support.
411	*/
412	allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
413	guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES);
414	guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, guest_has_cap: allow_gbpages);
415
416	best = kvm_find_cpuid_entry(vcpu, function: `1`);
417	if (best && apic) {
418	if (cpuid_entry_has(entry: best, X86_FEATURE_TSC_DEADLINE_TIMER))
419	apic->lapic_timer.timer_mode_mask = `3` << `17`;
420	else
421	apic->lapic_timer.timer_mode_mask = `1` << `17`;
422
423	kvm_apic_set_version(vcpu);
424	}
425
426	vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu);
427
428	vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu);
429
430	vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
431	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
432	vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
433
434	kvm_pmu_refresh(vcpu);
435
436	#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
437	vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) \|
438	__cr4_reserved_bits(guest_cpu_cap_has, vcpu);
439	#undef __kvm_cpu_cap_has
440
441	kvm_hv_set_cpuid(vcpu, hyperv_enabled: kvm_cpuid_has_hyperv(vcpu));
442
443	/ Invoke the vendor callback only after the above state is updated. /
444	kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
445
446	/*
447	* Except for the MMU, which needs to do its thing any vendor specific
448	* adjustments to the reserved GPA bits.
449	*/
450	kvm_mmu_after_set_cpuid(vcpu);
451	}
452
453	int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
454	{
455	struct kvm_cpuid_entry2 *best;
456
457	best = kvm_find_cpuid_entry(vcpu, function: `0x80000000`);
458	if (!best \|\| best->eax < `0x80000008`)
459	goto not_found;
460	best = kvm_find_cpuid_entry(vcpu, function: `0x80000008`);
461	if (best)
462	return best->eax & `0xff`;
463	not_found:
464	return `36`;
465	}
466
467	int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu)
468	{
469	struct kvm_cpuid_entry2 *best;
470
471	best = kvm_find_cpuid_entry(vcpu, function: `0x80000000`);
472	if (!best \|\| best->eax < `0x80000008`)
473	goto not_found;
474	best = kvm_find_cpuid_entry(vcpu, function: `0x80000008`);
475	if (best)
476	return (best->eax >> `16`) & `0xff`;
477	not_found:
478	return `0`;
479	}
480
481	/*
482	* This "raw" version returns the reserved GPA bits without any adjustments for
483	* encryption technologies that usurp bits. The raw mask should be used if and
484	* only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
485	*/
486	u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
487	{
488	return rsvd_bits(s: cpuid_maxphyaddr(vcpu), e: `63`);
489	}
490
491	static int kvm_set_cpuid(struct kvm_vcpu vcpu, struct* kvm_cpuid_entry2 *e2,
492	int nent)
493	{
494	u32 vcpu_caps[NR_KVM_CPU_CAPS];
495	int r;
496
497	/*
498	* Swap the existing (old) entries with the incoming (new) entries in
499	* order to massage the new entries, e.g. to account for dynamic bits
500	* that KVM controls, without clobbering the current guest CPUID, which
501	* KVM needs to preserve in order to unwind on failure.
502	*
503	* Similarly, save the vCPU's current cpu_caps so that the capabilities
504	* can be updated alongside the CPUID entries when performing runtime
505	* updates. Full initialization is done if and only if the vCPU hasn't
506	* run, i.e. only if userspace is potentially changing CPUID features.
507	*/
508	swap(vcpu->arch.cpuid_entries, e2);
509	swap(vcpu->arch.cpuid_nent, nent);
510
511	memcpy(vcpu_caps, vcpu->arch.cpu_caps, sizeof(vcpu_caps));
512	BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps));
513
514	/*
515	* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
516	* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
517	* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
518	* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
519	* the core vCPU model on the fly. It would've been better to forbid any
520	* KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
521	* some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
522	* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
523	* whether the supplied CPUID data is equal to what's already set.
524	*/
525	if (kvm_vcpu_has_run(vcpu)) {
526	r = kvm_cpuid_check_equal(vcpu, e2, nent);
527	if (r)
528	goto err;
529	goto success;
530	}
531
532	#ifdef CONFIG_KVM_HYPERV
533	if (kvm_cpuid_has_hyperv(vcpu)) {
534	r = kvm_hv_vcpu_init(vcpu);
535	if (r)
536	goto err;
537	}
538	#endif
539
540	r = kvm_check_cpuid(vcpu);
541	if (r)
542	goto err;
543
544	#ifdef CONFIG_KVM_XEN
545	vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
546	#endif
547	kvm_vcpu_after_set_cpuid(vcpu);
548
549	success:
550	kvfree(addr: e2);
551	return `0`;
552
553	err:
554	memcpy(vcpu->arch.cpu_caps, vcpu_caps, sizeof(vcpu_caps));
555	swap(vcpu->arch.cpuid_entries, e2);
556	swap(vcpu->arch.cpuid_nent, nent);
557	return r;
558	}
559
560	/ when an old userspace process fills a new kernel module /
561	int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
562	struct kvm_cpuid *cpuid,
563	struct kvm_cpuid_entry __user *entries)
564	{
565	int r, i;
566	struct kvm_cpuid_entry *e = NULL;
567	struct kvm_cpuid_entry2 *e2 = NULL;
568
569	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
570	return -E2BIG;
571
572	if (cpuid->nent) {
573	e = vmemdup_array_user(src: entries, n: cpuid->nent, size: sizeof(*e));
574	if (IS_ERR(ptr: e))
575	return PTR_ERR(ptr: e);
576
577	e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
578	if (!e2) {
579	r = -ENOMEM;
580	goto out_free_cpuid;
581	}
582	}
583	for (i = `0`; i < cpuid->nent; i++) {
584	e2[i].function = e[i].function;
585	e2[i].eax = e[i].eax;
586	e2[i].ebx = e[i].ebx;
587	e2[i].ecx = e[i].ecx;
588	e2[i].edx = e[i].edx;
589	e2[i].index = `0`;
590	e2[i].flags = `0`;
591	e2[i].padding[`0`] = `0`;
592	e2[i].padding[`1`] = `0`;
593	e2[i].padding[`2`] = `0`;
594	}
595
596	r = kvm_set_cpuid(vcpu, e2, nent: cpuid->nent);
597	if (r)
598	kvfree(addr: e2);
599
600	out_free_cpuid:
601	kvfree(addr: e);
602
603	return r;
604	}
605
606	int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
607	struct kvm_cpuid2 *cpuid,
608	struct kvm_cpuid_entry2 __user *entries)
609	{
610	struct kvm_cpuid_entry2 *e2 = NULL;
611	int r;
612
613	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
614	return -E2BIG;
615
616	if (cpuid->nent) {
617	e2 = vmemdup_array_user(src: entries, n: cpuid->nent, size: sizeof(*e2));
618	if (IS_ERR(ptr: e2))
619	return PTR_ERR(ptr: e2);
620	}
621
622	r = kvm_set_cpuid(vcpu, e2, nent: cpuid->nent);
623	if (r)
624	kvfree(addr: e2);
625
626	return r;
627	}
628
629	int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
630	struct kvm_cpuid2 *cpuid,
631	struct kvm_cpuid_entry2 __user *entries)
632	{
633	if (cpuid->nent < vcpu->arch.cpuid_nent)
634	return -E2BIG;
635
636	if (vcpu->arch.cpuid_dynamic_bits_dirty)
637	kvm_update_cpuid_runtime(vcpu);
638
639	if (copy_to_user(to: entries, from: vcpu->arch.cpuid_entries,
640	n: vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
641	return -EFAULT;
642
643	cpuid->nent = vcpu->arch.cpuid_nent;
644	return `0`;
645	}
646
647	static __always_inline u32 raw_cpuid_get(struct cpuid_reg cpuid)
648	{
649	struct kvm_cpuid_entry2 entry;
650	u32 base;
651
652	/*
653	* KVM only supports features defined by Intel (0x0), AMD (0x80000000),
654	* and Centaur (0xc0000000). WARN if a feature for new vendor base is
655	* defined, as this and other code would need to be updated.
656	*/
657	base = cpuid.function & `0xffff0000`;
658	if (WARN_ON_ONCE(base && base != `0x80000000` && base != `0xc0000000`))
659	return `0`;
660
661	if (cpuid_eax(op: base) < cpuid.function)
662	return `0`;
663
664	cpuid_count(op: cpuid.function, count: cpuid.index,
665	eax: &entry.eax, ebx: &entry.ebx, ecx: &entry.ecx, edx: &entry.edx);
666
667	return *__cpuid_entry_get_reg(entry: &entry, reg: cpuid.reg);
668	}
669
670	/*
671	* For kernel-defined leafs, mask KVM's supported feature set with the kernel's
672	* capabilities as well as raw CPUID. For KVM-defined leafs, consult only raw
673	* CPUID, as KVM is the one and only authority (in the kernel).
674	*/
675	#define kvm_cpu_cap_init(leaf, feature_initializers...) \
676	do { \
677	const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); \
678	const u32 __maybe_unused kvm_cpu_cap_init_in_progress = leaf; \
679	const u32 *kernel_cpu_caps = boot_cpu_data.x86_capability; \
680	u32 kvm_cpu_cap_passthrough = 0; \
681	u32 kvm_cpu_cap_synthesized = 0; \
682	u32 kvm_cpu_cap_emulated = 0; \
683	u32 kvm_cpu_cap_features = 0; \
684	\
685	feature_initializers \
686	\
687	kvm_cpu_caps[leaf] = kvm_cpu_cap_features; \
688	\
689	if (leaf < NCAPINTS) \
690	kvm_cpu_caps[leaf] &= kernel_cpu_caps[leaf]; \
691	\
692	kvm_cpu_caps[leaf] \|= kvm_cpu_cap_passthrough; \
693	kvm_cpu_caps[leaf] &= (raw_cpuid_get(cpuid) \| \
694	kvm_cpu_cap_synthesized); \
695	kvm_cpu_caps[leaf] \|= kvm_cpu_cap_emulated; \
696	} while (0)
697
698	/*
699	* Assert that the feature bit being declared, e.g. via F(), is in the CPUID
700	* word that's being initialized. Exempt 0x8000_0001.EDX usage of 0x1.EDX
701	* features, as AMD duplicated many 0x1.EDX features into 0x8000_0001.EDX.
702	*/
703	#define KVM_VALIDATE_CPU_CAP_USAGE(name) \
704	do { \
705	u32 __leaf = __feature_leaf(X86_FEATURE_##name); \
706	\
707	BUILD_BUG_ON(__leaf != kvm_cpu_cap_init_in_progress); \
708	} while (0)
709
710	#define F(name) \
711	({ \
712	KVM_VALIDATE_CPU_CAP_USAGE(name); \
713	kvm_cpu_cap_features \|= feature_bit(name); \
714	})
715
716	/ Scattered Flag - For features that are scattered by cpufeatures.h. /
717	#define SCATTERED_F(name) \
718	({ \
719	BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
720	KVM_VALIDATE_CPU_CAP_USAGE(name); \
721	if (boot_cpu_has(X86_FEATURE_##name)) \
722	F(name); \
723	})
724
725	/ Features that KVM supports only on 64-bit kernels. /
726	#define X86_64_F(name) \
727	({ \
728	KVM_VALIDATE_CPU_CAP_USAGE(name); \
729	if (IS_ENABLED(CONFIG_X86_64)) \
730	F(name); \
731	})
732
733	/*
734	* Emulated Feature - For features that KVM emulates in software irrespective
735	* of host CPU/kernel support.
736	*/
737	#define EMULATED_F(name) \
738	({ \
739	kvm_cpu_cap_emulated \|= feature_bit(name); \
740	F(name); \
741	})
742
743	/*
744	* Synthesized Feature - For features that are synthesized into boot_cpu_data,
745	* i.e. may not be present in the raw CPUID, but can still be advertised to
746	* userspace. Primarily used for mitigation related feature flags.
747	*/
748	#define SYNTHESIZED_F(name) \
749	({ \
750	kvm_cpu_cap_synthesized \|= feature_bit(name); \
751	F(name); \
752	})
753
754	/*
755	* Passthrough Feature - For features that KVM supports based purely on raw
756	* hardware CPUID, i.e. that KVM virtualizes even if the host kernel doesn't
757	* use the feature. Simply force set the feature in KVM's capabilities, raw
758	* CPUID support will be factored in by kvm_cpu_cap_mask().
759	*/
760	#define PASSTHROUGH_F(name) \
761	({ \
762	kvm_cpu_cap_passthrough \|= feature_bit(name); \
763	F(name); \
764	})
765
766	/*
767	* Aliased Features - For features in 0x8000_0001.EDX that are duplicates of
768	* identical 0x1.EDX features, and thus are aliased from 0x1 to 0x8000_0001.
769	*/
770	#define ALIASED_1_EDX_F(name) \
771	({ \
772	BUILD_BUG_ON(__feature_leaf(X86_FEATURE_##name) != CPUID_1_EDX); \
773	BUILD_BUG_ON(kvm_cpu_cap_init_in_progress != CPUID_8000_0001_EDX); \
774	kvm_cpu_cap_features \|= feature_bit(name); \
775	})
776
777	/*
778	* Vendor Features - For features that KVM supports, but are added in later
779	* because they require additional vendor enabling.
780	*/
781	#define VENDOR_F(name) \
782	({ \
783	KVM_VALIDATE_CPU_CAP_USAGE(name); \
784	})
785
786	/*
787	* Runtime Features - For features that KVM dynamically sets/clears at runtime,
788	* e.g. when CR4 changes, but which are never advertised to userspace.
789	*/
790	#define RUNTIME_F(name) \
791	({ \
792	KVM_VALIDATE_CPU_CAP_USAGE(name); \
793	})
794
795	/*
796	* Undefine the MSR bit macro to avoid token concatenation issues when
797	* processing X86_FEATURE_SPEC_CTRL_SSBD.
798	*/
799	#undef SPEC_CTRL_SSBD
800
801	/ DS is defined by ptrace-abi.h on 32-bit builds. /
802	#undef DS
803
804	void kvm_set_cpu_caps(void)
805	{
806	memset(kvm_cpu_caps, `0`, sizeof(kvm_cpu_caps));
807
808	BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
809	sizeof(boot_cpu_data.x86_capability));
810
811	kvm_cpu_cap_init(CPUID_1_ECX,
812	F(XMM3),
813	F(PCLMULQDQ),
814	VENDOR_F(DTES64),
815	/*
816	* NOTE: MONITOR (and MWAIT) are emulated as NOP, but not
817	* advertised to guests via CPUID! MWAIT is also technically a
818	* runtime flag thanks to IA32_MISC_ENABLES; mark it as such so
819	* that KVM is aware that it's a known, unadvertised flag.
820	*/
821	RUNTIME_F(MWAIT),
822	/ DS-CPL /
823	VENDOR_F(VMX),
824	/ SMX, EST /
825	/ TM2 /
826	F(SSSE3),
827	/ CNXT-ID /
828	/ Reserved /
829	F(FMA),
830	F(CX16),
831	/ xTPR Update /
832	F(PDCM),
833	F(PCID),
834	/ Reserved, DCA /
835	F(XMM4_1),
836	F(XMM4_2),
837	EMULATED_F(X2APIC),
838	F(MOVBE),
839	F(POPCNT),
840	EMULATED_F(TSC_DEADLINE_TIMER),
841	F(AES),
842	F(XSAVE),
843	RUNTIME_F(OSXSAVE),
844	F(AVX),
845	F(F16C),
846	F(RDRAND),
847	EMULATED_F(HYPERVISOR),
848	);
849
850	kvm_cpu_cap_init(CPUID_1_EDX,
851	F(FPU),
852	F(VME),
853	F(DE),
854	F(PSE),
855	F(TSC),
856	F(MSR),
857	F(PAE),
858	F(MCE),
859	F(CX8),
860	F(APIC),
861	/ Reserved /
862	F(SEP),
863	F(MTRR),
864	F(PGE),
865	F(MCA),
866	F(CMOV),
867	F(PAT),
868	F(PSE36),
869	/ PSN /
870	F(CLFLUSH),
871	/ Reserved /
872	VENDOR_F(DS),
873	/ ACPI /
874	F(MMX),
875	F(FXSR),
876	F(XMM),
877	F(XMM2),
878	F(SELFSNOOP),
879	/ HTT, TM, Reserved, PBE /
880	);
881
882	kvm_cpu_cap_init(CPUID_7_0_EBX,
883	F(FSGSBASE),
884	EMULATED_F(TSC_ADJUST),
885	F(SGX),
886	F(BMI1),
887	F(HLE),
888	F(AVX2),
889	F(FDP_EXCPTN_ONLY),
890	F(SMEP),
891	F(BMI2),
892	F(ERMS),
893	F(INVPCID),
894	F(RTM),
895	F(ZERO_FCS_FDS),
896	VENDOR_F(MPX),
897	F(AVX512F),
898	F(AVX512DQ),
899	F(RDSEED),
900	F(ADX),
901	F(SMAP),
902	F(AVX512IFMA),
903	F(CLFLUSHOPT),
904	F(CLWB),
905	VENDOR_F(INTEL_PT),
906	F(AVX512PF),
907	F(AVX512ER),
908	F(AVX512CD),
909	F(SHA_NI),
910	F(AVX512BW),
911	F(AVX512VL),
912	);
913
914	kvm_cpu_cap_init(CPUID_7_ECX,
915	F(AVX512VBMI),
916	PASSTHROUGH_F(LA57),
917	F(PKU),
918	RUNTIME_F(OSPKE),
919	F(RDPID),
920	F(AVX512_VPOPCNTDQ),
921	F(UMIP),
922	F(AVX512_VBMI2),
923	F(GFNI),
924	F(VAES),
925	F(VPCLMULQDQ),
926	F(AVX512_VNNI),
927	F(AVX512_BITALG),
928	F(CLDEMOTE),
929	F(MOVDIRI),
930	F(MOVDIR64B),
931	VENDOR_F(WAITPKG),
932	F(SGX_LC),
933	F(BUS_LOCK_DETECT),
934	);
935
936	/*
937	* PKU not yet implemented for shadow paging and requires OSPKE
938	* to be set on the host. Clear it if that is not the case
939	*/
940	if (!tdp_enabled \|\| !boot_cpu_has(X86_FEATURE_OSPKE))
941	kvm_cpu_cap_clear(X86_FEATURE_PKU);
942
943	kvm_cpu_cap_init(CPUID_7_EDX,
944	F(AVX512_4VNNIW),
945	F(AVX512_4FMAPS),
946	F(SPEC_CTRL),
947	F(SPEC_CTRL_SSBD),
948	EMULATED_F(ARCH_CAPABILITIES),
949	F(INTEL_STIBP),
950	F(MD_CLEAR),
951	F(AVX512_VP2INTERSECT),
952	F(FSRM),
953	F(SERIALIZE),
954	F(TSXLDTRK),
955	F(AVX512_FP16),
956	F(AMX_TILE),
957	F(AMX_INT8),
958	F(AMX_BF16),
959	F(FLUSH_L1D),
960	);
961
962	if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) &&
963	boot_cpu_has(X86_FEATURE_AMD_IBPB) &&
964	boot_cpu_has(X86_FEATURE_AMD_IBRS))
965	kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
966	if (boot_cpu_has(X86_FEATURE_STIBP))
967	kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
968	if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
969	kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
970
971	kvm_cpu_cap_init(CPUID_7_1_EAX,
972	F(SHA512),
973	F(SM3),
974	F(SM4),
975	F(AVX_VNNI),
976	F(AVX512_BF16),
977	F(CMPCCXADD),
978	F(FZRM),
979	F(FSRS),
980	F(FSRC),
981	F(WRMSRNS),
982	F(AMX_FP16),
983	F(AVX_IFMA),
984	F(LAM),
985	);
986
987	kvm_cpu_cap_init(CPUID_7_1_EDX,
988	F(AVX_VNNI_INT8),
989	F(AVX_NE_CONVERT),
990	F(AMX_COMPLEX),
991	F(AVX_VNNI_INT16),
992	F(PREFETCHITI),
993	F(AVX10),
994	);
995
996	kvm_cpu_cap_init(CPUID_7_2_EDX,
997	F(INTEL_PSFD),
998	F(IPRED_CTRL),
999	F(RRSBA_CTRL),
1000	F(DDPD_U),
1001	F(BHI_CTRL),
1002	F(MCDT_NO),
1003	);
1004
1005	kvm_cpu_cap_init(CPUID_D_1_EAX,
1006	F(XSAVEOPT),
1007	F(XSAVEC),
1008	F(XGETBV1),
1009	F(XSAVES),
1010	X86_64_F(XFD),
1011	);
1012
1013	kvm_cpu_cap_init(CPUID_12_EAX,
1014	SCATTERED_F(SGX1),
1015	SCATTERED_F(SGX2),
1016	SCATTERED_F(SGX_EDECCSSA),
1017	);
1018
1019	kvm_cpu_cap_init(CPUID_24_0_EBX,
1020	F(AVX10_128),
1021	F(AVX10_256),
1022	F(AVX10_512),
1023	);
1024
1025	kvm_cpu_cap_init(CPUID_8000_0001_ECX,
1026	F(LAHF_LM),
1027	F(CMP_LEGACY),
1028	VENDOR_F(SVM),
1029	/ ExtApicSpace /
1030	F(CR8_LEGACY),
1031	F(ABM),
1032	F(SSE4A),
1033	F(MISALIGNSSE),
1034	F(`3DNOWPREFETCH`),
1035	F(OSVW),
1036	/ IBS /
1037	F(XOP),
1038	/ SKINIT, WDT, LWP /
1039	F(FMA4),
1040	F(TBM),
1041	F(TOPOEXT),
1042	VENDOR_F(PERFCTR_CORE),
1043	);
1044
1045	kvm_cpu_cap_init(CPUID_8000_0001_EDX,
1046	ALIASED_1_EDX_F(FPU),
1047	ALIASED_1_EDX_F(VME),
1048	ALIASED_1_EDX_F(DE),
1049	ALIASED_1_EDX_F(PSE),
1050	ALIASED_1_EDX_F(TSC),
1051	ALIASED_1_EDX_F(MSR),
1052	ALIASED_1_EDX_F(PAE),
1053	ALIASED_1_EDX_F(MCE),
1054	ALIASED_1_EDX_F(CX8),
1055	ALIASED_1_EDX_F(APIC),
1056	/ Reserved /
1057	F(SYSCALL),
1058	ALIASED_1_EDX_F(MTRR),
1059	ALIASED_1_EDX_F(PGE),
1060	ALIASED_1_EDX_F(MCA),
1061	ALIASED_1_EDX_F(CMOV),
1062	ALIASED_1_EDX_F(PAT),
1063	ALIASED_1_EDX_F(PSE36),
1064	/ Reserved /
1065	F(NX),
1066	/ Reserved /
1067	F(MMXEXT),
1068	ALIASED_1_EDX_F(MMX),
1069	ALIASED_1_EDX_F(FXSR),
1070	F(FXSR_OPT),
1071	X86_64_F(GBPAGES),
1072	F(RDTSCP),
1073	/ Reserved /
1074	X86_64_F(LM),
1075	F(`3DNOWEXT`),
1076	F(`3DNOW`),
1077	);
1078
1079	if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
1080	kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
1081
1082	kvm_cpu_cap_init(CPUID_8000_0007_EDX,
1083	SCATTERED_F(CONSTANT_TSC),
1084	);
1085
1086	kvm_cpu_cap_init(CPUID_8000_0008_EBX,
1087	F(CLZERO),
1088	F(XSAVEERPTR),
1089	F(WBNOINVD),
1090	F(AMD_IBPB),
1091	F(AMD_IBRS),
1092	F(AMD_SSBD),
1093	F(VIRT_SSBD),
1094	F(AMD_SSB_NO),
1095	F(AMD_STIBP),
1096	F(AMD_STIBP_ALWAYS_ON),
1097	F(AMD_IBRS_SAME_MODE),
1098	F(AMD_PSFD),
1099	F(AMD_IBPB_RET),
1100	);
1101
1102	/*
1103	* AMD has separate bits for each SPEC_CTRL bit.
1104	* arch/x86/kernel/cpu/bugs.c is kind enough to
1105	* record that in cpufeatures so use them.
1106	*/
1107	if (boot_cpu_has(X86_FEATURE_IBPB)) {
1108	kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
1109	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
1110	!boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB))
1111	kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB_RET);
1112	}
1113	if (boot_cpu_has(X86_FEATURE_IBRS))
1114	kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
1115	if (boot_cpu_has(X86_FEATURE_STIBP))
1116	kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
1117	if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
1118	kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
1119	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1120	kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
1121	/*
1122	* The preference is to use SPEC CTRL MSR instead of the
1123	* VIRT_SPEC MSR.
1124	*/
1125	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
1126	!boot_cpu_has(X86_FEATURE_AMD_SSBD))
1127	kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
1128
1129	/ All SVM features required additional vendor module enabling. /
1130	kvm_cpu_cap_init(CPUID_8000_000A_EDX,
1131	VENDOR_F(NPT),
1132	VENDOR_F(VMCBCLEAN),
1133	VENDOR_F(FLUSHBYASID),
1134	VENDOR_F(NRIPS),
1135	VENDOR_F(TSCRATEMSR),
1136	VENDOR_F(V_VMSAVE_VMLOAD),
1137	VENDOR_F(LBRV),
1138	VENDOR_F(PAUSEFILTER),
1139	VENDOR_F(PFTHRESHOLD),
1140	VENDOR_F(VGIF),
1141	VENDOR_F(VNMI),
1142	VENDOR_F(SVME_ADDR_CHK),
1143	);
1144
1145	kvm_cpu_cap_init(CPUID_8000_001F_EAX,
1146	VENDOR_F(SME),
1147	VENDOR_F(SEV),
1148	/ VM_PAGE_FLUSH /
1149	VENDOR_F(SEV_ES),
1150	F(SME_COHERENT),
1151	);
1152
1153	kvm_cpu_cap_init(CPUID_8000_0021_EAX,
1154	F(NO_NESTED_DATA_BP),
1155	F(WRMSR_XX_BASE_NS),
1156	/*
1157	* Synthesize "LFENCE is serializing" into the AMD-defined entry
1158	* in KVM's supported CPUID, i.e. if the feature is reported as
1159	* supported by the kernel. LFENCE_RDTSC was a Linux-defined
1160	* synthetic feature long before AMD joined the bandwagon, e.g.
1161	* LFENCE is serializing on most CPUs that support SSE2. On
1162	* CPUs that don't support AMD's leaf, ANDing with the raw host
1163	* CPUID will drop the flags, and reporting support in AMD's
1164	* leaf can make it easier for userspace to detect the feature.
1165	*/
1166	SYNTHESIZED_F(LFENCE_RDTSC),
1167	/ SmmPgCfgLock /
1168	F(NULL_SEL_CLR_BASE),
1169	/ UpperAddressIgnore /
1170	F(AUTOIBRS),
1171	F(PREFETCHI),
1172	EMULATED_F(NO_SMM_CTL_MSR),
1173	/ PrefetchCtlMsr /
1174	/ GpOnUserCpuid /
1175	/ EPSF /
1176	SYNTHESIZED_F(SBPB),
1177	SYNTHESIZED_F(IBPB_BRTYPE),
1178	SYNTHESIZED_F(SRSO_NO),
1179	F(SRSO_USER_KERNEL_NO),
1180	);
1181
1182	kvm_cpu_cap_init(CPUID_8000_0022_EAX,
1183	F(PERFMON_V2),
1184	);
1185
1186	if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
1187	kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE);
1188
1189	kvm_cpu_cap_init(CPUID_C000_0001_EDX,
1190	F(XSTORE),
1191	F(XSTORE_EN),
1192	F(XCRYPT),
1193	F(XCRYPT_EN),
1194	F(ACE2),
1195	F(ACE2_EN),
1196	F(PHE),
1197	F(PHE_EN),
1198	F(PMM),
1199	F(PMM_EN),
1200	);
1201
1202	/*
1203	* Hide RDTSCP and RDPID if either feature is reported as supported but
1204	* probing MSR_TSC_AUX failed. This is purely a sanity check and
1205	* should never happen, but the guest will likely crash if RDTSCP or
1206	* RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
1207	* the past. For example, the sanity check may fire if this instance of
1208	* KVM is running as L1 on top of an older, broken KVM.
1209	*/
1210	if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) \|\|
1211	kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
1212	!kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
1213	kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
1214	kvm_cpu_cap_clear(X86_FEATURE_RDPID);
1215	}
1216	}
1217	EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
1218
1219	#undef F
1220	#undef SCATTERED_F
1221	#undef X86_64_F
1222	#undef EMULATED_F
1223	#undef SYNTHESIZED_F
1224	#undef PASSTHROUGH_F
1225	#undef ALIASED_1_EDX_F
1226	#undef VENDOR_F
1227	#undef RUNTIME_F
1228
1229	struct kvm_cpuid_array {
1230	struct kvm_cpuid_entry2 *entries;
1231	int maxnent;
1232	int nent;
1233	};
1234
1235	static struct kvm_cpuid_entry2 get_next_cpuid(struct* kvm_cpuid_array *array)
1236	{
1237	if (array->nent >= array->maxnent)
1238	return NULL;
1239
1240	return &array->entries[array->nent++];
1241	}
1242
1243	static struct kvm_cpuid_entry2 do_host_cpuid(struct* kvm_cpuid_array *array,
1244	u32 function, u32 index)
1245	{
1246	struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
1247
1248	if (!entry)
1249	return NULL;
1250
1251	memset(entry, `0`, sizeof(*entry));
1252	entry->function = function;
1253	entry->index = index;
1254	switch (function & `0xC0000000`) {
1255	case `0x40000000`:
1256	/ Hypervisor leaves are always synthesized by __do_cpuid_func. /
1257	return entry;
1258
1259	case `0x80000000`:
1260	/*
1261	* 0x80000021 is sometimes synthesized by __do_cpuid_func, which
1262	* would result in out-of-bounds calls to do_host_cpuid.
1263	*/
1264	{
1265	static int max_cpuid_80000000;
1266	if (!READ_ONCE(max_cpuid_80000000))
1267	WRITE_ONCE(max_cpuid_80000000, cpuid_eax(`0x80000000`));
1268	if (function > READ_ONCE(max_cpuid_80000000))
1269	return entry;
1270	}
1271	break;
1272
1273	default:
1274	break;
1275	}
1276
1277	cpuid_count(op: entry->function, count: entry->index,
1278	eax: &entry->eax, ebx: &entry->ebx, ecx: &entry->ecx, edx: &entry->edx);
1279
1280	if (cpuid_function_is_indexed(function))
1281	entry->flags \|= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1282
1283	return entry;
1284	}
1285
1286	static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
1287	bool include_partially_emulated)
1288	{
1289	memset(entry, `0`, sizeof(*entry));
1290
1291	entry->function = func;
1292	entry->index = `0`;
1293	entry->flags = `0`;
1294
1295	switch (func) {
1296	case `0`:
1297	entry->eax = `7`;
1298	return `1`;
1299	case `1`:
1300	entry->ecx = feature_bit(MOVBE);
1301	/*
1302	* KVM allows userspace to enumerate MONITOR+MWAIT support to
1303	* the guest, but the MWAIT feature flag is never advertised
1304	* to userspace because MONITOR+MWAIT aren't virtualized by
1305	* hardware, can't be faithfully emulated in software (KVM
1306	* emulates them as NOPs), and allowing the guest to execute
1307	* them natively requires enabling a per-VM capability.
1308	*/
1309	if (include_partially_emulated)
1310	entry->ecx \|= feature_bit(MWAIT);
1311	return `1`;
1312	case `7`:
1313	entry->flags \|= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1314	entry->eax = `0`;
1315	if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
1316	entry->ecx = feature_bit(RDPID);
1317	return `1`;
1318	default:
1319	return `0`;
1320	}
1321	}
1322
1323	static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
1324	{
1325	if (array->nent >= array->maxnent)
1326	return -E2BIG;
1327
1328	array->nent += cpuid_func_emulated(entry: &array->entries[array->nent], func, include_partially_emulated: false);
1329	return `0`;
1330	}
1331
1332	static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
1333	{
1334	struct kvm_cpuid_entry2 *entry;
1335	int r, i, max_idx;
1336
1337	/ all calls to cpuid_count() should be made on the same cpu /
1338	get_cpu();
1339
1340	r = -E2BIG;
1341
1342	entry = do_host_cpuid(array, function, index: `0`);
1343	if (!entry)
1344	goto out;
1345
1346	switch (function) {
1347	case `0`:
1348	/ Limited to the highest leaf implemented in KVM. /
1349	entry->eax = min(entry->eax, `0x24U`);
1350	break;
1351	case `1`:
1352	cpuid_entry_override(entry, leaf: CPUID_1_EDX);
1353	cpuid_entry_override(entry, leaf: CPUID_1_ECX);
1354	break;
1355	case `2`:
1356	/*
1357	* On ancient CPUs, function 2 entries are STATEFUL. That is,
1358	* CPUID(function=2, index=0) may return different results each
1359	* time, with the least-significant byte in EAX enumerating the
1360	* number of times software should do CPUID(2, 0).
1361	*
1362	* Modern CPUs, i.e. every CPU KVM has ever run on are less
1363	* idiotic. Intel's SDM states that EAX & 0xff "will always
1364	* return 01H. Software should ignore this value and not
1365	* interpret it as an informational descriptor", while AMD's
1366	* APM states that CPUID(2) is reserved.
1367	*
1368	* WARN if a frankenstein CPU that supports virtualization and
1369	* a stateful CPUID.0x2 is encountered.
1370	*/
1371	WARN_ON_ONCE((entry->eax & `0xff`) > `1`);
1372	break;
1373	/ functions 4 and 0x8000001d have additional index. /
1374	case `4`:
1375	case `0x8000001d`:
1376	/*
1377	* Read entries until the cache type in the previous entry is
1378	* zero, i.e. indicates an invalid entry.
1379	*/
1380	for (i = `1`; entry->eax & `0x1f`; ++i) {
1381	entry = do_host_cpuid(array, function, index: i);
1382	if (!entry)
1383	goto out;
1384	}
1385	break;
1386	case `6`: / Thermal management /
1387	entry->eax = `0x4`; / allow ARAT /
1388	entry->ebx = `0`;
1389	entry->ecx = `0`;
1390	entry->edx = `0`;
1391	break;
1392	/ function 7 has additional index. /
1393	case `7`:
1394	max_idx = entry->eax = min(entry->eax, `2u`);
1395	cpuid_entry_override(entry, leaf: CPUID_7_0_EBX);
1396	cpuid_entry_override(entry, leaf: CPUID_7_ECX);
1397	cpuid_entry_override(entry, leaf: CPUID_7_EDX);
1398
1399	/ KVM only supports up to 0x7.2, capped above via min(). /
1400	if (max_idx >= `1`) {
1401	entry = do_host_cpuid(array, function, index: `1`);
1402	if (!entry)
1403	goto out;
1404
1405	cpuid_entry_override(entry, leaf: CPUID_7_1_EAX);
1406	cpuid_entry_override(entry, leaf: CPUID_7_1_EDX);
1407	entry->ebx = `0`;
1408	entry->ecx = `0`;
1409	}
1410	if (max_idx >= `2`) {
1411	entry = do_host_cpuid(array, function, index: `2`);
1412	if (!entry)
1413	goto out;
1414
1415	cpuid_entry_override(entry, leaf: CPUID_7_2_EDX);
1416	entry->ecx = `0`;
1417	entry->ebx = `0`;
1418	entry->eax = `0`;
1419	}
1420	break;
1421	case `0xa`: { / Architectural Performance Monitoring /
1422	union cpuid10_eax eax = { };
1423	union cpuid10_edx edx = { };
1424
1425	if (!enable_pmu \|\| !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
1426	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1427	break;
1428	}
1429
1430	eax.split.version_id = kvm_pmu_cap.version;
1431	eax.split.num_counters = kvm_pmu_cap.num_counters_gp;
1432	eax.split.bit_width = kvm_pmu_cap.bit_width_gp;
1433	eax.split.mask_length = kvm_pmu_cap.events_mask_len;
1434	edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed;
1435	edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed;
1436
1437	if (kvm_pmu_cap.version)
1438	edx.split.anythread_deprecated = `1`;
1439
1440	entry->eax = eax.full;
1441	entry->ebx = kvm_pmu_cap.events_mask;
1442	entry->ecx = `0`;
1443	entry->edx = edx.full;
1444	break;
1445	}
1446	case `0x1f`:
1447	case `0xb`:
1448	/*
1449	* No topology; a valid topology is indicated by the presence
1450	* of subleaf 1.
1451	*/
1452	entry->eax = entry->ebx = entry->ecx = `0`;
1453	break;
1454	case `0xd`: {
1455	u64 permitted_xcr0 = kvm_get_filtered_xcr0();
1456	u64 permitted_xss = kvm_caps.supported_xss;
1457
1458	entry->eax &= permitted_xcr0;
1459	entry->ebx = xstate_required_size(xstate_bv: permitted_xcr0, compacted: false);
1460	entry->ecx = entry->ebx;
1461	entry->edx &= permitted_xcr0 >> `32`;
1462	if (!permitted_xcr0)
1463	break;
1464
1465	entry = do_host_cpuid(array, function, index: `1`);
1466	if (!entry)
1467	goto out;
1468
1469	cpuid_entry_override(entry, leaf: CPUID_D_1_EAX);
1470	if (entry->eax & (feature_bit(XSAVES) \| feature_bit(XSAVEC)))
1471	entry->ebx = xstate_required_size(xstate_bv: permitted_xcr0 \| permitted_xss,
1472	compacted: true);
1473	else {
1474	WARN_ON_ONCE(permitted_xss != `0`);
1475	entry->ebx = `0`;
1476	}
1477	entry->ecx &= permitted_xss;
1478	entry->edx &= permitted_xss >> `32`;
1479
1480	for (i = `2`; i < `64`; ++i) {
1481	bool s_state;
1482	if (permitted_xcr0 & BIT_ULL(i))
1483	s_state = false;
1484	else if (permitted_xss & BIT_ULL(i))
1485	s_state = true;
1486	else
1487	continue;
1488
1489	entry = do_host_cpuid(array, function, index: i);
1490	if (!entry)
1491	goto out;
1492
1493	/*
1494	* The supported check above should have filtered out
1495	* invalid sub-leafs. Only valid sub-leafs should
1496	* reach this point, and they should have a non-zero
1497	* save state size. Furthermore, check whether the
1498	* processor agrees with permitted_xcr0/permitted_xss
1499	* on whether this is an XCR0- or IA32_XSS-managed area.
1500	*/
1501	if (WARN_ON_ONCE(!entry->eax \|\| (entry->ecx & `0x1`) != s_state)) {
1502	--array->nent;
1503	continue;
1504	}
1505
1506	if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
1507	entry->ecx &= ~BIT_ULL(`2`);
1508	entry->edx = `0`;
1509	}
1510	break;
1511	}
1512	case `0x12`:
1513	/ Intel SGX /
1514	if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
1515	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1516	break;
1517	}
1518
1519	/*
1520	* Index 0: Sub-features, MISCSELECT (a.k.a extended features)
1521	* and max enclave sizes. The SGX sub-features and MISCSELECT
1522	* are restricted by kernel and KVM capabilities (like most
1523	* feature flags), while enclave size is unrestricted.
1524	*/
1525	cpuid_entry_override(entry, leaf: CPUID_12_EAX);
1526	entry->ebx &= SGX_MISC_EXINFO;
1527
1528	entry = do_host_cpuid(array, function, index: `1`);
1529	if (!entry)
1530	goto out;
1531
1532	/*
1533	* Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
1534	* feature flags. Advertise all supported flags, including
1535	* privileged attributes that require explicit opt-in from
1536	* userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
1537	* expected to derive it from supported XCR0.
1538	*/
1539	entry->eax &= SGX_ATTR_PRIV_MASK \| SGX_ATTR_UNPRIV_MASK;
1540	entry->ebx &= `0`;
1541	break;
1542	/ Intel PT /
1543	case `0x14`:
1544	if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
1545	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1546	break;
1547	}
1548
1549	for (i = `1`, max_idx = entry->eax; i <= max_idx; ++i) {
1550	if (!do_host_cpuid(array, function, index: i))
1551	goto out;
1552	}
1553	break;
1554	/ Intel AMX TILE /
1555	case `0x1d`:
1556	if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
1557	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1558	break;
1559	}
1560
1561	for (i = `1`, max_idx = entry->eax; i <= max_idx; ++i) {
1562	if (!do_host_cpuid(array, function, index: i))
1563	goto out;
1564	}
1565	break;
1566	case `0x1e`: / TMUL information /
1567	if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
1568	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1569	break;
1570	}
1571	break;
1572	case `0x24`: {
1573	u8 avx10_version;
1574
1575	if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) {
1576	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1577	break;
1578	}
1579
1580	/*
1581	* The AVX10 version is encoded in EBX[7:0]. Note, the version
1582	* is guaranteed to be >=1 if AVX10 is supported. Note #2, the
1583	* version needs to be captured before overriding EBX features!
1584	*/
1585	avx10_version = min_t(u8, entry->ebx & `0xff`, `1`);
1586	cpuid_entry_override(entry, leaf: CPUID_24_0_EBX);
1587	entry->ebx \|= avx10_version;
1588
1589	entry->eax = `0`;
1590	entry->ecx = `0`;
1591	entry->edx = `0`;
1592	break;
1593	}
1594	case KVM_CPUID_SIGNATURE: {
1595	const u32 sigptr = (const* u32 *)KVM_SIGNATURE;
1596	entry->eax = KVM_CPUID_FEATURES;
1597	entry->ebx = sigptr[`0`];
1598	entry->ecx = sigptr[`1`];
1599	entry->edx = sigptr[`2`];
1600	break;
1601	}
1602	case KVM_CPUID_FEATURES:
1603	entry->eax = (`1` << KVM_FEATURE_CLOCKSOURCE) \|
1604	(`1` << KVM_FEATURE_NOP_IO_DELAY) \|
1605	(`1` << KVM_FEATURE_CLOCKSOURCE2) \|
1606	(`1` << KVM_FEATURE_ASYNC_PF) \|
1607	(`1` << KVM_FEATURE_PV_EOI) \|
1608	(`1` << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) \|
1609	(`1` << KVM_FEATURE_PV_UNHALT) \|
1610	(`1` << KVM_FEATURE_PV_TLB_FLUSH) \|
1611	(`1` << KVM_FEATURE_ASYNC_PF_VMEXIT) \|
1612	(`1` << KVM_FEATURE_PV_SEND_IPI) \|
1613	(`1` << KVM_FEATURE_POLL_CONTROL) \|
1614	(`1` << KVM_FEATURE_PV_SCHED_YIELD) \|
1615	(`1` << KVM_FEATURE_ASYNC_PF_INT);
1616
1617	if (sched_info_on())
1618	entry->eax \|= (`1` << KVM_FEATURE_STEAL_TIME);
1619
1620	entry->ebx = `0`;
1621	entry->ecx = `0`;
1622	entry->edx = `0`;
1623	break;
1624	case `0x80000000`:
1625	entry->eax = min(entry->eax, `0x80000022`);
1626	/*
1627	* Serializing LFENCE is reported in a multitude of ways, and
1628	* NullSegClearsBase is not reported in CPUID on Zen2; help
1629	* userspace by providing the CPUID leaf ourselves.
1630	*
1631	* However, only do it if the host has CPUID leaf 0x8000001d.
1632	* QEMU thinks that it can query the host blindly for that
1633	* CPUID leaf if KVM reports that it supports 0x8000001d or
1634	* above. The processor merrily returns values from the
1635	* highest Intel leaf which QEMU tries to use as the guest's
1636	* 0x8000001d. Even worse, this can result in an infinite
1637	* loop if said highest leaf has no subleaves indexed by ECX.
1638	*/
1639	if (entry->eax >= `0x8000001d` &&
1640	(static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
1641	\|\| !static_cpu_has_bug(X86_BUG_NULL_SEG)))
1642	entry->eax = max(entry->eax, `0x80000021`);
1643	break;
1644	case `0x80000001`:
1645	entry->ebx &= ~GENMASK(`27`, `16`);
1646	cpuid_entry_override(entry, leaf: CPUID_8000_0001_EDX);
1647	cpuid_entry_override(entry, leaf: CPUID_8000_0001_ECX);
1648	break;
1649	case `0x80000005`:
1650	/ Pass host L1 cache and TLB info. /
1651	break;
1652	case `0x80000006`:
1653	/ Drop reserved bits, pass host L2 cache and TLB info. /
1654	entry->edx &= ~GENMASK(`17`, `16`);
1655	break;
1656	case `0x80000007`: / Advanced power management /
1657	cpuid_entry_override(entry, leaf: CPUID_8000_0007_EDX);
1658
1659	/ mask against host /
1660	entry->edx &= boot_cpu_data.x86_power;
1661	entry->eax = entry->ebx = entry->ecx = `0`;
1662	break;
1663	case `0x80000008`: {
1664	/*
1665	* GuestPhysAddrSize (EAX[23:16]) is intended for software
1666	* use.
1667	*
1668	* KVM's ABI is to report the effective MAXPHYADDR for the
1669	* guest in PhysAddrSize (phys_as), and the maximum
1670	* addressable GPA in GuestPhysAddrSize (g_phys_as).
1671	*
1672	* GuestPhysAddrSize is valid if and only if TDP is enabled,
1673	* in which case the max GPA that can be addressed by KVM may
1674	* be less than the max GPA that can be legally generated by
1675	* the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't
1676	* support 5-level TDP.
1677	*/
1678	unsigned int virt_as = max((entry->eax >> `8`) & `0xff`, `48U`);
1679	unsigned int phys_as, g_phys_as;
1680
1681	/*
1682	* If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
1683	* the guest operates in the same PA space as the host, i.e.
1684	* reductions in MAXPHYADDR for memory encryption affect shadow
1685	* paging, too.
1686	*
1687	* If TDP is enabled, use the raw bare metal MAXPHYADDR as
1688	* reductions to the HPAs do not affect GPAs. The max
1689	* addressable GPA is the same as the max effective GPA, except
1690	* that it's capped at 48 bits if 5-level TDP isn't supported
1691	* (hardware processes bits 51:48 only when walking the fifth
1692	* level page table).
1693	*/
1694	if (!tdp_enabled) {
1695	phys_as = boot_cpu_data.x86_phys_bits;
1696	g_phys_as = `0`;
1697	} else {
1698	phys_as = entry->eax & `0xff`;
1699	g_phys_as = phys_as;
1700	if (kvm_mmu_get_max_tdp_level() < `5`)
1701	g_phys_as = min(g_phys_as, `48U`);
1702	}
1703
1704	entry->eax = phys_as \| (virt_as << `8`) \| (g_phys_as << `16`);
1705	entry->ecx &= ~(GENMASK(`31`, `16`) \| GENMASK(`11`, `8`));
1706	entry->edx = `0`;
1707	cpuid_entry_override(entry, leaf: CPUID_8000_0008_EBX);
1708	break;
1709	}
1710	case `0x8000000A`:
1711	if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) {
1712	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1713	break;
1714	}
1715	entry->eax = `1`; / SVM revision 1 /
1716	entry->ebx = `8`; / Lets support 8 ASIDs in case we add proper*
1717	ASID emulation to nested SVM /*
1718	entry->ecx = `0`; / Reserved /
1719	cpuid_entry_override(entry, leaf: CPUID_8000_000A_EDX);
1720	break;
1721	case `0x80000019`:
1722	entry->ecx = entry->edx = `0`;
1723	break;
1724	case `0x8000001a`:
1725	entry->eax &= GENMASK(`2`, `0`);
1726	entry->ebx = entry->ecx = entry->edx = `0`;
1727	break;
1728	case `0x8000001e`:
1729	/ Do not return host topology information. /
1730	entry->eax = entry->ebx = entry->ecx = `0`;
1731	entry->edx = `0`; / reserved /
1732	break;
1733	case `0x8000001F`:
1734	if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
1735	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1736	} else {
1737	cpuid_entry_override(entry, leaf: CPUID_8000_001F_EAX);
1738	/ Clear NumVMPL since KVM does not support VMPL. /
1739	entry->ebx &= ~GENMASK(`31`, `12`);
1740	/*
1741	* Enumerate '0' for "PA bits reduction", the adjusted
1742	* MAXPHYADDR is enumerated directly (see 0x80000008).
1743	*/
1744	entry->ebx &= ~GENMASK(`11`, `6`);
1745	}
1746	break;
1747	case `0x80000020`:
1748	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1749	break;
1750	case `0x80000021`:
1751	entry->ebx = entry->ecx = entry->edx = `0`;
1752	cpuid_entry_override(entry, leaf: CPUID_8000_0021_EAX);
1753	break;
1754	/ AMD Extended Performance Monitoring and Debug /
1755	case `0x80000022`: {
1756	union cpuid_0x80000022_ebx ebx = { };
1757
1758	entry->ecx = entry->edx = `0`;
1759	if (!enable_pmu \|\| !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
1760	entry->eax = entry->ebx = `0`;
1761	break;
1762	}
1763
1764	cpuid_entry_override(entry, leaf: CPUID_8000_0022_EAX);
1765
1766	ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp;
1767	entry->ebx = ebx.full;
1768	break;
1769	}
1770	/Add support for Centaur's CPUID instruction/
1771	case `0xC0000000`:
1772	/Just support up to 0xC0000004 now/
1773	entry->eax = min(entry->eax, `0xC0000004`);
1774	break;
1775	case `0xC0000001`:
1776	cpuid_entry_override(entry, leaf: CPUID_C000_0001_EDX);
1777	break;
1778	case `3`: / Processor serial number /
1779	case `5`: / MONITOR/MWAIT /
1780	case `0xC0000002`:
1781	case `0xC0000003`:
1782	case `0xC0000004`:
1783	default:
1784	entry->eax = entry->ebx = entry->ecx = entry->edx = `0`;
1785	break;
1786	}
1787
1788	r = `0`;
1789
1790	out:
1791	put_cpu();
1792
1793	return r;
1794	}
1795
1796	static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
1797	unsigned int type)
1798	{
1799	if (type == KVM_GET_EMULATED_CPUID)
1800	return __do_cpuid_func_emulated(array, func);
1801
1802	return __do_cpuid_func(array, function: func);
1803	}
1804
1805	#define CENTAUR_CPUID_SIGNATURE 0xC0000000
1806
1807	static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
1808	unsigned int type)
1809	{
1810	u32 limit;
1811	int r;
1812
1813	if (func == CENTAUR_CPUID_SIGNATURE &&
1814	boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
1815	return `0`;
1816
1817	r = do_cpuid_func(array, func, type);
1818	if (r)
1819	return r;
1820
1821	limit = array->entries[array->nent - `1`].eax;
1822	for (func = func + `1`; func <= limit; ++func) {
1823	r = do_cpuid_func(array, func, type);
1824	if (r)
1825	break;
1826	}
1827
1828	return r;
1829	}
1830
1831	static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
1832	__u32 num_entries, unsigned int ioctl_type)
1833	{
1834	int i;
1835	__u32 pad[`3`];
1836
1837	if (ioctl_type != KVM_GET_EMULATED_CPUID)
1838	return false;
1839
1840	/*
1841	* We want to make sure that ->padding is being passed clean from
1842	* userspace in case we want to use it for something in the future.
1843	*
1844	* Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
1845	* have to give ourselves satisfied only with the emulated side. /me
1846	* sheds a tear.
1847	*/
1848	for (i = `0`; i < num_entries; i++) {
1849	if (copy_from_user(to: pad, from: entries[i].padding, n: sizeof(pad)))
1850	return true;
1851
1852	if (pad[`0`] \|\| pad[`1`] \|\| pad[`2`])
1853	return true;
1854	}
1855	return false;
1856	}
1857
1858	int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
1859	struct kvm_cpuid_entry2 __user *entries,
1860	unsigned int type)
1861	{
1862	static const u32 funcs[] = {
1863	`0`, `0x80000000`, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
1864	};
1865
1866	struct kvm_cpuid_array array = {
1867	.nent = `0`,
1868	};
1869	int r, i;
1870
1871	if (cpuid->nent < `1`)
1872	return -E2BIG;
1873	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1874	cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1875
1876	if (sanity_check_entries(entries, num_entries: cpuid->nent, ioctl_type: type))
1877	return -EINVAL;
1878
1879	array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL);
1880	if (!array.entries)
1881	return -ENOMEM;
1882
1883	array.maxnent = cpuid->nent;
1884
1885	for (i = `0`; i < ARRAY_SIZE(funcs); i++) {
1886	r = get_cpuid_func(array: &array, func: funcs[i], type);
1887	if (r)
1888	goto out_free;
1889	}
1890	cpuid->nent = array.nent;
1891
1892	if (copy_to_user(to: entries, from: array.entries,
1893	n: array.nent * sizeof(struct kvm_cpuid_entry2)))
1894	r = -EFAULT;
1895
1896	out_free:
1897	kvfree(addr: array.entries);
1898	return r;
1899	}
1900
1901	/*
1902	* Intel CPUID semantics treats any query for an out-of-range leaf as if the
1903	* highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics
1904	* returns all zeroes for any undefined leaf, whether or not the leaf is in
1905	* range. Centaur/VIA follows Intel semantics.
1906	*
1907	* A leaf is considered out-of-range if its function is higher than the maximum
1908	* supported leaf of its associated class or if its associated class does not
1909	* exist.
1910	*
1911	* There are three primary classes to be considered, with their respective
1912	* ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary
1913	* class exists if a guest CPUID entry for its <base> leaf exists. For a given
1914	* class, CPUID.<base>.EAX contains the max supported leaf for the class.
1915	*
1916	* - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
1917	* - Hypervisor: 0x40000000 - 0x4fffffff
1918	* - Extended: 0x80000000 - 0xbfffffff
1919	* - Centaur: 0xc0000000 - 0xcfffffff
1920	*
1921	* The Hypervisor class is further subdivided into sub-classes that each act as
1922	* their own independent class associated with a 0x100 byte range. E.g. if Qemu
1923	* is advertising support for both HyperV and KVM, the resulting Hypervisor
1924	* CPUID sub-classes are:
1925	*
1926	* - HyperV: 0x40000000 - 0x400000ff
1927	* - KVM: 0x40000100 - 0x400001ff
1928	*/
1929	static struct kvm_cpuid_entry2 *
1930	get_out_of_range_cpuid_entry(struct kvm_vcpu vcpu, u32 fn_ptr, u32 index)
1931	{
1932	struct kvm_cpuid_entry2 basic, class;
1933	u32 function = *fn_ptr;
1934
1935	basic = kvm_find_cpuid_entry(vcpu, function: `0`);
1936	if (!basic)
1937	return NULL;
1938
1939	if (is_guest_vendor_amd(ebx: basic->ebx, ecx: basic->ecx, edx: basic->edx) \|\|
1940	is_guest_vendor_hygon(ebx: basic->ebx, ecx: basic->ecx, edx: basic->edx))
1941	return NULL;
1942
1943	if (function >= `0x40000000` && function <= `0x4fffffff`)
1944	class = kvm_find_cpuid_entry(vcpu, function: function & `0xffffff00`);
1945	else if (function >= `0xc0000000`)
1946	class = kvm_find_cpuid_entry(vcpu, function: `0xc0000000`);
1947	else
1948	class = kvm_find_cpuid_entry(vcpu, function: function & `0x80000000`);
1949
1950	if (class && function <= class->eax)
1951	return NULL;
1952
1953	/*
1954	* Leaf specific adjustments are also applied when redirecting to the
1955	* max basic entry, e.g. if the max basic leaf is 0xb but there is no
1956	* entry for CPUID.0xb.index (see below), then the output value for EDX
1957	* needs to be pulled from CPUID.0xb.1.
1958	*/
1959	*fn_ptr = basic->eax;
1960
1961	/*
1962	* The class does not exist or the requested function is out of range;
1963	* the effective CPUID entry is the max basic leaf. Note, the index of
1964	* the original requested leaf is observed!
1965	*/
1966	return kvm_find_cpuid_entry_index(vcpu, function: basic->eax, index);
1967	}
1968
1969	bool kvm_cpuid(struct kvm_vcpu vcpu, u32 eax, u32 *ebx,
1970	u32 ecx, u32 edx, bool exact_only)
1971	{
1972	u32 orig_function = eax, function = eax, index = *ecx;
1973	struct kvm_cpuid_entry2 *entry;
1974	bool exact, used_max_basic = false;
1975
1976	if (vcpu->arch.cpuid_dynamic_bits_dirty)
1977	kvm_update_cpuid_runtime(vcpu);
1978
1979	entry = kvm_find_cpuid_entry_index(vcpu, function, index);
1980	exact = !!entry;
1981
1982	if (!entry && !exact_only) {
1983	entry = get_out_of_range_cpuid_entry(vcpu, fn_ptr: &function, index);
1984	used_max_basic = !!entry;
1985	}
1986
1987	if (entry) {
1988	*eax = entry->eax;
1989	*ebx = entry->ebx;
1990	*ecx = entry->ecx;
1991	*edx = entry->edx;
1992	if (function == `7` && index == `0`) {
1993	u64 data;
1994	if ((*ebx & (feature_bit(RTM) \| feature_bit(HLE))) &&
1995	!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, data: &data, host_initiated: true) &&
1996	(data & TSX_CTRL_CPUID_CLEAR))
1997	*ebx &= ~(feature_bit(RTM) \| feature_bit(HLE));
1998	} else if (function == `0x80000007`) {
1999	if (kvm_hv_invtsc_suppressed(vcpu))
2000	*edx &= ~feature_bit(CONSTANT_TSC);
2001	} else if (IS_ENABLED(CONFIG_KVM_XEN) &&
2002	kvm_xen_is_tsc_leaf(vcpu, function)) {
2003	/*
2004	* Update guest TSC frequency information if necessary.
2005	* Ignore failures, there is no sane value that can be
2006	* provided if KVM can't get the TSC frequency.
2007	*/
2008	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
2009	kvm_guest_time_update(v: vcpu);
2010
2011	if (index == `1`) {
2012	*ecx = vcpu->arch.pvclock_tsc_mul;
2013	*edx = vcpu->arch.pvclock_tsc_shift;
2014	} else if (index == `2`) {
2015	*eax = vcpu->arch.hw_tsc_khz;
2016	}
2017	}
2018	} else {
2019	eax = ebx = ecx = edx = `0`;
2020	/*
2021	* When leaf 0BH or 1FH is defined, CL is pass-through
2022	* and EDX is always the x2APIC ID, even for undefined
2023	* subleaves. Index 1 will exist iff the leaf is
2024	* implemented, so we pass through CL iff leaf 1
2025	* exists. EDX can be copied from any existing index.
2026	*/
2027	if (function == `0xb` \|\| function == `0x1f`) {
2028	entry = kvm_find_cpuid_entry_index(vcpu, function, index: `1`);
2029	if (entry) {
2030	*ecx = index & `0xff`;
2031	*edx = entry->edx;
2032	}
2033	}
2034	}
2035	trace_kvm_cpuid(function: orig_function, index, rax: eax, rbx: ebx, rcx: ecx, rdx: edx, found: exact,
2036	used_max_basic);
2037	return exact;
2038	}
2039	EXPORT_SYMBOL_GPL(kvm_cpuid);
2040
2041	int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2042	{
2043	u32 eax, ebx, ecx, edx;
2044
2045	if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, required_cpl: `0`))
2046	return `1`;
2047
2048	eax = kvm_rax_read(vcpu);
2049	ecx = kvm_rcx_read(vcpu);
2050	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
2051	kvm_rax_write(vcpu, val: eax);
2052	kvm_rbx_write(vcpu, val: ebx);
2053	kvm_rcx_write(vcpu, val: ecx);
2054	kvm_rdx_write(vcpu, val: edx);
2055	return kvm_skip_emulated_instruction(vcpu);
2056	}
2057	EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2058

Provided by KDAB

Definitions

source code of linux/arch/x86/kvm/cpuid.c