kvm.c source code [linux/arch/x86/kernel/kvm.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* KVM paravirt_ops implementation
4	*
5	* Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6	* Copyright IBM Corporation, 2007
7	* Authors: Anthony Liguori <aliguori@us.ibm.com>
8	*/
9
10	#define pr_fmt(fmt) "kvm-guest: " fmt
11
12	#include <linux/context_tracking.h>
13	#include <linux/init.h>
14	#include <linux/irq.h>
15	#include <linux/kernel.h>
16	#include <linux/kvm_para.h>
17	#include <linux/cpu.h>
18	#include <linux/mm.h>
19	#include <linux/highmem.h>
20	#include <linux/hardirq.h>
21	#include <linux/notifier.h>
22	#include <linux/reboot.h>
23	#include <linux/hash.h>
24	#include <linux/sched.h>
25	#include <linux/slab.h>
26	#include <linux/kprobes.h>
27	#include <linux/nmi.h>
28	#include <linux/swait.h>
29	#include <linux/syscore_ops.h>
30	#include <linux/cc_platform.h>
31	#include <linux/efi.h>
32	#include <asm/timer.h>
33	#include <asm/cpu.h>
34	#include <asm/traps.h>
35	#include <asm/desc.h>
36	#include <asm/tlbflush.h>
37	#include <asm/apic.h>
38	#include <asm/apicdef.h>
39	#include <asm/hypervisor.h>
40	#include <asm/tlb.h>
41	#include <asm/cpuidle_haltpoll.h>
42	#include <asm/ptrace.h>
43	#include <asm/reboot.h>
44	#include <asm/svm.h>
45	#include <asm/e820/api.h>
46
47	DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
48
49	static int kvmapf = `1`;
50
51	static int __init parse_no_kvmapf(char *arg)
52	{
53	kvmapf = `0`;
54	return `0`;
55	}
56
57	early_param("no-kvmapf", parse_no_kvmapf);
58
59	static int steal_acc = `1`;
60	static int __init parse_no_stealacc(char *arg)
61	{
62	steal_acc = `0`;
63	return `0`;
64	}
65
66	early_param("no-steal-acc", parse_no_stealacc);
67
68	static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(`64`);
69	DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(`64`) __visible;
70	static int has_steal_clock = `0`;
71
72	static int has_guest_poll = `0`;
73	/*
74	* No need for any "IO delay" on KVM
75	*/
76	static void kvm_io_delay(void)
77	{
78	}
79
80	#define KVM_TASK_SLEEP_HASHBITS 8
81	#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
82
83	struct kvm_task_sleep_node {
84	struct hlist_node link;
85	struct swait_queue_head wq;
86	u32 token;
87	int cpu;
88	};
89
90	static struct kvm_task_sleep_head {
91	raw_spinlock_t lock;
92	struct hlist_head list;
93	} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
94
95	static struct kvm_task_sleep_node _find_apf_task(struct* kvm_task_sleep_head *b,
96	u32 token)
97	{
98	struct hlist_node *p;
99
100	hlist_for_each(p, &b->list) {
101	struct kvm_task_sleep_node *n =
102	hlist_entry(p, typeof(*n), link);
103	if (n->token == token)
104	return n;
105	}
106
107	return NULL;
108	}
109
110	static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
111	{
112	u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS);
113	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
114	struct kvm_task_sleep_node *e;
115
116	raw_spin_lock(&b->lock);
117	e = _find_apf_task(b, token);
118	if (e) {
119	/ dummy entry exist -> wake up was delivered ahead of PF /
120	hlist_del(n: &e->link);
121	raw_spin_unlock(&b->lock);
122	kfree(objp: e);
123	return false;
124	}
125
126	n->token = token;
127	n->cpu = smp_processor_id();
128	init_swait_queue_head(&n->wq);
129	hlist_add_head(n: &n->link, h: &b->list);
130	raw_spin_unlock(&b->lock);
131	return true;
132	}
133
134	/*
135	* kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
136	* @token: Token to identify the sleep node entry
137	*
138	* Invoked from the async pagefault handling code or from the VM exit page
139	* fault handler. In both cases RCU is watching.
140	*/
141	void kvm_async_pf_task_wait_schedule(u32 token)
142	{
143	struct kvm_task_sleep_node n;
144	DECLARE_SWAITQUEUE(wait);
145
146	lockdep_assert_irqs_disabled();
147
148	if (!kvm_async_pf_queue_task(token, n: &n))
149	return;
150
151	for (;;) {
152	prepare_to_swait_exclusive(q: &n.wq, wait: &wait, TASK_UNINTERRUPTIBLE);
153	if (hlist_unhashed(h: &n.link))
154	break;
155
156	local_irq_enable();
157	schedule();
158	local_irq_disable();
159	}
160	finish_swait(q: &n.wq, wait: &wait);
161	}
162	EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
163
164	static void apf_task_wake_one(struct kvm_task_sleep_node *n)
165	{
166	hlist_del_init(n: &n->link);
167	if (swq_has_sleeper(wq: &n->wq))
168	swake_up_one(q: &n->wq);
169	}
170
171	static void apf_task_wake_all(void)
172	{
173	int i;
174
175	for (i = `0`; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
176	struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
177	struct kvm_task_sleep_node *n;
178	struct hlist_node p, next;
179
180	raw_spin_lock(&b->lock);
181	hlist_for_each_safe(p, next, &b->list) {
182	n = hlist_entry(p, typeof(*n), link);
183	if (n->cpu == smp_processor_id())
184	apf_task_wake_one(n);
185	}
186	raw_spin_unlock(&b->lock);
187	}
188	}
189
190	void kvm_async_pf_task_wake(u32 token)
191	{
192	u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS);
193	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
194	struct kvm_task_sleep_node n, dummy = NULL;
195
196	if (token == ~`0`) {
197	apf_task_wake_all();
198	return;
199	}
200
201	again:
202	raw_spin_lock(&b->lock);
203	n = _find_apf_task(b, token);
204	if (!n) {
205	/*
206	* Async #PF not yet handled, add a dummy entry for the token.
207	* Allocating the token must be down outside of the raw lock
208	* as the allocator is preemptible on PREEMPT_RT kernels.
209	*/
210	if (!dummy) {
211	raw_spin_unlock(&b->lock);
212	dummy = kzalloc(size: sizeof(*dummy), GFP_ATOMIC);
213
214	/*
215	* Continue looping on allocation failure, eventually
216	* the async #PF will be handled and allocating a new
217	* node will be unnecessary.
218	*/
219	if (!dummy)
220	cpu_relax();
221
222	/*
223	* Recheck for async #PF completion before enqueueing
224	* the dummy token to avoid duplicate list entries.
225	*/
226	goto again;
227	}
228	dummy->token = token;
229	dummy->cpu = smp_processor_id();
230	init_swait_queue_head(&dummy->wq);
231	hlist_add_head(n: &dummy->link, h: &b->list);
232	dummy = NULL;
233	} else {
234	apf_task_wake_one(n);
235	}
236	raw_spin_unlock(&b->lock);
237
238	/ A dummy token might be allocated and ultimately not used. /
239	kfree(objp: dummy);
240	}
241	EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
242
243	noinstr u32 kvm_read_and_reset_apf_flags(void)
244	{
245	u32 flags = `0`;
246
247	if (__this_cpu_read(apf_reason.enabled)) {
248	flags = __this_cpu_read(apf_reason.flags);
249	__this_cpu_write(apf_reason.flags, `0`);
250	}
251
252	return flags;
253	}
254	EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
255
256	noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
257	{
258	u32 flags = kvm_read_and_reset_apf_flags();
259	irqentry_state_t state;
260
261	if (!flags)
262	return false;
263
264	state = irqentry_enter(regs);
265	instrumentation_begin();
266
267	/*
268	* If the host managed to inject an async #PF into an interrupt
269	* disabled region, then die hard as this is not going to end well
270	* and the host side is seriously broken.
271	*/
272	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
273	panic(fmt: "Host injected async #PF in interrupt disabled region\n");
274
275	if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
276	if (unlikely(!(user_mode(regs))))
277	panic(fmt: "Host injected async #PF in kernel mode\n");
278	/ Page is swapped out by the host. /
279	kvm_async_pf_task_wait_schedule(token);
280	} else {
281	WARN_ONCE(`1`, "Unexpected async PF flags: %x\n", flags);
282	}
283
284	instrumentation_end();
285	irqentry_exit(regs, state);
286	return true;
287	}
288
289	DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
290	{
291	struct pt_regs *old_regs = set_irq_regs(regs);
292	u32 token;
293
294	apic_eoi();
295
296	inc_irq_stat(irq_hv_callback_count);
297
298	if (__this_cpu_read(apf_reason.enabled)) {
299	token = __this_cpu_read(apf_reason.token);
300	kvm_async_pf_task_wake(token);
301	__this_cpu_write(apf_reason.token, `0`);
302	wrmsrl(MSR_KVM_ASYNC_PF_ACK, val: `1`);
303	}
304
305	set_irq_regs(old_regs);
306	}
307
308	static void __init paravirt_ops_setup(void)
309	{
310	pv_info.name = "KVM";
311
312	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
313	pv_ops.cpu.io_delay = kvm_io_delay;
314
315	#ifdef CONFIG_X86_IO_APIC
316	no_timer_check = `1`;
317	#endif
318	}
319
320	static void kvm_register_steal_time(void)
321	{
322	int cpu = smp_processor_id();
323	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
324
325	if (!has_steal_clock)
326	return;
327
328	wrmsrl(MSR_KVM_STEAL_TIME, val: (slow_virt_to_phys(address: st) \| KVM_MSR_ENABLED));
329	pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
330	(unsigned long long) slow_virt_to_phys(st));
331	}
332
333	static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
334
335	static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
336	{
337	/**
338	* This relies on __test_and_clear_bit to modify the memory
339	* in a way that is atomic with respect to the local CPU.
340	* The hypervisor only accesses this memory from the local CPU so
341	* there's no need for lock or memory barriers.
342	* An optimization barrier is implied in apic write.
343	*/
344	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
345	return;
346	apic_native_eoi();
347	}
348
349	static void kvm_guest_cpu_init(void)
350	{
351	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
352	u64 pa;
353
354	WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
355
356	pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
357	pa \|= KVM_ASYNC_PF_ENABLED \| KVM_ASYNC_PF_DELIVERY_AS_INT;
358
359	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
360	pa \|= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
361
362	wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
363
364	wrmsrl(MSR_KVM_ASYNC_PF_EN, val: pa);
365	__this_cpu_write(apf_reason.enabled, `1`);
366	pr_debug("setup async PF for cpu %d\n", smp_processor_id());
367	}
368
369	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
370	unsigned long pa;
371
372	/ Size alignment is implied but just to make it explicit. /
373	BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < `4`);
374	__this_cpu_write(kvm_apic_eoi, `0`);
375	pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
376	\| KVM_MSR_ENABLED;
377	wrmsrl(MSR_KVM_PV_EOI_EN, val: pa);
378	}
379
380	if (has_steal_clock)
381	kvm_register_steal_time();
382	}
383
384	static void kvm_pv_disable_apf(void)
385	{
386	if (!__this_cpu_read(apf_reason.enabled))
387	return;
388
389	wrmsrl(MSR_KVM_ASYNC_PF_EN, val: `0`);
390	__this_cpu_write(apf_reason.enabled, `0`);
391
392	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
393	}
394
395	static void kvm_disable_steal_time(void)
396	{
397	if (!has_steal_clock)
398	return;
399
400	wrmsr(MSR_KVM_STEAL_TIME, `0`, `0`);
401	}
402
403	static u64 kvm_steal_clock(int cpu)
404	{
405	u64 steal;
406	struct kvm_steal_time *src;
407	int version;
408
409	src = &per_cpu(steal_time, cpu);
410	do {
411	version = src->version;
412	virt_rmb();
413	steal = src->steal;
414	virt_rmb();
415	} while ((version & `1`) \|\| (version != src->version));
416
417	return steal;
418	}
419
420	static inline void __set_percpu_decrypted(void ptr, unsigned* long size)
421	{
422	early_set_memory_decrypted(vaddr: (unsigned long) ptr, size);
423	}
424
425	/*
426	* Iterate through all possible CPUs and map the memory region pointed
427	* by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
428	*
429	* Note: we iterate through all possible CPUs to ensure that CPUs
430	* hotplugged will have their per-cpu variable already mapped as
431	* decrypted.
432	*/
433	static void __init sev_map_percpu_data(void)
434	{
435	int cpu;
436
437	if (!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT))
438	return;
439
440	for_each_possible_cpu(cpu) {
441	__set_percpu_decrypted(ptr: &per_cpu(apf_reason, cpu), size: sizeof(apf_reason));
442	__set_percpu_decrypted(ptr: &per_cpu(steal_time, cpu), size: sizeof(steal_time));
443	__set_percpu_decrypted(ptr: &per_cpu(kvm_apic_eoi, cpu), size: sizeof(kvm_apic_eoi));
444	}
445	}
446
447	static void kvm_guest_cpu_offline(bool shutdown)
448	{
449	kvm_disable_steal_time();
450	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
451	wrmsrl(MSR_KVM_PV_EOI_EN, val: `0`);
452	if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
453	wrmsrl(MSR_KVM_MIGRATION_CONTROL, val: `0`);
454	kvm_pv_disable_apf();
455	if (!shutdown)
456	apf_task_wake_all();
457	kvmclock_disable();
458	}
459
460	static int kvm_cpu_online(unsigned int cpu)
461	{
462	unsigned long flags;
463
464	local_irq_save(flags);
465	kvm_guest_cpu_init();
466	local_irq_restore(flags);
467	return `0`;
468	}
469
470	#ifdef CONFIG_SMP
471
472	static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
473
474	static bool pv_tlb_flush_supported(void)
475	{
476	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
477	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
478	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
479	!boot_cpu_has(X86_FEATURE_MWAIT) &&
480	(num_possible_cpus() != `1`));
481	}
482
483	static bool pv_ipi_supported(void)
484	{
485	return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
486	(num_possible_cpus() != `1`));
487	}
488
489	static bool pv_sched_yield_supported(void)
490	{
491	return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
492	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
493	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
494	!boot_cpu_has(X86_FEATURE_MWAIT) &&
495	(num_possible_cpus() != `1`));
496	}
497
498	#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
499
500	static void __send_ipi_mask(const struct cpumask mask, int* vector)
501	{
502	unsigned long flags;
503	int cpu, min = `0`, max = `0`;
504	#ifdef CONFIG_X86_64
505	__uint128_t ipi_bitmap = `0`;
506	#else
507	u64 ipi_bitmap = `0`;
508	#endif
509	u32 apic_id, icr;
510	long ret;
511
512	if (cpumask_empty(srcp: mask))
513	return;
514
515	local_irq_save(flags);
516
517	switch (vector) {
518	default:
519	icr = APIC_DM_FIXED \| vector;
520	break;
521	case NMI_VECTOR:
522	icr = APIC_DM_NMI;
523	break;
524	}
525
526	for_each_cpu(cpu, mask) {
527	apic_id = per_cpu(x86_cpu_to_apicid, cpu);
528	if (!ipi_bitmap) {
529	min = max = apic_id;
530	} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
531	ipi_bitmap <<= min - apic_id;
532	min = apic_id;
533	} else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
534	max = apic_id < max ? max : apic_id;
535	} else {
536	ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap,
537	p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr);
538	WARN_ONCE(ret < `0`, "kvm-guest: failed to send PV IPI: %ld",
539	ret);
540	min = max = apic_id;
541	ipi_bitmap = `0`;
542	}
543	__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
544	}
545
546	if (ipi_bitmap) {
547	ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap,
548	p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr);
549	WARN_ONCE(ret < `0`, "kvm-guest: failed to send PV IPI: %ld",
550	ret);
551	}
552
553	local_irq_restore(flags);
554	}
555
556	static void kvm_send_ipi_mask(const struct cpumask mask, int* vector)
557	{
558	__send_ipi_mask(mask, vector);
559	}
560
561	static void kvm_send_ipi_mask_allbutself(const struct cpumask mask, int* vector)
562	{
563	unsigned int this_cpu = smp_processor_id();
564	struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
565	const struct cpumask *local_mask;
566
567	cpumask_copy(dstp: new_mask, srcp: mask);
568	cpumask_clear_cpu(cpu: this_cpu, dstp: new_mask);
569	local_mask = new_mask;
570	__send_ipi_mask(mask: local_mask, vector);
571	}
572
573	static int __init setup_efi_kvm_sev_migration(void)
574	{
575	efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
576	efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
577	efi_status_t status;
578	unsigned long size;
579	bool enabled;
580
581	if (!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) \|\|
582	!kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
583	return `0`;
584
585	if (!efi_enabled(EFI_BOOT))
586	return `0`;
587
588	if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
589	pr_info("%s : EFI runtime services are not enabled\n", __func__);
590	return `0`;
591	}
592
593	size = sizeof(enabled);
594
595	/ Get variable contents into buffer /
596	status = efi.get_variable(efi_sev_live_migration_enabled,
597	&efi_variable_guid, NULL, &size, &enabled);
598
599	if (status == EFI_NOT_FOUND) {
600	pr_info("%s : EFI live migration variable not found\n", __func__);
601	return `0`;
602	}
603
604	if (status != EFI_SUCCESS) {
605	pr_info("%s : EFI variable retrieval failed\n", __func__);
606	return `0`;
607	}
608
609	if (enabled == `0`) {
610	pr_info("%s: live migration disabled in EFI\n", __func__);
611	return `0`;
612	}
613
614	pr_info("%s : live migration enabled in EFI\n", __func__);
615	wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
616
617	return `1`;
618	}
619
620	late_initcall(setup_efi_kvm_sev_migration);
621
622	/*
623	* Set the IPI entry points
624	*/
625	static __init void kvm_setup_pv_ipi(void)
626	{
627	apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
628	apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
629	pr_info("setup PV IPIs\n");
630	}
631
632	static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
633	{
634	int cpu;
635
636	native_send_call_func_ipi(mask);
637
638	/ Make sure other vCPUs get a chance to run if they need to. /
639	for_each_cpu(cpu, mask) {
640	if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
641	kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
642	break;
643	}
644	}
645	}
646
647	static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
648	const struct flush_tlb_info *info)
649	{
650	u8 state;
651	int cpu;
652	struct kvm_steal_time *src;
653	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
654
655	cpumask_copy(dstp: flushmask, srcp: cpumask);
656	/*
657	* We have to call flush only on online vCPUs. And
658	* queue flush_on_enter for pre-empted vCPUs
659	*/
660	for_each_cpu(cpu, flushmask) {
661	/*
662	* The local vCPU is never preempted, so we do not explicitly
663	* skip check for local vCPU - it will never be cleared from
664	* flushmask.
665	*/
666	src = &per_cpu(steal_time, cpu);
667	state = READ_ONCE(src->preempted);
668	if ((state & KVM_VCPU_PREEMPTED)) {
669	if (try_cmpxchg(&src->preempted, &state,
670	state \| KVM_VCPU_FLUSH_TLB))
671	__cpumask_clear_cpu(cpu, dstp: flushmask);
672	}
673	}
674
675	native_flush_tlb_multi(cpumask: flushmask, info);
676	}
677
678	static __init int kvm_alloc_cpumask(void)
679	{
680	int cpu;
681
682	if (!kvm_para_available() \|\| nopv)
683	return `0`;
684
685	if (pv_tlb_flush_supported() \|\| pv_ipi_supported())
686	for_each_possible_cpu(cpu) {
687	zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
688	GFP_KERNEL, cpu_to_node(cpu));
689	}
690
691	return `0`;
692	}
693	arch_initcall(kvm_alloc_cpumask);
694
695	static void __init kvm_smp_prepare_boot_cpu(void)
696	{
697	/*
698	* Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
699	* shares the guest physical address with the hypervisor.
700	*/
701	sev_map_percpu_data();
702
703	kvm_guest_cpu_init();
704	native_smp_prepare_boot_cpu();
705	kvm_spinlock_init();
706	}
707
708	static int kvm_cpu_down_prepare(unsigned int cpu)
709	{
710	unsigned long flags;
711
712	local_irq_save(flags);
713	kvm_guest_cpu_offline(shutdown: false);
714	local_irq_restore(flags);
715	return `0`;
716	}
717
718	#endif
719
720	static int kvm_suspend(void)
721	{
722	u64 val = `0`;
723
724	kvm_guest_cpu_offline(shutdown: false);
725
726	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
727	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
728	rdmsrl(MSR_KVM_POLL_CONTROL, val);
729	has_guest_poll = !(val & `1`);
730	#endif
731	return `0`;
732	}
733
734	static void kvm_resume(void)
735	{
736	kvm_cpu_online(raw_smp_processor_id());
737
738	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
739	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
740	wrmsrl(MSR_KVM_POLL_CONTROL, val: `0`);
741	#endif
742	}
743
744	static struct syscore_ops kvm_syscore_ops = {
745	.suspend = kvm_suspend,
746	.resume = kvm_resume,
747	};
748
749	static void kvm_pv_guest_cpu_reboot(void *unused)
750	{
751	kvm_guest_cpu_offline(shutdown: true);
752	}
753
754	static int kvm_pv_reboot_notify(struct notifier_block *nb,
755	unsigned long code, void *unused)
756	{
757	if (code == SYS_RESTART)
758	on_each_cpu(func: kvm_pv_guest_cpu_reboot, NULL, wait: `1`);
759	return NOTIFY_DONE;
760	}
761
762	static struct notifier_block kvm_pv_reboot_nb = {
763	.notifier_call = kvm_pv_reboot_notify,
764	};
765
766	/*
767	* After a PV feature is registered, the host will keep writing to the
768	* registered memory location. If the guest happens to shutdown, this memory
769	* won't be valid. In cases like kexec, in which you install a new kernel, this
770	* means a random memory location will be kept being written.
771	*/
772	#ifdef CONFIG_KEXEC_CORE
773	static void kvm_crash_shutdown(struct pt_regs *regs)
774	{
775	kvm_guest_cpu_offline(shutdown: true);
776	native_machine_crash_shutdown(regs);
777	}
778	#endif
779
780	#if defined(CONFIG_X86_32) \|\| !defined(CONFIG_SMP)
781	bool __kvm_vcpu_is_preempted(long cpu);
782
783	__visible bool __kvm_vcpu_is_preempted(long cpu)
784	{
785	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
786
787	return !!(src->preempted & KVM_VCPU_PREEMPTED);
788	}
789	PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
790
791	#else
792
793	#include <asm/asm-offsets.h>
794
795	extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
796
797	/*
798	* Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
799	* restoring to/from the stack.
800	*/
801	#define PV_VCPU_PREEMPTED_ASM \
802	"movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
803	"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
804	"setne %al\n\t"
805
806	DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
807	PV_VCPU_PREEMPTED_ASM, .text);
808	#endif
809
810	static void __init kvm_guest_init(void)
811	{
812	int i;
813
814	paravirt_ops_setup();
815	register_reboot_notifier(&kvm_pv_reboot_nb);
816	for (i = `0`; i < KVM_TASK_SLEEP_HASHSIZE; i++)
817	raw_spin_lock_init(&async_pf_sleepers[i].lock);
818
819	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
820	has_steal_clock = `1`;
821	static_call_update(pv_steal_clock, kvm_steal_clock);
822
823	pv_ops.lock.vcpu_is_preempted =
824	PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
825	}
826
827	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
828	apic_update_callback(eoi, kvm_guest_apic_eoi_write);
829
830	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
831	static_branch_enable(&kvm_async_pf_enabled);
832	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, addr: asm_sysvec_kvm_asyncpf_interrupt);
833	}
834
835	#ifdef CONFIG_SMP
836	if (pv_tlb_flush_supported()) {
837	pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
838	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
839	pr_info("KVM setup pv remote TLB flush\n");
840	}
841
842	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
843	if (pv_sched_yield_supported()) {
844	smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
845	pr_info("setup PV sched yield\n");
846	}
847	if (cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN, name: "x86/kvm:online",
848	startup: kvm_cpu_online, teardown: kvm_cpu_down_prepare) < `0`)
849	pr_err("failed to install cpu hotplug callbacks\n");
850	#else
851	sev_map_percpu_data();
852	kvm_guest_cpu_init();
853	#endif
854
855	#ifdef CONFIG_KEXEC_CORE
856	machine_ops.crash_shutdown = kvm_crash_shutdown;
857	#endif
858
859	register_syscore_ops(ops: &kvm_syscore_ops);
860
861	/*
862	* Hard lockup detection is enabled by default. Disable it, as guests
863	* can get false positives too easily, for example if the host is
864	* overcommitted.
865	*/
866	hardlockup_detector_disable();
867	}
868
869	static noinline uint32_t __kvm_cpuid_base(void)
870	{
871	if (boot_cpu_data.cpuid_level < `0`)
872	return `0`; / So we don't blow up on old processors /
873
874	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
875	return hypervisor_cpuid_base(KVM_SIGNATURE, leaves: `0`);
876
877	return `0`;
878	}
879
880	static inline uint32_t kvm_cpuid_base(void)
881	{
882	static int kvm_cpuid_base = -`1`;
883
884	if (kvm_cpuid_base == -`1`)
885	kvm_cpuid_base = __kvm_cpuid_base();
886
887	return kvm_cpuid_base;
888	}
889
890	bool kvm_para_available(void)
891	{
892	return kvm_cpuid_base() != `0`;
893	}
894	EXPORT_SYMBOL_GPL(kvm_para_available);
895
896	unsigned int kvm_arch_para_features(void)
897	{
898	return cpuid_eax(op: kvm_cpuid_base() \| KVM_CPUID_FEATURES);
899	}
900
901	unsigned int kvm_arch_para_hints(void)
902	{
903	return cpuid_edx(op: kvm_cpuid_base() \| KVM_CPUID_FEATURES);
904	}
905	EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
906
907	static uint32_t __init kvm_detect(void)
908	{
909	return kvm_cpuid_base();
910	}
911
912	static void __init kvm_apic_init(void)
913	{
914	#ifdef CONFIG_SMP
915	if (pv_ipi_supported())
916	kvm_setup_pv_ipi();
917	#endif
918	}
919
920	static bool __init kvm_msi_ext_dest_id(void)
921	{
922	return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
923	}
924
925	static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
926	{
927	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: pfn << PAGE_SHIFT, p2: npages,
928	KVM_MAP_GPA_RANGE_ENC_STAT(enc) \| KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
929	}
930
931	static void __init kvm_init_platform(void)
932	{
933	if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) &&
934	kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
935	unsigned long nr_pages;
936	int i;
937
938	pv_ops.mmu.notify_page_enc_status_changed =
939	kvm_sev_hc_page_enc_status;
940
941	/*
942	* Reset the host's shared pages list related to kernel
943	* specific page encryption status settings before we load a
944	* new kernel by kexec. Reset the page encryption status
945	* during early boot intead of just before kexec to avoid SMP
946	* races during kvm_pv_guest_cpu_reboot().
947	* NOTE: We cannot reset the complete shared pages list
948	* here as we need to retain the UEFI/OVMF firmware
949	* specific settings.
950	*/
951
952	for (i = `0`; i < e820_table->nr_entries; i++) {
953	struct e820_entry *entry = &e820_table->entries[i];
954
955	if (entry->type != E820_TYPE_RAM)
956	continue;
957
958	nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
959
960	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: entry->addr,
961	p2: nr_pages,
962	KVM_MAP_GPA_RANGE_ENCRYPTED \| KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
963	}
964
965	/*
966	* Ensure that _bss_decrypted section is marked as decrypted in the
967	* shared pages list.
968	*/
969	early_set_mem_enc_dec_hypercall(vaddr: (unsigned long)__start_bss_decrypted,
970	size: __end_bss_decrypted - __start_bss_decrypted, enc: `0`);
971
972	/*
973	* If not booted using EFI, enable Live migration support.
974	*/
975	if (!efi_enabled(EFI_BOOT))
976	wrmsrl(MSR_KVM_MIGRATION_CONTROL,
977	KVM_MIGRATION_READY);
978	}
979	kvmclock_init();
980	x86_platform.apic_post_init = kvm_apic_init;
981	}
982
983	#if defined(CONFIG_AMD_MEM_ENCRYPT)
984	static void kvm_sev_es_hcall_prepare(struct ghcb ghcb, struct* pt_regs *regs)
985	{
986	/ RAX and CPL are already in the GHCB /
987	ghcb_set_rbx(ghcb, value: regs->bx);
988	ghcb_set_rcx(ghcb, value: regs->cx);
989	ghcb_set_rdx(ghcb, value: regs->dx);
990	ghcb_set_rsi(ghcb, value: regs->si);
991	}
992
993	static bool kvm_sev_es_hcall_finish(struct ghcb ghcb, struct* pt_regs *regs)
994	{
995	/ No checking of the return state needed /
996	return true;
997	}
998	#endif
999
1000	const __initconst struct hypervisor_x86 x86_hyper_kvm = {
1001	.name = "KVM",
1002	.detect = kvm_detect,
1003	.type = X86_HYPER_KVM,
1004	.init.guest_late_init = kvm_guest_init,
1005	.init.x2apic_available = kvm_para_available,
1006	.init.msi_ext_dest_id = kvm_msi_ext_dest_id,
1007	.init.init_platform = kvm_init_platform,
1008	#if defined(CONFIG_AMD_MEM_ENCRYPT)
1009	.runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
1010	.runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
1011	#endif
1012	};
1013
1014	static __init int activate_jump_labels(void)
1015	{
1016	if (has_steal_clock) {
1017	static_key_slow_inc(key: &paravirt_steal_enabled);
1018	if (steal_acc)
1019	static_key_slow_inc(key: &paravirt_steal_rq_enabled);
1020	}
1021
1022	return `0`;
1023	}
1024	arch_initcall(activate_jump_labels);
1025
1026	#ifdef CONFIG_PARAVIRT_SPINLOCKS
1027
1028	/ Kick a cpu by its apicid. Used to wake up a halted vcpu /
1029	static void kvm_kick_cpu(int cpu)
1030	{
1031	unsigned long flags = `0`;
1032	u32 apicid;
1033
1034	apicid = per_cpu(x86_cpu_to_apicid, cpu);
1035	kvm_hypercall2(KVM_HC_KICK_CPU, p1: flags, p2: apicid);
1036	}
1037
1038	#include <asm/qspinlock.h>
1039
1040	static void kvm_wait(u8 *ptr, u8 val)
1041	{
1042	if (in_nmi())
1043	return;
1044
1045	/*
1046	* halt until it's our turn and kicked. Note that we do safe halt
1047	* for irq enabled case to avoid hang when lock info is overwritten
1048	* in irq spinlock slowpath and no spurious interrupt occur to save us.
1049	*/
1050	if (irqs_disabled()) {
1051	if (READ_ONCE(*ptr) == val)
1052	halt();
1053	} else {
1054	local_irq_disable();
1055
1056	/ safe_halt() will enable IRQ /
1057	if (READ_ONCE(*ptr) == val)
1058	safe_halt();
1059	else
1060	local_irq_enable();
1061	}
1062	}
1063
1064	/*
1065	* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
1066	*/
1067	void __init kvm_spinlock_init(void)
1068	{
1069	/*
1070	* In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
1071	* advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
1072	* preferred over native qspinlock when vCPU is preempted.
1073	*/
1074	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
1075	pr_info("PV spinlocks disabled, no host support\n");
1076	return;
1077	}
1078
1079	/*
1080	* Disable PV spinlocks and use native qspinlock when dedicated pCPUs
1081	* are available.
1082	*/
1083	if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
1084	pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
1085	goto out;
1086	}
1087
1088	if (num_possible_cpus() == `1`) {
1089	pr_info("PV spinlocks disabled, single CPU\n");
1090	goto out;
1091	}
1092
1093	if (nopvspin) {
1094	pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
1095	goto out;
1096	}
1097
1098	pr_info("PV spinlocks enabled\n");
1099
1100	__pv_init_lock_hash();
1101	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
1102	pv_ops.lock.queued_spin_unlock =
1103	PV_CALLEE_SAVE(__pv_queued_spin_unlock);
1104	pv_ops.lock.wait = kvm_wait;
1105	pv_ops.lock.kick = kvm_kick_cpu;
1106
1107	/*
1108	* When PV spinlock is enabled which is preferred over
1109	* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
1110	* Just disable it anyway.
1111	*/
1112	out:
1113	static_branch_disable(&virt_spin_lock_key);
1114	}
1115
1116	#endif /* CONFIG_PARAVIRT_SPINLOCKS */
1117
1118	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
1119
1120	static void kvm_disable_host_haltpoll(void *i)
1121	{
1122	wrmsrl(MSR_KVM_POLL_CONTROL, val: `0`);
1123	}
1124
1125	static void kvm_enable_host_haltpoll(void *i)
1126	{
1127	wrmsrl(MSR_KVM_POLL_CONTROL, val: `1`);
1128	}
1129
1130	void arch_haltpoll_enable(unsigned int cpu)
1131	{
1132	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
1133	pr_err_once("host does not support poll control\n");
1134	pr_err_once("host upgrade recommended\n");
1135	return;
1136	}
1137
1138	/ Enable guest halt poll disables host halt poll /
1139	smp_call_function_single(cpuid: cpu, func: kvm_disable_host_haltpoll, NULL, wait: `1`);
1140	}
1141	EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
1142
1143	void arch_haltpoll_disable(unsigned int cpu)
1144	{
1145	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
1146	return;
1147
1148	/ Disable guest halt poll enables host halt poll /
1149	smp_call_function_single(cpuid: cpu, func: kvm_enable_host_haltpoll, NULL, wait: `1`);
1150	}
1151	EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
1152	#endif
1153

source code of linux/arch/x86/kernel/kvm.c