1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * KVM paravirt_ops implementation |
4 | * |
5 | * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
6 | * Copyright IBM Corporation, 2007 |
7 | * Authors: Anthony Liguori <aliguori@us.ibm.com> |
8 | */ |
9 | |
10 | #define pr_fmt(fmt) "kvm-guest: " fmt |
11 | |
12 | #include <linux/context_tracking.h> |
13 | #include <linux/init.h> |
14 | #include <linux/irq.h> |
15 | #include <linux/kernel.h> |
16 | #include <linux/kvm_para.h> |
17 | #include <linux/cpu.h> |
18 | #include <linux/mm.h> |
19 | #include <linux/highmem.h> |
20 | #include <linux/hardirq.h> |
21 | #include <linux/notifier.h> |
22 | #include <linux/reboot.h> |
23 | #include <linux/hash.h> |
24 | #include <linux/sched.h> |
25 | #include <linux/slab.h> |
26 | #include <linux/kprobes.h> |
27 | #include <linux/nmi.h> |
28 | #include <linux/swait.h> |
29 | #include <linux/syscore_ops.h> |
30 | #include <linux/cc_platform.h> |
31 | #include <linux/efi.h> |
32 | #include <asm/timer.h> |
33 | #include <asm/cpu.h> |
34 | #include <asm/traps.h> |
35 | #include <asm/desc.h> |
36 | #include <asm/tlbflush.h> |
37 | #include <asm/apic.h> |
38 | #include <asm/apicdef.h> |
39 | #include <asm/hypervisor.h> |
40 | #include <asm/tlb.h> |
41 | #include <asm/cpuidle_haltpoll.h> |
42 | #include <asm/ptrace.h> |
43 | #include <asm/reboot.h> |
44 | #include <asm/svm.h> |
45 | #include <asm/e820/api.h> |
46 | |
47 | DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); |
48 | |
49 | static int kvmapf = 1; |
50 | |
51 | static int __init parse_no_kvmapf(char *arg) |
52 | { |
53 | kvmapf = 0; |
54 | return 0; |
55 | } |
56 | |
57 | early_param("no-kvmapf" , parse_no_kvmapf); |
58 | |
59 | static int steal_acc = 1; |
60 | static int __init parse_no_stealacc(char *arg) |
61 | { |
62 | steal_acc = 0; |
63 | return 0; |
64 | } |
65 | |
66 | early_param("no-steal-acc" , parse_no_stealacc); |
67 | |
68 | static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); |
69 | DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; |
70 | static int has_steal_clock = 0; |
71 | |
72 | static int has_guest_poll = 0; |
73 | /* |
74 | * No need for any "IO delay" on KVM |
75 | */ |
76 | static void kvm_io_delay(void) |
77 | { |
78 | } |
79 | |
80 | #define KVM_TASK_SLEEP_HASHBITS 8 |
81 | #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) |
82 | |
83 | struct kvm_task_sleep_node { |
84 | struct hlist_node link; |
85 | struct swait_queue_head wq; |
86 | u32 token; |
87 | int cpu; |
88 | }; |
89 | |
90 | static struct kvm_task_sleep_head { |
91 | raw_spinlock_t lock; |
92 | struct hlist_head list; |
93 | } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; |
94 | |
95 | static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, |
96 | u32 token) |
97 | { |
98 | struct hlist_node *p; |
99 | |
100 | hlist_for_each(p, &b->list) { |
101 | struct kvm_task_sleep_node *n = |
102 | hlist_entry(p, typeof(*n), link); |
103 | if (n->token == token) |
104 | return n; |
105 | } |
106 | |
107 | return NULL; |
108 | } |
109 | |
110 | static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) |
111 | { |
112 | u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS); |
113 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; |
114 | struct kvm_task_sleep_node *e; |
115 | |
116 | raw_spin_lock(&b->lock); |
117 | e = _find_apf_task(b, token); |
118 | if (e) { |
119 | /* dummy entry exist -> wake up was delivered ahead of PF */ |
120 | hlist_del(n: &e->link); |
121 | raw_spin_unlock(&b->lock); |
122 | kfree(objp: e); |
123 | return false; |
124 | } |
125 | |
126 | n->token = token; |
127 | n->cpu = smp_processor_id(); |
128 | init_swait_queue_head(&n->wq); |
129 | hlist_add_head(n: &n->link, h: &b->list); |
130 | raw_spin_unlock(&b->lock); |
131 | return true; |
132 | } |
133 | |
134 | /* |
135 | * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled |
136 | * @token: Token to identify the sleep node entry |
137 | * |
138 | * Invoked from the async pagefault handling code or from the VM exit page |
139 | * fault handler. In both cases RCU is watching. |
140 | */ |
141 | void kvm_async_pf_task_wait_schedule(u32 token) |
142 | { |
143 | struct kvm_task_sleep_node n; |
144 | DECLARE_SWAITQUEUE(wait); |
145 | |
146 | lockdep_assert_irqs_disabled(); |
147 | |
148 | if (!kvm_async_pf_queue_task(token, n: &n)) |
149 | return; |
150 | |
151 | for (;;) { |
152 | prepare_to_swait_exclusive(q: &n.wq, wait: &wait, TASK_UNINTERRUPTIBLE); |
153 | if (hlist_unhashed(h: &n.link)) |
154 | break; |
155 | |
156 | local_irq_enable(); |
157 | schedule(); |
158 | local_irq_disable(); |
159 | } |
160 | finish_swait(q: &n.wq, wait: &wait); |
161 | } |
162 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); |
163 | |
164 | static void apf_task_wake_one(struct kvm_task_sleep_node *n) |
165 | { |
166 | hlist_del_init(n: &n->link); |
167 | if (swq_has_sleeper(wq: &n->wq)) |
168 | swake_up_one(q: &n->wq); |
169 | } |
170 | |
171 | static void apf_task_wake_all(void) |
172 | { |
173 | int i; |
174 | |
175 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { |
176 | struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; |
177 | struct kvm_task_sleep_node *n; |
178 | struct hlist_node *p, *next; |
179 | |
180 | raw_spin_lock(&b->lock); |
181 | hlist_for_each_safe(p, next, &b->list) { |
182 | n = hlist_entry(p, typeof(*n), link); |
183 | if (n->cpu == smp_processor_id()) |
184 | apf_task_wake_one(n); |
185 | } |
186 | raw_spin_unlock(&b->lock); |
187 | } |
188 | } |
189 | |
190 | void kvm_async_pf_task_wake(u32 token) |
191 | { |
192 | u32 key = hash_32(val: token, KVM_TASK_SLEEP_HASHBITS); |
193 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; |
194 | struct kvm_task_sleep_node *n, *dummy = NULL; |
195 | |
196 | if (token == ~0) { |
197 | apf_task_wake_all(); |
198 | return; |
199 | } |
200 | |
201 | again: |
202 | raw_spin_lock(&b->lock); |
203 | n = _find_apf_task(b, token); |
204 | if (!n) { |
205 | /* |
206 | * Async #PF not yet handled, add a dummy entry for the token. |
207 | * Allocating the token must be down outside of the raw lock |
208 | * as the allocator is preemptible on PREEMPT_RT kernels. |
209 | */ |
210 | if (!dummy) { |
211 | raw_spin_unlock(&b->lock); |
212 | dummy = kzalloc(size: sizeof(*dummy), GFP_ATOMIC); |
213 | |
214 | /* |
215 | * Continue looping on allocation failure, eventually |
216 | * the async #PF will be handled and allocating a new |
217 | * node will be unnecessary. |
218 | */ |
219 | if (!dummy) |
220 | cpu_relax(); |
221 | |
222 | /* |
223 | * Recheck for async #PF completion before enqueueing |
224 | * the dummy token to avoid duplicate list entries. |
225 | */ |
226 | goto again; |
227 | } |
228 | dummy->token = token; |
229 | dummy->cpu = smp_processor_id(); |
230 | init_swait_queue_head(&dummy->wq); |
231 | hlist_add_head(n: &dummy->link, h: &b->list); |
232 | dummy = NULL; |
233 | } else { |
234 | apf_task_wake_one(n); |
235 | } |
236 | raw_spin_unlock(&b->lock); |
237 | |
238 | /* A dummy token might be allocated and ultimately not used. */ |
239 | kfree(objp: dummy); |
240 | } |
241 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); |
242 | |
243 | noinstr u32 kvm_read_and_reset_apf_flags(void) |
244 | { |
245 | u32 flags = 0; |
246 | |
247 | if (__this_cpu_read(apf_reason.enabled)) { |
248 | flags = __this_cpu_read(apf_reason.flags); |
249 | __this_cpu_write(apf_reason.flags, 0); |
250 | } |
251 | |
252 | return flags; |
253 | } |
254 | EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); |
255 | |
256 | noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) |
257 | { |
258 | u32 flags = kvm_read_and_reset_apf_flags(); |
259 | irqentry_state_t state; |
260 | |
261 | if (!flags) |
262 | return false; |
263 | |
264 | state = irqentry_enter(regs); |
265 | instrumentation_begin(); |
266 | |
267 | /* |
268 | * If the host managed to inject an async #PF into an interrupt |
269 | * disabled region, then die hard as this is not going to end well |
270 | * and the host side is seriously broken. |
271 | */ |
272 | if (unlikely(!(regs->flags & X86_EFLAGS_IF))) |
273 | panic(fmt: "Host injected async #PF in interrupt disabled region\n" ); |
274 | |
275 | if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { |
276 | if (unlikely(!(user_mode(regs)))) |
277 | panic(fmt: "Host injected async #PF in kernel mode\n" ); |
278 | /* Page is swapped out by the host. */ |
279 | kvm_async_pf_task_wait_schedule(token); |
280 | } else { |
281 | WARN_ONCE(1, "Unexpected async PF flags: %x\n" , flags); |
282 | } |
283 | |
284 | instrumentation_end(); |
285 | irqentry_exit(regs, state); |
286 | return true; |
287 | } |
288 | |
289 | DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) |
290 | { |
291 | struct pt_regs *old_regs = set_irq_regs(regs); |
292 | u32 token; |
293 | |
294 | apic_eoi(); |
295 | |
296 | inc_irq_stat(irq_hv_callback_count); |
297 | |
298 | if (__this_cpu_read(apf_reason.enabled)) { |
299 | token = __this_cpu_read(apf_reason.token); |
300 | kvm_async_pf_task_wake(token); |
301 | __this_cpu_write(apf_reason.token, 0); |
302 | wrmsrl(MSR_KVM_ASYNC_PF_ACK, val: 1); |
303 | } |
304 | |
305 | set_irq_regs(old_regs); |
306 | } |
307 | |
308 | static void __init paravirt_ops_setup(void) |
309 | { |
310 | pv_info.name = "KVM" ; |
311 | |
312 | if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) |
313 | pv_ops.cpu.io_delay = kvm_io_delay; |
314 | |
315 | #ifdef CONFIG_X86_IO_APIC |
316 | no_timer_check = 1; |
317 | #endif |
318 | } |
319 | |
320 | static void kvm_register_steal_time(void) |
321 | { |
322 | int cpu = smp_processor_id(); |
323 | struct kvm_steal_time *st = &per_cpu(steal_time, cpu); |
324 | |
325 | if (!has_steal_clock) |
326 | return; |
327 | |
328 | wrmsrl(MSR_KVM_STEAL_TIME, val: (slow_virt_to_phys(address: st) | KVM_MSR_ENABLED)); |
329 | pr_debug("stealtime: cpu %d, msr %llx\n" , cpu, |
330 | (unsigned long long) slow_virt_to_phys(st)); |
331 | } |
332 | |
333 | static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; |
334 | |
335 | static notrace __maybe_unused void kvm_guest_apic_eoi_write(void) |
336 | { |
337 | /** |
338 | * This relies on __test_and_clear_bit to modify the memory |
339 | * in a way that is atomic with respect to the local CPU. |
340 | * The hypervisor only accesses this memory from the local CPU so |
341 | * there's no need for lock or memory barriers. |
342 | * An optimization barrier is implied in apic write. |
343 | */ |
344 | if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) |
345 | return; |
346 | apic_native_eoi(); |
347 | } |
348 | |
349 | static void kvm_guest_cpu_init(void) |
350 | { |
351 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { |
352 | u64 pa; |
353 | |
354 | WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); |
355 | |
356 | pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); |
357 | pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; |
358 | |
359 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) |
360 | pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; |
361 | |
362 | wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); |
363 | |
364 | wrmsrl(MSR_KVM_ASYNC_PF_EN, val: pa); |
365 | __this_cpu_write(apf_reason.enabled, 1); |
366 | pr_debug("setup async PF for cpu %d\n" , smp_processor_id()); |
367 | } |
368 | |
369 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { |
370 | unsigned long pa; |
371 | |
372 | /* Size alignment is implied but just to make it explicit. */ |
373 | BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); |
374 | __this_cpu_write(kvm_apic_eoi, 0); |
375 | pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) |
376 | | KVM_MSR_ENABLED; |
377 | wrmsrl(MSR_KVM_PV_EOI_EN, val: pa); |
378 | } |
379 | |
380 | if (has_steal_clock) |
381 | kvm_register_steal_time(); |
382 | } |
383 | |
384 | static void kvm_pv_disable_apf(void) |
385 | { |
386 | if (!__this_cpu_read(apf_reason.enabled)) |
387 | return; |
388 | |
389 | wrmsrl(MSR_KVM_ASYNC_PF_EN, val: 0); |
390 | __this_cpu_write(apf_reason.enabled, 0); |
391 | |
392 | pr_debug("disable async PF for cpu %d\n" , smp_processor_id()); |
393 | } |
394 | |
395 | static void kvm_disable_steal_time(void) |
396 | { |
397 | if (!has_steal_clock) |
398 | return; |
399 | |
400 | wrmsr(MSR_KVM_STEAL_TIME, 0, 0); |
401 | } |
402 | |
403 | static u64 kvm_steal_clock(int cpu) |
404 | { |
405 | u64 steal; |
406 | struct kvm_steal_time *src; |
407 | int version; |
408 | |
409 | src = &per_cpu(steal_time, cpu); |
410 | do { |
411 | version = src->version; |
412 | virt_rmb(); |
413 | steal = src->steal; |
414 | virt_rmb(); |
415 | } while ((version & 1) || (version != src->version)); |
416 | |
417 | return steal; |
418 | } |
419 | |
420 | static inline void __set_percpu_decrypted(void *ptr, unsigned long size) |
421 | { |
422 | early_set_memory_decrypted(vaddr: (unsigned long) ptr, size); |
423 | } |
424 | |
425 | /* |
426 | * Iterate through all possible CPUs and map the memory region pointed |
427 | * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. |
428 | * |
429 | * Note: we iterate through all possible CPUs to ensure that CPUs |
430 | * hotplugged will have their per-cpu variable already mapped as |
431 | * decrypted. |
432 | */ |
433 | static void __init sev_map_percpu_data(void) |
434 | { |
435 | int cpu; |
436 | |
437 | if (!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT)) |
438 | return; |
439 | |
440 | for_each_possible_cpu(cpu) { |
441 | __set_percpu_decrypted(ptr: &per_cpu(apf_reason, cpu), size: sizeof(apf_reason)); |
442 | __set_percpu_decrypted(ptr: &per_cpu(steal_time, cpu), size: sizeof(steal_time)); |
443 | __set_percpu_decrypted(ptr: &per_cpu(kvm_apic_eoi, cpu), size: sizeof(kvm_apic_eoi)); |
444 | } |
445 | } |
446 | |
447 | static void kvm_guest_cpu_offline(bool shutdown) |
448 | { |
449 | kvm_disable_steal_time(); |
450 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
451 | wrmsrl(MSR_KVM_PV_EOI_EN, val: 0); |
452 | if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) |
453 | wrmsrl(MSR_KVM_MIGRATION_CONTROL, val: 0); |
454 | kvm_pv_disable_apf(); |
455 | if (!shutdown) |
456 | apf_task_wake_all(); |
457 | kvmclock_disable(); |
458 | } |
459 | |
460 | static int kvm_cpu_online(unsigned int cpu) |
461 | { |
462 | unsigned long flags; |
463 | |
464 | local_irq_save(flags); |
465 | kvm_guest_cpu_init(); |
466 | local_irq_restore(flags); |
467 | return 0; |
468 | } |
469 | |
470 | #ifdef CONFIG_SMP |
471 | |
472 | static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); |
473 | |
474 | static bool pv_tlb_flush_supported(void) |
475 | { |
476 | return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && |
477 | !kvm_para_has_hint(KVM_HINTS_REALTIME) && |
478 | kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && |
479 | !boot_cpu_has(X86_FEATURE_MWAIT) && |
480 | (num_possible_cpus() != 1)); |
481 | } |
482 | |
483 | static bool pv_ipi_supported(void) |
484 | { |
485 | return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) && |
486 | (num_possible_cpus() != 1)); |
487 | } |
488 | |
489 | static bool pv_sched_yield_supported(void) |
490 | { |
491 | return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && |
492 | !kvm_para_has_hint(KVM_HINTS_REALTIME) && |
493 | kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && |
494 | !boot_cpu_has(X86_FEATURE_MWAIT) && |
495 | (num_possible_cpus() != 1)); |
496 | } |
497 | |
498 | #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) |
499 | |
500 | static void __send_ipi_mask(const struct cpumask *mask, int vector) |
501 | { |
502 | unsigned long flags; |
503 | int cpu, min = 0, max = 0; |
504 | #ifdef CONFIG_X86_64 |
505 | __uint128_t ipi_bitmap = 0; |
506 | #else |
507 | u64 ipi_bitmap = 0; |
508 | #endif |
509 | u32 apic_id, icr; |
510 | long ret; |
511 | |
512 | if (cpumask_empty(srcp: mask)) |
513 | return; |
514 | |
515 | local_irq_save(flags); |
516 | |
517 | switch (vector) { |
518 | default: |
519 | icr = APIC_DM_FIXED | vector; |
520 | break; |
521 | case NMI_VECTOR: |
522 | icr = APIC_DM_NMI; |
523 | break; |
524 | } |
525 | |
526 | for_each_cpu(cpu, mask) { |
527 | apic_id = per_cpu(x86_cpu_to_apicid, cpu); |
528 | if (!ipi_bitmap) { |
529 | min = max = apic_id; |
530 | } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { |
531 | ipi_bitmap <<= min - apic_id; |
532 | min = apic_id; |
533 | } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { |
534 | max = apic_id < max ? max : apic_id; |
535 | } else { |
536 | ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap, |
537 | p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr); |
538 | WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld" , |
539 | ret); |
540 | min = max = apic_id; |
541 | ipi_bitmap = 0; |
542 | } |
543 | __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); |
544 | } |
545 | |
546 | if (ipi_bitmap) { |
547 | ret = kvm_hypercall4(KVM_HC_SEND_IPI, p1: (unsigned long)ipi_bitmap, |
548 | p2: (unsigned long)(ipi_bitmap >> BITS_PER_LONG), p3: min, p4: icr); |
549 | WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld" , |
550 | ret); |
551 | } |
552 | |
553 | local_irq_restore(flags); |
554 | } |
555 | |
556 | static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) |
557 | { |
558 | __send_ipi_mask(mask, vector); |
559 | } |
560 | |
561 | static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) |
562 | { |
563 | unsigned int this_cpu = smp_processor_id(); |
564 | struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); |
565 | const struct cpumask *local_mask; |
566 | |
567 | cpumask_copy(dstp: new_mask, srcp: mask); |
568 | cpumask_clear_cpu(cpu: this_cpu, dstp: new_mask); |
569 | local_mask = new_mask; |
570 | __send_ipi_mask(mask: local_mask, vector); |
571 | } |
572 | |
573 | static int __init setup_efi_kvm_sev_migration(void) |
574 | { |
575 | efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled" ; |
576 | efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; |
577 | efi_status_t status; |
578 | unsigned long size; |
579 | bool enabled; |
580 | |
581 | if (!cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) || |
582 | !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) |
583 | return 0; |
584 | |
585 | if (!efi_enabled(EFI_BOOT)) |
586 | return 0; |
587 | |
588 | if (!efi_enabled(EFI_RUNTIME_SERVICES)) { |
589 | pr_info("%s : EFI runtime services are not enabled\n" , __func__); |
590 | return 0; |
591 | } |
592 | |
593 | size = sizeof(enabled); |
594 | |
595 | /* Get variable contents into buffer */ |
596 | status = efi.get_variable(efi_sev_live_migration_enabled, |
597 | &efi_variable_guid, NULL, &size, &enabled); |
598 | |
599 | if (status == EFI_NOT_FOUND) { |
600 | pr_info("%s : EFI live migration variable not found\n" , __func__); |
601 | return 0; |
602 | } |
603 | |
604 | if (status != EFI_SUCCESS) { |
605 | pr_info("%s : EFI variable retrieval failed\n" , __func__); |
606 | return 0; |
607 | } |
608 | |
609 | if (enabled == 0) { |
610 | pr_info("%s: live migration disabled in EFI\n" , __func__); |
611 | return 0; |
612 | } |
613 | |
614 | pr_info("%s : live migration enabled in EFI\n" , __func__); |
615 | wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); |
616 | |
617 | return 1; |
618 | } |
619 | |
620 | late_initcall(setup_efi_kvm_sev_migration); |
621 | |
622 | /* |
623 | * Set the IPI entry points |
624 | */ |
625 | static __init void kvm_setup_pv_ipi(void) |
626 | { |
627 | apic_update_callback(send_IPI_mask, kvm_send_ipi_mask); |
628 | apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself); |
629 | pr_info("setup PV IPIs\n" ); |
630 | } |
631 | |
632 | static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) |
633 | { |
634 | int cpu; |
635 | |
636 | native_send_call_func_ipi(mask); |
637 | |
638 | /* Make sure other vCPUs get a chance to run if they need to. */ |
639 | for_each_cpu(cpu, mask) { |
640 | if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) { |
641 | kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); |
642 | break; |
643 | } |
644 | } |
645 | } |
646 | |
647 | static void kvm_flush_tlb_multi(const struct cpumask *cpumask, |
648 | const struct flush_tlb_info *info) |
649 | { |
650 | u8 state; |
651 | int cpu; |
652 | struct kvm_steal_time *src; |
653 | struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); |
654 | |
655 | cpumask_copy(dstp: flushmask, srcp: cpumask); |
656 | /* |
657 | * We have to call flush only on online vCPUs. And |
658 | * queue flush_on_enter for pre-empted vCPUs |
659 | */ |
660 | for_each_cpu(cpu, flushmask) { |
661 | /* |
662 | * The local vCPU is never preempted, so we do not explicitly |
663 | * skip check for local vCPU - it will never be cleared from |
664 | * flushmask. |
665 | */ |
666 | src = &per_cpu(steal_time, cpu); |
667 | state = READ_ONCE(src->preempted); |
668 | if ((state & KVM_VCPU_PREEMPTED)) { |
669 | if (try_cmpxchg(&src->preempted, &state, |
670 | state | KVM_VCPU_FLUSH_TLB)) |
671 | __cpumask_clear_cpu(cpu, dstp: flushmask); |
672 | } |
673 | } |
674 | |
675 | native_flush_tlb_multi(cpumask: flushmask, info); |
676 | } |
677 | |
678 | static __init int kvm_alloc_cpumask(void) |
679 | { |
680 | int cpu; |
681 | |
682 | if (!kvm_para_available() || nopv) |
683 | return 0; |
684 | |
685 | if (pv_tlb_flush_supported() || pv_ipi_supported()) |
686 | for_each_possible_cpu(cpu) { |
687 | zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), |
688 | GFP_KERNEL, cpu_to_node(cpu)); |
689 | } |
690 | |
691 | return 0; |
692 | } |
693 | arch_initcall(kvm_alloc_cpumask); |
694 | |
695 | static void __init kvm_smp_prepare_boot_cpu(void) |
696 | { |
697 | /* |
698 | * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() |
699 | * shares the guest physical address with the hypervisor. |
700 | */ |
701 | sev_map_percpu_data(); |
702 | |
703 | kvm_guest_cpu_init(); |
704 | native_smp_prepare_boot_cpu(); |
705 | kvm_spinlock_init(); |
706 | } |
707 | |
708 | static int kvm_cpu_down_prepare(unsigned int cpu) |
709 | { |
710 | unsigned long flags; |
711 | |
712 | local_irq_save(flags); |
713 | kvm_guest_cpu_offline(shutdown: false); |
714 | local_irq_restore(flags); |
715 | return 0; |
716 | } |
717 | |
718 | #endif |
719 | |
720 | static int kvm_suspend(void) |
721 | { |
722 | u64 val = 0; |
723 | |
724 | kvm_guest_cpu_offline(shutdown: false); |
725 | |
726 | #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL |
727 | if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) |
728 | rdmsrl(MSR_KVM_POLL_CONTROL, val); |
729 | has_guest_poll = !(val & 1); |
730 | #endif |
731 | return 0; |
732 | } |
733 | |
734 | static void kvm_resume(void) |
735 | { |
736 | kvm_cpu_online(raw_smp_processor_id()); |
737 | |
738 | #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL |
739 | if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) |
740 | wrmsrl(MSR_KVM_POLL_CONTROL, val: 0); |
741 | #endif |
742 | } |
743 | |
744 | static struct syscore_ops kvm_syscore_ops = { |
745 | .suspend = kvm_suspend, |
746 | .resume = kvm_resume, |
747 | }; |
748 | |
749 | static void kvm_pv_guest_cpu_reboot(void *unused) |
750 | { |
751 | kvm_guest_cpu_offline(shutdown: true); |
752 | } |
753 | |
754 | static int kvm_pv_reboot_notify(struct notifier_block *nb, |
755 | unsigned long code, void *unused) |
756 | { |
757 | if (code == SYS_RESTART) |
758 | on_each_cpu(func: kvm_pv_guest_cpu_reboot, NULL, wait: 1); |
759 | return NOTIFY_DONE; |
760 | } |
761 | |
762 | static struct notifier_block kvm_pv_reboot_nb = { |
763 | .notifier_call = kvm_pv_reboot_notify, |
764 | }; |
765 | |
766 | /* |
767 | * After a PV feature is registered, the host will keep writing to the |
768 | * registered memory location. If the guest happens to shutdown, this memory |
769 | * won't be valid. In cases like kexec, in which you install a new kernel, this |
770 | * means a random memory location will be kept being written. |
771 | */ |
772 | #ifdef CONFIG_KEXEC_CORE |
773 | static void kvm_crash_shutdown(struct pt_regs *regs) |
774 | { |
775 | kvm_guest_cpu_offline(shutdown: true); |
776 | native_machine_crash_shutdown(regs); |
777 | } |
778 | #endif |
779 | |
780 | #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP) |
781 | bool __kvm_vcpu_is_preempted(long cpu); |
782 | |
783 | __visible bool __kvm_vcpu_is_preempted(long cpu) |
784 | { |
785 | struct kvm_steal_time *src = &per_cpu(steal_time, cpu); |
786 | |
787 | return !!(src->preempted & KVM_VCPU_PREEMPTED); |
788 | } |
789 | PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); |
790 | |
791 | #else |
792 | |
793 | #include <asm/asm-offsets.h> |
794 | |
795 | extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); |
796 | |
797 | /* |
798 | * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and |
799 | * restoring to/from the stack. |
800 | */ |
801 | #define PV_VCPU_PREEMPTED_ASM \ |
802 | "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \ |
803 | "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ |
804 | "setne %al\n\t" |
805 | |
806 | DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, |
807 | PV_VCPU_PREEMPTED_ASM, .text); |
808 | #endif |
809 | |
810 | static void __init kvm_guest_init(void) |
811 | { |
812 | int i; |
813 | |
814 | paravirt_ops_setup(); |
815 | register_reboot_notifier(&kvm_pv_reboot_nb); |
816 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) |
817 | raw_spin_lock_init(&async_pf_sleepers[i].lock); |
818 | |
819 | if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { |
820 | has_steal_clock = 1; |
821 | static_call_update(pv_steal_clock, kvm_steal_clock); |
822 | |
823 | pv_ops.lock.vcpu_is_preempted = |
824 | PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); |
825 | } |
826 | |
827 | if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) |
828 | apic_update_callback(eoi, kvm_guest_apic_eoi_write); |
829 | |
830 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { |
831 | static_branch_enable(&kvm_async_pf_enabled); |
832 | alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, addr: asm_sysvec_kvm_asyncpf_interrupt); |
833 | } |
834 | |
835 | #ifdef CONFIG_SMP |
836 | if (pv_tlb_flush_supported()) { |
837 | pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; |
838 | pv_ops.mmu.tlb_remove_table = tlb_remove_table; |
839 | pr_info("KVM setup pv remote TLB flush\n" ); |
840 | } |
841 | |
842 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
843 | if (pv_sched_yield_supported()) { |
844 | smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; |
845 | pr_info("setup PV sched yield\n" ); |
846 | } |
847 | if (cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN, name: "x86/kvm:online" , |
848 | startup: kvm_cpu_online, teardown: kvm_cpu_down_prepare) < 0) |
849 | pr_err("failed to install cpu hotplug callbacks\n" ); |
850 | #else |
851 | sev_map_percpu_data(); |
852 | kvm_guest_cpu_init(); |
853 | #endif |
854 | |
855 | #ifdef CONFIG_KEXEC_CORE |
856 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
857 | #endif |
858 | |
859 | register_syscore_ops(ops: &kvm_syscore_ops); |
860 | |
861 | /* |
862 | * Hard lockup detection is enabled by default. Disable it, as guests |
863 | * can get false positives too easily, for example if the host is |
864 | * overcommitted. |
865 | */ |
866 | hardlockup_detector_disable(); |
867 | } |
868 | |
869 | static noinline uint32_t __kvm_cpuid_base(void) |
870 | { |
871 | if (boot_cpu_data.cpuid_level < 0) |
872 | return 0; /* So we don't blow up on old processors */ |
873 | |
874 | if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) |
875 | return hypervisor_cpuid_base(KVM_SIGNATURE, leaves: 0); |
876 | |
877 | return 0; |
878 | } |
879 | |
880 | static inline uint32_t kvm_cpuid_base(void) |
881 | { |
882 | static int kvm_cpuid_base = -1; |
883 | |
884 | if (kvm_cpuid_base == -1) |
885 | kvm_cpuid_base = __kvm_cpuid_base(); |
886 | |
887 | return kvm_cpuid_base; |
888 | } |
889 | |
890 | bool kvm_para_available(void) |
891 | { |
892 | return kvm_cpuid_base() != 0; |
893 | } |
894 | EXPORT_SYMBOL_GPL(kvm_para_available); |
895 | |
896 | unsigned int kvm_arch_para_features(void) |
897 | { |
898 | return cpuid_eax(op: kvm_cpuid_base() | KVM_CPUID_FEATURES); |
899 | } |
900 | |
901 | unsigned int kvm_arch_para_hints(void) |
902 | { |
903 | return cpuid_edx(op: kvm_cpuid_base() | KVM_CPUID_FEATURES); |
904 | } |
905 | EXPORT_SYMBOL_GPL(kvm_arch_para_hints); |
906 | |
907 | static uint32_t __init kvm_detect(void) |
908 | { |
909 | return kvm_cpuid_base(); |
910 | } |
911 | |
912 | static void __init kvm_apic_init(void) |
913 | { |
914 | #ifdef CONFIG_SMP |
915 | if (pv_ipi_supported()) |
916 | kvm_setup_pv_ipi(); |
917 | #endif |
918 | } |
919 | |
920 | static bool __init kvm_msi_ext_dest_id(void) |
921 | { |
922 | return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); |
923 | } |
924 | |
925 | static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) |
926 | { |
927 | kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: pfn << PAGE_SHIFT, p2: npages, |
928 | KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); |
929 | } |
930 | |
931 | static void __init kvm_init_platform(void) |
932 | { |
933 | if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT) && |
934 | kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { |
935 | unsigned long nr_pages; |
936 | int i; |
937 | |
938 | pv_ops.mmu.notify_page_enc_status_changed = |
939 | kvm_sev_hc_page_enc_status; |
940 | |
941 | /* |
942 | * Reset the host's shared pages list related to kernel |
943 | * specific page encryption status settings before we load a |
944 | * new kernel by kexec. Reset the page encryption status |
945 | * during early boot intead of just before kexec to avoid SMP |
946 | * races during kvm_pv_guest_cpu_reboot(). |
947 | * NOTE: We cannot reset the complete shared pages list |
948 | * here as we need to retain the UEFI/OVMF firmware |
949 | * specific settings. |
950 | */ |
951 | |
952 | for (i = 0; i < e820_table->nr_entries; i++) { |
953 | struct e820_entry *entry = &e820_table->entries[i]; |
954 | |
955 | if (entry->type != E820_TYPE_RAM) |
956 | continue; |
957 | |
958 | nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); |
959 | |
960 | kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, p1: entry->addr, |
961 | p2: nr_pages, |
962 | KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); |
963 | } |
964 | |
965 | /* |
966 | * Ensure that _bss_decrypted section is marked as decrypted in the |
967 | * shared pages list. |
968 | */ |
969 | early_set_mem_enc_dec_hypercall(vaddr: (unsigned long)__start_bss_decrypted, |
970 | size: __end_bss_decrypted - __start_bss_decrypted, enc: 0); |
971 | |
972 | /* |
973 | * If not booted using EFI, enable Live migration support. |
974 | */ |
975 | if (!efi_enabled(EFI_BOOT)) |
976 | wrmsrl(MSR_KVM_MIGRATION_CONTROL, |
977 | KVM_MIGRATION_READY); |
978 | } |
979 | kvmclock_init(); |
980 | x86_platform.apic_post_init = kvm_apic_init; |
981 | } |
982 | |
983 | #if defined(CONFIG_AMD_MEM_ENCRYPT) |
984 | static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) |
985 | { |
986 | /* RAX and CPL are already in the GHCB */ |
987 | ghcb_set_rbx(ghcb, value: regs->bx); |
988 | ghcb_set_rcx(ghcb, value: regs->cx); |
989 | ghcb_set_rdx(ghcb, value: regs->dx); |
990 | ghcb_set_rsi(ghcb, value: regs->si); |
991 | } |
992 | |
993 | static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) |
994 | { |
995 | /* No checking of the return state needed */ |
996 | return true; |
997 | } |
998 | #endif |
999 | |
1000 | const __initconst struct hypervisor_x86 x86_hyper_kvm = { |
1001 | .name = "KVM" , |
1002 | .detect = kvm_detect, |
1003 | .type = X86_HYPER_KVM, |
1004 | .init.guest_late_init = kvm_guest_init, |
1005 | .init.x2apic_available = kvm_para_available, |
1006 | .init.msi_ext_dest_id = kvm_msi_ext_dest_id, |
1007 | .init.init_platform = kvm_init_platform, |
1008 | #if defined(CONFIG_AMD_MEM_ENCRYPT) |
1009 | .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, |
1010 | .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, |
1011 | #endif |
1012 | }; |
1013 | |
1014 | static __init int activate_jump_labels(void) |
1015 | { |
1016 | if (has_steal_clock) { |
1017 | static_key_slow_inc(key: ¶virt_steal_enabled); |
1018 | if (steal_acc) |
1019 | static_key_slow_inc(key: ¶virt_steal_rq_enabled); |
1020 | } |
1021 | |
1022 | return 0; |
1023 | } |
1024 | arch_initcall(activate_jump_labels); |
1025 | |
1026 | #ifdef CONFIG_PARAVIRT_SPINLOCKS |
1027 | |
1028 | /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ |
1029 | static void kvm_kick_cpu(int cpu) |
1030 | { |
1031 | unsigned long flags = 0; |
1032 | u32 apicid; |
1033 | |
1034 | apicid = per_cpu(x86_cpu_to_apicid, cpu); |
1035 | kvm_hypercall2(KVM_HC_KICK_CPU, p1: flags, p2: apicid); |
1036 | } |
1037 | |
1038 | #include <asm/qspinlock.h> |
1039 | |
1040 | static void kvm_wait(u8 *ptr, u8 val) |
1041 | { |
1042 | if (in_nmi()) |
1043 | return; |
1044 | |
1045 | /* |
1046 | * halt until it's our turn and kicked. Note that we do safe halt |
1047 | * for irq enabled case to avoid hang when lock info is overwritten |
1048 | * in irq spinlock slowpath and no spurious interrupt occur to save us. |
1049 | */ |
1050 | if (irqs_disabled()) { |
1051 | if (READ_ONCE(*ptr) == val) |
1052 | halt(); |
1053 | } else { |
1054 | local_irq_disable(); |
1055 | |
1056 | /* safe_halt() will enable IRQ */ |
1057 | if (READ_ONCE(*ptr) == val) |
1058 | safe_halt(); |
1059 | else |
1060 | local_irq_enable(); |
1061 | } |
1062 | } |
1063 | |
1064 | /* |
1065 | * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. |
1066 | */ |
1067 | void __init kvm_spinlock_init(void) |
1068 | { |
1069 | /* |
1070 | * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an |
1071 | * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is |
1072 | * preferred over native qspinlock when vCPU is preempted. |
1073 | */ |
1074 | if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { |
1075 | pr_info("PV spinlocks disabled, no host support\n" ); |
1076 | return; |
1077 | } |
1078 | |
1079 | /* |
1080 | * Disable PV spinlocks and use native qspinlock when dedicated pCPUs |
1081 | * are available. |
1082 | */ |
1083 | if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { |
1084 | pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n" ); |
1085 | goto out; |
1086 | } |
1087 | |
1088 | if (num_possible_cpus() == 1) { |
1089 | pr_info("PV spinlocks disabled, single CPU\n" ); |
1090 | goto out; |
1091 | } |
1092 | |
1093 | if (nopvspin) { |
1094 | pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n" ); |
1095 | goto out; |
1096 | } |
1097 | |
1098 | pr_info("PV spinlocks enabled\n" ); |
1099 | |
1100 | __pv_init_lock_hash(); |
1101 | pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; |
1102 | pv_ops.lock.queued_spin_unlock = |
1103 | PV_CALLEE_SAVE(__pv_queued_spin_unlock); |
1104 | pv_ops.lock.wait = kvm_wait; |
1105 | pv_ops.lock.kick = kvm_kick_cpu; |
1106 | |
1107 | /* |
1108 | * When PV spinlock is enabled which is preferred over |
1109 | * virt_spin_lock(), virt_spin_lock_key's value is meaningless. |
1110 | * Just disable it anyway. |
1111 | */ |
1112 | out: |
1113 | static_branch_disable(&virt_spin_lock_key); |
1114 | } |
1115 | |
1116 | #endif /* CONFIG_PARAVIRT_SPINLOCKS */ |
1117 | |
1118 | #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL |
1119 | |
1120 | static void kvm_disable_host_haltpoll(void *i) |
1121 | { |
1122 | wrmsrl(MSR_KVM_POLL_CONTROL, val: 0); |
1123 | } |
1124 | |
1125 | static void kvm_enable_host_haltpoll(void *i) |
1126 | { |
1127 | wrmsrl(MSR_KVM_POLL_CONTROL, val: 1); |
1128 | } |
1129 | |
1130 | void arch_haltpoll_enable(unsigned int cpu) |
1131 | { |
1132 | if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { |
1133 | pr_err_once("host does not support poll control\n" ); |
1134 | pr_err_once("host upgrade recommended\n" ); |
1135 | return; |
1136 | } |
1137 | |
1138 | /* Enable guest halt poll disables host halt poll */ |
1139 | smp_call_function_single(cpuid: cpu, func: kvm_disable_host_haltpoll, NULL, wait: 1); |
1140 | } |
1141 | EXPORT_SYMBOL_GPL(arch_haltpoll_enable); |
1142 | |
1143 | void arch_haltpoll_disable(unsigned int cpu) |
1144 | { |
1145 | if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) |
1146 | return; |
1147 | |
1148 | /* Disable guest halt poll enables host halt poll */ |
1149 | smp_call_function_single(cpuid: cpu, func: kvm_enable_host_haltpoll, NULL, wait: 1); |
1150 | } |
1151 | EXPORT_SYMBOL_GPL(arch_haltpoll_disable); |
1152 | #endif |
1153 | |