| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 3 | |
| 4 | #include <linux/kvm_host.h> |
| 5 | #include <linux/kvm_irqfd.h> |
| 6 | |
| 7 | #include <asm/irq_remapping.h> |
| 8 | #include <asm/cpu.h> |
| 9 | |
| 10 | #include "lapic.h" |
| 11 | #include "irq.h" |
| 12 | #include "posted_intr.h" |
| 13 | #include "trace.h" |
| 14 | #include "vmx.h" |
| 15 | #include "tdx.h" |
| 16 | |
| 17 | /* |
| 18 | * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() |
| 19 | * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when |
| 20 | * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. |
| 21 | * The vCPUs posted interrupt descriptor is updated at the same time to set its |
| 22 | * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices |
| 23 | * wake the target vCPUs. vCPUs are removed from the list and the notification |
| 24 | * vector is reset when the vCPU is scheduled in. |
| 25 | */ |
| 26 | static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); |
| 27 | /* |
| 28 | * Protect the per-CPU list with a per-CPU spinlock to handle task migration. |
| 29 | * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the |
| 30 | * ->sched_in() path will need to take the vCPU off the list of the _previous_ |
| 31 | * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will |
| 32 | * occur if a wakeup IRQ arrives and attempts to acquire the lock. |
| 33 | */ |
| 34 | static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); |
| 35 | |
| 36 | #define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING |
| 37 | |
| 38 | static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) |
| 39 | { |
| 40 | return &(to_vt(vcpu)->pi_desc); |
| 41 | } |
| 42 | |
| 43 | static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) |
| 44 | { |
| 45 | /* |
| 46 | * PID.ON can be set at any time by a different vCPU or by hardware, |
| 47 | * e.g. a device. PID.control must be written atomically, and the |
| 48 | * update must be retried with a fresh snapshot an ON change causes |
| 49 | * the cmpxchg to fail. |
| 50 | */ |
| 51 | if (!try_cmpxchg64(&pi_desc->control, pold, new)) |
| 52 | return -EBUSY; |
| 53 | |
| 54 | return 0; |
| 55 | } |
| 56 | |
| 57 | void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) |
| 58 | { |
| 59 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
| 60 | struct vcpu_vt *vt = to_vt(vcpu); |
| 61 | struct pi_desc old, new; |
| 62 | unsigned long flags; |
| 63 | unsigned int dest; |
| 64 | |
| 65 | /* |
| 66 | * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and |
| 67 | * PI.SN up-to-date even if there is no assigned device or if APICv is |
| 68 | * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. |
| 69 | */ |
| 70 | if (!enable_apicv || !lapic_in_kernel(vcpu)) |
| 71 | return; |
| 72 | |
| 73 | /* |
| 74 | * If the vCPU wasn't on the wakeup list and wasn't migrated, then the |
| 75 | * full update can be skipped as neither the vector nor the destination |
| 76 | * needs to be changed. Clear SN even if there is no assigned device, |
| 77 | * again for simplicity. |
| 78 | */ |
| 79 | if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { |
| 80 | if (pi_test_and_clear_sn(pi_desc)) |
| 81 | goto after_clear_sn; |
| 82 | return; |
| 83 | } |
| 84 | |
| 85 | local_irq_save(flags); |
| 86 | |
| 87 | /* |
| 88 | * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup |
| 89 | * list of the _previous_ pCPU, which will not be the same as the |
| 90 | * current pCPU if the task was migrated. |
| 91 | */ |
| 92 | if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { |
| 93 | raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); |
| 94 | |
| 95 | /* |
| 96 | * In addition to taking the wakeup lock for the regular/IRQ |
| 97 | * context, tell lockdep it is being taken for the "sched out" |
| 98 | * context as well. vCPU loads happens in task context, and |
| 99 | * this is taking the lock of the *previous* CPU, i.e. can race |
| 100 | * with both the scheduler and the wakeup handler. |
| 101 | */ |
| 102 | raw_spin_lock(spinlock); |
| 103 | spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); |
| 104 | list_del(entry: &vt->pi_wakeup_list); |
| 105 | spin_release(&spinlock->dep_map, _RET_IP_); |
| 106 | raw_spin_unlock(spinlock); |
| 107 | } |
| 108 | |
| 109 | dest = cpu_physical_id(cpu); |
| 110 | if (!x2apic_mode) |
| 111 | dest = (dest << 8) & 0xFF00; |
| 112 | |
| 113 | old.control = READ_ONCE(pi_desc->control); |
| 114 | do { |
| 115 | new.control = old.control; |
| 116 | |
| 117 | /* |
| 118 | * Clear SN (as above) and refresh the destination APIC ID to |
| 119 | * handle task migration (@cpu != vcpu->cpu). |
| 120 | */ |
| 121 | new.ndst = dest; |
| 122 | __pi_clear_sn(pi_desc: &new); |
| 123 | |
| 124 | /* |
| 125 | * Restore the notification vector; in the blocking case, the |
| 126 | * descriptor was modified on "put" to use the wakeup vector. |
| 127 | */ |
| 128 | new.nv = POSTED_INTR_VECTOR; |
| 129 | } while (pi_try_set_control(pi_desc, pold: &old.control, new: new.control)); |
| 130 | |
| 131 | local_irq_restore(flags); |
| 132 | |
| 133 | after_clear_sn: |
| 134 | |
| 135 | /* |
| 136 | * Clear SN before reading the bitmap. The VT-d firmware |
| 137 | * writes the bitmap and reads SN atomically (5.2.3 in the |
| 138 | * spec), so it doesn't really have a memory barrier that |
| 139 | * pairs with this, but we cannot do that and we need one. |
| 140 | */ |
| 141 | smp_mb__after_atomic(); |
| 142 | |
| 143 | if (!pi_is_pir_empty(pi_desc)) |
| 144 | pi_set_on(pi_desc); |
| 145 | } |
| 146 | |
| 147 | static bool vmx_can_use_vtd_pi(struct kvm *kvm) |
| 148 | { |
| 149 | /* |
| 150 | * Note, reading the number of possible bypass IRQs can race with a |
| 151 | * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures |
| 152 | * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK. |
| 153 | */ |
| 154 | return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && |
| 155 | READ_ONCE(kvm->arch.nr_possible_bypass_irqs); |
| 156 | } |
| 157 | |
| 158 | /* |
| 159 | * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set |
| 160 | * WAKEUP as the notification vector in the PI descriptor. |
| 161 | */ |
| 162 | static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) |
| 163 | { |
| 164 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
| 165 | struct vcpu_vt *vt = to_vt(vcpu); |
| 166 | struct pi_desc old, new; |
| 167 | |
| 168 | lockdep_assert_irqs_disabled(); |
| 169 | |
| 170 | /* |
| 171 | * Acquire the wakeup lock using the "sched out" context to workaround |
| 172 | * a lockdep false positive. When this is called, schedule() holds |
| 173 | * various per-CPU scheduler locks. When the wakeup handler runs, it |
| 174 | * holds this CPU's wakeup lock while calling try_to_wake_up(), which |
| 175 | * can eventually take the aforementioned scheduler locks, which causes |
| 176 | * lockdep to assume there is deadlock. |
| 177 | * |
| 178 | * Deadlock can't actually occur because IRQs are disabled for the |
| 179 | * entirety of the sched_out critical section, i.e. the wakeup handler |
| 180 | * can't run while the scheduler locks are held. |
| 181 | */ |
| 182 | raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), |
| 183 | PI_LOCK_SCHED_OUT); |
| 184 | list_add_tail(new: &vt->pi_wakeup_list, |
| 185 | head: &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); |
| 186 | raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); |
| 187 | |
| 188 | WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking" ); |
| 189 | |
| 190 | old.control = READ_ONCE(pi_desc->control); |
| 191 | do { |
| 192 | /* set 'NV' to 'wakeup vector' */ |
| 193 | new.control = old.control; |
| 194 | new.nv = POSTED_INTR_WAKEUP_VECTOR; |
| 195 | } while (pi_try_set_control(pi_desc, pold: &old.control, new: new.control)); |
| 196 | |
| 197 | /* |
| 198 | * Send a wakeup IPI to this CPU if an interrupt may have been posted |
| 199 | * before the notification vector was updated, in which case the IRQ |
| 200 | * will arrive on the non-wakeup vector. An IPI is needed as calling |
| 201 | * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not |
| 202 | * enabled until it is safe to call try_to_wake_up() on the task being |
| 203 | * scheduled out). |
| 204 | */ |
| 205 | if (pi_test_on(pi_desc: &new)) |
| 206 | __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); |
| 207 | } |
| 208 | |
| 209 | static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) |
| 210 | { |
| 211 | /* |
| 212 | * The default posted interrupt vector does nothing when |
| 213 | * invoked outside guest mode. Return whether a blocked vCPU |
| 214 | * can be the target of posted interrupts, as is the case when |
| 215 | * using either IPI virtualization or VT-d PI, so that the |
| 216 | * notification vector is switched to the one that calls |
| 217 | * back to the pi_wakeup_handler() function. |
| 218 | */ |
| 219 | return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) || |
| 220 | vmx_can_use_vtd_pi(kvm: vcpu->kvm); |
| 221 | } |
| 222 | |
| 223 | void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) |
| 224 | { |
| 225 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
| 226 | |
| 227 | if (!vmx_needs_pi_wakeup(vcpu)) |
| 228 | return; |
| 229 | |
| 230 | /* |
| 231 | * If the vCPU is blocking with IRQs enabled and ISN'T being preempted, |
| 232 | * enable the wakeup handler so that notification IRQ wakes the vCPU as |
| 233 | * expected. There is no need to enable the wakeup handler if the vCPU |
| 234 | * is preempted between setting its wait state and manually scheduling |
| 235 | * out, as the task is still runnable, i.e. doesn't need a wake event |
| 236 | * from KVM to be scheduled in. |
| 237 | * |
| 238 | * If the wakeup handler isn't being enabled, Suppress Notifications as |
| 239 | * the cost of propagating PIR.IRR to PID.ON is negligible compared to |
| 240 | * the cost of a spurious IRQ, and vCPU put/load is a slow path. |
| 241 | */ |
| 242 | if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) && |
| 243 | ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || |
| 244 | (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) |
| 245 | pi_enable_wakeup_handler(vcpu); |
| 246 | else |
| 247 | pi_set_sn(pi_desc); |
| 248 | } |
| 249 | |
| 250 | /* |
| 251 | * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. |
| 252 | */ |
| 253 | void pi_wakeup_handler(void) |
| 254 | { |
| 255 | int cpu = smp_processor_id(); |
| 256 | struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); |
| 257 | raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); |
| 258 | struct vcpu_vt *vt; |
| 259 | |
| 260 | raw_spin_lock(spinlock); |
| 261 | list_for_each_entry(vt, wakeup_list, pi_wakeup_list) { |
| 262 | |
| 263 | if (pi_test_on(pi_desc: &vt->pi_desc)) |
| 264 | kvm_vcpu_wake_up(vcpu: vt_to_vcpu(vt)); |
| 265 | } |
| 266 | raw_spin_unlock(spinlock); |
| 267 | } |
| 268 | |
| 269 | void __init pi_init_cpu(int cpu) |
| 270 | { |
| 271 | INIT_LIST_HEAD(list: &per_cpu(wakeup_vcpus_on_cpu, cpu)); |
| 272 | raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); |
| 273 | } |
| 274 | |
| 275 | void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu) |
| 276 | { |
| 277 | struct pi_desc *pi = vcpu_to_pi_desc(vcpu); |
| 278 | |
| 279 | pi_clear_on(pi_desc: pi); |
| 280 | memset(pi->pir, 0, sizeof(pi->pir)); |
| 281 | } |
| 282 | |
| 283 | bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) |
| 284 | { |
| 285 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
| 286 | |
| 287 | return pi_test_on(pi_desc) || |
| 288 | (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); |
| 289 | } |
| 290 | |
| 291 | |
| 292 | /* |
| 293 | * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as |
| 294 | * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup |
| 295 | * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put(). |
| 296 | */ |
| 297 | void vmx_pi_start_bypass(struct kvm *kvm) |
| 298 | { |
| 299 | if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm))) |
| 300 | return; |
| 301 | |
| 302 | kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); |
| 303 | } |
| 304 | |
| 305 | int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, |
| 306 | unsigned int host_irq, uint32_t guest_irq, |
| 307 | struct kvm_vcpu *vcpu, u32 vector) |
| 308 | { |
| 309 | if (vcpu) { |
| 310 | struct intel_iommu_pi_data pi_data = { |
| 311 | .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)), |
| 312 | .vector = vector, |
| 313 | }; |
| 314 | |
| 315 | return irq_set_vcpu_affinity(irq: host_irq, vcpu_info: &pi_data); |
| 316 | } else { |
| 317 | return irq_set_vcpu_affinity(irq: host_irq, NULL); |
| 318 | } |
| 319 | } |
| 320 | |