posted_intr.c source code [linux/arch/x86/kvm/vmx/posted_intr.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4	#include <linux/kvm_host.h>
5
6	#include <asm/irq_remapping.h>
7	#include <asm/cpu.h>
8
9	#include "lapic.h"
10	#include "irq.h"
11	#include "posted_intr.h"
12	#include "trace.h"
13	#include "vmx.h"
14
15	/*
16	* Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
17	* when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when
18	* the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
19	* The vCPUs posted interrupt descriptor is updated at the same time to set its
20	* notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
21	* wake the target vCPUs. vCPUs are removed from the list and the notification
22	* vector is reset when the vCPU is scheduled in.
23	*/
24	static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
25	/*
26	* Protect the per-CPU list with a per-CPU spinlock to handle task migration.
27	* When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
28	* ->sched_in() path will need to take the vCPU off the list of the _previous_
29	* CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
30	* occur if a wakeup IRQ arrives and attempts to acquire the lock.
31	*/
32	static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
33
34	static inline struct pi_desc vcpu_to_pi_desc(struct* kvm_vcpu *vcpu)
35	{
36	return &(to_vmx(vcpu)->pi_desc);
37	}
38
39	static int pi_try_set_control(struct pi_desc pi_desc, u64 pold, u64 new)
40	{
41	/*
42	* PID.ON can be set at any time by a different vCPU or by hardware,
43	* e.g. a device. PID.control must be written atomically, and the
44	* update must be retried with a fresh snapshot an ON change causes
45	* the cmpxchg to fail.
46	*/
47	if (!try_cmpxchg64(&pi_desc->control, pold, new))
48	return -EBUSY;
49
50	return `0`;
51	}
52
53	void vmx_vcpu_pi_load(struct kvm_vcpu vcpu, int* cpu)
54	{
55	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
56	struct vcpu_vmx *vmx = to_vmx(vcpu);
57	struct pi_desc old, new;
58	unsigned long flags;
59	unsigned int dest;
60
61	/*
62	* To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
63	* PI.SN up-to-date even if there is no assigned device or if APICv is
64	* deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
65	*/
66	if (!enable_apicv \|\| !lapic_in_kernel(vcpu))
67	return;
68
69	/*
70	* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
71	* full update can be skipped as neither the vector nor the destination
72	* needs to be changed.
73	*/
74	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
75	/*
76	* Clear SN if it was set due to being preempted. Again, do
77	* this even if there is no assigned device for simplicity.
78	*/
79	if (pi_test_and_clear_sn(pi_desc))
80	goto after_clear_sn;
81	return;
82	}
83
84	local_irq_save(flags);
85
86	/*
87	* If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
88	* list of the _previous_ pCPU, which will not be the same as the
89	* current pCPU if the task was migrated.
90	*/
91	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
92	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
93	list_del(entry: &vmx->pi_wakeup_list);
94	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
95	}
96
97	dest = cpu_physical_id(cpu);
98	if (!x2apic_mode)
99	dest = (dest << `8`) & `0xFF00`;
100
101	old.control = READ_ONCE(pi_desc->control);
102	do {
103	new.control = old.control;
104
105	/*
106	* Clear SN (as above) and refresh the destination APIC ID to
107	* handle task migration (@cpu != vcpu->cpu).
108	*/
109	new.ndst = dest;
110	new.sn = `0`;
111
112	/*
113	* Restore the notification vector; in the blocking case, the
114	* descriptor was modified on "put" to use the wakeup vector.
115	*/
116	new.nv = POSTED_INTR_VECTOR;
117	} while (pi_try_set_control(pi_desc, pold: &old.control, new: new.control));
118
119	local_irq_restore(flags);
120
121	after_clear_sn:
122
123	/*
124	* Clear SN before reading the bitmap. The VT-d firmware
125	* writes the bitmap and reads SN atomically (5.2.3 in the
126	* spec), so it doesn't really have a memory barrier that
127	* pairs with this, but we cannot do that and we need one.
128	*/
129	smp_mb__after_atomic();
130
131	if (!pi_is_pir_empty(pi_desc))
132	pi_set_on(pi_desc);
133	}
134
135	static bool vmx_can_use_vtd_pi(struct kvm *kvm)
136	{
137	return irqchip_in_kernel(kvm) && enable_apicv &&
138	kvm_arch_has_assigned_device(kvm) &&
139	irq_remapping_cap(cap: IRQ_POSTING_CAP);
140	}
141
142	/*
143	* Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
144	* WAKEUP as the notification vector in the PI descriptor.
145	*/
146	static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
147	{
148	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
149	struct vcpu_vmx *vmx = to_vmx(vcpu);
150	struct pi_desc old, new;
151	unsigned long flags;
152
153	local_irq_save(flags);
154
155	raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
156	list_add_tail(new: &vmx->pi_wakeup_list,
157	head: &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
158	raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
159
160	WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
161
162	old.control = READ_ONCE(pi_desc->control);
163	do {
164	/ set 'NV' to 'wakeup vector' /
165	new.control = old.control;
166	new.nv = POSTED_INTR_WAKEUP_VECTOR;
167	} while (pi_try_set_control(pi_desc, pold: &old.control, new: new.control));
168
169	/*
170	* Send a wakeup IPI to this CPU if an interrupt may have been posted
171	* before the notification vector was updated, in which case the IRQ
172	* will arrive on the non-wakeup vector. An IPI is needed as calling
173	* try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
174	* enabled until it is safe to call try_to_wake_up() on the task being
175	* scheduled out).
176	*/
177	if (pi_test_on(pi_desc: &new))
178	__apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
179
180	local_irq_restore(flags);
181	}
182
183	static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
184	{
185	/*
186	* The default posted interrupt vector does nothing when
187	* invoked outside guest mode. Return whether a blocked vCPU
188	* can be the target of posted interrupts, as is the case when
189	* using either IPI virtualization or VT-d PI, so that the
190	* notification vector is switched to the one that calls
191	* back to the pi_wakeup_handler() function.
192	*/
193	return vmx_can_use_ipiv(vcpu) \|\| vmx_can_use_vtd_pi(kvm: vcpu->kvm);
194	}
195
196	void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
197	{
198	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
199
200	if (!vmx_needs_pi_wakeup(vcpu))
201	return;
202
203	if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
204	pi_enable_wakeup_handler(vcpu);
205
206	/*
207	* Set SN when the vCPU is preempted. Note, the vCPU can both be seen
208	* as blocking and preempted, e.g. if it's preempted between setting
209	* its wait state and manually scheduling out.
210	*/
211	if (vcpu->preempted)
212	pi_set_sn(pi_desc);
213	}
214
215	/*
216	* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
217	*/
218	void pi_wakeup_handler(void)
219	{
220	int cpu = smp_processor_id();
221	struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
222	raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
223	struct vcpu_vmx *vmx;
224
225	raw_spin_lock(spinlock);
226	list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
227
228	if (pi_test_on(pi_desc: &vmx->pi_desc))
229	kvm_vcpu_wake_up(vcpu: &vmx->vcpu);
230	}
231	raw_spin_unlock(spinlock);
232	}
233
234	void __init pi_init_cpu(int cpu)
235	{
236	INIT_LIST_HEAD(list: &per_cpu(wakeup_vcpus_on_cpu, cpu));
237	raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
238	}
239
240	bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
241	{
242	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
243
244	return pi_test_on(pi_desc) \|\|
245	(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
246	}
247
248
249	/*
250	* Bail out of the block loop if the VM has an assigned
251	* device, but the blocking vCPU didn't reconfigure the
252	* PI.NV to the wakeup vector, i.e. the assigned device
253	* came along after the initial check in vmx_vcpu_pi_put().
254	*/
255	void vmx_pi_start_assignment(struct kvm *kvm)
256	{
257	if (!irq_remapping_cap(cap: IRQ_POSTING_CAP))
258	return;
259
260	kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
261	}
262
263	/*
264	* vmx_pi_update_irte - set IRTE for Posted-Interrupts
265	*
266	* @kvm: kvm
267	* @host_irq: host irq of the interrupt
268	* @guest_irq: gsi of the interrupt
269	* @set: set or unset PI
270	* returns 0 on success, < 0 on failure
271	*/
272	int vmx_pi_update_irte(struct kvm kvm, unsigned* int host_irq,
273	uint32_t guest_irq, bool set)
274	{
275	struct kvm_kernel_irq_routing_entry *e;
276	struct kvm_irq_routing_table *irq_rt;
277	struct kvm_lapic_irq irq;
278	struct kvm_vcpu *vcpu;
279	struct vcpu_data vcpu_info;
280	int idx, ret = `0`;
281
282	if (!vmx_can_use_vtd_pi(kvm))
283	return `0`;
284
285	idx = srcu_read_lock(ssp: &kvm->irq_srcu);
286	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
287	if (guest_irq >= irq_rt->nr_rt_entries \|\|
288	hlist_empty(h: &irq_rt->map[guest_irq])) {
289	pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
290	guest_irq, irq_rt->nr_rt_entries);
291	goto out;
292	}
293
294	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
295	if (e->type != KVM_IRQ_ROUTING_MSI)
296	continue;
297	/*
298	* VT-d PI cannot support posting multicast/broadcast
299	* interrupts to a vCPU, we still use interrupt remapping
300	* for these kind of interrupts.
301	*
302	* For lowest-priority interrupts, we only support
303	* those with single CPU as the destination, e.g. user
304	* configures the interrupts via /proc/irq or uses
305	* irqbalance to make the interrupts single-CPU.
306	*
307	* We will support full lowest-priority interrupt later.
308	*
309	* In addition, we can only inject generic interrupts using
310	* the PI mechanism, refuse to route others through it.
311	*/
312
313	kvm_set_msi_irq(kvm, e, irq: &irq);
314	if (!kvm_intr_is_single_vcpu(kvm, irq: &irq, dest_vcpu: &vcpu) \|\|
315	!kvm_irq_is_postable(irq: &irq)) {
316	/*
317	* Make sure the IRTE is in remapped mode if
318	* we don't handle it in posted mode.
319	*/
320	ret = irq_set_vcpu_affinity(irq: host_irq, NULL);
321	if (ret < `0`) {
322	printk(KERN_INFO
323	"failed to back to remapped mode, irq: %u\n",
324	host_irq);
325	goto out;
326	}
327
328	continue;
329	}
330
331	vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
332	vcpu_info.vector = irq.vector;
333
334	trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
335	vcpu_info.vector, vcpu_info.pi_desc_addr, set);
336
337	if (set)
338	ret = irq_set_vcpu_affinity(irq: host_irq, vcpu_info: &vcpu_info);
339	else
340	ret = irq_set_vcpu_affinity(irq: host_irq, NULL);
341
342	if (ret < `0`) {
343	printk(KERN_INFO "%s: failed to update PI IRTE\n",
344	__func__);
345	goto out;
346	}
347	}
348
349	ret = `0`;
350	out:
351	srcu_read_unlock(ssp: &kvm->irq_srcu, idx);
352	return ret;
353	}
354

source code of linux/arch/x86/kvm/vmx/posted_intr.c