xen.c source code [linux/arch/x86/kvm/xen.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4	* Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5	*
6	* KVM Xen emulation
7	*/
8	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10	#include "x86.h"
11	#include "xen.h"
12	#include "hyperv.h"
13	#include "lapic.h"
14
15	#include <linux/eventfd.h>
16	#include <linux/kvm_host.h>
17	#include <linux/sched/stat.h>
18
19	#include <trace/events/kvm.h>
20	#include <xen/interface/xen.h>
21	#include <xen/interface/vcpu.h>
22	#include <xen/interface/version.h>
23	#include <xen/interface/event_channel.h>
24	#include <xen/interface/sched.h>
25
26	#include <asm/xen/cpuid.h>
27
28	#include "cpuid.h"
29	#include "trace.h"
30
31	static int kvm_xen_set_evtchn(struct kvm_xen_evtchn xe, struct* kvm *kvm);
32	static int kvm_xen_setattr_evtchn(struct kvm kvm, struct* kvm_xen_hvm_attr *data);
33	static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu vcpu, u64 param, u64 r);
34
35	DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
36
37	static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
38	{
39	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
40	struct pvclock_wall_clock *wc;
41	gpa_t gpa = gfn_to_gpa(gfn);
42	u32 *wc_sec_hi;
43	u32 wc_version;
44	u64 wall_nsec;
45	int ret = `0`;
46	int idx = srcu_read_lock(ssp: &kvm->srcu);
47
48	if (gfn == KVM_XEN_INVALID_GFN) {
49	kvm_gpc_deactivate(gpc);
50	goto out;
51	}
52
53	do {
54	ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
55	if (ret)
56	goto out;
57
58	/*
59	* This code mirrors kvm_write_wall_clock() except that it writes
60	* directly through the pfn cache and doesn't mark the page dirty.
61	*/
62	wall_nsec = kvm_get_wall_clock_epoch(kvm);
63
64	/ It could be invalid again already, so we need to check /
65	read_lock_irq(&gpc->lock);
66
67	if (gpc->valid)
68	break;
69
70	read_unlock_irq(&gpc->lock);
71	} while (`1`);
72
73	/ Paranoia checks on the 32-bit struct layout /
74	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != `0x900`);
75	BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != `0x924`);
76	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != `0`);
77
78	#ifdef CONFIG_X86_64
79	/ Paranoia checks on the 64-bit struct layout /
80	BUILD_BUG_ON(offsetof(struct shared_info, wc) != `0xc00`);
81	BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != `0xc0c`);
82
83	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
84	struct shared_info *shinfo = gpc->khva;
85
86	wc_sec_hi = &shinfo->wc_sec_hi;
87	wc = &shinfo->wc;
88	} else
89	#endif
90	{
91	struct compat_shared_info *shinfo = gpc->khva;
92
93	wc_sec_hi = &shinfo->arch.wc_sec_hi;
94	wc = &shinfo->wc;
95	}
96
97	/ Increment and ensure an odd value /
98	wc_version = wc->version = (wc->version + `1`) \| `1`;
99	smp_wmb();
100
101	wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
102	wc->sec = (u32)wall_nsec;
103	*wc_sec_hi = wall_nsec >> `32`;
104	smp_wmb();
105
106	wc->version = wc_version + `1`;
107	read_unlock_irq(&gpc->lock);
108
109	kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
110
111	out:
112	srcu_read_unlock(ssp: &kvm->srcu, idx);
113	return ret;
114	}
115
116	void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
117	{
118	if (atomic_read(v: &vcpu->arch.xen.timer_pending) > `0`) {
119	struct kvm_xen_evtchn e;
120
121	e.vcpu_id = vcpu->vcpu_id;
122	e.vcpu_idx = vcpu->vcpu_idx;
123	e.port = vcpu->arch.xen.timer_virq;
124	e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
125
126	kvm_xen_set_evtchn(xe: &e, kvm: vcpu->kvm);
127
128	vcpu->arch.xen.timer_expires = `0`;
129	atomic_set(v: &vcpu->arch.xen.timer_pending, i: `0`);
130	}
131	}
132
133	static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
134	{
135	struct kvm_vcpu vcpu = container_of(timer, struct* kvm_vcpu,
136	arch.xen.timer);
137	struct kvm_xen_evtchn e;
138	int rc;
139
140	if (atomic_read(v: &vcpu->arch.xen.timer_pending))
141	return HRTIMER_NORESTART;
142
143	e.vcpu_id = vcpu->vcpu_id;
144	e.vcpu_idx = vcpu->vcpu_idx;
145	e.port = vcpu->arch.xen.timer_virq;
146	e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
147
148	rc = kvm_xen_set_evtchn_fast(xe: &e, kvm: vcpu->kvm);
149	if (rc != -EWOULDBLOCK) {
150	vcpu->arch.xen.timer_expires = `0`;
151	return HRTIMER_NORESTART;
152	}
153
154	atomic_inc(v: &vcpu->arch.xen.timer_pending);
155	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
156	kvm_vcpu_kick(vcpu);
157
158	return HRTIMER_NORESTART;
159	}
160
161	static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
162	{
163	/*
164	* Avoid races with the old timer firing. Checking timer_expires
165	* to avoid calling hrtimer_cancel() will only have false positives
166	* so is fine.
167	*/
168	if (vcpu->arch.xen.timer_expires)
169	hrtimer_cancel(timer: &vcpu->arch.xen.timer);
170
171	atomic_set(v: &vcpu->arch.xen.timer_pending, i: `0`);
172	vcpu->arch.xen.timer_expires = guest_abs;
173
174	if (delta_ns <= `0`) {
175	xen_timer_callback(timer: &vcpu->arch.xen.timer);
176	} else {
177	ktime_t ktime_now = ktime_get();
178	hrtimer_start(timer: &vcpu->arch.xen.timer,
179	ktime_add_ns(ktime_now, delta_ns),
180	mode: HRTIMER_MODE_ABS_HARD);
181	}
182	}
183
184	static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
185	{
186	hrtimer_cancel(timer: &vcpu->arch.xen.timer);
187	vcpu->arch.xen.timer_expires = `0`;
188	atomic_set(v: &vcpu->arch.xen.timer_pending, i: `0`);
189	}
190
191	static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
192	{
193	hrtimer_init(timer: &vcpu->arch.xen.timer, CLOCK_MONOTONIC,
194	mode: HRTIMER_MODE_ABS_HARD);
195	vcpu->arch.xen.timer.function = xen_timer_callback;
196	}
197
198	static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
199	{
200	struct kvm_vcpu_xen *vx = &v->arch.xen;
201	struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
202	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
203	size_t user_len, user_len1, user_len2;
204	struct vcpu_runstate_info rs;
205	unsigned long flags;
206	size_t times_ofs;
207	uint8_t *update_bit = NULL;
208	uint64_t entry_time;
209	uint64_t *rs_times;
210	int *rs_state;
211
212	/*
213	* The only difference between 32-bit and 64-bit versions of the
214	* runstate struct is the alignment of uint64_t in 32-bit, which
215	* means that the 64-bit version has an additional 4 bytes of
216	* padding after the first field 'state'. Let's be really really
217	* paranoid about that, and matching it with our internal data
218	* structures that we memcpy into it...
219	*/
220	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != `0`);
221	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != `0`);
222	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != `0x2c`);
223	#ifdef CONFIG_X86_64
224	/*
225	* The 64-bit structure has 4 bytes of padding before 'state_entry_time'
226	* so each subsequent field is shifted by 4, and it's 4 bytes longer.
227	*/
228	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
229	offsetof(struct compat_vcpu_runstate_info, state_entry_time) + `4`);
230	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
231	offsetof(struct compat_vcpu_runstate_info, time) + `4`);
232	BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != `0x2c` + `4`);
233	#endif
234	/*
235	* The state field is in the same place at the start of both structs,
236	* and is the same size (int) as vx->current_runstate.
237	*/
238	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
239	offsetof(struct compat_vcpu_runstate_info, state));
240	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
241	sizeof(vx->current_runstate));
242	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
243	sizeof(vx->current_runstate));
244
245	/*
246	* The state_entry_time field is 64 bits in both versions, and the
247	* XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
248	* is little-endian means that it's in the last byte of the word.
249	* That detail is important later.
250	*/
251	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
252	sizeof(uint64_t));
253	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
254	sizeof(uint64_t));
255	BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> `56`) != `0x80`);
256
257	/*
258	* The time array is four 64-bit quantities in both versions, matching
259	* the vx->runstate_times and immediately following state_entry_time.
260	*/
261	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
262	offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
263	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
264	offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
265	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
266	sizeof_field(struct compat_vcpu_runstate_info, time));
267	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
268	sizeof(vx->runstate_times));
269
270	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
271	user_len = sizeof(struct vcpu_runstate_info);
272	times_ofs = offsetof(struct vcpu_runstate_info,
273	state_entry_time);
274	} else {
275	user_len = sizeof(struct compat_vcpu_runstate_info);
276	times_ofs = offsetof(struct compat_vcpu_runstate_info,
277	state_entry_time);
278	}
279
280	/*
281	* There are basically no alignment constraints. The guest can set it
282	* up so it crosses from one page to the next, and at arbitrary byte
283	* alignment (and the 32-bit ABI doesn't align the 64-bit integers
284	* anyway, even if the overall struct had been 64-bit aligned).
285	*/
286	if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
287	user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
288	user_len2 = user_len - user_len1;
289	} else {
290	user_len1 = user_len;
291	user_len2 = `0`;
292	}
293	BUG_ON(user_len1 + user_len2 != user_len);
294
295	retry:
296	/*
297	* Attempt to obtain the GPC lock on both (if there are two)
298	* gfn_to_pfn caches that cover the region.
299	*/
300	if (atomic) {
301	local_irq_save(flags);
302	if (!read_trylock(&gpc1->lock)) {
303	local_irq_restore(flags);
304	return;
305	}
306	} else {
307	read_lock_irqsave(&gpc1->lock, flags);
308	}
309	while (!kvm_gpc_check(gpc: gpc1, len: user_len1)) {
310	read_unlock_irqrestore(&gpc1->lock, flags);
311
312	/ When invoked from kvm_sched_out() we cannot sleep /
313	if (atomic)
314	return;
315
316	if (kvm_gpc_refresh(gpc: gpc1, len: user_len1))
317	return;
318
319	read_lock_irqsave(&gpc1->lock, flags);
320	}
321
322	if (likely(!user_len2)) {
323	/*
324	* Set up three pointers directly to the runstate_info
325	* struct in the guest (via the GPC).
326	*
327	* • @rs_state → state field
328	* • @rs_times → state_entry_time field.
329	* • @update_bit → last byte of state_entry_time, which
330	* contains the XEN_RUNSTATE_UPDATE bit.
331	*/
332	rs_state = gpc1->khva;
333	rs_times = gpc1->khva + times_ofs;
334	if (v->kvm->arch.xen.runstate_update_flag)
335	update_bit = ((void *)(&rs_times[`1`])) - `1`;
336	} else {
337	/*
338	* The guest's runstate_info is split across two pages and we
339	* need to hold and validate both GPCs simultaneously. We can
340	* declare a lock ordering GPC1 > GPC2 because nothing else
341	* takes them more than one at a time. Set a subclass on the
342	* gpc1 lock to make lockdep shut up about it.
343	*/
344	lock_set_subclass(lock: &gpc1->lock.dep_map, subclass: `1`, _THIS_IP_);
345	if (atomic) {
346	if (!read_trylock(&gpc2->lock)) {
347	read_unlock_irqrestore(&gpc1->lock, flags);
348	return;
349	}
350	} else {
351	read_lock(&gpc2->lock);
352	}
353
354	if (!kvm_gpc_check(gpc: gpc2, len: user_len2)) {
355	read_unlock(&gpc2->lock);
356	read_unlock_irqrestore(&gpc1->lock, flags);
357
358	/ When invoked from kvm_sched_out() we cannot sleep /
359	if (atomic)
360	return;
361
362	/*
363	* Use kvm_gpc_activate() here because if the runstate
364	* area was configured in 32-bit mode and only extends
365	* to the second page now because the guest changed to
366	* 64-bit mode, the second GPC won't have been set up.
367	*/
368	if (kvm_gpc_activate(gpc: gpc2, gpa: gpc1->gpa + user_len1,
369	len: user_len2))
370	return;
371
372	/*
373	* We dropped the lock on GPC1 so we have to go all the
374	* way back and revalidate that too.
375	*/
376	goto retry;
377	}
378
379	/*
380	* In this case, the runstate_info struct will be assembled on
381	* the kernel stack (compat or not as appropriate) and will
382	* be copied to GPC1/GPC2 with a dual memcpy. Set up the three
383	* rs pointers accordingly.
384	*/
385	rs_times = &rs.state_entry_time;
386
387	/*
388	* The rs_state pointer points to the start of what we'll
389	* copy to the guest, which in the case of a compat guest
390	* is the 32-bit field that the compiler thinks is padding.
391	*/
392	rs_state = ((void *)rs_times) - times_ofs;
393
394	/*
395	* The update_bit is still directly in the guest memory,
396	* via one GPC or the other.
397	*/
398	if (v->kvm->arch.xen.runstate_update_flag) {
399	if (user_len1 >= times_ofs + sizeof(uint64_t))
400	update_bit = gpc1->khva + times_ofs +
401	sizeof(uint64_t) - `1`;
402	else
403	update_bit = gpc2->khva + times_ofs +
404	sizeof(uint64_t) - `1` - user_len1;
405	}
406
407	#ifdef CONFIG_X86_64
408	/*
409	* Don't leak kernel memory through the padding in the 64-bit
410	* version of the struct.
411	*/
412	memset(&rs, `0`, offsetof(struct vcpu_runstate_info, state_entry_time));
413	#endif
414	}
415
416	/*
417	* First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
418	* state_entry_time field, directly in the guest. We need to set
419	* that (and write-barrier) before writing to the rest of the
420	* structure, and clear it last. Just as Xen does, we address the
421	* single byte in which it resides because it might be in a
422	* different cache line to the rest of the 64-bit word, due to
423	* the (lack of) alignment constraints.
424	*/
425	entry_time = vx->runstate_entry_time;
426	if (update_bit) {
427	entry_time \|= XEN_RUNSTATE_UPDATE;
428	*update_bit = (vx->runstate_entry_time \| XEN_RUNSTATE_UPDATE) >> `56`;
429	smp_wmb();
430	}
431
432	/*
433	* Now assemble the actual structure, either on our kernel stack
434	* or directly in the guest according to how the rs_state and
435	* rs_times pointers were set up above.
436	*/
437	*rs_state = vx->current_runstate;
438	rs_times[`0`] = entry_time;
439	memcpy(rs_times + `1`, vx->runstate_times, sizeof(vx->runstate_times));
440
441	/ For the split case, we have to then copy it to the guest. /
442	if (user_len2) {
443	memcpy(gpc1->khva, rs_state, user_len1);
444	memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
445	}
446	smp_wmb();
447
448	/ Finally, clear the XEN_RUNSTATE_UPDATE bit. /
449	if (update_bit) {
450	entry_time &= ~XEN_RUNSTATE_UPDATE;
451	*update_bit = entry_time >> `56`;
452	smp_wmb();
453	}
454
455	if (user_len2)
456	read_unlock(&gpc2->lock);
457
458	read_unlock_irqrestore(&gpc1->lock, flags);
459
460	mark_page_dirty_in_slot(kvm: v->kvm, memslot: gpc1->memslot, gfn: gpc1->gpa >> PAGE_SHIFT);
461	if (user_len2)
462	mark_page_dirty_in_slot(kvm: v->kvm, memslot: gpc2->memslot, gfn: gpc2->gpa >> PAGE_SHIFT);
463	}
464
465	void kvm_xen_update_runstate(struct kvm_vcpu v, int* state)
466	{
467	struct kvm_vcpu_xen *vx = &v->arch.xen;
468	u64 now = get_kvmclock_ns(kvm: v->kvm);
469	u64 delta_ns = now - vx->runstate_entry_time;
470	u64 run_delay = current->sched_info.run_delay;
471
472	if (unlikely(!vx->runstate_entry_time))
473	vx->current_runstate = RUNSTATE_offline;
474
475	/*
476	* Time waiting for the scheduler isn't "stolen" if the
477	* vCPU wasn't running anyway.
478	*/
479	if (vx->current_runstate == RUNSTATE_running) {
480	u64 steal_ns = run_delay - vx->last_steal;
481
482	delta_ns -= steal_ns;
483
484	vx->runstate_times[RUNSTATE_runnable] += steal_ns;
485	}
486	vx->last_steal = run_delay;
487
488	vx->runstate_times[vx->current_runstate] += delta_ns;
489	vx->current_runstate = state;
490	vx->runstate_entry_time = now;
491
492	if (vx->runstate_cache.active)
493	kvm_xen_update_runstate_guest(v, atomic: state == RUNSTATE_runnable);
494	}
495
496	static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
497	{
498	struct kvm_lapic_irq irq = { };
499	int r;
500
501	irq.dest_id = v->vcpu_id;
502	irq.vector = v->arch.xen.upcall_vector;
503	irq.dest_mode = APIC_DEST_PHYSICAL;
504	irq.shorthand = APIC_DEST_NOSHORT;
505	irq.delivery_mode = APIC_DM_FIXED;
506	irq.level = `1`;
507
508	/ The fast version will always work for physical unicast /
509	WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
510	}
511
512	/*
513	* On event channel delivery, the vcpu_info may not have been accessible.
514	* In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
515	* need to be marked into the vcpu_info (and evtchn_upcall_pending set).
516	* Do so now that we can sleep in the context of the vCPU to bring the
517	* page in, and refresh the pfn cache for it.
518	*/
519	void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
520	{
521	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
522	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
523	unsigned long flags;
524
525	if (!evtchn_pending_sel)
526	return;
527
528	/*
529	* Yes, this is an open-coded loop. But that's just what put_user()
530	* does anyway. Page it in and retry the instruction. We're just a
531	* little more honest about it.
532	*/
533	read_lock_irqsave(&gpc->lock, flags);
534	while (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) {
535	read_unlock_irqrestore(&gpc->lock, flags);
536
537	if (kvm_gpc_refresh(gpc, len: sizeof(struct vcpu_info)))
538	return;
539
540	read_lock_irqsave(&gpc->lock, flags);
541	}
542
543	/ Now gpc->khva is a valid kernel address for the vcpu_info /
544	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
545	struct vcpu_info *vi = gpc->khva;
546
547	asm volatile(LOCK_PREFIX "orq %0, %1\n"
548	"notq %0\n"
549	LOCK_PREFIX "andq %0, %2\n"
550	: "=r" (evtchn_pending_sel),
551	"+m" (vi->evtchn_pending_sel),
552	"+m" (v->arch.xen.evtchn_pending_sel)
553	: "0" (evtchn_pending_sel));
554	WRITE_ONCE(vi->evtchn_upcall_pending, `1`);
555	} else {
556	u32 evtchn_pending_sel32 = evtchn_pending_sel;
557	struct compat_vcpu_info *vi = gpc->khva;
558
559	asm volatile(LOCK_PREFIX "orl %0, %1\n"
560	"notl %0\n"
561	LOCK_PREFIX "andl %0, %2\n"
562	: "=r" (evtchn_pending_sel32),
563	"+m" (vi->evtchn_pending_sel),
564	"+m" (v->arch.xen.evtchn_pending_sel)
565	: "0" (evtchn_pending_sel32));
566	WRITE_ONCE(vi->evtchn_upcall_pending, `1`);
567	}
568	read_unlock_irqrestore(&gpc->lock, flags);
569
570	/ For the per-vCPU lapic vector, deliver it as MSI. /
571	if (v->arch.xen.upcall_vector)
572	kvm_xen_inject_vcpu_vector(v);
573
574	mark_page_dirty_in_slot(kvm: v->kvm, memslot: gpc->memslot, gfn: gpc->gpa >> PAGE_SHIFT);
575	}
576
577	int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
578	{
579	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
580	unsigned long flags;
581	u8 rc = `0`;
582
583	/*
584	* If the global upcall vector (HVMIRQ_callback_vector) is set and
585	* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
586	*/
587
588	/ No need for compat handling here /
589	BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
590	offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
591	BUILD_BUG_ON(sizeof(rc) !=
592	sizeof_field(struct vcpu_info, evtchn_upcall_pending));
593	BUILD_BUG_ON(sizeof(rc) !=
594	sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
595
596	read_lock_irqsave(&gpc->lock, flags);
597	while (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) {
598	read_unlock_irqrestore(&gpc->lock, flags);
599
600	/*
601	* This function gets called from kvm_vcpu_block() after setting the
602	* task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
603	* from a HLT. So we really mustn't sleep. If the page ended up absent
604	* at that point, just return 1 in order to trigger an immediate wake,
605	* and we'll end up getting called again from a context where we can
606	* fault in the page and wait for it.
607	*/
608	if (in_atomic() \|\| !task_is_running(current))
609	return `1`;
610
611	if (kvm_gpc_refresh(gpc, len: sizeof(struct vcpu_info))) {
612	/*
613	* If this failed, userspace has screwed up the
614	* vcpu_info mapping. No interrupts for you.
615	*/
616	return `0`;
617	}
618	read_lock_irqsave(&gpc->lock, flags);
619	}
620
621	rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
622	read_unlock_irqrestore(&gpc->lock, flags);
623	return rc;
624	}
625
626	int kvm_xen_hvm_set_attr(struct kvm kvm, struct* kvm_xen_hvm_attr *data)
627	{
628	int r = -ENOENT;
629
630
631	switch (data->type) {
632	case KVM_XEN_ATTR_TYPE_LONG_MODE:
633	if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
634	r = -EINVAL;
635	} else {
636	mutex_lock(&kvm->arch.xen.xen_lock);
637	kvm->arch.xen.long_mode = !!data->u.long_mode;
638	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
639	r = `0`;
640	}
641	break;
642
643	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
644	mutex_lock(&kvm->arch.xen.xen_lock);
645	r = kvm_xen_shared_info_init(kvm, gfn: data->u.shared_info.gfn);
646	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
647	break;
648
649	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
650	if (data->u.vector && data->u.vector < `0x10`)
651	r = -EINVAL;
652	else {
653	mutex_lock(&kvm->arch.xen.xen_lock);
654	kvm->arch.xen.upcall_vector = data->u.vector;
655	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
656	r = `0`;
657	}
658	break;
659
660	case KVM_XEN_ATTR_TYPE_EVTCHN:
661	r = kvm_xen_setattr_evtchn(kvm, data);
662	break;
663
664	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
665	mutex_lock(&kvm->arch.xen.xen_lock);
666	kvm->arch.xen.xen_version = data->u.xen_version;
667	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
668	r = `0`;
669	break;
670
671	case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
672	if (!sched_info_on()) {
673	r = -EOPNOTSUPP;
674	break;
675	}
676	mutex_lock(&kvm->arch.xen.xen_lock);
677	kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
678	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
679	r = `0`;
680	break;
681
682	default:
683	break;
684	}
685
686	return r;
687	}
688
689	int kvm_xen_hvm_get_attr(struct kvm kvm, struct* kvm_xen_hvm_attr *data)
690	{
691	int r = -ENOENT;
692
693	mutex_lock(&kvm->arch.xen.xen_lock);
694
695	switch (data->type) {
696	case KVM_XEN_ATTR_TYPE_LONG_MODE:
697	data->u.long_mode = kvm->arch.xen.long_mode;
698	r = `0`;
699	break;
700
701	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
702	if (kvm->arch.xen.shinfo_cache.active)
703	data->u.shared_info.gfn = gpa_to_gfn(gpa: kvm->arch.xen.shinfo_cache.gpa);
704	else
705	data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
706	r = `0`;
707	break;
708
709	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
710	data->u.vector = kvm->arch.xen.upcall_vector;
711	r = `0`;
712	break;
713
714	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
715	data->u.xen_version = kvm->arch.xen.xen_version;
716	r = `0`;
717	break;
718
719	case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
720	if (!sched_info_on()) {
721	r = -EOPNOTSUPP;
722	break;
723	}
724	data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
725	r = `0`;
726	break;
727
728	default:
729	break;
730	}
731
732	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
733	return r;
734	}
735
736	int kvm_xen_vcpu_set_attr(struct kvm_vcpu vcpu, struct* kvm_xen_vcpu_attr *data)
737	{
738	int idx, r = -ENOENT;
739
740	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
741	idx = srcu_read_lock(ssp: &vcpu->kvm->srcu);
742
743	switch (data->type) {
744	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
745	/ No compat necessary here. /
746	BUILD_BUG_ON(sizeof(struct vcpu_info) !=
747	sizeof(struct compat_vcpu_info));
748	BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
749	offsetof(struct compat_vcpu_info, time));
750
751	if (data->u.gpa == KVM_XEN_INVALID_GPA) {
752	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache);
753	r = `0`;
754	break;
755	}
756
757	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.vcpu_info_cache,
758	gpa: data->u.gpa, len: sizeof(struct vcpu_info));
759	if (!r)
760	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
761
762	break;
763
764	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
765	if (data->u.gpa == KVM_XEN_INVALID_GPA) {
766	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_time_info_cache);
767	r = `0`;
768	break;
769	}
770
771	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.vcpu_time_info_cache,
772	gpa: data->u.gpa,
773	len: sizeof(struct pvclock_vcpu_time_info));
774	if (!r)
775	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
776	break;
777
778	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
779	size_t sz, sz1, sz2;
780
781	if (!sched_info_on()) {
782	r = -EOPNOTSUPP;
783	break;
784	}
785	if (data->u.gpa == KVM_XEN_INVALID_GPA) {
786	r = `0`;
787	deactivate_out:
788	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate_cache);
789	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache);
790	break;
791	}
792
793	/*
794	* If the guest switches to 64-bit mode after setting the runstate
795	* address, that's actually OK. kvm_xen_update_runstate_guest()
796	* will cope.
797	*/
798	if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
799	sz = sizeof(struct vcpu_runstate_info);
800	else
801	sz = sizeof(struct compat_vcpu_runstate_info);
802
803	/ How much fits in the (first) page? /
804	sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
805	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.runstate_cache,
806	gpa: data->u.gpa, len: sz1);
807	if (r)
808	goto deactivate_out;
809
810	/ Either map the second page, or deactivate the second GPC /
811	if (sz1 >= sz) {
812	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache);
813	} else {
814	sz2 = sz - sz1;
815	BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
816	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.runstate2_cache,
817	gpa: data->u.gpa + sz1, len: sz2);
818	if (r)
819	goto deactivate_out;
820	}
821
822	kvm_xen_update_runstate_guest(v: vcpu, atomic: false);
823	break;
824	}
825	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
826	if (!sched_info_on()) {
827	r = -EOPNOTSUPP;
828	break;
829	}
830	if (data->u.runstate.state > RUNSTATE_offline) {
831	r = -EINVAL;
832	break;
833	}
834
835	kvm_xen_update_runstate(v: vcpu, state: data->u.runstate.state);
836	r = `0`;
837	break;
838
839	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
840	if (!sched_info_on()) {
841	r = -EOPNOTSUPP;
842	break;
843	}
844	if (data->u.runstate.state > RUNSTATE_offline) {
845	r = -EINVAL;
846	break;
847	}
848	if (data->u.runstate.state_entry_time !=
849	(data->u.runstate.time_running +
850	data->u.runstate.time_runnable +
851	data->u.runstate.time_blocked +
852	data->u.runstate.time_offline)) {
853	r = -EINVAL;
854	break;
855	}
856	if (get_kvmclock_ns(kvm: vcpu->kvm) <
857	data->u.runstate.state_entry_time) {
858	r = -EINVAL;
859	break;
860	}
861
862	vcpu->arch.xen.current_runstate = data->u.runstate.state;
863	vcpu->arch.xen.runstate_entry_time =
864	data->u.runstate.state_entry_time;
865	vcpu->arch.xen.runstate_times[RUNSTATE_running] =
866	data->u.runstate.time_running;
867	vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
868	data->u.runstate.time_runnable;
869	vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
870	data->u.runstate.time_blocked;
871	vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
872	data->u.runstate.time_offline;
873	vcpu->arch.xen.last_steal = current->sched_info.run_delay;
874	r = `0`;
875	break;
876
877	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
878	if (!sched_info_on()) {
879	r = -EOPNOTSUPP;
880	break;
881	}
882	if (data->u.runstate.state > RUNSTATE_offline &&
883	data->u.runstate.state != (u64)-`1`) {
884	r = -EINVAL;
885	break;
886	}
887	/ The adjustment must add up /
888	if (data->u.runstate.state_entry_time !=
889	(data->u.runstate.time_running +
890	data->u.runstate.time_runnable +
891	data->u.runstate.time_blocked +
892	data->u.runstate.time_offline)) {
893	r = -EINVAL;
894	break;
895	}
896
897	if (get_kvmclock_ns(kvm: vcpu->kvm) <
898	(vcpu->arch.xen.runstate_entry_time +
899	data->u.runstate.state_entry_time)) {
900	r = -EINVAL;
901	break;
902	}
903
904	vcpu->arch.xen.runstate_entry_time +=
905	data->u.runstate.state_entry_time;
906	vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
907	data->u.runstate.time_running;
908	vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
909	data->u.runstate.time_runnable;
910	vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
911	data->u.runstate.time_blocked;
912	vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
913	data->u.runstate.time_offline;
914
915	if (data->u.runstate.state <= RUNSTATE_offline)
916	kvm_xen_update_runstate(v: vcpu, state: data->u.runstate.state);
917	else if (vcpu->arch.xen.runstate_cache.active)
918	kvm_xen_update_runstate_guest(v: vcpu, atomic: false);
919	r = `0`;
920	break;
921
922	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
923	if (data->u.vcpu_id >= KVM_MAX_VCPUS)
924	r = -EINVAL;
925	else {
926	vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
927	r = `0`;
928	}
929	break;
930
931	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
932	if (data->u.timer.port &&
933	data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
934	r = -EINVAL;
935	break;
936	}
937
938	if (!vcpu->arch.xen.timer.function)
939	kvm_xen_init_timer(vcpu);
940
941	/ Stop the timer (if it's running) before changing the vector /
942	kvm_xen_stop_timer(vcpu);
943	vcpu->arch.xen.timer_virq = data->u.timer.port;
944
945	/ Start the timer if the new value has a valid vector+expiry. /
946	if (data->u.timer.port && data->u.timer.expires_ns)
947	kvm_xen_start_timer(vcpu, guest_abs: data->u.timer.expires_ns,
948	delta_ns: data->u.timer.expires_ns -
949	get_kvmclock_ns(kvm: vcpu->kvm));
950
951	r = `0`;
952	break;
953
954	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
955	if (data->u.vector && data->u.vector < `0x10`)
956	r = -EINVAL;
957	else {
958	vcpu->arch.xen.upcall_vector = data->u.vector;
959	r = `0`;
960	}
961	break;
962
963	default:
964	break;
965	}
966
967	srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx);
968	mutex_unlock(lock: &vcpu->kvm->arch.xen.xen_lock);
969	return r;
970	}
971
972	int kvm_xen_vcpu_get_attr(struct kvm_vcpu vcpu, struct* kvm_xen_vcpu_attr *data)
973	{
974	int r = -ENOENT;
975
976	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
977
978	switch (data->type) {
979	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
980	if (vcpu->arch.xen.vcpu_info_cache.active)
981	data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
982	else
983	data->u.gpa = KVM_XEN_INVALID_GPA;
984	r = `0`;
985	break;
986
987	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
988	if (vcpu->arch.xen.vcpu_time_info_cache.active)
989	data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
990	else
991	data->u.gpa = KVM_XEN_INVALID_GPA;
992	r = `0`;
993	break;
994
995	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
996	if (!sched_info_on()) {
997	r = -EOPNOTSUPP;
998	break;
999	}
1000	if (vcpu->arch.xen.runstate_cache.active) {
1001	data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
1002	r = `0`;
1003	}
1004	break;
1005
1006	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
1007	if (!sched_info_on()) {
1008	r = -EOPNOTSUPP;
1009	break;
1010	}
1011	data->u.runstate.state = vcpu->arch.xen.current_runstate;
1012	r = `0`;
1013	break;
1014
1015	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
1016	if (!sched_info_on()) {
1017	r = -EOPNOTSUPP;
1018	break;
1019	}
1020	data->u.runstate.state = vcpu->arch.xen.current_runstate;
1021	data->u.runstate.state_entry_time =
1022	vcpu->arch.xen.runstate_entry_time;
1023	data->u.runstate.time_running =
1024	vcpu->arch.xen.runstate_times[RUNSTATE_running];
1025	data->u.runstate.time_runnable =
1026	vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
1027	data->u.runstate.time_blocked =
1028	vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
1029	data->u.runstate.time_offline =
1030	vcpu->arch.xen.runstate_times[RUNSTATE_offline];
1031	r = `0`;
1032	break;
1033
1034	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
1035	r = -EINVAL;
1036	break;
1037
1038	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
1039	data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
1040	r = `0`;
1041	break;
1042
1043	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
1044	/*
1045	* Ensure a consistent snapshot of state is captured, with a
1046	* timer either being pending, or the event channel delivered
1047	* to the corresponding bit in the shared_info. Not still
1048	* lurking in the timer_pending flag for deferred delivery.
1049	* Purely as an optimisation, if the timer_expires field is
1050	* zero, that means the timer isn't active (or even in the
1051	* timer_pending flag) and there is no need to cancel it.
1052	*/
1053	if (vcpu->arch.xen.timer_expires) {
1054	hrtimer_cancel(timer: &vcpu->arch.xen.timer);
1055	kvm_xen_inject_timer_irqs(vcpu);
1056	}
1057
1058	data->u.timer.port = vcpu->arch.xen.timer_virq;
1059	data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
1060	data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
1061
1062	/*
1063	* The hrtimer may trigger and raise the IRQ immediately,
1064	* while the returned state causes it to be set up and
1065	* raised again on the destination system after migration.
1066	* That's fine, as the guest won't even have had a chance
1067	* to run and handle the interrupt. Asserting an already
1068	* pending event channel is idempotent.
1069	*/
1070	if (vcpu->arch.xen.timer_expires)
1071	hrtimer_start_expires(timer: &vcpu->arch.xen.timer,
1072	mode: HRTIMER_MODE_ABS_HARD);
1073
1074	r = `0`;
1075	break;
1076
1077	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
1078	data->u.vector = vcpu->arch.xen.upcall_vector;
1079	r = `0`;
1080	break;
1081
1082	default:
1083	break;
1084	}
1085
1086	mutex_unlock(lock: &vcpu->kvm->arch.xen.xen_lock);
1087	return r;
1088	}
1089
1090	int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
1091	{
1092	struct kvm *kvm = vcpu->kvm;
1093	u32 page_num = data & ~PAGE_MASK;
1094	u64 page_addr = data & PAGE_MASK;
1095	bool lm = is_long_mode(vcpu);
1096
1097	/ Latch long_mode for shared_info pages etc. /
1098	vcpu->kvm->arch.xen.long_mode = lm;
1099
1100	/*
1101	* If Xen hypercall intercept is enabled, fill the hypercall
1102	* page with VMCALL/VMMCALL instructions since that's what
1103	* we catch. Else the VMM has provided the hypercall pages
1104	* with instructions of its own choosing, so use those.
1105	*/
1106	if (kvm_xen_hypercall_enabled(kvm)) {
1107	u8 instructions[`32`];
1108	int i;
1109
1110	if (page_num)
1111	return `1`;
1112
1113	/ mov imm32, %eax /
1114	instructions[`0`] = `0xb8`;
1115
1116	/ vmcall / vmmcall /
1117	static_call(kvm_x86_patch_hypercall)(vcpu, instructions + `5`);
1118
1119	/ ret /
1120	instructions[`8`] = `0xc3`;
1121
1122	/ int3 to pad /
1123	memset(instructions + `9`, `0xcc`, sizeof(instructions) - `9`);
1124
1125	for (i = `0`; i < PAGE_SIZE / sizeof(instructions); i++) {
1126	(u32 )&instructions[`1`] = i;
1127	if (kvm_vcpu_write_guest(vcpu,
1128	gpa: page_addr + (i * sizeof(instructions)),
1129	data: instructions, len: sizeof(instructions)))
1130	return `1`;
1131	}
1132	} else {
1133	/*
1134	* Note, truncation is a non-issue as 'lm' is guaranteed to be
1135	* false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
1136	*/
1137	hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
1138	: kvm->arch.xen_hvm_config.blob_addr_32;
1139	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1140	: kvm->arch.xen_hvm_config.blob_size_32;
1141	u8 *page;
1142	int ret;
1143
1144	if (page_num >= blob_size)
1145	return `1`;
1146
1147	blob_addr += page_num * PAGE_SIZE;
1148
1149	page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
1150	if (IS_ERR(ptr: page))
1151	return PTR_ERR(ptr: page);
1152
1153	ret = kvm_vcpu_write_guest(vcpu, gpa: page_addr, data: page, PAGE_SIZE);
1154	kfree(objp: page);
1155	if (ret)
1156	return `1`;
1157	}
1158	return `0`;
1159	}
1160
1161	int kvm_xen_hvm_config(struct kvm kvm, struct* kvm_xen_hvm_config *xhc)
1162	{
1163	/ Only some feature flags need to be enabled by userspace /
1164	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL \|
1165	KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
1166
1167	if (xhc->flags & ~permitted_flags)
1168	return -EINVAL;
1169
1170	/*
1171	* With hypercall interception the kernel generates its own
1172	* hypercall page so it must not be provided.
1173	*/
1174	if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
1175	(xhc->blob_addr_32 \|\| xhc->blob_addr_64 \|\|
1176	xhc->blob_size_32 \|\| xhc->blob_size_64))
1177	return -EINVAL;
1178
1179	mutex_lock(&kvm->arch.xen.xen_lock);
1180
1181	if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
1182	static_branch_inc(&kvm_xen_enabled.key);
1183	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
1184	static_branch_slow_dec_deferred(&kvm_xen_enabled);
1185
1186	memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
1187
1188	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
1189	return `0`;
1190	}
1191
1192	static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
1193	{
1194	kvm_rax_write(vcpu, val: result);
1195	return kvm_skip_emulated_instruction(vcpu);
1196	}
1197
1198	static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
1199	{
1200	struct kvm_run *run = vcpu->run;
1201
1202	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
1203	return `1`;
1204
1205	return kvm_xen_hypercall_set_result(vcpu, result: run->xen.u.hcall.result);
1206	}
1207
1208	static inline int max_evtchn_port(struct kvm *kvm)
1209	{
1210	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
1211	return EVTCHN_2L_NR_CHANNELS;
1212	else
1213	return COMPAT_EVTCHN_2L_NR_CHANNELS;
1214	}
1215
1216	static bool wait_pending_event(struct kvm_vcpu vcpu, int* nr_ports,
1217	evtchn_port_t *ports)
1218	{
1219	struct kvm *kvm = vcpu->kvm;
1220	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1221	unsigned long *pending_bits;
1222	unsigned long flags;
1223	bool ret = true;
1224	int idx, i;
1225
1226	idx = srcu_read_lock(ssp: &kvm->srcu);
1227	read_lock_irqsave(&gpc->lock, flags);
1228	if (!kvm_gpc_check(gpc, PAGE_SIZE))
1229	goto out_rcu;
1230
1231	ret = false;
1232	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1233	struct shared_info *shinfo = gpc->khva;
1234	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1235	} else {
1236	struct compat_shared_info *shinfo = gpc->khva;
1237	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1238	}
1239
1240	for (i = `0`; i < nr_ports; i++) {
1241	if (test_bit(ports[i], pending_bits)) {
1242	ret = true;
1243	break;
1244	}
1245	}
1246
1247	out_rcu:
1248	read_unlock_irqrestore(&gpc->lock, flags);
1249	srcu_read_unlock(ssp: &kvm->srcu, idx);
1250
1251	return ret;
1252	}
1253
1254	static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
1255	u64 param, u64 *r)
1256	{
1257	struct sched_poll sched_poll;
1258	evtchn_port_t port, *ports;
1259	struct x86_exception e;
1260	int i;
1261
1262	if (!lapic_in_kernel(vcpu) \|\|
1263	!(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
1264	return false;
1265
1266	if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
1267	struct compat_sched_poll sp32;
1268
1269	/ Sanity check that the compat struct definition is correct /
1270	BUILD_BUG_ON(sizeof(sp32) != `16`);
1271
1272	if (kvm_read_guest_virt(vcpu, addr: param, val: &sp32, bytes: sizeof(sp32), exception: &e)) {
1273	*r = -EFAULT;
1274	return true;
1275	}
1276
1277	/*
1278	* This is a 32-bit pointer to an array of evtchn_port_t which
1279	* are uint32_t, so once it's converted no further compat
1280	* handling is needed.
1281	*/
1282	sched_poll.ports = (void )(unsigned* long)(sp32.ports);
1283	sched_poll.nr_ports = sp32.nr_ports;
1284	sched_poll.timeout = sp32.timeout;
1285	} else {
1286	if (kvm_read_guest_virt(vcpu, addr: param, val: &sched_poll,
1287	bytes: sizeof(sched_poll), exception: &e)) {
1288	*r = -EFAULT;
1289	return true;
1290	}
1291	}
1292
1293	if (unlikely(sched_poll.nr_ports > `1`)) {
1294	/ Xen (unofficially) limits number of pollers to 128 /
1295	if (sched_poll.nr_ports > `128`) {
1296	*r = -EINVAL;
1297	return true;
1298	}
1299
1300	ports = kmalloc_array(n: sched_poll.nr_ports,
1301	size: sizeof(*ports), GFP_KERNEL);
1302	if (!ports) {
1303	*r = -ENOMEM;
1304	return true;
1305	}
1306	} else
1307	ports = &port;
1308
1309	if (kvm_read_guest_virt(vcpu, addr: (gva_t)sched_poll.ports, val: ports,
1310	bytes: sched_poll.nr_ports * sizeof(*ports), exception: &e)) {
1311	*r = -EFAULT;
1312	return true;
1313	}
1314
1315	for (i = `0`; i < sched_poll.nr_ports; i++) {
1316	if (ports[i] >= max_evtchn_port(kvm: vcpu->kvm)) {
1317	*r = -EINVAL;
1318	goto out;
1319	}
1320	}
1321
1322	if (sched_poll.nr_ports == `1`)
1323	vcpu->arch.xen.poll_evtchn = port;
1324	else
1325	vcpu->arch.xen.poll_evtchn = -`1`;
1326
1327	set_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask);
1328
1329	if (!wait_pending_event(vcpu, nr_ports: sched_poll.nr_ports, ports)) {
1330	vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
1331
1332	if (sched_poll.timeout)
1333	mod_timer(timer: &vcpu->arch.xen.poll_timer,
1334	expires: jiffies + nsecs_to_jiffies(n: sched_poll.timeout));
1335
1336	kvm_vcpu_halt(vcpu);
1337
1338	if (sched_poll.timeout)
1339	del_timer(timer: &vcpu->arch.xen.poll_timer);
1340
1341	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1342	}
1343
1344	vcpu->arch.xen.poll_evtchn = `0`;
1345	*r = `0`;
1346	out:
1347	/ Really, this is only needed in case of timeout /
1348	clear_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask);
1349
1350	if (unlikely(sched_poll.nr_ports > `1`))
1351	kfree(objp: ports);
1352	return true;
1353	}
1354
1355	static void cancel_evtchn_poll(struct timer_list *t)
1356	{
1357	struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
1358
1359	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1360	kvm_vcpu_kick(vcpu);
1361	}
1362
1363	static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
1364	int cmd, u64 param, u64 *r)
1365	{
1366	switch (cmd) {
1367	case SCHEDOP_poll:
1368	if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
1369	return true;
1370	fallthrough;
1371	case SCHEDOP_yield:
1372	kvm_vcpu_on_spin(vcpu, yield_to_kernel_mode: true);
1373	*r = `0`;
1374	return true;
1375	default:
1376	break;
1377	}
1378
1379	return false;
1380	}
1381
1382	struct compat_vcpu_set_singleshot_timer {
1383	uint64_t timeout_abs_ns;
1384	uint32_t flags;
1385	} __attribute__((packed));
1386
1387	static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu vcpu, bool longmode, int* cmd,
1388	int vcpu_id, u64 param, u64 *r)
1389	{
1390	struct vcpu_set_singleshot_timer oneshot;
1391	struct x86_exception e;
1392	s64 delta;
1393
1394	if (!kvm_xen_timer_enabled(vcpu))
1395	return false;
1396
1397	switch (cmd) {
1398	case VCPUOP_set_singleshot_timer:
1399	if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1400	*r = -EINVAL;
1401	return true;
1402	}
1403
1404	/*
1405	* The only difference for 32-bit compat is the 4 bytes of
1406	* padding after the interesting part of the structure. So
1407	* for a faithful emulation of Xen we have to try to copy
1408	* the padding and return -EFAULT if we can't. Otherwise we
1409	* might as well just have copied the 12-byte 32-bit struct.
1410	*/
1411	BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1412	offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1413	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1414	sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1415	BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
1416	offsetof(struct vcpu_set_singleshot_timer, flags));
1417	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
1418	sizeof_field(struct vcpu_set_singleshot_timer, flags));
1419
1420	if (kvm_read_guest_virt(vcpu, addr: param, val: &oneshot, bytes: longmode ? sizeof(oneshot) :
1421	sizeof(struct compat_vcpu_set_singleshot_timer), exception: &e)) {
1422	*r = -EFAULT;
1423	return true;
1424	}
1425
1426	/ A delta <= 0 results in an immediate callback, which is what we want /
1427	delta = oneshot.timeout_abs_ns - get_kvmclock_ns(kvm: vcpu->kvm);
1428	kvm_xen_start_timer(vcpu, guest_abs: oneshot.timeout_abs_ns, delta_ns: delta);
1429	*r = `0`;
1430	return true;
1431
1432	case VCPUOP_stop_singleshot_timer:
1433	if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1434	*r = -EINVAL;
1435	return true;
1436	}
1437	kvm_xen_stop_timer(vcpu);
1438	*r = `0`;
1439	return true;
1440	}
1441
1442	return false;
1443	}
1444
1445	static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
1446	u64 *r)
1447	{
1448	if (!kvm_xen_timer_enabled(vcpu))
1449	return false;
1450
1451	if (timeout) {
1452	uint64_t guest_now = get_kvmclock_ns(kvm: vcpu->kvm);
1453	int64_t delta = timeout - guest_now;
1454
1455	/ Xen has a 'Linux workaround' in do_set_timer_op() which*
1456	* checks for negative absolute timeout values (caused by
1457	* integer overflow), and for values about 13 days in the
1458	* future (2^50ns) which would be caused by jiffies
1459	* overflow. For those cases, it sets the timeout 100ms in
1460	* the future (not too soon, since if a guest really did
1461	* set a long timeout on purpose we don't want to keep
1462	* churning CPU time by waking it up).
1463	*/
1464	if (unlikely((int64_t)timeout < `0` \|\|
1465	(delta > `0` && (uint32_t) (delta >> `50`) != `0`))) {
1466	delta = `100` * NSEC_PER_MSEC;
1467	timeout = guest_now + delta;
1468	}
1469
1470	kvm_xen_start_timer(vcpu, guest_abs: timeout, delta_ns: delta);
1471	} else {
1472	kvm_xen_stop_timer(vcpu);
1473	}
1474
1475	*r = `0`;
1476	return true;
1477	}
1478
1479	int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
1480	{
1481	bool longmode;
1482	u64 input, params[`6`], r = -ENOSYS;
1483	bool handled = false;
1484	u8 cpl;
1485
1486	input = (u64)kvm_register_read(vcpu, reg: VCPU_REGS_RAX);
1487
1488	/ Hyper-V hypercalls get bit 31 set in EAX /
1489	if ((input & `0x80000000`) &&
1490	kvm_hv_hypercall_enabled(vcpu))
1491	return kvm_hv_hypercall(vcpu);
1492
1493	longmode = is_64_bit_hypercall(vcpu);
1494	if (!longmode) {
1495	params[`0`] = (u32)kvm_rbx_read(vcpu);
1496	params[`1`] = (u32)kvm_rcx_read(vcpu);
1497	params[`2`] = (u32)kvm_rdx_read(vcpu);
1498	params[`3`] = (u32)kvm_rsi_read(vcpu);
1499	params[`4`] = (u32)kvm_rdi_read(vcpu);
1500	params[`5`] = (u32)kvm_rbp_read(vcpu);
1501	}
1502	#ifdef CONFIG_X86_64
1503	else {
1504	params[`0`] = (u64)kvm_rdi_read(vcpu);
1505	params[`1`] = (u64)kvm_rsi_read(vcpu);
1506	params[`2`] = (u64)kvm_rdx_read(vcpu);
1507	params[`3`] = (u64)kvm_r10_read(vcpu);
1508	params[`4`] = (u64)kvm_r8_read(vcpu);
1509	params[`5`] = (u64)kvm_r9_read(vcpu);
1510	}
1511	#endif
1512	cpl = static_call(kvm_x86_get_cpl)(vcpu);
1513	trace_kvm_xen_hypercall(cpl, nr: input, a0: params[`0`], a1: params[`1`], a2: params[`2`],
1514	a3: params[`3`], a4: params[`4`], a5: params[`5`]);
1515
1516	/*
1517	* Only allow hypercall acceleration for CPL0. The rare hypercalls that
1518	* are permitted in guest userspace can be handled by the VMM.
1519	*/
1520	if (unlikely(cpl > `0`))
1521	goto handle_in_userspace;
1522
1523	switch (input) {
1524	case __HYPERVISOR_xen_version:
1525	if (params[`0`] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
1526	r = vcpu->kvm->arch.xen.xen_version;
1527	handled = true;
1528	}
1529	break;
1530	case __HYPERVISOR_event_channel_op:
1531	if (params[`0`] == EVTCHNOP_send)
1532	handled = kvm_xen_hcall_evtchn_send(vcpu, param: params[`1`], r: &r);
1533	break;
1534	case __HYPERVISOR_sched_op:
1535	handled = kvm_xen_hcall_sched_op(vcpu, longmode, cmd: params[`0`],
1536	param: params[`1`], r: &r);
1537	break;
1538	case __HYPERVISOR_vcpu_op:
1539	handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, cmd: params[`0`], vcpu_id: params[`1`],
1540	param: params[`2`], r: &r);
1541	break;
1542	case __HYPERVISOR_set_timer_op: {
1543	u64 timeout = params[`0`];
1544	/ In 32-bit mode, the 64-bit timeout is in two 32-bit params. /
1545	if (!longmode)
1546	timeout \|= params[`1`] << `32`;
1547	handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, r: &r);
1548	break;
1549	}
1550	default:
1551	break;
1552	}
1553
1554	if (handled)
1555	return kvm_xen_hypercall_set_result(vcpu, result: r);
1556
1557	handle_in_userspace:
1558	vcpu->run->exit_reason = KVM_EXIT_XEN;
1559	vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
1560	vcpu->run->xen.u.hcall.longmode = longmode;
1561	vcpu->run->xen.u.hcall.cpl = cpl;
1562	vcpu->run->xen.u.hcall.input = input;
1563	vcpu->run->xen.u.hcall.params[`0`] = params[`0`];
1564	vcpu->run->xen.u.hcall.params[`1`] = params[`1`];
1565	vcpu->run->xen.u.hcall.params[`2`] = params[`2`];
1566	vcpu->run->xen.u.hcall.params[`3`] = params[`3`];
1567	vcpu->run->xen.u.hcall.params[`4`] = params[`4`];
1568	vcpu->run->xen.u.hcall.params[`5`] = params[`5`];
1569	vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
1570	vcpu->arch.complete_userspace_io =
1571	kvm_xen_hypercall_complete_userspace;
1572
1573	return `0`;
1574	}
1575
1576	static void kvm_xen_check_poller(struct kvm_vcpu vcpu, int* port)
1577	{
1578	int poll_evtchn = vcpu->arch.xen.poll_evtchn;
1579
1580	if ((poll_evtchn == port \|\| poll_evtchn == -`1`) &&
1581	test_and_clear_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask)) {
1582	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1583	kvm_vcpu_kick(vcpu);
1584	}
1585	}
1586
1587	/*
1588	* The return value from this function is propagated to kvm_set_irq() API,
1589	* so it returns:
1590	* < 0 Interrupt was ignored (masked or not delivered for other reasons)
1591	* = 0 Interrupt was coalesced (previous irq is still pending)
1592	* > 0 Number of CPUs interrupt was delivered to
1593	*
1594	* It is also called directly from kvm_arch_set_irq_inatomic(), where the
1595	* only check on its return value is a comparison with -EWOULDBLOCK'.
1596	*/
1597	int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn xe, struct* kvm *kvm)
1598	{
1599	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1600	struct kvm_vcpu *vcpu;
1601	unsigned long pending_bits, mask_bits;
1602	unsigned long flags;
1603	int port_word_bit;
1604	bool kick_vcpu = false;
1605	int vcpu_idx, idx, rc;
1606
1607	vcpu_idx = READ_ONCE(xe->vcpu_idx);
1608	if (vcpu_idx >= `0`)
1609	vcpu = kvm_get_vcpu(kvm, i: vcpu_idx);
1610	else {
1611	vcpu = kvm_get_vcpu_by_id(kvm, id: xe->vcpu_id);
1612	if (!vcpu)
1613	return -EINVAL;
1614	WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
1615	}
1616
1617	if (!vcpu->arch.xen.vcpu_info_cache.active)
1618	return -EINVAL;
1619
1620	if (xe->port >= max_evtchn_port(kvm))
1621	return -EINVAL;
1622
1623	rc = -EWOULDBLOCK;
1624
1625	idx = srcu_read_lock(ssp: &kvm->srcu);
1626
1627	read_lock_irqsave(&gpc->lock, flags);
1628	if (!kvm_gpc_check(gpc, PAGE_SIZE))
1629	goto out_rcu;
1630
1631	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1632	struct shared_info *shinfo = gpc->khva;
1633	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1634	mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1635	port_word_bit = xe->port / `64`;
1636	} else {
1637	struct compat_shared_info *shinfo = gpc->khva;
1638	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1639	mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1640	port_word_bit = xe->port / `32`;
1641	}
1642
1643	/*
1644	* If this port wasn't already set, and if it isn't masked, then
1645	* we try to set the corresponding bit in the in-kernel shadow of
1646	* evtchn_pending_sel for the target vCPU. And if that wasn't
1647	* already set, then we kick the vCPU in question to write to the
1648	* real evtchn_pending_sel in its own guest vcpu_info struct.
1649	*/
1650	if (test_and_set_bit(nr: xe->port, addr: pending_bits)) {
1651	rc = `0`; / It was already raised /
1652	} else if (test_bit(xe->port, mask_bits)) {
1653	rc = -ENOTCONN; / Masked /
1654	kvm_xen_check_poller(vcpu, port: xe->port);
1655	} else {
1656	rc = `1`; / Delivered to the bitmap in shared_info. /
1657	/ Now switch to the vCPU's vcpu_info to set the index and pending_sel /
1658	read_unlock_irqrestore(&gpc->lock, flags);
1659	gpc = &vcpu->arch.xen.vcpu_info_cache;
1660
1661	read_lock_irqsave(&gpc->lock, flags);
1662	if (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) {
1663	/*
1664	* Could not access the vcpu_info. Set the bit in-kernel
1665	* and prod the vCPU to deliver it for itself.
1666	*/
1667	if (!test_and_set_bit(nr: port_word_bit, addr: &vcpu->arch.xen.evtchn_pending_sel))
1668	kick_vcpu = true;
1669	goto out_rcu;
1670	}
1671
1672	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1673	struct vcpu_info *vcpu_info = gpc->khva;
1674	if (!test_and_set_bit(nr: port_word_bit, addr: &vcpu_info->evtchn_pending_sel)) {
1675	WRITE_ONCE(vcpu_info->evtchn_upcall_pending, `1`);
1676	kick_vcpu = true;
1677	}
1678	} else {
1679	struct compat_vcpu_info *vcpu_info = gpc->khva;
1680	if (!test_and_set_bit(nr: port_word_bit,
1681	addr: (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
1682	WRITE_ONCE(vcpu_info->evtchn_upcall_pending, `1`);
1683	kick_vcpu = true;
1684	}
1685	}
1686
1687	/ For the per-vCPU lapic vector, deliver it as MSI. /
1688	if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
1689	kvm_xen_inject_vcpu_vector(v: vcpu);
1690	kick_vcpu = false;
1691	}
1692	}
1693
1694	out_rcu:
1695	read_unlock_irqrestore(&gpc->lock, flags);
1696	srcu_read_unlock(ssp: &kvm->srcu, idx);
1697
1698	if (kick_vcpu) {
1699	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1700	kvm_vcpu_kick(vcpu);
1701	}
1702
1703	return rc;
1704	}
1705
1706	static int kvm_xen_set_evtchn(struct kvm_xen_evtchn xe, struct* kvm *kvm)
1707	{
1708	bool mm_borrowed = false;
1709	int rc;
1710
1711	rc = kvm_xen_set_evtchn_fast(xe, kvm);
1712	if (rc != -EWOULDBLOCK)
1713	return rc;
1714
1715	if (current->mm != kvm->mm) {
1716	/*
1717	* If not on a thread which already belongs to this KVM,
1718	* we'd better be in the irqfd workqueue.
1719	*/
1720	if (WARN_ON_ONCE(current->mm))
1721	return -EINVAL;
1722
1723	kthread_use_mm(mm: kvm->mm);
1724	mm_borrowed = true;
1725	}
1726
1727	mutex_lock(&kvm->arch.xen.xen_lock);
1728
1729	/*
1730	* It is theoretically possible for the page to be unmapped
1731	* and the MMU notifier to invalidate the shared_info before
1732	* we even get to use it. In that case, this looks like an
1733	* infinite loop. It was tempting to do it via the userspace
1734	* HVA instead... but that just hides the fact that it's
1735	* an infinite loop, because if a fault occurs and it waits
1736	* for the page to come back, it can still immediately
1737	* fault and have to wait again, repeatedly.
1738	*
1739	* Conversely, the page could also have been reinstated by
1740	* another thread before we even obtain the mutex above, so
1741	* check again first before remapping it.
1742	*/
1743	do {
1744	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1745	int idx;
1746
1747	rc = kvm_xen_set_evtchn_fast(xe, kvm);
1748	if (rc != -EWOULDBLOCK)
1749	break;
1750
1751	idx = srcu_read_lock(ssp: &kvm->srcu);
1752	rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
1753	srcu_read_unlock(ssp: &kvm->srcu, idx);
1754	} while(!rc);
1755
1756	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
1757
1758	if (mm_borrowed)
1759	kthread_unuse_mm(mm: kvm->mm);
1760
1761	return rc;
1762	}
1763
1764	/ This is the version called from kvm_set_irq() as the .set function /
1765	static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry e, struct* kvm *kvm,
1766	int irq_source_id, int level, bool line_status)
1767	{
1768	if (!level)
1769	return -EINVAL;
1770
1771	return kvm_xen_set_evtchn(xe: &e->xen_evtchn, kvm);
1772	}
1773
1774	/*
1775	* Set up an event channel interrupt from the KVM IRQ routing table.
1776	* Used for e.g. PIRQ from passed through physical devices.
1777	*/
1778	int kvm_xen_setup_evtchn(struct kvm *kvm,
1779	struct kvm_kernel_irq_routing_entry *e,
1780	const struct kvm_irq_routing_entry *ue)
1781
1782	{
1783	struct kvm_vcpu *vcpu;
1784
1785	if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
1786	return -EINVAL;
1787
1788	/ We only support 2 level event channels for now /
1789	if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1790	return -EINVAL;
1791
1792	/*
1793	* Xen gives us interesting mappings from vCPU index to APIC ID,
1794	* which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
1795	* to find it. Do that once at setup time, instead of every time.
1796	* But beware that on live update / live migration, the routing
1797	* table might be reinstated before the vCPU threads have finished
1798	* recreating their vCPUs.
1799	*/
1800	vcpu = kvm_get_vcpu_by_id(kvm, id: ue->u.xen_evtchn.vcpu);
1801	if (vcpu)
1802	e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
1803	else
1804	e->xen_evtchn.vcpu_idx = -`1`;
1805
1806	e->xen_evtchn.port = ue->u.xen_evtchn.port;
1807	e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
1808	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
1809	e->set = evtchn_set_fn;
1810
1811	return `0`;
1812	}
1813
1814	/*
1815	* Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
1816	*/
1817	int kvm_xen_hvm_evtchn_send(struct kvm kvm, struct* kvm_irq_routing_xen_evtchn *uxe)
1818	{
1819	struct kvm_xen_evtchn e;
1820	int ret;
1821
1822	if (!uxe->port \|\| uxe->port >= max_evtchn_port(kvm))
1823	return -EINVAL;
1824
1825	/ We only support 2 level event channels for now /
1826	if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1827	return -EINVAL;
1828
1829	e.port = uxe->port;
1830	e.vcpu_id = uxe->vcpu;
1831	e.vcpu_idx = -`1`;
1832	e.priority = uxe->priority;
1833
1834	ret = kvm_xen_set_evtchn(xe: &e, kvm);
1835
1836	/*
1837	* None of that 'return 1 if it actually got delivered' nonsense.
1838	* We don't care if it was masked (-ENOTCONN) either.
1839	*/
1840	if (ret > `0` \|\| ret == -ENOTCONN)
1841	ret = `0`;
1842
1843	return ret;
1844	}
1845
1846	/*
1847	* Support for outbound event channel events via the EVTCHNOP_send hypercall.
1848	*/
1849	struct evtchnfd {
1850	u32 send_port;
1851	u32 type;
1852	union {
1853	struct kvm_xen_evtchn port;
1854	struct {
1855	u32 port; / zero /
1856	struct eventfd_ctx *ctx;
1857	} eventfd;
1858	} deliver;
1859	};
1860
1861	/*
1862	* Update target vCPU or priority for a registered sending channel.
1863	*/
1864	static int kvm_xen_eventfd_update(struct kvm *kvm,
1865	struct kvm_xen_hvm_attr *data)
1866	{
1867	u32 port = data->u.evtchn.send_port;
1868	struct evtchnfd *evtchnfd;
1869	int ret;
1870
1871	/ Protect writes to evtchnfd as well as the idr lookup. /
1872	mutex_lock(&kvm->arch.xen.xen_lock);
1873	evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, id: port);
1874
1875	ret = -ENOENT;
1876	if (!evtchnfd)
1877	goto out_unlock;
1878
1879	/ For an UPDATE, nothing may change except the priority/vcpu /
1880	ret = -EINVAL;
1881	if (evtchnfd->type != data->u.evtchn.type)
1882	goto out_unlock;
1883
1884	/*
1885	* Port cannot change, and if it's zero that was an eventfd
1886	* which can't be changed either.
1887	*/
1888	if (!evtchnfd->deliver.port.port \|\|
1889	evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
1890	goto out_unlock;
1891
1892	/ We only support 2 level event channels for now /
1893	if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1894	goto out_unlock;
1895
1896	evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
1897	if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
1898	evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
1899	evtchnfd->deliver.port.vcpu_idx = -`1`;
1900	}
1901	ret = `0`;
1902	out_unlock:
1903	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
1904	return ret;
1905	}
1906
1907	/*
1908	* Configure the target (eventfd or local port delivery) for sending on
1909	* a given event channel.
1910	*/
1911	static int kvm_xen_eventfd_assign(struct kvm *kvm,
1912	struct kvm_xen_hvm_attr *data)
1913	{
1914	u32 port = data->u.evtchn.send_port;
1915	struct eventfd_ctx *eventfd = NULL;
1916	struct evtchnfd *evtchnfd;
1917	int ret = -EINVAL;
1918
1919	evtchnfd = kzalloc(size: sizeof(struct evtchnfd), GFP_KERNEL);
1920	if (!evtchnfd)
1921	return -ENOMEM;
1922
1923	switch(data->u.evtchn.type) {
1924	case EVTCHNSTAT_ipi:
1925	/ IPI must map back to the same port# /
1926	if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
1927	goto out_noeventfd; / -EINVAL /
1928	break;
1929
1930	case EVTCHNSTAT_interdomain:
1931	if (data->u.evtchn.deliver.port.port) {
1932	if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
1933	goto out_noeventfd; / -EINVAL /
1934	} else {
1935	eventfd = eventfd_ctx_fdget(fd: data->u.evtchn.deliver.eventfd.fd);
1936	if (IS_ERR(ptr: eventfd)) {
1937	ret = PTR_ERR(ptr: eventfd);
1938	goto out_noeventfd;
1939	}
1940	}
1941	break;
1942
1943	case EVTCHNSTAT_virq:
1944	case EVTCHNSTAT_closed:
1945	case EVTCHNSTAT_unbound:
1946	case EVTCHNSTAT_pirq:
1947	default: / Unknown event channel type /
1948	goto out; / -EINVAL /
1949	}
1950
1951	evtchnfd->send_port = data->u.evtchn.send_port;
1952	evtchnfd->type = data->u.evtchn.type;
1953	if (eventfd) {
1954	evtchnfd->deliver.eventfd.ctx = eventfd;
1955	} else {
1956	/ We only support 2 level event channels for now /
1957	if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1958	goto out; / -EINVAL; /
1959
1960	evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
1961	evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
1962	evtchnfd->deliver.port.vcpu_idx = -`1`;
1963	evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
1964	}
1965
1966	mutex_lock(&kvm->arch.xen.xen_lock);
1967	ret = idr_alloc(&kvm->arch.xen.evtchn_ports, ptr: evtchnfd, start: port, end: port + `1`,
1968	GFP_KERNEL);
1969	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
1970	if (ret >= `0`)
1971	return `0`;
1972
1973	if (ret == -ENOSPC)
1974	ret = -EEXIST;
1975	out:
1976	if (eventfd)
1977	eventfd_ctx_put(ctx: eventfd);
1978	out_noeventfd:
1979	kfree(objp: evtchnfd);
1980	return ret;
1981	}
1982
1983	static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
1984	{
1985	struct evtchnfd *evtchnfd;
1986
1987	mutex_lock(&kvm->arch.xen.xen_lock);
1988	evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, id: port);
1989	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
1990
1991	if (!evtchnfd)
1992	return -ENOENT;
1993
1994	synchronize_srcu(ssp: &kvm->srcu);
1995	if (!evtchnfd->deliver.port.port)
1996	eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx);
1997	kfree(objp: evtchnfd);
1998	return `0`;
1999	}
2000
2001	static int kvm_xen_eventfd_reset(struct kvm *kvm)
2002	{
2003	struct evtchnfd evtchnfd, *all_evtchnfds;
2004	int i;
2005	int n = `0`;
2006
2007	mutex_lock(&kvm->arch.xen.xen_lock);
2008
2009	/*
2010	* Because synchronize_srcu() cannot be called inside the
2011	* critical section, first collect all the evtchnfd objects
2012	* in an array as they are removed from evtchn_ports.
2013	*/
2014	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
2015	n++;
2016
2017	all_evtchnfds = kmalloc_array(n, size: sizeof(struct evtchnfd *), GFP_KERNEL);
2018	if (!all_evtchnfds) {
2019	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
2020	return -ENOMEM;
2021	}
2022
2023	n = `0`;
2024	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2025	all_evtchnfds[n++] = evtchnfd;
2026	idr_remove(&kvm->arch.xen.evtchn_ports, id: evtchnfd->send_port);
2027	}
2028	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
2029
2030	synchronize_srcu(ssp: &kvm->srcu);
2031
2032	while (n--) {
2033	evtchnfd = all_evtchnfds[n];
2034	if (!evtchnfd->deliver.port.port)
2035	eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx);
2036	kfree(objp: evtchnfd);
2037	}
2038	kfree(objp: all_evtchnfds);
2039
2040	return `0`;
2041	}
2042
2043	static int kvm_xen_setattr_evtchn(struct kvm kvm, struct* kvm_xen_hvm_attr *data)
2044	{
2045	u32 port = data->u.evtchn.send_port;
2046
2047	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
2048	return kvm_xen_eventfd_reset(kvm);
2049
2050	if (!port \|\| port >= max_evtchn_port(kvm))
2051	return -EINVAL;
2052
2053	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
2054	return kvm_xen_eventfd_deassign(kvm, port);
2055	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
2056	return kvm_xen_eventfd_update(kvm, data);
2057	if (data->u.evtchn.flags)
2058	return -EINVAL;
2059
2060	return kvm_xen_eventfd_assign(kvm, data);
2061	}
2062
2063	static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu vcpu, u64 param, u64 r)
2064	{
2065	struct evtchnfd *evtchnfd;
2066	struct evtchn_send send;
2067	struct x86_exception e;
2068
2069	/ Sanity check: this structure is the same for 32-bit and 64-bit /
2070	BUILD_BUG_ON(sizeof(send) != `4`);
2071	if (kvm_read_guest_virt(vcpu, addr: param, val: &send, bytes: sizeof(send), exception: &e)) {
2072	*r = -EFAULT;
2073	return true;
2074	}
2075
2076	/*
2077	* evtchnfd is protected by kvm->srcu; the idr lookup instead
2078	* is protected by RCU.
2079	*/
2080	rcu_read_lock();
2081	evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, id: send.port);
2082	rcu_read_unlock();
2083	if (!evtchnfd)
2084	return false;
2085
2086	if (evtchnfd->deliver.port.port) {
2087	int ret = kvm_xen_set_evtchn(xe: &evtchnfd->deliver.port, kvm: vcpu->kvm);
2088	if (ret < `0` && ret != -ENOTCONN)
2089	return false;
2090	} else {
2091	eventfd_signal(ctx: evtchnfd->deliver.eventfd.ctx, n: `1`);
2092	}
2093
2094	*r = `0`;
2095	return true;
2096	}
2097
2098	void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
2099	{
2100	vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
2101	vcpu->arch.xen.poll_evtchn = `0`;
2102
2103	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, `0`);
2104
2105	kvm_gpc_init(gpc: &vcpu->arch.xen.runstate_cache, kvm: vcpu->kvm, NULL,
2106	usage: KVM_HOST_USES_PFN);
2107	kvm_gpc_init(gpc: &vcpu->arch.xen.runstate2_cache, kvm: vcpu->kvm, NULL,
2108	usage: KVM_HOST_USES_PFN);
2109	kvm_gpc_init(gpc: &vcpu->arch.xen.vcpu_info_cache, kvm: vcpu->kvm, NULL,
2110	usage: KVM_HOST_USES_PFN);
2111	kvm_gpc_init(gpc: &vcpu->arch.xen.vcpu_time_info_cache, kvm: vcpu->kvm, NULL,
2112	usage: KVM_HOST_USES_PFN);
2113	}
2114
2115	void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
2116	{
2117	if (kvm_xen_timer_enabled(vcpu))
2118	kvm_xen_stop_timer(vcpu);
2119
2120	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate_cache);
2121	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache);
2122	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache);
2123	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_time_info_cache);
2124
2125	del_timer_sync(timer: &vcpu->arch.xen.poll_timer);
2126	}
2127
2128	void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
2129	{
2130	struct kvm_cpuid_entry2 *entry;
2131	u32 function;
2132
2133	if (!vcpu->arch.xen.cpuid.base)
2134	return;
2135
2136	function = vcpu->arch.xen.cpuid.base \| XEN_CPUID_LEAF(`3`);
2137	if (function > vcpu->arch.xen.cpuid.limit)
2138	return;
2139
2140	entry = kvm_find_cpuid_entry_index(vcpu, function, index: `1`);
2141	if (entry) {
2142	entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
2143	entry->edx = vcpu->arch.hv_clock.tsc_shift;
2144	}
2145
2146	entry = kvm_find_cpuid_entry_index(vcpu, function, index: `2`);
2147	if (entry)
2148	entry->eax = vcpu->arch.hw_tsc_khz;
2149	}
2150
2151	void kvm_xen_init_vm(struct kvm *kvm)
2152	{
2153	mutex_init(&kvm->arch.xen.xen_lock);
2154	idr_init(idr: &kvm->arch.xen.evtchn_ports);
2155	kvm_gpc_init(gpc: &kvm->arch.xen.shinfo_cache, kvm, NULL, usage: KVM_HOST_USES_PFN);
2156	}
2157
2158	void kvm_xen_destroy_vm(struct kvm *kvm)
2159	{
2160	struct evtchnfd *evtchnfd;
2161	int i;
2162
2163	kvm_gpc_deactivate(gpc: &kvm->arch.xen.shinfo_cache);
2164
2165	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2166	if (!evtchnfd->deliver.port.port)
2167	eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx);
2168	kfree(objp: evtchnfd);
2169	}
2170	idr_destroy(&kvm->arch.xen.evtchn_ports);
2171
2172	if (kvm->arch.xen_hvm_config.msr)
2173	static_branch_slow_dec_deferred(&kvm_xen_enabled);
2174	}
2175

source code of linux/arch/x86/kvm/xen.c