book3s_hv.c source code [linux/arch/powerpc/kvm/book3s_hv.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
4	* Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
5	*
6	* Authors:
7	* Paul Mackerras <paulus@au1.ibm.com>
8	* Alexander Graf <agraf@suse.de>
9	* Kevin Wolf <mail@kevin-wolf.de>
10	*
11	* Description: KVM functions specific to running on Book 3S
12	* processors in hypervisor mode (specifically POWER7 and later).
13	*
14	* This file is derived from arch/powerpc/kvm/book3s.c,
15	* by Alexander Graf <agraf@suse.de>.
16	*/
17
18	#include <linux/kvm_host.h>
19	#include <linux/kernel.h>
20	#include <linux/err.h>
21	#include <linux/slab.h>
22	#include <linux/preempt.h>
23	#include <linux/sched/signal.h>
24	#include <linux/sched/stat.h>
25	#include <linux/delay.h>
26	#include <linux/export.h>
27	#include <linux/fs.h>
28	#include <linux/anon_inodes.h>
29	#include <linux/cpu.h>
30	#include <linux/cpumask.h>
31	#include <linux/spinlock.h>
32	#include <linux/page-flags.h>
33	#include <linux/srcu.h>
34	#include <linux/miscdevice.h>
35	#include <linux/debugfs.h>
36	#include <linux/gfp.h>
37	#include <linux/vmalloc.h>
38	#include <linux/highmem.h>
39	#include <linux/hugetlb.h>
40	#include <linux/kvm_irqfd.h>
41	#include <linux/irqbypass.h>
42	#include <linux/module.h>
43	#include <linux/compiler.h>
44	#include <linux/of.h>
45	#include <linux/irqdomain.h>
46	#include <linux/smp.h>
47
48	#include <asm/ftrace.h>
49	#include <asm/reg.h>
50	#include <asm/ppc-opcode.h>
51	#include <asm/asm-prototypes.h>
52	#include <asm/archrandom.h>
53	#include <asm/debug.h>
54	#include <asm/disassemble.h>
55	#include <asm/cputable.h>
56	#include <asm/cacheflush.h>
57	#include <linux/uaccess.h>
58	#include <asm/interrupt.h>
59	#include <asm/io.h>
60	#include <asm/kvm_ppc.h>
61	#include <asm/kvm_book3s.h>
62	#include <asm/mmu_context.h>
63	#include <asm/lppaca.h>
64	#include <asm/pmc.h>
65	#include <asm/processor.h>
66	#include <asm/cputhreads.h>
67	#include <asm/page.h>
68	#include <asm/hvcall.h>
69	#include <asm/switch_to.h>
70	#include <asm/smp.h>
71	#include <asm/dbell.h>
72	#include <asm/hmi.h>
73	#include <asm/pnv-pci.h>
74	#include <asm/mmu.h>
75	#include <asm/opal.h>
76	#include <asm/xics.h>
77	#include <asm/xive.h>
78	#include <asm/hw_breakpoint.h>
79	#include <asm/kvm_book3s_uvmem.h>
80	#include <asm/ultravisor.h>
81	#include <asm/dtl.h>
82	#include <asm/plpar_wrappers.h>
83
84	#include <trace/events/ipi.h>
85
86	#include "book3s.h"
87	#include "book3s_hv.h"
88
89	#define CREATE_TRACE_POINTS
90	#include "trace_hv.h"
91
92	/ #define EXIT_DEBUG /
93	/ #define EXIT_DEBUG_SIMPLE /
94	/ #define EXIT_DEBUG_INT /
95
96	/ Used to indicate that a guest page fault needs to be handled /
97	#define RESUME_PAGE_FAULT (RESUME_GUEST \| RESUME_FLAG_ARCH1)
98	/ Used to indicate that a guest passthrough interrupt needs to be handled /
99	#define RESUME_PASSTHROUGH (RESUME_GUEST \| RESUME_FLAG_ARCH2)
100
101	/ Used as a "null" value for timebase values /
102	#define TB_NIL (~(u64)0)
103
104	static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/`4` + `1`);
105
106	static int dynamic_mt_modes = `6`;
107	module_param(dynamic_mt_modes, int, `0644`);
108	MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
109	static int target_smt_mode;
110	module_param(target_smt_mode, int, `0644`);
111	MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
112
113	static bool one_vm_per_core;
114	module_param(one_vm_per_core, bool, S_IRUGO \| S_IWUSR);
115	MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
116
117	#ifdef CONFIG_KVM_XICS
118	static const struct kernel_param_ops module_param_ops = {
119	.set = param_set_int,
120	.get = param_get_int,
121	};
122
123	module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, `0644`);
124	MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
125
126	module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, `0644`);
127	MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
128	#endif
129
130	/ If set, guests are allowed to create and control nested guests /
131	static bool nested = true;
132	module_param(nested, bool, S_IRUGO \| S_IWUSR);
133	MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
134
135	static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
136
137	/*
138	* RWMR values for POWER8. These control the rate at which PURR
139	* and SPURR count and should be set according to the number of
140	* online threads in the vcore being run.
141	*/
142	#define RWMR_RPA_P8_1THREAD 0x164520C62609AECAUL
143	#define RWMR_RPA_P8_2THREAD 0x7FFF2908450D8DA9UL
144	#define RWMR_RPA_P8_3THREAD 0x164520C62609AECAUL
145	#define RWMR_RPA_P8_4THREAD 0x199A421245058DA9UL
146	#define RWMR_RPA_P8_5THREAD 0x164520C62609AECAUL
147	#define RWMR_RPA_P8_6THREAD 0x164520C62609AECAUL
148	#define RWMR_RPA_P8_7THREAD 0x164520C62609AECAUL
149	#define RWMR_RPA_P8_8THREAD 0x164520C62609AECAUL
150
151	static unsigned long p8_rwmr_values[MAX_SMT_THREADS + `1`] = {
152	RWMR_RPA_P8_1THREAD,
153	RWMR_RPA_P8_1THREAD,
154	RWMR_RPA_P8_2THREAD,
155	RWMR_RPA_P8_3THREAD,
156	RWMR_RPA_P8_4THREAD,
157	RWMR_RPA_P8_5THREAD,
158	RWMR_RPA_P8_6THREAD,
159	RWMR_RPA_P8_7THREAD,
160	RWMR_RPA_P8_8THREAD,
161	};
162
163	static inline struct kvm_vcpu next_runnable_thread(struct* kvmppc_vcore *vc,
164	int *ip)
165	{
166	int i = *ip;
167	struct kvm_vcpu *vcpu;
168
169	while (++i < MAX_SMT_THREADS) {
170	vcpu = READ_ONCE(vc->runnable_threads[i]);
171	if (vcpu) {
172	*ip = i;
173	return vcpu;
174	}
175	}
176	return NULL;
177	}
178
179	/ Used to traverse the list of runnable threads for a given vcore /
180	#define for_each_runnable_thread(i, vcpu, vc) \
181	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
182
183	static bool kvmppc_ipi_thread(int cpu)
184	{
185	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
186
187	/ If we're a nested hypervisor, fall back to ordinary IPIs for now /
188	if (kvmhv_on_pseries())
189	return false;
190
191	/ On POWER9 we can use msgsnd to IPI any cpu /
192	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
193	msg \|= get_hard_smp_processor_id(cpu);
194	smp_mb();
195	__asm__ __volatile__ (PPC_MSGSND(%`0`) : : "r" (msg));
196	return true;
197	}
198
199	/ On POWER8 for IPIs to threads in the same core, use msgsnd /
200	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
201	preempt_disable();
202	if (cpu_first_thread_sibling(cpu) ==
203	cpu_first_thread_sibling(smp_processor_id())) {
204	msg \|= cpu_thread_in_core(cpu);
205	smp_mb();
206	__asm__ __volatile__ (PPC_MSGSND(%`0`) : : "r" (msg));
207	preempt_enable();
208	return true;
209	}
210	preempt_enable();
211	}
212
213	#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
214	if (cpu >= `0` && cpu < nr_cpu_ids) {
215	if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
216	xics_wake_cpu(cpu);
217	return true;
218	}
219	opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
220	return true;
221	}
222	#endif
223
224	return false;
225	}
226
227	static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
228	{
229	int cpu;
230	struct rcuwait *waitp;
231
232	/*
233	* rcuwait_wake_up contains smp_mb() which orders prior stores that
234	* create pending work vs below loads of cpu fields. The other side
235	* is the barrier in vcpu run that orders setting the cpu fields vs
236	* testing for pending work.
237	*/
238
239	waitp = kvm_arch_vcpu_get_wait(vcpu);
240	if (rcuwait_wake_up(w: waitp))
241	++vcpu->stat.generic.halt_wakeup;
242
243	cpu = READ_ONCE(vcpu->arch.thread_cpu);
244	if (cpu >= `0` && kvmppc_ipi_thread(cpu))
245	return;
246
247	/ CPU points to the first thread of the core /
248	cpu = vcpu->cpu;
249	if (cpu >= `0` && cpu < nr_cpu_ids && cpu_online(cpu))
250	smp_send_reschedule(cpu);
251	}
252
253	/*
254	* We use the vcpu_load/put functions to measure stolen time.
255	*
256	* Stolen time is counted as time when either the vcpu is able to
257	* run as part of a virtual core, but the task running the vcore
258	* is preempted or sleeping, or when the vcpu needs something done
259	* in the kernel by the task running the vcpu, but that task is
260	* preempted or sleeping. Those two things have to be counted
261	* separately, since one of the vcpu tasks will take on the job
262	* of running the core, and the other vcpu tasks in the vcore will
263	* sleep waiting for it to do that, but that sleep shouldn't count
264	* as stolen time.
265	*
266	* Hence we accumulate stolen time when the vcpu can run as part of
267	* a vcore using vc->stolen_tb, and the stolen time when the vcpu
268	* needs its task to do other things in the kernel (for example,
269	* service a page fault) in busy_stolen. We don't accumulate
270	* stolen time for a vcore when it is inactive, or for a vcpu
271	* when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
272	* a misnomer; it means that the vcpu task is not executing in
273	* the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
274	* the kernel. We don't have any way of dividing up that time
275	* between time that the vcpu is genuinely stopped, time that
276	* the task is actively working on behalf of the vcpu, and time
277	* that the task is preempted, so we don't count any of it as
278	* stolen.
279	*
280	* Updates to busy_stolen are protected by arch.tbacct_lock;
281	* updates to vc->stolen_tb are protected by the vcore->stoltb_lock
282	* lock. The stolen times are measured in units of timebase ticks.
283	* (Note that the != TB_NIL checks below are purely defensive;
284	* they should never fail.)
285	*
286	* The POWER9 path is simpler, one vcpu per virtual core so the
287	* former case does not exist. If a vcpu is preempted when it is
288	* BUSY_IN_HOST and not ceded or otherwise blocked, then accumulate
289	* the stolen cycles in busy_stolen. RUNNING is not a preemptible
290	* state in the P9 path.
291	*/
292
293	static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
294	{
295	unsigned long flags;
296
297	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
298
299	spin_lock_irqsave(&vc->stoltb_lock, flags);
300	vc->preempt_tb = tb;
301	spin_unlock_irqrestore(lock: &vc->stoltb_lock, flags);
302	}
303
304	static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
305	{
306	unsigned long flags;
307
308	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
309
310	spin_lock_irqsave(&vc->stoltb_lock, flags);
311	if (vc->preempt_tb != TB_NIL) {
312	vc->stolen_tb += tb - vc->preempt_tb;
313	vc->preempt_tb = TB_NIL;
314	}
315	spin_unlock_irqrestore(lock: &vc->stoltb_lock, flags);
316	}
317
318	static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu vcpu, int* cpu)
319	{
320	struct kvmppc_vcore *vc = vcpu->arch.vcore;
321	unsigned long flags;
322	u64 now;
323
324	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
325	if (vcpu->arch.busy_preempt != TB_NIL) {
326	WARN_ON_ONCE(vcpu->arch.state != KVMPPC_VCPU_BUSY_IN_HOST);
327	vc->stolen_tb += mftb() - vcpu->arch.busy_preempt;
328	vcpu->arch.busy_preempt = TB_NIL;
329	}
330	return;
331	}
332
333	now = mftb();
334
335	/*
336	* We can test vc->runner without taking the vcore lock,
337	* because only this task ever sets vc->runner to this
338	* vcpu, and once it is set to this vcpu, only this task
339	* ever sets it to NULL.
340	*/
341	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
342	kvmppc_core_end_stolen(vc, tb: now);
343
344	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
345	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
346	vcpu->arch.busy_preempt != TB_NIL) {
347	vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt;
348	vcpu->arch.busy_preempt = TB_NIL;
349	}
350	spin_unlock_irqrestore(lock: &vcpu->arch.tbacct_lock, flags);
351	}
352
353	static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
354	{
355	struct kvmppc_vcore *vc = vcpu->arch.vcore;
356	unsigned long flags;
357	u64 now;
358
359	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
360	/*
361	* In the P9 path, RUNNABLE is not preemptible
362	* (nor takes host interrupts)
363	*/
364	WARN_ON_ONCE(vcpu->arch.state == KVMPPC_VCPU_RUNNABLE);
365	/*
366	* Account stolen time when preempted while the vcpu task is
367	* running in the kernel (but not in qemu, which is INACTIVE).
368	*/
369	if (task_is_running(current) &&
370	vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
371	vcpu->arch.busy_preempt = mftb();
372	return;
373	}
374
375	now = mftb();
376
377	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
378	kvmppc_core_start_stolen(vc, tb: now);
379
380	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
381	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
382	vcpu->arch.busy_preempt = now;
383	spin_unlock_irqrestore(lock: &vcpu->arch.tbacct_lock, flags);
384	}
385
386	static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
387	{
388	vcpu->arch.pvr = pvr;
389	}
390
391	/ Dummy value used in computing PCR value below /
392	#define PCR_ARCH_31 (PCR_ARCH_300 << 1)
393
394	static inline unsigned long map_pcr_to_cap(unsigned long pcr)
395	{
396	unsigned long cap = `0`;
397
398	switch (pcr) {
399	case PCR_ARCH_300:
400	cap = H_GUEST_CAP_POWER9;
401	break;
402	case PCR_ARCH_31:
403	cap = H_GUEST_CAP_POWER10;
404	break;
405	default:
406	break;
407	}
408
409	return cap;
410	}
411
412	static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
413	{
414	unsigned long host_pcr_bit = `0`, guest_pcr_bit = `0`, cap = `0`;
415	struct kvmppc_vcore *vc = vcpu->arch.vcore;
416
417	/ We can (emulate) our own architecture version and anything older /
418	if (cpu_has_feature(CPU_FTR_ARCH_31))
419	host_pcr_bit = PCR_ARCH_31;
420	else if (cpu_has_feature(CPU_FTR_ARCH_300))
421	host_pcr_bit = PCR_ARCH_300;
422	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
423	host_pcr_bit = PCR_ARCH_207;
424	else if (cpu_has_feature(CPU_FTR_ARCH_206))
425	host_pcr_bit = PCR_ARCH_206;
426	else
427	host_pcr_bit = PCR_ARCH_205;
428
429	/ Determine lowest PCR bit needed to run guest in given PVR level /
430	guest_pcr_bit = host_pcr_bit;
431	if (arch_compat) {
432	switch (arch_compat) {
433	case PVR_ARCH_205:
434	guest_pcr_bit = PCR_ARCH_205;
435	break;
436	case PVR_ARCH_206:
437	case PVR_ARCH_206p:
438	guest_pcr_bit = PCR_ARCH_206;
439	break;
440	case PVR_ARCH_207:
441	guest_pcr_bit = PCR_ARCH_207;
442	break;
443	case PVR_ARCH_300:
444	guest_pcr_bit = PCR_ARCH_300;
445	break;
446	case PVR_ARCH_31:
447	case PVR_ARCH_31_P11:
448	guest_pcr_bit = PCR_ARCH_31;
449	break;
450	default:
451	return -EINVAL;
452	}
453	}
454
455	/ Check requested PCR bits don't exceed our capabilities /
456	if (guest_pcr_bit > host_pcr_bit)
457	return -EINVAL;
458
459	if (kvmhv_on_pseries() && kvmhv_is_nestedv2()) {
460	/*
461	* 'arch_compat == 0' would mean the guest should default to
462	* L1's compatibility. In this case, the guest would pick
463	* host's PCR and evaluate the corresponding capabilities.
464	*/
465	cap = map_pcr_to_cap(pcr: guest_pcr_bit);
466	if (!(cap & nested_capabilities))
467	return -EINVAL;
468	}
469
470	spin_lock(lock: &vc->lock);
471	vc->arch_compat = arch_compat;
472	kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LOGICAL_PVR);
473	/*
474	* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
475	* Also set all reserved PCR bits
476	*/
477	vc->pcr = (host_pcr_bit - guest_pcr_bit) \| PCR_MASK;
478	spin_unlock(lock: &vc->lock);
479
480	return `0`;
481	}
482
483	static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
484	{
485	int r;
486
487	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
488	pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
489	vcpu->arch.regs.nip, vcpu->arch.shregs.msr, vcpu->arch.trap);
490	for (r = `0`; r < `16`; ++r)
491	pr_err("r%2d = %.16lx r%d = %.16lx\n",
492	r, kvmppc_get_gpr(vcpu, r),
493	r+`16`, kvmppc_get_gpr(vcpu, r+`16`));
494	pr_err("ctr = %.16lx lr = %.16lx\n",
495	vcpu->arch.regs.ctr, vcpu->arch.regs.link);
496	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
497	vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
498	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
499	vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
500	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
501	vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
502	pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n",
503	vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
504	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
505	pr_err("fault dar = %.16lx dsisr = %.8x\n",
506	vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
507	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
508	for (r = `0`; r < vcpu->arch.slb_max; ++r)
509	pr_err(" ESID = %.16llx VSID = %.16llx\n",
510	vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
511	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.16lx\n",
512	vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
513	vcpu->arch.last_inst);
514	}
515
516	static struct kvm_vcpu kvmppc_find_vcpu(struct* kvm kvm, int* id)
517	{
518	return kvm_get_vcpu_by_id(kvm, id);
519	}
520
521	static void init_vpa(struct kvm_vcpu vcpu, struct* lppaca *vpa)
522	{
523	vpa->__old_status \|= LPPACA_OLD_SHARED_PROC;
524	vpa->yield_count = cpu_to_be32(`1`);
525	}
526
527	static int set_vpa(struct kvm_vcpu vcpu, struct* kvmppc_vpa *v,
528	unsigned long addr, unsigned long len)
529	{
530	/ check address is cacheline aligned /
531	if (addr & (L1_CACHE_BYTES - `1`))
532	return -EINVAL;
533	spin_lock(lock: &vcpu->arch.vpa_update_lock);
534	if (v->next_gpa != addr \|\| v->len != len) {
535	v->next_gpa = addr;
536	v->len = addr ? len : `0`;
537	v->update_pending = `1`;
538	}
539	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
540	return `0`;
541	}
542
543	/ Length for a per-processor buffer is passed in at offset 4 in the buffer /
544	struct reg_vpa {
545	u32 dummy;
546	union {
547	__be16 hword;
548	__be32 word;
549	} length;
550	};
551
552	static int vpa_is_registered(struct kvmppc_vpa *vpap)
553	{
554	if (vpap->update_pending)
555	return vpap->next_gpa != `0`;
556	return vpap->pinned_addr != NULL;
557	}
558
559	static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
560	unsigned long flags,
561	unsigned long vcpuid, unsigned long vpa)
562	{
563	struct kvm *kvm = vcpu->kvm;
564	unsigned long len, nb;
565	void *va;
566	struct kvm_vcpu *tvcpu;
567	int err;
568	int subfunc;
569	struct kvmppc_vpa *vpap;
570
571	tvcpu = kvmppc_find_vcpu(kvm, id: vcpuid);
572	if (!tvcpu)
573	return H_PARAMETER;
574
575	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
576	if (subfunc == H_VPA_REG_VPA \|\| subfunc == H_VPA_REG_DTL \|\|
577	subfunc == H_VPA_REG_SLB) {
578	/ Registering new area - address must be cache-line aligned /
579	if ((vpa & (L1_CACHE_BYTES - `1`)) \|\| !vpa)
580	return H_PARAMETER;
581
582	/ convert logical addr to kernel addr and read length /
583	va = kvmppc_pin_guest_page(kvm, vpa, &nb);
584	if (va == NULL)
585	return H_PARAMETER;
586	if (subfunc == H_VPA_REG_VPA)
587	len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
588	else
589	len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
590	kvmppc_unpin_guest_page(kvm, va, vpa, false);
591
592	/ Check length /
593	if (len > nb \|\| len < sizeof(struct reg_vpa))
594	return H_PARAMETER;
595	} else {
596	vpa = `0`;
597	len = `0`;
598	}
599
600	err = H_PARAMETER;
601	vpap = NULL;
602	spin_lock(lock: &tvcpu->arch.vpa_update_lock);
603
604	switch (subfunc) {
605	case H_VPA_REG_VPA: / register VPA /
606	/*
607	* The size of our lppaca is 1kB because of the way we align
608	* it for the guest to avoid crossing a 4kB boundary. We only
609	* use 640 bytes of the structure though, so we should accept
610	* clients that set a size of 640.
611	*/
612	BUILD_BUG_ON(sizeof(struct lppaca) != `640`);
613	if (len < sizeof(struct lppaca))
614	break;
615	vpap = &tvcpu->arch.vpa;
616	err = `0`;
617	break;
618
619	case H_VPA_REG_DTL: / register DTL /
620	if (len < sizeof(struct dtl_entry))
621	break;
622	len -= len % sizeof(struct dtl_entry);
623
624	/ Check that they have previously registered a VPA /
625	err = H_RESOURCE;
626	if (!vpa_is_registered(vpap: &tvcpu->arch.vpa))
627	break;
628
629	vpap = &tvcpu->arch.dtl;
630	err = `0`;
631	break;
632
633	case H_VPA_REG_SLB: / register SLB shadow buffer /
634	/ Check that they have previously registered a VPA /
635	err = H_RESOURCE;
636	if (!vpa_is_registered(vpap: &tvcpu->arch.vpa))
637	break;
638
639	vpap = &tvcpu->arch.slb_shadow;
640	err = `0`;
641	break;
642
643	case H_VPA_DEREG_VPA: / deregister VPA /
644	/ Check they don't still have a DTL or SLB buf registered /
645	err = H_RESOURCE;
646	if (vpa_is_registered(vpap: &tvcpu->arch.dtl) \|\|
647	vpa_is_registered(vpap: &tvcpu->arch.slb_shadow))
648	break;
649
650	vpap = &tvcpu->arch.vpa;
651	err = `0`;
652	break;
653
654	case H_VPA_DEREG_DTL: / deregister DTL /
655	vpap = &tvcpu->arch.dtl;
656	err = `0`;
657	break;
658
659	case H_VPA_DEREG_SLB: / deregister SLB shadow buffer /
660	vpap = &tvcpu->arch.slb_shadow;
661	err = `0`;
662	break;
663	}
664
665	if (vpap) {
666	vpap->next_gpa = vpa;
667	vpap->len = len;
668	vpap->update_pending = `1`;
669	}
670
671	spin_unlock(lock: &tvcpu->arch.vpa_update_lock);
672
673	return err;
674	}
675
676	static void kvmppc_update_vpa(struct kvm_vcpu vcpu, struct* kvmppc_vpa *vpap,
677	struct kvmppc_vpa *old_vpap)
678	{
679	struct kvm *kvm = vcpu->kvm;
680	void *va;
681	unsigned long nb;
682	unsigned long gpa;
683
684	/*
685	* We need to pin the page pointed to by vpap->next_gpa,
686	* but we can't call kvmppc_pin_guest_page under the lock
687	* as it does get_user_pages() and down_read(). So we
688	* have to drop the lock, pin the page, then get the lock
689	* again and check that a new area didn't get registered
690	* in the meantime.
691	*/
692	for (;;) {
693	gpa = vpap->next_gpa;
694	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
695	va = NULL;
696	nb = `0`;
697	if (gpa)
698	va = kvmppc_pin_guest_page(kvm, gpa, &nb);
699	spin_lock(lock: &vcpu->arch.vpa_update_lock);
700	if (gpa == vpap->next_gpa)
701	break;
702	/ sigh... unpin that one and try again /
703	if (va)
704	kvmppc_unpin_guest_page(kvm, va, gpa, false);
705	}
706
707	vpap->update_pending = `0`;
708	if (va && nb < vpap->len) {
709	/*
710	* If it's now too short, it must be that userspace
711	* has changed the mappings underlying guest memory,
712	* so unregister the region.
713	*/
714	kvmppc_unpin_guest_page(kvm, va, gpa, false);
715	va = NULL;
716	}
717	old_vpap = vpap;
718
719	vpap->gpa = gpa;
720	vpap->pinned_addr = va;
721	vpap->dirty = false;
722	if (va)
723	vpap->pinned_end = va + vpap->len;
724	}
725
726	static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
727	{
728	struct kvm *kvm = vcpu->kvm;
729	struct kvmppc_vpa old_vpa = { `0` };
730
731	if (!(vcpu->arch.vpa.update_pending \|\|
732	vcpu->arch.slb_shadow.update_pending \|\|
733	vcpu->arch.dtl.update_pending))
734	return;
735
736	spin_lock(lock: &vcpu->arch.vpa_update_lock);
737	if (vcpu->arch.vpa.update_pending) {
738	kvmppc_update_vpa(vcpu, vpap: &vcpu->arch.vpa, old_vpap: &old_vpa);
739	if (old_vpa.pinned_addr) {
740	if (kvmhv_is_nestedv2())
741	kvmhv_nestedv2_set_vpa(vcpu, ~`0ull`);
742	kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
743	old_vpa.dirty);
744	}
745	if (vcpu->arch.vpa.pinned_addr) {
746	init_vpa(vcpu, vpa: vcpu->arch.vpa.pinned_addr);
747	if (kvmhv_is_nestedv2())
748	kvmhv_nestedv2_set_vpa(vcpu, __pa(vcpu->arch.vpa.pinned_addr));
749	}
750	}
751	if (vcpu->arch.dtl.update_pending) {
752	kvmppc_update_vpa(vcpu, vpap: &vcpu->arch.dtl, old_vpap: &old_vpa);
753	if (old_vpa.pinned_addr)
754	kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
755	old_vpa.dirty);
756	vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
757	vcpu->arch.dtl_index = `0`;
758	}
759	if (vcpu->arch.slb_shadow.update_pending) {
760	kvmppc_update_vpa(vcpu, vpap: &vcpu->arch.slb_shadow, old_vpap: &old_vpa);
761	if (old_vpa.pinned_addr)
762	kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
763	old_vpa.dirty);
764	}
765
766	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
767	}
768
769	/*
770	* Return the accumulated stolen time for the vcore up until `now'.
771	* The caller should hold the vcore lock.
772	*/
773	static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
774	{
775	u64 p;
776	unsigned long flags;
777
778	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
779
780	spin_lock_irqsave(&vc->stoltb_lock, flags);
781	p = vc->stolen_tb;
782	if (vc->vcore_state != VCORE_INACTIVE &&
783	vc->preempt_tb != TB_NIL)
784	p += now - vc->preempt_tb;
785	spin_unlock_irqrestore(lock: &vc->stoltb_lock, flags);
786	return p;
787	}
788
789	static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
790	struct lppaca *vpa,
791	unsigned int pcpu, u64 now,
792	unsigned long stolen)
793	{
794	struct dtl_entry *dt;
795
796	dt = vcpu->arch.dtl_ptr;
797
798	if (!dt)
799	return;
800
801	dt->dispatch_reason = `7`;
802	dt->preempt_reason = `0`;
803	dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
804	dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
805	dt->ready_to_enqueue_time = `0`;
806	dt->waiting_to_ready_time = `0`;
807	dt->timebase = cpu_to_be64(now);
808	dt->fault_addr = `0`;
809	dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
810	dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
811
812	++dt;
813	if (dt == vcpu->arch.dtl.pinned_end)
814	dt = vcpu->arch.dtl.pinned_addr;
815	vcpu->arch.dtl_ptr = dt;
816	/ order writing dt vs. writing vpa->dtl_idx /*
817	smp_wmb();
818	vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
819
820	/ vcpu->arch.dtl.dirty is set by the caller /
821	}
822
823	static void kvmppc_update_vpa_dispatch(struct kvm_vcpu *vcpu,
824	struct kvmppc_vcore *vc)
825	{
826	struct lppaca *vpa;
827	unsigned long stolen;
828	unsigned long core_stolen;
829	u64 now;
830	unsigned long flags;
831
832	vpa = vcpu->arch.vpa.pinned_addr;
833	if (!vpa)
834	return;
835
836	now = mftb();
837
838	core_stolen = vcore_stolen_time(vc, now);
839	stolen = core_stolen - vcpu->arch.stolen_logged;
840	vcpu->arch.stolen_logged = core_stolen;
841	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
842	stolen += vcpu->arch.busy_stolen;
843	vcpu->arch.busy_stolen = `0`;
844	spin_unlock_irqrestore(lock: &vcpu->arch.tbacct_lock, flags);
845
846	vpa->enqueue_dispatch_tb = cpu_to_be64(be64_to_cpu(vpa->enqueue_dispatch_tb) + stolen);
847
848	__kvmppc_create_dtl_entry(vcpu, vpa, pcpu: vc->pcpu, now: now + kvmppc_get_tb_offset(vcpu), stolen);
849
850	vcpu->arch.vpa.dirty = true;
851	}
852
853	static void kvmppc_update_vpa_dispatch_p9(struct kvm_vcpu *vcpu,
854	struct kvmppc_vcore *vc,
855	u64 now)
856	{
857	struct lppaca *vpa;
858	unsigned long stolen;
859	unsigned long stolen_delta;
860
861	vpa = vcpu->arch.vpa.pinned_addr;
862	if (!vpa)
863	return;
864
865	stolen = vc->stolen_tb;
866	stolen_delta = stolen - vcpu->arch.stolen_logged;
867	vcpu->arch.stolen_logged = stolen;
868
869	vpa->enqueue_dispatch_tb = cpu_to_be64(stolen);
870
871	__kvmppc_create_dtl_entry(vcpu, vpa, pcpu: vc->pcpu, now, stolen: stolen_delta);
872
873	vcpu->arch.vpa.dirty = true;
874	}
875
876	/ See if there is a doorbell interrupt pending for a vcpu /
877	static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
878	{
879	int thr;
880	struct kvmppc_vcore *vc;
881
882	if (vcpu->arch.doorbell_request)
883	return true;
884	if (cpu_has_feature(CPU_FTR_ARCH_300))
885	return false;
886	/*
887	* Ensure that the read of vcore->dpdes comes after the read
888	* of vcpu->doorbell_request. This barrier matches the
889	* smp_wmb() in kvmppc_guest_entry_inject().
890	*/
891	smp_rmb();
892	vc = vcpu->arch.vcore;
893	thr = vcpu->vcpu_id - vc->first_vcpuid;
894	return !!(vc->dpdes & (`1` << thr));
895	}
896
897	static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
898	{
899	if (kvmppc_get_arch_compat(vcpu) >= PVR_ARCH_207)
900	return true;
901	if ((!kvmppc_get_arch_compat(vcpu)) &&
902	cpu_has_feature(CPU_FTR_ARCH_207S))
903	return true;
904	return false;
905	}
906
907	static int kvmppc_h_set_mode(struct kvm_vcpu vcpu, unsigned* long mflags,
908	unsigned long resource, unsigned long value1,
909	unsigned long value2)
910	{
911	switch (resource) {
912	case H_SET_MODE_RESOURCE_SET_CIABR:
913	if (!kvmppc_power8_compatible(vcpu))
914	return H_P2;
915	if (value2)
916	return H_P4;
917	if (mflags)
918	return H_UNSUPPORTED_FLAG_START;
919	/ Guests can't breakpoint the hypervisor /
920	if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
921	return H_P3;
922	kvmppc_set_ciabr_hv(vcpu, val: value1);
923	return H_SUCCESS;
924	case H_SET_MODE_RESOURCE_SET_DAWR0:
925	if (!kvmppc_power8_compatible(vcpu))
926	return H_P2;
927	if (!ppc_breakpoint_available())
928	return H_P2;
929	if (mflags)
930	return H_UNSUPPORTED_FLAG_START;
931	if (value2 & DABRX_HYP)
932	return H_P4;
933	kvmppc_set_dawr0_hv(vcpu, val: value1);
934	kvmppc_set_dawrx0_hv(vcpu, val: value2);
935	return H_SUCCESS;
936	case H_SET_MODE_RESOURCE_SET_DAWR1:
937	if (!kvmppc_power8_compatible(vcpu))
938	return H_P2;
939	if (!ppc_breakpoint_available())
940	return H_P2;
941	if (!cpu_has_feature(CPU_FTR_DAWR1))
942	return H_P2;
943	if (!vcpu->kvm->arch.dawr1_enabled)
944	return H_FUNCTION;
945	if (mflags)
946	return H_UNSUPPORTED_FLAG_START;
947	if (value2 & DABRX_HYP)
948	return H_P4;
949	kvmppc_set_dawr1_hv(vcpu, val: value1);
950	kvmppc_set_dawrx1_hv(vcpu, val: value2);
951	return H_SUCCESS;
952	case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
953	/*
954	* KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
955	* Keep this in synch with kvmppc_filter_guest_lpcr_hv.
956	*/
957	if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
958	kvmhv_vcpu_is_radix(vcpu) && mflags == `3`)
959	return H_UNSUPPORTED_FLAG_START;
960	return H_TOO_HARD;
961	default:
962	return H_TOO_HARD;
963	}
964	}
965
966	/ Copy guest memory in place - must reside within a single memslot /
967	static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
968	unsigned long len)
969	{
970	struct kvm_memory_slot *to_memslot = NULL;
971	struct kvm_memory_slot *from_memslot = NULL;
972	unsigned long to_addr, from_addr;
973	int r;
974
975	/ Get HPA for from address /
976	from_memslot = gfn_to_memslot(kvm, gfn: from >> PAGE_SHIFT);
977	if (!from_memslot)
978	return -EFAULT;
979	if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
980	<< PAGE_SHIFT))
981	return -EINVAL;
982	from_addr = gfn_to_hva_memslot(slot: from_memslot, gfn: from >> PAGE_SHIFT);
983	if (kvm_is_error_hva(addr: from_addr))
984	return -EFAULT;
985	from_addr \|= (from & (PAGE_SIZE - `1`));
986
987	/ Get HPA for to address /
988	to_memslot = gfn_to_memslot(kvm, gfn: to >> PAGE_SHIFT);
989	if (!to_memslot)
990	return -EFAULT;
991	if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
992	<< PAGE_SHIFT))
993	return -EINVAL;
994	to_addr = gfn_to_hva_memslot(slot: to_memslot, gfn: to >> PAGE_SHIFT);
995	if (kvm_is_error_hva(addr: to_addr))
996	return -EFAULT;
997	to_addr \|= (to & (PAGE_SIZE - `1`));
998
999	/ Perform copy /
1000	r = raw_copy_in_user((void __user )to_addr, (void* __user *)from_addr,
1001	len);
1002	if (r)
1003	return -EFAULT;
1004	mark_page_dirty(kvm, gfn: to >> PAGE_SHIFT);
1005	return `0`;
1006	}
1007
1008	static long kvmppc_h_page_init(struct kvm_vcpu vcpu, unsigned* long flags,
1009	unsigned long dest, unsigned long src)
1010	{
1011	u64 pg_sz = SZ_4K; / 4K page size /
1012	u64 pg_mask = SZ_4K - `1`;
1013	int ret;
1014
1015	/ Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) /
1016	if (flags & ~(H_ICACHE_INVALIDATE \| H_ICACHE_SYNCHRONIZE \|
1017	H_ZERO_PAGE \| H_COPY_PAGE \| H_PAGE_SET_LOANED))
1018	return H_PARAMETER;
1019
1020	/ dest (and src if copy_page flag set) must be page aligned /
1021	if ((dest & pg_mask) \|\| ((flags & H_COPY_PAGE) && (src & pg_mask)))
1022	return H_PARAMETER;
1023
1024	/ zero and/or copy the page as determined by the flags /
1025	if (flags & H_COPY_PAGE) {
1026	ret = kvmppc_copy_guest(kvm: vcpu->kvm, to: dest, from: src, len: pg_sz);
1027	if (ret < `0`)
1028	return H_PARAMETER;
1029	} else if (flags & H_ZERO_PAGE) {
1030	ret = kvm_clear_guest(kvm: vcpu->kvm, gpa: dest, len: pg_sz);
1031	if (ret < `0`)
1032	return H_PARAMETER;
1033	}
1034
1035	/ We can ignore the remaining flags /
1036
1037	return H_SUCCESS;
1038	}
1039
1040	static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
1041	{
1042	struct kvmppc_vcore *vcore = target->arch.vcore;
1043
1044	/*
1045	* We expect to have been called by the real mode handler
1046	* (kvmppc_rm_h_confer()) which would have directly returned
1047	* H_SUCCESS if the source vcore wasn't idle (e.g. if it may
1048	* have useful work to do and should not confer) so we don't
1049	* recheck that here.
1050	*
1051	* In the case of the P9 single vcpu per vcore case, the real
1052	* mode handler is not called but no other threads are in the
1053	* source vcore.
1054	*/
1055	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
1056	spin_lock(lock: &vcore->lock);
1057	if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
1058	vcore->vcore_state != VCORE_INACTIVE &&
1059	vcore->runner)
1060	target = vcore->runner;
1061	spin_unlock(lock: &vcore->lock);
1062	}
1063
1064	return kvm_vcpu_yield_to(target);
1065	}
1066
1067	static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
1068	{
1069	int yield_count = `0`;
1070	struct lppaca *lppaca;
1071
1072	spin_lock(lock: &vcpu->arch.vpa_update_lock);
1073	lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
1074	if (lppaca)
1075	yield_count = be32_to_cpu(lppaca->yield_count);
1076	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
1077	return yield_count;
1078	}
1079
1080	/*
1081	* H_RPT_INVALIDATE hcall handler for nested guests.
1082	*
1083	* Handles only nested process-scoped invalidation requests in L0.
1084	*/
1085	static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
1086	{
1087	unsigned long type = kvmppc_get_gpr(vcpu, `6`);
1088	unsigned long pid, pg_sizes, start, end;
1089
1090	/*
1091	* The partition-scoped invalidations aren't handled here in L0.
1092	*/
1093	if (type & H_RPTI_TYPE_NESTED)
1094	return RESUME_HOST;
1095
1096	pid = kvmppc_get_gpr(vcpu, `4`);
1097	pg_sizes = kvmppc_get_gpr(vcpu, `7`);
1098	start = kvmppc_get_gpr(vcpu, `8`);
1099	end = kvmppc_get_gpr(vcpu, `9`);
1100
1101	do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
1102	type, pg_sizes, start, end);
1103
1104	kvmppc_set_gpr(vcpu, `3`, H_SUCCESS);
1105	return RESUME_GUEST;
1106	}
1107
1108	static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
1109	unsigned long id, unsigned long target,
1110	unsigned long type, unsigned long pg_sizes,
1111	unsigned long start, unsigned long end)
1112	{
1113	if (!kvm_is_radix(vcpu->kvm))
1114	return H_UNSUPPORTED;
1115
1116	if (end < start)
1117	return H_P5;
1118
1119	/*
1120	* Partition-scoped invalidation for nested guests.
1121	*/
1122	if (type & H_RPTI_TYPE_NESTED) {
1123	if (!nesting_enabled(vcpu->kvm))
1124	return H_FUNCTION;
1125
1126	/ Support only cores as target /
1127	if (target != H_RPTI_TARGET_CMMU)
1128	return H_P2;
1129
1130	return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
1131	start, end);
1132	}
1133
1134	/*
1135	* Process-scoped invalidation for L1 guests.
1136	*/
1137	do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
1138	type, pg_sizes, start, end);
1139	return H_SUCCESS;
1140	}
1141
1142	int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1143	{
1144	struct kvm *kvm = vcpu->kvm;
1145	unsigned long req = kvmppc_get_gpr(vcpu, `3`);
1146	unsigned long target, ret = H_SUCCESS;
1147	int yield_count;
1148	struct kvm_vcpu *tvcpu;
1149	int idx, rc;
1150
1151	if (req <= MAX_HCALL_OPCODE &&
1152	!test_bit(req/`4`, vcpu->kvm->arch.enabled_hcalls))
1153	return RESUME_HOST;
1154
1155	switch (req) {
1156	case H_REMOVE:
1157	ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, `4`),
1158	kvmppc_get_gpr(vcpu, `5`),
1159	kvmppc_get_gpr(vcpu, `6`));
1160	if (ret == H_TOO_HARD)
1161	return RESUME_HOST;
1162	break;
1163	case H_ENTER:
1164	ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, `4`),
1165	kvmppc_get_gpr(vcpu, `5`),
1166	kvmppc_get_gpr(vcpu, `6`),
1167	kvmppc_get_gpr(vcpu, `7`));
1168	if (ret == H_TOO_HARD)
1169	return RESUME_HOST;
1170	break;
1171	case H_READ:
1172	ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, `4`),
1173	kvmppc_get_gpr(vcpu, `5`));
1174	if (ret == H_TOO_HARD)
1175	return RESUME_HOST;
1176	break;
1177	case H_CLEAR_MOD:
1178	ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, `4`),
1179	kvmppc_get_gpr(vcpu, `5`));
1180	if (ret == H_TOO_HARD)
1181	return RESUME_HOST;
1182	break;
1183	case H_CLEAR_REF:
1184	ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, `4`),
1185	kvmppc_get_gpr(vcpu, `5`));
1186	if (ret == H_TOO_HARD)
1187	return RESUME_HOST;
1188	break;
1189	case H_PROTECT:
1190	ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, `4`),
1191	kvmppc_get_gpr(vcpu, `5`),
1192	kvmppc_get_gpr(vcpu, `6`));
1193	if (ret == H_TOO_HARD)
1194	return RESUME_HOST;
1195	break;
1196	case H_BULK_REMOVE:
1197	ret = kvmppc_h_bulk_remove(vcpu);
1198	if (ret == H_TOO_HARD)
1199	return RESUME_HOST;
1200	break;
1201
1202	case H_CEDE:
1203	break;
1204	case H_PROD:
1205	target = kvmppc_get_gpr(vcpu, `4`);
1206	tvcpu = kvmppc_find_vcpu(kvm, id: target);
1207	if (!tvcpu) {
1208	ret = H_PARAMETER;
1209	break;
1210	}
1211	tvcpu->arch.prodded = `1`;
1212	smp_mb(); / This orders prodded store vs ceded load /
1213	if (tvcpu->arch.ceded)
1214	kvmppc_fast_vcpu_kick_hv(vcpu: tvcpu);
1215	break;
1216	case H_CONFER:
1217	target = kvmppc_get_gpr(vcpu, `4`);
1218	if (target == -`1`)
1219	break;
1220	tvcpu = kvmppc_find_vcpu(kvm, id: target);
1221	if (!tvcpu) {
1222	ret = H_PARAMETER;
1223	break;
1224	}
1225	yield_count = kvmppc_get_gpr(vcpu, `5`);
1226	if (kvmppc_get_yield_count(vcpu: tvcpu) != yield_count)
1227	break;
1228	kvm_arch_vcpu_yield_to(target: tvcpu);
1229	break;
1230	case H_REGISTER_VPA:
1231	ret = do_h_register_vpa(vcpu, flags: kvmppc_get_gpr(vcpu, `4`),
1232	vcpuid: kvmppc_get_gpr(vcpu, `5`),
1233	vpa: kvmppc_get_gpr(vcpu, `6`));
1234	break;
1235	case H_RTAS:
1236	if (list_empty(&kvm->arch.rtas_tokens))
1237	return RESUME_HOST;
1238
1239	idx = srcu_read_lock(ssp: &kvm->srcu);
1240	rc = kvmppc_rtas_hcall(vcpu);
1241	srcu_read_unlock(ssp: &kvm->srcu, idx);
1242
1243	if (rc == -ENOENT)
1244	return RESUME_HOST;
1245	else if (rc == `0`)
1246	break;
1247
1248	/ Send the error out to userspace via KVM_RUN /
1249	return rc;
1250	case H_LOGICAL_CI_LOAD:
1251	ret = kvmppc_h_logical_ci_load(vcpu);
1252	if (ret == H_TOO_HARD)
1253	return RESUME_HOST;
1254	break;
1255	case H_LOGICAL_CI_STORE:
1256	ret = kvmppc_h_logical_ci_store(vcpu);
1257	if (ret == H_TOO_HARD)
1258	return RESUME_HOST;
1259	break;
1260	case H_SET_MODE:
1261	ret = kvmppc_h_set_mode(vcpu, mflags: kvmppc_get_gpr(vcpu, `4`),
1262	resource: kvmppc_get_gpr(vcpu, `5`),
1263	value1: kvmppc_get_gpr(vcpu, `6`),
1264	value2: kvmppc_get_gpr(vcpu, `7`));
1265	if (ret == H_TOO_HARD)
1266	return RESUME_HOST;
1267	break;
1268	case H_XIRR:
1269	case H_CPPR:
1270	case H_EOI:
1271	case H_IPI:
1272	case H_IPOLL:
1273	case H_XIRR_X:
1274	if (kvmppc_xics_enabled(vcpu)) {
1275	if (xics_on_xive()) {
1276	ret = H_NOT_AVAILABLE;
1277	return RESUME_GUEST;
1278	}
1279	ret = kvmppc_xics_hcall(vcpu, req);
1280	break;
1281	}
1282	return RESUME_HOST;
1283	case H_SET_DABR:
1284	ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, `4`));
1285	break;
1286	case H_SET_XDABR:
1287	ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, `4`),
1288	kvmppc_get_gpr(vcpu, `5`));
1289	break;
1290	#ifdef CONFIG_SPAPR_TCE_IOMMU
1291	case H_GET_TCE:
1292	ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, `4`),
1293	kvmppc_get_gpr(vcpu, `5`));
1294	if (ret == H_TOO_HARD)
1295	return RESUME_HOST;
1296	break;
1297	case H_PUT_TCE:
1298	ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, `4`),
1299	kvmppc_get_gpr(vcpu, `5`),
1300	kvmppc_get_gpr(vcpu, `6`));
1301	if (ret == H_TOO_HARD)
1302	return RESUME_HOST;
1303	break;
1304	case H_PUT_TCE_INDIRECT:
1305	ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, `4`),
1306	kvmppc_get_gpr(vcpu, `5`),
1307	kvmppc_get_gpr(vcpu, `6`),
1308	kvmppc_get_gpr(vcpu, `7`));
1309	if (ret == H_TOO_HARD)
1310	return RESUME_HOST;
1311	break;
1312	case H_STUFF_TCE:
1313	ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, `4`),
1314	kvmppc_get_gpr(vcpu, `5`),
1315	kvmppc_get_gpr(vcpu, `6`),
1316	kvmppc_get_gpr(vcpu, `7`));
1317	if (ret == H_TOO_HARD)
1318	return RESUME_HOST;
1319	break;
1320	#endif
1321	case H_RANDOM: {
1322	unsigned long rand;
1323
1324	if (!arch_get_random_seed_longs(&rand, `1`))
1325	ret = H_HARDWARE;
1326	kvmppc_set_gpr(vcpu, `4`, rand);
1327	break;
1328	}
1329	case H_RPT_INVALIDATE:
1330	ret = kvmppc_h_rpt_invalidate(vcpu, id: kvmppc_get_gpr(vcpu, `4`),
1331	target: kvmppc_get_gpr(vcpu, `5`),
1332	type: kvmppc_get_gpr(vcpu, `6`),
1333	pg_sizes: kvmppc_get_gpr(vcpu, `7`),
1334	start: kvmppc_get_gpr(vcpu, `8`),
1335	end: kvmppc_get_gpr(vcpu, `9`));
1336	break;
1337
1338	case H_SET_PARTITION_TABLE:
1339	ret = H_FUNCTION;
1340	if (nesting_enabled(kvm))
1341	ret = kvmhv_set_partition_table(vcpu);
1342	break;
1343	case H_ENTER_NESTED:
1344	ret = H_FUNCTION;
1345	if (!nesting_enabled(kvm))
1346	break;
1347	ret = kvmhv_enter_nested_guest(vcpu);
1348	if (ret == H_INTERRUPT) {
1349	kvmppc_set_gpr(vcpu, `3`, `0`);
1350	vcpu->arch.hcall_needed = `0`;
1351	return -EINTR;
1352	} else if (ret == H_TOO_HARD) {
1353	kvmppc_set_gpr(vcpu, `3`, `0`);
1354	vcpu->arch.hcall_needed = `0`;
1355	return RESUME_HOST;
1356	}
1357	break;
1358	case H_TLB_INVALIDATE:
1359	ret = H_FUNCTION;
1360	if (nesting_enabled(kvm))
1361	ret = kvmhv_do_nested_tlbie(vcpu);
1362	break;
1363	case H_COPY_TOFROM_GUEST:
1364	ret = H_FUNCTION;
1365	if (nesting_enabled(kvm))
1366	ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1367	break;
1368	case H_PAGE_INIT:
1369	ret = kvmppc_h_page_init(vcpu, flags: kvmppc_get_gpr(vcpu, `4`),
1370	dest: kvmppc_get_gpr(vcpu, `5`),
1371	src: kvmppc_get_gpr(vcpu, `6`));
1372	break;
1373	case H_SVM_PAGE_IN:
1374	ret = H_UNSUPPORTED;
1375	if (kvmppc_get_srr1(vcpu) & MSR_S)
1376	ret = kvmppc_h_svm_page_in(kvm,
1377	kvmppc_get_gpr(vcpu, `4`),
1378	kvmppc_get_gpr(vcpu, `5`),
1379	kvmppc_get_gpr(vcpu, `6`));
1380	break;
1381	case H_SVM_PAGE_OUT:
1382	ret = H_UNSUPPORTED;
1383	if (kvmppc_get_srr1(vcpu) & MSR_S)
1384	ret = kvmppc_h_svm_page_out(kvm,
1385	kvmppc_get_gpr(vcpu, `4`),
1386	kvmppc_get_gpr(vcpu, `5`),
1387	kvmppc_get_gpr(vcpu, `6`));
1388	break;
1389	case H_SVM_INIT_START:
1390	ret = H_UNSUPPORTED;
1391	if (kvmppc_get_srr1(vcpu) & MSR_S)
1392	ret = kvmppc_h_svm_init_start(kvm);
1393	break;
1394	case H_SVM_INIT_DONE:
1395	ret = H_UNSUPPORTED;
1396	if (kvmppc_get_srr1(vcpu) & MSR_S)
1397	ret = kvmppc_h_svm_init_done(kvm);
1398	break;
1399	case H_SVM_INIT_ABORT:
1400	/*
1401	* Even if that call is made by the Ultravisor, the SSR1 value
1402	* is the guest context one, with the secure bit clear as it has
1403	* not yet been secured. So we can't check it here.
1404	* Instead the kvm->arch.secure_guest flag is checked inside
1405	* kvmppc_h_svm_init_abort().
1406	*/
1407	ret = kvmppc_h_svm_init_abort(kvm);
1408	break;
1409
1410	default:
1411	return RESUME_HOST;
1412	}
1413	WARN_ON_ONCE(ret == H_TOO_HARD);
1414	kvmppc_set_gpr(vcpu, `3`, ret);
1415	vcpu->arch.hcall_needed = `0`;
1416	return RESUME_GUEST;
1417	}
1418
1419	/*
1420	* Handle H_CEDE in the P9 path where we don't call the real-mode hcall
1421	* handlers in book3s_hv_rmhandlers.S.
1422	*
1423	* This has to be done early, not in kvmppc_pseries_do_hcall(), so
1424	* that the cede logic in kvmppc_run_single_vcpu() works properly.
1425	*/
1426	static void kvmppc_cede(struct kvm_vcpu *vcpu)
1427	{
1428	__kvmppc_set_msr_hv(vcpu, __kvmppc_get_msr_hv(vcpu) \| MSR_EE);
1429	vcpu->arch.ceded = `1`;
1430	smp_mb();
1431	if (vcpu->arch.prodded) {
1432	vcpu->arch.prodded = `0`;
1433	smp_mb();
1434	vcpu->arch.ceded = `0`;
1435	}
1436	}
1437
1438	static int kvmppc_hcall_impl_hv(unsigned long cmd)
1439	{
1440	switch (cmd) {
1441	case H_CEDE:
1442	case H_PROD:
1443	case H_CONFER:
1444	case H_REGISTER_VPA:
1445	case H_SET_MODE:
1446	#ifdef CONFIG_SPAPR_TCE_IOMMU
1447	case H_GET_TCE:
1448	case H_PUT_TCE:
1449	case H_PUT_TCE_INDIRECT:
1450	case H_STUFF_TCE:
1451	#endif
1452	case H_LOGICAL_CI_LOAD:
1453	case H_LOGICAL_CI_STORE:
1454	#ifdef CONFIG_KVM_XICS
1455	case H_XIRR:
1456	case H_CPPR:
1457	case H_EOI:
1458	case H_IPI:
1459	case H_IPOLL:
1460	case H_XIRR_X:
1461	#endif
1462	case H_PAGE_INIT:
1463	case H_RPT_INVALIDATE:
1464	return `1`;
1465	}
1466
1467	/ See if it's in the real-mode table /
1468	return kvmppc_hcall_impl_hv_realmode(cmd);
1469	}
1470
1471	static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
1472	{
1473	ppc_inst_t last_inst;
1474
1475	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
1476	EMULATE_DONE) {
1477	/*
1478	* Fetch failed, so return to guest and
1479	* try executing it again.
1480	*/
1481	return RESUME_GUEST;
1482	}
1483
1484	if (ppc_inst_val(last_inst) == KVMPPC_INST_SW_BREAKPOINT) {
1485	vcpu->run->exit_reason = KVM_EXIT_DEBUG;
1486	vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
1487	return RESUME_HOST;
1488	} else {
1489	kvmppc_core_queue_program(vcpu, SRR1_PROGILL \|
1490	(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1491	return RESUME_GUEST;
1492	}
1493	}
1494
1495	static void do_nothing(void *x)
1496	{
1497	}
1498
1499	static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
1500	{
1501	int thr, cpu, pcpu, nthreads;
1502	struct kvm_vcpu *v;
1503	unsigned long dpdes;
1504
1505	nthreads = vcpu->kvm->arch.emul_smt_mode;
1506	dpdes = `0`;
1507	cpu = vcpu->vcpu_id & ~(nthreads - `1`);
1508	for (thr = `0`; thr < nthreads; ++thr, ++cpu) {
1509	v = kvmppc_find_vcpu(kvm: vcpu->kvm, id: cpu);
1510	if (!v)
1511	continue;
1512	/*
1513	* If the vcpu is currently running on a physical cpu thread,
1514	* interrupt it in order to pull it out of the guest briefly,
1515	* which will update its vcore->dpdes value.
1516	*/
1517	pcpu = READ_ONCE(v->cpu);
1518	if (pcpu >= `0`)
1519	smp_call_function_single(cpuid: pcpu, func: do_nothing, NULL, wait: `1`);
1520	if (kvmppc_doorbell_pending(vcpu: v))
1521	dpdes \|= `1` << thr;
1522	}
1523	return dpdes;
1524	}
1525
1526	/*
1527	* On POWER9, emulate doorbell-related instructions in order to
1528	* give the guest the illusion of running on a multi-threaded core.
1529	* The instructions emulated are msgsndp, msgclrp, mfspr TIR,
1530	* and mfspr DPDES.
1531	*/
1532	static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
1533	{
1534	u32 inst, rb, thr;
1535	unsigned long arg;
1536	struct kvm *kvm = vcpu->kvm;
1537	struct kvm_vcpu *tvcpu;
1538	ppc_inst_t pinst;
1539
1540	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst) != EMULATE_DONE)
1541	return RESUME_GUEST;
1542	inst = ppc_inst_val(pinst);
1543	if (get_op(inst) != `31`)
1544	return EMULATE_FAIL;
1545	rb = get_rb(inst);
1546	thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - `1`);
1547	switch (get_xop(inst)) {
1548	case OP_31_XOP_MSGSNDP:
1549	arg = kvmppc_get_gpr(vcpu, rb);
1550	if (((arg >> `27`) & `0x1f`) != PPC_DBELL_SERVER)
1551	break;
1552	arg &= `0x7f`;
1553	if (arg >= kvm->arch.emul_smt_mode)
1554	break;
1555	tvcpu = kvmppc_find_vcpu(kvm, id: vcpu->vcpu_id - thr + arg);
1556	if (!tvcpu)
1557	break;
1558	if (!tvcpu->arch.doorbell_request) {
1559	tvcpu->arch.doorbell_request = `1`;
1560	kvmppc_fast_vcpu_kick_hv(vcpu: tvcpu);
1561	}
1562	break;
1563	case OP_31_XOP_MSGCLRP:
1564	arg = kvmppc_get_gpr(vcpu, rb);
1565	if (((arg >> `27`) & `0x1f`) != PPC_DBELL_SERVER)
1566	break;
1567	vcpu->arch.vcore->dpdes = `0`;
1568	vcpu->arch.doorbell_request = `0`;
1569	break;
1570	case OP_31_XOP_MFSPR:
1571	switch (get_sprn(inst)) {
1572	case SPRN_TIR:
1573	arg = thr;
1574	break;
1575	case SPRN_DPDES:
1576	arg = kvmppc_read_dpdes(vcpu);
1577	break;
1578	default:
1579	return EMULATE_FAIL;
1580	}
1581	kvmppc_set_gpr(vcpu, get_rt(inst), arg);
1582	break;
1583	default:
1584	return EMULATE_FAIL;
1585	}
1586	kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + `4`);
1587	return RESUME_GUEST;
1588	}
1589
1590	/*
1591	* If the lppaca had pmcregs_in_use clear when we exited the guest, then
1592	* HFSCR_PM is cleared for next entry. If the guest then tries to access
1593	* the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM
1594	* back in the guest HFSCR will cause the next entry to load the PMU SPRs and
1595	* allow the guest access to continue.
1596	*/
1597	static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
1598	{
1599	if (!(vcpu->arch.hfscr_permitted & HFSCR_PM))
1600	return EMULATE_FAIL;
1601
1602	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) \| HFSCR_PM);
1603
1604	return RESUME_GUEST;
1605	}
1606
1607	static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
1608	{
1609	if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB))
1610	return EMULATE_FAIL;
1611
1612	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) \| HFSCR_EBB);
1613
1614	return RESUME_GUEST;
1615	}
1616
1617	static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu)
1618	{
1619	if (!(vcpu->arch.hfscr_permitted & HFSCR_TM))
1620	return EMULATE_FAIL;
1621
1622	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) \| HFSCR_TM);
1623
1624	return RESUME_GUEST;
1625	}
1626
1627	static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
1628	struct task_struct *tsk)
1629	{
1630	struct kvm_run *run = vcpu->run;
1631	int r = RESUME_HOST;
1632
1633	vcpu->stat.sum_exits++;
1634
1635	/*
1636	* This can happen if an interrupt occurs in the last stages
1637	* of guest entry or the first stages of guest exit (i.e. after
1638	* setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1639	* and before setting it to KVM_GUEST_MODE_HOST_HV).
1640	* That can happen due to a bug, or due to a machine check
1641	* occurring at just the wrong time.
1642	*/
1643	if (!kvmhv_is_nestedv2() && (__kvmppc_get_msr_hv(vcpu) & MSR_HV)) {
1644	printk(KERN_EMERG "KVM trap in HV mode!\n");
1645	printk(KERN_EMERG "trap=0x%x \| pc=0x%lx \| msr=0x%llx\n",
1646	vcpu->arch.trap, kvmppc_get_pc(vcpu),
1647	vcpu->arch.shregs.msr);
1648	kvmppc_dump_regs(vcpu);
1649	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1650	run->hw.hardware_exit_reason = vcpu->arch.trap;
1651	return RESUME_HOST;
1652	}
1653	run->exit_reason = KVM_EXIT_UNKNOWN;
1654	run->ready_for_interrupt_injection = `1`;
1655	switch (vcpu->arch.trap) {
1656	/ We're good on these - the host merely wanted to get our attention /
1657	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
1658	WARN_ON_ONCE(`1`); / Should never happen /
1659	vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
1660	fallthrough;
1661	case BOOK3S_INTERRUPT_HV_DECREMENTER:
1662	vcpu->stat.dec_exits++;
1663	r = RESUME_GUEST;
1664	break;
1665	case BOOK3S_INTERRUPT_EXTERNAL:
1666	case BOOK3S_INTERRUPT_H_DOORBELL:
1667	case BOOK3S_INTERRUPT_H_VIRT:
1668	vcpu->stat.ext_intr_exits++;
1669	r = RESUME_GUEST;
1670	break;
1671	/ SR/HMI/PMI are HV interrupts that host has handled. Resume guest./
1672	case BOOK3S_INTERRUPT_HMI:
1673	case BOOK3S_INTERRUPT_PERFMON:
1674	case BOOK3S_INTERRUPT_SYSTEM_RESET:
1675	r = RESUME_GUEST;
1676	break;
1677	case BOOK3S_INTERRUPT_MACHINE_CHECK: {
1678	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1679	DEFAULT_RATELIMIT_BURST);
1680	/*
1681	* Print the MCE event to host console. Ratelimit so the guest
1682	* can't flood the host log.
1683	*/
1684	if (__ratelimit(&rs))
1685	machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
1686
1687	/*
1688	* If the guest can do FWNMI, exit to userspace so it can
1689	* deliver a FWNMI to the guest.
1690	* Otherwise we synthesize a machine check for the guest
1691	* so that it knows that the machine check occurred.
1692	*/
1693	if (!vcpu->kvm->arch.fwnmi_enabled) {
1694	ulong flags = (__kvmppc_get_msr_hv(vcpu) & `0x083c0000`) \|
1695	(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
1696	kvmppc_core_queue_machine_check(vcpu, flags);
1697	r = RESUME_GUEST;
1698	break;
1699	}
1700
1701	/ Exit to guest with KVM_EXIT_NMI as exit reason /
1702	run->exit_reason = KVM_EXIT_NMI;
1703	run->hw.hardware_exit_reason = vcpu->arch.trap;
1704	/ Clear out the old NMI status from run->flags /
1705	run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
1706	/ Now set the NMI status /
1707	if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
1708	run->flags \|= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
1709	else
1710	run->flags \|= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
1711
1712	r = RESUME_HOST;
1713	break;
1714	}
1715	case BOOK3S_INTERRUPT_PROGRAM:
1716	{
1717	ulong flags;
1718	/*
1719	* Normally program interrupts are delivered directly
1720	* to the guest by the hardware, but we can get here
1721	* as a result of a hypervisor emulation interrupt
1722	* (e40) getting turned into a 700 by BML RTAS.
1723	*/
1724	flags = (__kvmppc_get_msr_hv(vcpu) & `0x1f0000ull`) \|
1725	(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
1726	kvmppc_core_queue_program(vcpu, flags);
1727	r = RESUME_GUEST;
1728	break;
1729	}
1730	case BOOK3S_INTERRUPT_SYSCALL:
1731	{
1732	int i;
1733
1734	if (!kvmhv_is_nestedv2() && unlikely(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
1735	/*
1736	* Guest userspace executed sc 1. This can only be
1737	* reached by the P9 path because the old path
1738	* handles this case in realmode hcall handlers.
1739	*/
1740	if (!kvmhv_vcpu_is_radix(vcpu)) {
1741	/*
1742	* A guest could be running PR KVM, so this
1743	* may be a PR KVM hcall. It must be reflected
1744	* to the guest kernel as a sc interrupt.
1745	*/
1746	kvmppc_core_queue_syscall(vcpu);
1747	} else {
1748	/*
1749	* Radix guests can not run PR KVM or nested HV
1750	* hash guests which might run PR KVM, so this
1751	* is always a privilege fault. Send a program
1752	* check to guest kernel.
1753	*/
1754	kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
1755	}
1756	r = RESUME_GUEST;
1757	break;
1758	}
1759
1760	/*
1761	* hcall - gather args and set exit_reason. This will next be
1762	* handled by kvmppc_pseries_do_hcall which may be able to deal
1763	* with it and resume guest, or may punt to userspace.
1764	*/
1765	run->papr_hcall.nr = kvmppc_get_gpr(vcpu, `3`);
1766	for (i = `0`; i < `9`; ++i)
1767	run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, `4` + i);
1768	run->exit_reason = KVM_EXIT_PAPR_HCALL;
1769	vcpu->arch.hcall_needed = `1`;
1770	r = RESUME_HOST;
1771	break;
1772	}
1773	/*
1774	* We get these next two if the guest accesses a page which it thinks
1775	* it has mapped but which is not actually present, either because
1776	* it is for an emulated I/O device or because the corresonding
1777	* host page has been paged out.
1778	*
1779	* Any other HDSI/HISI interrupts have been handled already for P7/8
1780	* guests. For POWER9 hash guests not using rmhandlers, basic hash
1781	* fault handling is done here.
1782	*/
1783	case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
1784	unsigned long vsid;
1785	long err;
1786
1787	if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
1788	unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
1789	r = RESUME_GUEST; / Just retry if it's the canary /
1790	break;
1791	}
1792
1793	if (kvm_is_radix(vcpu->kvm) \|\| !cpu_has_feature(CPU_FTR_ARCH_300)) {
1794	/*
1795	* Radix doesn't require anything, and pre-ISAv3.0 hash
1796	* already attempted to handle this in rmhandlers. The
1797	* hash fault handling below is v3 only (it uses ASDR
1798	* via fault_gpa).
1799	*/
1800	r = RESUME_PAGE_FAULT;
1801	break;
1802	}
1803
1804	if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE \| DSISR_PROTFAULT))) {
1805	kvmppc_core_queue_data_storage(vcpu,
1806	kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1807	vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
1808	r = RESUME_GUEST;
1809	break;
1810	}
1811
1812	if (!(__kvmppc_get_msr_hv(vcpu) & MSR_DR))
1813	vsid = vcpu->kvm->arch.vrma_slb_v;
1814	else
1815	vsid = vcpu->arch.fault_gpa;
1816
1817	err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1818	vsid, vcpu->arch.fault_dsisr, true);
1819	if (err == `0`) {
1820	r = RESUME_GUEST;
1821	} else if (err == -`1` \|\| err == -`2`) {
1822	r = RESUME_PAGE_FAULT;
1823	} else {
1824	kvmppc_core_queue_data_storage(vcpu,
1825	kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1826	vcpu->arch.fault_dar, err);
1827	r = RESUME_GUEST;
1828	}
1829	break;
1830	}
1831	case BOOK3S_INTERRUPT_H_INST_STORAGE: {
1832	unsigned long vsid;
1833	long err;
1834
1835	vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1836	vcpu->arch.fault_dsisr = __kvmppc_get_msr_hv(vcpu) &
1837	DSISR_SRR1_MATCH_64S;
1838	if (kvm_is_radix(vcpu->kvm) \|\| !cpu_has_feature(CPU_FTR_ARCH_300)) {
1839	/*
1840	* Radix doesn't require anything, and pre-ISAv3.0 hash
1841	* already attempted to handle this in rmhandlers. The
1842	* hash fault handling below is v3 only (it uses ASDR
1843	* via fault_gpa).
1844	*/
1845	if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
1846	vcpu->arch.fault_dsisr \|= DSISR_ISSTORE;
1847	r = RESUME_PAGE_FAULT;
1848	break;
1849	}
1850
1851	if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
1852	kvmppc_core_queue_inst_storage(vcpu,
1853	vcpu->arch.fault_dsisr \|
1854	(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1855	r = RESUME_GUEST;
1856	break;
1857	}
1858
1859	if (!(__kvmppc_get_msr_hv(vcpu) & MSR_IR))
1860	vsid = vcpu->kvm->arch.vrma_slb_v;
1861	else
1862	vsid = vcpu->arch.fault_gpa;
1863
1864	err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
1865	vsid, vcpu->arch.fault_dsisr, false);
1866	if (err == `0`) {
1867	r = RESUME_GUEST;
1868	} else if (err == -`1`) {
1869	r = RESUME_PAGE_FAULT;
1870	} else {
1871	kvmppc_core_queue_inst_storage(vcpu,
1872	err \| (kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1873	r = RESUME_GUEST;
1874	}
1875	break;
1876	}
1877
1878	/*
1879	* This occurs if the guest executes an illegal instruction.
1880	* If the guest debug is disabled, generate a program interrupt
1881	* to the guest. If guest debug is enabled, we need to check
1882	* whether the instruction is a software breakpoint instruction.
1883	* Accordingly return to Guest or Host.
1884	*/
1885	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
1886	if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
1887	vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
1888	swab32(vcpu->arch.emul_inst) :
1889	vcpu->arch.emul_inst;
1890	if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1891	r = kvmppc_emulate_debug_inst(vcpu);
1892	} else {
1893	kvmppc_core_queue_program(vcpu, SRR1_PROGILL \|
1894	(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1895	r = RESUME_GUEST;
1896	}
1897	break;
1898
1899	#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1900	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
1901	/*
1902	* This occurs for various TM-related instructions that
1903	* we need to emulate on POWER9 DD2.2. We have already
1904	* handled the cases where the guest was in real-suspend
1905	* mode and was transitioning to transactional state.
1906	*/
1907	r = kvmhv_p9_tm_emulation(vcpu);
1908	if (r != -`1`)
1909	break;
1910	fallthrough; / go to facility unavailable handler /
1911	#endif
1912
1913	/*
1914	* This occurs if the guest (kernel or userspace), does something that
1915	* is prohibited by HFSCR.
1916	* On POWER9, this could be a doorbell instruction that we need
1917	* to emulate.
1918	* Otherwise, we just generate a program interrupt to the guest.
1919	*/
1920	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
1921	u64 cause = kvmppc_get_hfscr_hv(vcpu) >> `56`;
1922
1923	r = EMULATE_FAIL;
1924	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1925	if (cause == FSCR_MSGP_LG)
1926	r = kvmppc_emulate_doorbell_instr(vcpu);
1927	if (cause == FSCR_PM_LG)
1928	r = kvmppc_pmu_unavailable(vcpu);
1929	if (cause == FSCR_EBB_LG)
1930	r = kvmppc_ebb_unavailable(vcpu);
1931	if (cause == FSCR_TM_LG)
1932	r = kvmppc_tm_unavailable(vcpu);
1933	}
1934	if (r == EMULATE_FAIL) {
1935	kvmppc_core_queue_program(vcpu, SRR1_PROGILL \|
1936	(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
1937	r = RESUME_GUEST;
1938	}
1939	break;
1940	}
1941
1942	case BOOK3S_INTERRUPT_HV_RM_HARD:
1943	r = RESUME_PASSTHROUGH;
1944	break;
1945	default:
1946	kvmppc_dump_regs(vcpu);
1947	printk(KERN_EMERG "trap=0x%x \| pc=0x%lx \| msr=0x%llx\n",
1948	vcpu->arch.trap, kvmppc_get_pc(vcpu),
1949	__kvmppc_get_msr_hv(vcpu));
1950	run->hw.hardware_exit_reason = vcpu->arch.trap;
1951	r = RESUME_HOST;
1952	break;
1953	}
1954
1955	return r;
1956	}
1957
1958	static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1959	{
1960	int r;
1961	int srcu_idx;
1962
1963	vcpu->stat.sum_exits++;
1964
1965	/*
1966	* This can happen if an interrupt occurs in the last stages
1967	* of guest entry or the first stages of guest exit (i.e. after
1968	* setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1969	* and before setting it to KVM_GUEST_MODE_HOST_HV).
1970	* That can happen due to a bug, or due to a machine check
1971	* occurring at just the wrong time.
1972	*/
1973	if (__kvmppc_get_msr_hv(vcpu) & MSR_HV) {
1974	pr_emerg("KVM trap in HV mode while nested!\n");
1975	pr_emerg("trap=0x%x \| pc=0x%lx \| msr=0x%llx\n",
1976	vcpu->arch.trap, kvmppc_get_pc(vcpu),
1977	__kvmppc_get_msr_hv(vcpu));
1978	kvmppc_dump_regs(vcpu);
1979	return RESUME_HOST;
1980	}
1981	switch (vcpu->arch.trap) {
1982	/ We're good on these - the host merely wanted to get our attention /
1983	case BOOK3S_INTERRUPT_HV_DECREMENTER:
1984	vcpu->stat.dec_exits++;
1985	r = RESUME_GUEST;
1986	break;
1987	case BOOK3S_INTERRUPT_EXTERNAL:
1988	vcpu->stat.ext_intr_exits++;
1989	r = RESUME_HOST;
1990	break;
1991	case BOOK3S_INTERRUPT_H_DOORBELL:
1992	case BOOK3S_INTERRUPT_H_VIRT:
1993	vcpu->stat.ext_intr_exits++;
1994	r = RESUME_GUEST;
1995	break;
1996	/ These need to go to the nested HV /
1997	case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
1998	vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
1999	vcpu->stat.dec_exits++;
2000	r = RESUME_HOST;
2001	break;
2002	/ SR/HMI/PMI are HV interrupts that host has handled. Resume guest./
2003	case BOOK3S_INTERRUPT_HMI:
2004	case BOOK3S_INTERRUPT_PERFMON:
2005	case BOOK3S_INTERRUPT_SYSTEM_RESET:
2006	r = RESUME_GUEST;
2007	break;
2008	case BOOK3S_INTERRUPT_MACHINE_CHECK:
2009	{
2010	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
2011	DEFAULT_RATELIMIT_BURST);
2012	/ Pass the machine check to the L1 guest /
2013	r = RESUME_HOST;
2014	/ Print the MCE event to host console. /
2015	if (__ratelimit(&rs))
2016	machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
2017	break;
2018	}
2019	/*
2020	* We get these next two if the guest accesses a page which it thinks
2021	* it has mapped but which is not actually present, either because
2022	* it is for an emulated I/O device or because the corresonding
2023	* host page has been paged out.
2024	*/
2025	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
2026	srcu_idx = srcu_read_lock(ssp: &vcpu->kvm->srcu);
2027	r = kvmhv_nested_page_fault(vcpu);
2028	srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx: srcu_idx);
2029	break;
2030	case BOOK3S_INTERRUPT_H_INST_STORAGE:
2031	vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
2032	vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
2033	DSISR_SRR1_MATCH_64S;
2034	if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
2035	vcpu->arch.fault_dsisr \|= DSISR_ISSTORE;
2036	srcu_idx = srcu_read_lock(ssp: &vcpu->kvm->srcu);
2037	r = kvmhv_nested_page_fault(vcpu);
2038	srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx: srcu_idx);
2039	break;
2040
2041	#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2042	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
2043	/*
2044	* This occurs for various TM-related instructions that
2045	* we need to emulate on POWER9 DD2.2. We have already
2046	* handled the cases where the guest was in real-suspend
2047	* mode and was transitioning to transactional state.
2048	*/
2049	r = kvmhv_p9_tm_emulation(vcpu);
2050	if (r != -`1`)
2051	break;
2052	fallthrough; / go to facility unavailable handler /
2053	#endif
2054
2055	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
2056	u64 cause = vcpu->arch.hfscr >> `56`;
2057
2058	/*
2059	* Only pass HFU interrupts to the L1 if the facility is
2060	* permitted but disabled by the L1's HFSCR, otherwise
2061	* the interrupt does not make sense to the L1 so turn
2062	* it into a HEAI.
2063	*/
2064	if (!(vcpu->arch.hfscr_permitted & (`1UL` << cause)) \|\|
2065	(vcpu->arch.nested_hfscr & (`1UL` << cause))) {
2066	ppc_inst_t pinst;
2067	vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
2068
2069	/*
2070	* If the fetch failed, return to guest and
2071	* try executing it again.
2072	*/
2073	r = kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst);
2074	vcpu->arch.emul_inst = ppc_inst_val(pinst);
2075	if (r != EMULATE_DONE)
2076	r = RESUME_GUEST;
2077	else
2078	r = RESUME_HOST;
2079	} else {
2080	r = RESUME_HOST;
2081	}
2082
2083	break;
2084	}
2085
2086	case BOOK3S_INTERRUPT_HV_RM_HARD:
2087	vcpu->arch.trap = `0`;
2088	r = RESUME_GUEST;
2089	if (!xics_on_xive())
2090	kvmppc_xics_rm_complete(vcpu, `0`);
2091	break;
2092	case BOOK3S_INTERRUPT_SYSCALL:
2093	{
2094	unsigned long req = kvmppc_get_gpr(vcpu, `3`);
2095
2096	/*
2097	* The H_RPT_INVALIDATE hcalls issued by nested
2098	* guests for process-scoped invalidations when
2099	* GTSE=0, are handled here in L0.
2100	*/
2101	if (req == H_RPT_INVALIDATE) {
2102	r = kvmppc_nested_h_rpt_invalidate(vcpu);
2103	break;
2104	}
2105
2106	r = RESUME_HOST;
2107	break;
2108	}
2109	default:
2110	r = RESUME_HOST;
2111	break;
2112	}
2113
2114	return r;
2115	}
2116
2117	static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
2118	struct kvm_sregs *sregs)
2119	{
2120	int i;
2121
2122	memset(sregs, `0`, sizeof(struct kvm_sregs));
2123	sregs->pvr = vcpu->arch.pvr;
2124	for (i = `0`; i < vcpu->arch.slb_max; i++) {
2125	sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
2126	sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
2127	}
2128
2129	return `0`;
2130	}
2131
2132	static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
2133	struct kvm_sregs *sregs)
2134	{
2135	int i, j;
2136
2137	/ Only accept the same PVR as the host's, since we can't spoof it /
2138	if (sregs->pvr != vcpu->arch.pvr)
2139	return -EINVAL;
2140
2141	j = `0`;
2142	for (i = `0`; i < vcpu->arch.slb_nr; i++) {
2143	if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
2144	vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
2145	vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
2146	++j;
2147	}
2148	}
2149	vcpu->arch.slb_max = j;
2150
2151	return `0`;
2152	}
2153
2154	/*
2155	* Enforce limits on guest LPCR values based on hardware availability,
2156	* guest configuration, and possibly hypervisor support and security
2157	* concerns.
2158	*/
2159	unsigned long kvmppc_filter_lpcr_hv(struct kvm kvm, unsigned* long lpcr)
2160	{
2161	/ LPCR_TC only applies to HPT guests /
2162	if (kvm_is_radix(kvm))
2163	lpcr &= ~LPCR_TC;
2164
2165	/ On POWER8 and above, userspace can modify AIL /
2166	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2167	lpcr &= ~LPCR_AIL;
2168	if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
2169	lpcr &= ~LPCR_AIL; / LPCR[AIL]=1/2 is disallowed /
2170	/*
2171	* On some POWER9s we force AIL off for radix guests to prevent
2172	* executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
2173	* guest, which can result in Q0 translations with LPID=0 PID=PIDR to
2174	* be cached, which the host TLB management does not expect.
2175	*/
2176	if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
2177	lpcr &= ~LPCR_AIL;
2178
2179	/*
2180	* On POWER9, allow userspace to enable large decrementer for the
2181	* guest, whether or not the host has it enabled.
2182	*/
2183	if (!cpu_has_feature(CPU_FTR_ARCH_300))
2184	lpcr &= ~LPCR_LD;
2185
2186	return lpcr;
2187	}
2188
2189	static void verify_lpcr(struct kvm kvm, unsigned* long lpcr)
2190	{
2191	if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) {
2192	WARN_ONCE(`1`, "lpcr 0x%lx differs from filtered 0x%lx\n",
2193	lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr));
2194	}
2195	}
2196
2197	static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
2198	bool preserve_top32)
2199	{
2200	struct kvm *kvm = vcpu->kvm;
2201	struct kvmppc_vcore *vc = vcpu->arch.vcore;
2202	u64 mask;
2203
2204	spin_lock(lock: &vc->lock);
2205
2206	/*
2207	* Userspace can only modify
2208	* DPFD (default prefetch depth), ILE (interrupt little-endian),
2209	* TC (translation control), AIL (alternate interrupt location),
2210	* LD (large decrementer).
2211	* These are subject to restrictions from kvmppc_filter_lcpr_hv().
2212	*/
2213	mask = LPCR_DPFD \| LPCR_ILE \| LPCR_TC \| LPCR_AIL \| LPCR_LD;
2214
2215	/ Broken 32-bit version of LPCR must not clear top bits /
2216	if (preserve_top32)
2217	mask &= `0xFFFFFFFF`;
2218
2219	new_lpcr = kvmppc_filter_lpcr_hv(kvm,
2220	lpcr: (vc->lpcr & ~mask) \| (new_lpcr & mask));
2221
2222	/*
2223	* If ILE (interrupt little-endian) has changed, update the
2224	* MSR_LE bit in the intr_msr for each vcpu in this vcore.
2225	*/
2226	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
2227	struct kvm_vcpu *vcpu;
2228	unsigned long i;
2229
2230	kvm_for_each_vcpu(i, vcpu, kvm) {
2231	if (vcpu->arch.vcore != vc)
2232	continue;
2233	if (new_lpcr & LPCR_ILE)
2234	vcpu->arch.intr_msr \|= MSR_LE;
2235	else
2236	vcpu->arch.intr_msr &= ~MSR_LE;
2237	}
2238	}
2239
2240	vc->lpcr = new_lpcr;
2241	kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LPCR);
2242
2243	spin_unlock(lock: &vc->lock);
2244	}
2245
2246	static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
2247	union kvmppc_one_reg *val)
2248	{
2249	int r = `0`;
2250	long int i;
2251
2252	switch (id) {
2253	case KVM_REG_PPC_DEBUG_INST:
2254	*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
2255	break;
2256	case KVM_REG_PPC_HIOR:
2257	*val = get_reg_val(id, `0`);
2258	break;
2259	case KVM_REG_PPC_DABR:
2260	*val = get_reg_val(id, vcpu->arch.dabr);
2261	break;
2262	case KVM_REG_PPC_DABRX:
2263	*val = get_reg_val(id, vcpu->arch.dabrx);
2264	break;
2265	case KVM_REG_PPC_DSCR:
2266	*val = get_reg_val(id, kvmppc_get_dscr_hv(vcpu));
2267	break;
2268	case KVM_REG_PPC_PURR:
2269	*val = get_reg_val(id, kvmppc_get_purr_hv(vcpu));
2270	break;
2271	case KVM_REG_PPC_SPURR:
2272	*val = get_reg_val(id, kvmppc_get_spurr_hv(vcpu));
2273	break;
2274	case KVM_REG_PPC_AMR:
2275	*val = get_reg_val(id, kvmppc_get_amr_hv(vcpu));
2276	break;
2277	case KVM_REG_PPC_UAMOR:
2278	*val = get_reg_val(id, kvmppc_get_uamor_hv(vcpu));
2279	break;
2280	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2281	i = id - KVM_REG_PPC_MMCR0;
2282	*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, i));
2283	break;
2284	case KVM_REG_PPC_MMCR2:
2285	*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, i: `2`));
2286	break;
2287	case KVM_REG_PPC_MMCRA:
2288	*val = get_reg_val(id, kvmppc_get_mmcra_hv(vcpu));
2289	break;
2290	case KVM_REG_PPC_MMCRS:
2291	*val = get_reg_val(id, vcpu->arch.mmcrs);
2292	break;
2293	case KVM_REG_PPC_MMCR3:
2294	*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, i: `3`));
2295	break;
2296	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
2297	i = id - KVM_REG_PPC_PMC1;
2298	*val = get_reg_val(id, kvmppc_get_pmc_hv(vcpu, i));
2299	break;
2300	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
2301	i = id - KVM_REG_PPC_SPMC1;
2302	*val = get_reg_val(id, vcpu->arch.spmc[i]);
2303	break;
2304	case KVM_REG_PPC_SIAR:
2305	*val = get_reg_val(id, kvmppc_get_siar_hv(vcpu));
2306	break;
2307	case KVM_REG_PPC_SDAR:
2308	*val = get_reg_val(id, kvmppc_get_siar_hv(vcpu));
2309	break;
2310	case KVM_REG_PPC_SIER:
2311	*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, i: `0`));
2312	break;
2313	case KVM_REG_PPC_SIER2:
2314	*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, i: `1`));
2315	break;
2316	case KVM_REG_PPC_SIER3:
2317	*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, i: `2`));
2318	break;
2319	case KVM_REG_PPC_IAMR:
2320	*val = get_reg_val(id, kvmppc_get_iamr_hv(vcpu));
2321	break;
2322	case KVM_REG_PPC_PSPB:
2323	*val = get_reg_val(id, kvmppc_get_pspb_hv(vcpu));
2324	break;
2325	case KVM_REG_PPC_DPDES:
2326	/*
2327	* On POWER9, where we are emulating msgsndp etc.,
2328	* we return 1 bit for each vcpu, which can come from
2329	* either vcore->dpdes or doorbell_request.
2330	* On POWER8, doorbell_request is 0.
2331	*/
2332	if (cpu_has_feature(CPU_FTR_ARCH_300))
2333	*val = get_reg_val(id, vcpu->arch.doorbell_request);
2334	else
2335	*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
2336	break;
2337	case KVM_REG_PPC_VTB:
2338	*val = get_reg_val(id, kvmppc_get_vtb(vcpu));
2339	break;
2340	case KVM_REG_PPC_DAWR:
2341	*val = get_reg_val(id, kvmppc_get_dawr0_hv(vcpu));
2342	break;
2343	case KVM_REG_PPC_DAWRX:
2344	*val = get_reg_val(id, kvmppc_get_dawrx0_hv(vcpu));
2345	break;
2346	case KVM_REG_PPC_DAWR1:
2347	*val = get_reg_val(id, kvmppc_get_dawr1_hv(vcpu));
2348	break;
2349	case KVM_REG_PPC_DAWRX1:
2350	*val = get_reg_val(id, kvmppc_get_dawrx1_hv(vcpu));
2351	break;
2352	case KVM_REG_PPC_CIABR:
2353	*val = get_reg_val(id, kvmppc_get_ciabr_hv(vcpu));
2354	break;
2355	case KVM_REG_PPC_CSIGR:
2356	*val = get_reg_val(id, vcpu->arch.csigr);
2357	break;
2358	case KVM_REG_PPC_TACR:
2359	*val = get_reg_val(id, vcpu->arch.tacr);
2360	break;
2361	case KVM_REG_PPC_TCSCR:
2362	*val = get_reg_val(id, vcpu->arch.tcscr);
2363	break;
2364	case KVM_REG_PPC_PID:
2365	*val = get_reg_val(id, kvmppc_get_pid(vcpu));
2366	break;
2367	case KVM_REG_PPC_ACOP:
2368	*val = get_reg_val(id, vcpu->arch.acop);
2369	break;
2370	case KVM_REG_PPC_WORT:
2371	*val = get_reg_val(id, kvmppc_get_wort_hv(vcpu));
2372	break;
2373	case KVM_REG_PPC_TIDR:
2374	*val = get_reg_val(id, vcpu->arch.tid);
2375	break;
2376	case KVM_REG_PPC_PSSCR:
2377	*val = get_reg_val(id, vcpu->arch.psscr);
2378	break;
2379	case KVM_REG_PPC_VPA_ADDR:
2380	spin_lock(lock: &vcpu->arch.vpa_update_lock);
2381	*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
2382	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
2383	break;
2384	case KVM_REG_PPC_VPA_SLB:
2385	spin_lock(lock: &vcpu->arch.vpa_update_lock);
2386	val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
2387	val->vpaval.length = vcpu->arch.slb_shadow.len;
2388	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
2389	break;
2390	case KVM_REG_PPC_VPA_DTL:
2391	spin_lock(lock: &vcpu->arch.vpa_update_lock);
2392	val->vpaval.addr = vcpu->arch.dtl.next_gpa;
2393	val->vpaval.length = vcpu->arch.dtl.len;
2394	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
2395	break;
2396	case KVM_REG_PPC_TB_OFFSET:
2397	*val = get_reg_val(id, kvmppc_get_tb_offset(vcpu));
2398	break;
2399	case KVM_REG_PPC_LPCR:
2400	case KVM_REG_PPC_LPCR_64:
2401	*val = get_reg_val(id, kvmppc_get_lpcr(vcpu));
2402	break;
2403	case KVM_REG_PPC_PPR:
2404	*val = get_reg_val(id, kvmppc_get_ppr_hv(vcpu));
2405	break;
2406	#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2407	case KVM_REG_PPC_TFHAR:
2408	*val = get_reg_val(id, vcpu->arch.tfhar);
2409	break;
2410	case KVM_REG_PPC_TFIAR:
2411	*val = get_reg_val(id, vcpu->arch.tfiar);
2412	break;
2413	case KVM_REG_PPC_TEXASR:
2414	*val = get_reg_val(id, vcpu->arch.texasr);
2415	break;
2416	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2417	i = id - KVM_REG_PPC_TM_GPR0;
2418	*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
2419	break;
2420	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2421	{
2422	int j;
2423	i = id - KVM_REG_PPC_TM_VSR0;
2424	if (i < `32`)
2425	for (j = `0`; j < TS_FPRWIDTH; j++)
2426	val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
2427	else {
2428	if (cpu_has_feature(CPU_FTR_ALTIVEC))
2429	val->vval = vcpu->arch.vr_tm.vr[i-`32`];
2430	else
2431	r = -ENXIO;
2432	}
2433	break;
2434	}
2435	case KVM_REG_PPC_TM_CR:
2436	*val = get_reg_val(id, vcpu->arch.cr_tm);
2437	break;
2438	case KVM_REG_PPC_TM_XER:
2439	*val = get_reg_val(id, vcpu->arch.xer_tm);
2440	break;
2441	case KVM_REG_PPC_TM_LR:
2442	*val = get_reg_val(id, vcpu->arch.lr_tm);
2443	break;
2444	case KVM_REG_PPC_TM_CTR:
2445	*val = get_reg_val(id, vcpu->arch.ctr_tm);
2446	break;
2447	case KVM_REG_PPC_TM_FPSCR:
2448	*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
2449	break;
2450	case KVM_REG_PPC_TM_AMR:
2451	*val = get_reg_val(id, vcpu->arch.amr_tm);
2452	break;
2453	case KVM_REG_PPC_TM_PPR:
2454	*val = get_reg_val(id, vcpu->arch.ppr_tm);
2455	break;
2456	case KVM_REG_PPC_TM_VRSAVE:
2457	*val = get_reg_val(id, vcpu->arch.vrsave_tm);
2458	break;
2459	case KVM_REG_PPC_TM_VSCR:
2460	if (cpu_has_feature(CPU_FTR_ALTIVEC))
2461	*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[`3`]);
2462	else
2463	r = -ENXIO;
2464	break;
2465	case KVM_REG_PPC_TM_DSCR:
2466	*val = get_reg_val(id, vcpu->arch.dscr_tm);
2467	break;
2468	case KVM_REG_PPC_TM_TAR:
2469	*val = get_reg_val(id, vcpu->arch.tar_tm);
2470	break;
2471	#endif
2472	case KVM_REG_PPC_ARCH_COMPAT:
2473	*val = get_reg_val(id, kvmppc_get_arch_compat(vcpu));
2474	break;
2475	case KVM_REG_PPC_DEC_EXPIRY:
2476	*val = get_reg_val(id, kvmppc_get_dec_expires(vcpu));
2477	break;
2478	case KVM_REG_PPC_ONLINE:
2479	*val = get_reg_val(id, vcpu->arch.online);
2480	break;
2481	case KVM_REG_PPC_PTCR:
2482	*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
2483	break;
2484	case KVM_REG_PPC_FSCR:
2485	*val = get_reg_val(id, kvmppc_get_fscr_hv(vcpu));
2486	break;
2487	default:
2488	r = -EINVAL;
2489	break;
2490	}
2491
2492	return r;
2493	}
2494
2495	static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
2496	union kvmppc_one_reg *val)
2497	{
2498	int r = `0`;
2499	long int i;
2500	unsigned long addr, len;
2501
2502	switch (id) {
2503	case KVM_REG_PPC_HIOR:
2504	/ Only allow this to be set to zero /
2505	if (set_reg_val(id, *val))
2506	r = -EINVAL;
2507	break;
2508	case KVM_REG_PPC_DABR:
2509	vcpu->arch.dabr = set_reg_val(id, *val);
2510	break;
2511	case KVM_REG_PPC_DABRX:
2512	vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
2513	break;
2514	case KVM_REG_PPC_DSCR:
2515	kvmppc_set_dscr_hv(vcpu, val: set_reg_val(id, *val));
2516	break;
2517	case KVM_REG_PPC_PURR:
2518	kvmppc_set_purr_hv(vcpu, val: set_reg_val(id, *val));
2519	break;
2520	case KVM_REG_PPC_SPURR:
2521	kvmppc_set_spurr_hv(vcpu, val: set_reg_val(id, *val));
2522	break;
2523	case KVM_REG_PPC_AMR:
2524	kvmppc_set_amr_hv(vcpu, val: set_reg_val(id, *val));
2525	break;
2526	case KVM_REG_PPC_UAMOR:
2527	kvmppc_set_uamor_hv(vcpu, val: set_reg_val(id, *val));
2528	break;
2529	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
2530	i = id - KVM_REG_PPC_MMCR0;
2531	kvmppc_set_mmcr_hv(vcpu, i, val: set_reg_val(id, *val));
2532	break;
2533	case KVM_REG_PPC_MMCR2:
2534	kvmppc_set_mmcr_hv(vcpu, i: `2`, val: set_reg_val(id, *val));
2535	break;
2536	case KVM_REG_PPC_MMCRA:
2537	kvmppc_set_mmcra_hv(vcpu, val: set_reg_val(id, *val));
2538	break;
2539	case KVM_REG_PPC_MMCRS:
2540	vcpu->arch.mmcrs = set_reg_val(id, *val);
2541	break;
2542	case KVM_REG_PPC_MMCR3:
2543	*val = get_reg_val(id, vcpu->arch.mmcr[`3`]);
2544	break;
2545	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
2546	i = id - KVM_REG_PPC_PMC1;
2547	kvmppc_set_pmc_hv(vcpu, i, val: set_reg_val(id, *val));
2548	break;
2549	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
2550	i = id - KVM_REG_PPC_SPMC1;
2551	vcpu->arch.spmc[i] = set_reg_val(id, *val);
2552	break;
2553	case KVM_REG_PPC_SIAR:
2554	kvmppc_set_siar_hv(vcpu, val: set_reg_val(id, *val));
2555	break;
2556	case KVM_REG_PPC_SDAR:
2557	kvmppc_set_sdar_hv(vcpu, val: set_reg_val(id, *val));
2558	break;
2559	case KVM_REG_PPC_SIER:
2560	kvmppc_set_sier_hv(vcpu, i: `0`, val: set_reg_val(id, *val));
2561	break;
2562	case KVM_REG_PPC_SIER2:
2563	kvmppc_set_sier_hv(vcpu, i: `1`, val: set_reg_val(id, *val));
2564	break;
2565	case KVM_REG_PPC_SIER3:
2566	kvmppc_set_sier_hv(vcpu, i: `2`, val: set_reg_val(id, *val));
2567	break;
2568	case KVM_REG_PPC_IAMR:
2569	kvmppc_set_iamr_hv(vcpu, val: set_reg_val(id, *val));
2570	break;
2571	case KVM_REG_PPC_PSPB:
2572	kvmppc_set_pspb_hv(vcpu, val: set_reg_val(id, *val));
2573	break;
2574	case KVM_REG_PPC_DPDES:
2575	if (cpu_has_feature(CPU_FTR_ARCH_300))
2576	vcpu->arch.doorbell_request = set_reg_val(id, *val) & `1`;
2577	else
2578	vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
2579	break;
2580	case KVM_REG_PPC_VTB:
2581	kvmppc_set_vtb(vcpu, set_reg_val(id, *val));
2582	break;
2583	case KVM_REG_PPC_DAWR:
2584	kvmppc_set_dawr0_hv(vcpu, val: set_reg_val(id, *val));
2585	break;
2586	case KVM_REG_PPC_DAWRX:
2587	kvmppc_set_dawrx0_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
2588	break;
2589	case KVM_REG_PPC_DAWR1:
2590	kvmppc_set_dawr1_hv(vcpu, val: set_reg_val(id, *val));
2591	break;
2592	case KVM_REG_PPC_DAWRX1:
2593	kvmppc_set_dawrx1_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP);
2594	break;
2595	case KVM_REG_PPC_CIABR:
2596	kvmppc_set_ciabr_hv(vcpu, val: set_reg_val(id, *val));
2597	/ Don't allow setting breakpoints in hypervisor code /
2598	if ((kvmppc_get_ciabr_hv(vcpu) & CIABR_PRIV) == CIABR_PRIV_HYPER)
2599	kvmppc_set_ciabr_hv(vcpu, kvmppc_get_ciabr_hv(vcpu) & ~CIABR_PRIV);
2600	break;
2601	case KVM_REG_PPC_CSIGR:
2602	vcpu->arch.csigr = set_reg_val(id, *val);
2603	break;
2604	case KVM_REG_PPC_TACR:
2605	vcpu->arch.tacr = set_reg_val(id, *val);
2606	break;
2607	case KVM_REG_PPC_TCSCR:
2608	vcpu->arch.tcscr = set_reg_val(id, *val);
2609	break;
2610	case KVM_REG_PPC_PID:
2611	kvmppc_set_pid(vcpu, set_reg_val(id, *val));
2612	break;
2613	case KVM_REG_PPC_ACOP:
2614	vcpu->arch.acop = set_reg_val(id, *val);
2615	break;
2616	case KVM_REG_PPC_WORT:
2617	kvmppc_set_wort_hv(vcpu, val: set_reg_val(id, *val));
2618	break;
2619	case KVM_REG_PPC_TIDR:
2620	vcpu->arch.tid = set_reg_val(id, *val);
2621	break;
2622	case KVM_REG_PPC_PSSCR:
2623	vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
2624	break;
2625	case KVM_REG_PPC_VPA_ADDR:
2626	addr = set_reg_val(id, *val);
2627	r = -EINVAL;
2628	if (!addr && (vcpu->arch.slb_shadow.next_gpa \|\|
2629	vcpu->arch.dtl.next_gpa))
2630	break;
2631	r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
2632	break;
2633	case KVM_REG_PPC_VPA_SLB:
2634	addr = val->vpaval.addr;
2635	len = val->vpaval.length;
2636	r = -EINVAL;
2637	if (addr && !vcpu->arch.vpa.next_gpa)
2638	break;
2639	r = set_vpa(vcpu, v: &vcpu->arch.slb_shadow, addr, len);
2640	break;
2641	case KVM_REG_PPC_VPA_DTL:
2642	addr = val->vpaval.addr;
2643	len = val->vpaval.length;
2644	r = -EINVAL;
2645	if (addr && (len < sizeof(struct dtl_entry) \|\|
2646	!vcpu->arch.vpa.next_gpa))
2647	break;
2648	len -= len % sizeof(struct dtl_entry);
2649	r = set_vpa(vcpu, v: &vcpu->arch.dtl, addr, len);
2650	break;
2651	case KVM_REG_PPC_TB_OFFSET:
2652	{
2653	/ round up to multiple of 2^24 /
2654	u64 tb_offset = ALIGN(set_reg_val(id, *val), `1UL` << `24`);
2655
2656	/*
2657	* Now that we know the timebase offset, update the
2658	* decrementer expiry with a guest timebase value. If
2659	* the userspace does not set DEC_EXPIRY, this ensures
2660	* a migrated vcpu at least starts with an expired
2661	* decrementer, which is better than a large one that
2662	* causes a hang.
2663	*/
2664	kvmppc_set_tb_offset(vcpu, tb_offset);
2665	if (!kvmppc_get_dec_expires(vcpu) && tb_offset)
2666	kvmppc_set_dec_expires(vcpu, get_tb() + tb_offset);
2667
2668	kvmppc_set_tb_offset(vcpu, tb_offset);
2669	break;
2670	}
2671	case KVM_REG_PPC_LPCR:
2672	kvmppc_set_lpcr(vcpu, new_lpcr: set_reg_val(id, *val), preserve_top32: true);
2673	break;
2674	case KVM_REG_PPC_LPCR_64:
2675	kvmppc_set_lpcr(vcpu, new_lpcr: set_reg_val(id, *val), preserve_top32: false);
2676	break;
2677	case KVM_REG_PPC_PPR:
2678	kvmppc_set_ppr_hv(vcpu, val: set_reg_val(id, *val));
2679	break;
2680	#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2681	case KVM_REG_PPC_TFHAR:
2682	vcpu->arch.tfhar = set_reg_val(id, *val);
2683	break;
2684	case KVM_REG_PPC_TFIAR:
2685	vcpu->arch.tfiar = set_reg_val(id, *val);
2686	break;
2687	case KVM_REG_PPC_TEXASR:
2688	vcpu->arch.texasr = set_reg_val(id, *val);
2689	break;
2690	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
2691	i = id - KVM_REG_PPC_TM_GPR0;
2692	vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
2693	break;
2694	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
2695	{
2696	int j;
2697	i = id - KVM_REG_PPC_TM_VSR0;
2698	if (i < `32`)
2699	for (j = `0`; j < TS_FPRWIDTH; j++)
2700	vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
2701	else
2702	if (cpu_has_feature(CPU_FTR_ALTIVEC))
2703	vcpu->arch.vr_tm.vr[i-`32`] = val->vval;
2704	else
2705	r = -ENXIO;
2706	break;
2707	}
2708	case KVM_REG_PPC_TM_CR:
2709	vcpu->arch.cr_tm = set_reg_val(id, *val);
2710	break;
2711	case KVM_REG_PPC_TM_XER:
2712	vcpu->arch.xer_tm = set_reg_val(id, *val);
2713	break;
2714	case KVM_REG_PPC_TM_LR:
2715	vcpu->arch.lr_tm = set_reg_val(id, *val);
2716	break;
2717	case KVM_REG_PPC_TM_CTR:
2718	vcpu->arch.ctr_tm = set_reg_val(id, *val);
2719	break;
2720	case KVM_REG_PPC_TM_FPSCR:
2721	vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
2722	break;
2723	case KVM_REG_PPC_TM_AMR:
2724	vcpu->arch.amr_tm = set_reg_val(id, *val);
2725	break;
2726	case KVM_REG_PPC_TM_PPR:
2727	vcpu->arch.ppr_tm = set_reg_val(id, *val);
2728	break;
2729	case KVM_REG_PPC_TM_VRSAVE:
2730	vcpu->arch.vrsave_tm = set_reg_val(id, *val);
2731	break;
2732	case KVM_REG_PPC_TM_VSCR:
2733	if (cpu_has_feature(CPU_FTR_ALTIVEC))
2734	vcpu->arch.vr.vscr.u[`3`] = set_reg_val(id, *val);
2735	else
2736	r = - ENXIO;
2737	break;
2738	case KVM_REG_PPC_TM_DSCR:
2739	vcpu->arch.dscr_tm = set_reg_val(id, *val);
2740	break;
2741	case KVM_REG_PPC_TM_TAR:
2742	vcpu->arch.tar_tm = set_reg_val(id, *val);
2743	break;
2744	#endif
2745	case KVM_REG_PPC_ARCH_COMPAT:
2746	r = kvmppc_set_arch_compat(vcpu, arch_compat: set_reg_val(id, *val));
2747	break;
2748	case KVM_REG_PPC_DEC_EXPIRY:
2749	kvmppc_set_dec_expires(vcpu, set_reg_val(id, *val));
2750	break;
2751	case KVM_REG_PPC_ONLINE:
2752	i = set_reg_val(id, *val);
2753	if (i && !vcpu->arch.online)
2754	atomic_inc(v: &vcpu->arch.vcore->online_count);
2755	else if (!i && vcpu->arch.online)
2756	atomic_dec(v: &vcpu->arch.vcore->online_count);
2757	vcpu->arch.online = i;
2758	break;
2759	case KVM_REG_PPC_PTCR:
2760	vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
2761	break;
2762	case KVM_REG_PPC_FSCR:
2763	kvmppc_set_fscr_hv(vcpu, val: set_reg_val(id, *val));
2764	break;
2765	default:
2766	r = -EINVAL;
2767	break;
2768	}
2769
2770	return r;
2771	}
2772
2773	/*
2774	* On POWER9, threads are independent and can be in different partitions.
2775	* Therefore we consider each thread to be a subcore.
2776	* There is a restriction that all threads have to be in the same
2777	* MMU mode (radix or HPT), unfortunately, but since we only support
2778	* HPT guests on a HPT host so far, that isn't an impediment yet.
2779	*/
2780	static int threads_per_vcore(struct kvm *kvm)
2781	{
2782	if (cpu_has_feature(CPU_FTR_ARCH_300))
2783	return `1`;
2784	return threads_per_subcore;
2785	}
2786
2787	static struct kvmppc_vcore kvmppc_vcore_create(struct* kvm kvm, int* id)
2788	{
2789	struct kvmppc_vcore *vcore;
2790
2791	vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
2792
2793	if (vcore == NULL)
2794	return NULL;
2795
2796	spin_lock_init(&vcore->lock);
2797	spin_lock_init(&vcore->stoltb_lock);
2798	rcuwait_init(w: &vcore->wait);
2799	vcore->preempt_tb = TB_NIL;
2800	vcore->lpcr = kvm->arch.lpcr;
2801	vcore->first_vcpuid = id;
2802	vcore->kvm = kvm;
2803	INIT_LIST_HEAD(list: &vcore->preempt_list);
2804
2805	return vcore;
2806	}
2807
2808	#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
2809	static struct debugfs_timings_element {
2810	const char *name;
2811	size_t offset;
2812	} timings[] = {
2813	#ifdef CONFIG_KVM_BOOK3S_HV_P9_TIMING
2814	{"vcpu_entry", offsetof(struct kvm_vcpu, arch.vcpu_entry)},
2815	{"guest_entry", offsetof(struct kvm_vcpu, arch.guest_entry)},
2816	{"in_guest", offsetof(struct kvm_vcpu, arch.in_guest)},
2817	{"guest_exit", offsetof(struct kvm_vcpu, arch.guest_exit)},
2818	{"vcpu_exit", offsetof(struct kvm_vcpu, arch.vcpu_exit)},
2819	{"hypercall", offsetof(struct kvm_vcpu, arch.hcall)},
2820	{"page_fault", offsetof(struct kvm_vcpu, arch.pg_fault)},
2821	#else
2822	{"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)},
2823	{"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)},
2824	{"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)},
2825	{"guest", offsetof(struct kvm_vcpu, arch.guest_time)},
2826	{"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
2827	#endif
2828	};
2829
2830	#define N_TIMINGS (ARRAY_SIZE(timings))
2831
2832	struct debugfs_timings_state {
2833	struct kvm_vcpu *vcpu;
2834	unsigned int buflen;
2835	char buf[N_TIMINGS * `100`];
2836	};
2837
2838	static int debugfs_timings_open(struct inode inode, struct* file *file)
2839	{
2840	struct kvm_vcpu *vcpu = inode->i_private;
2841	struct debugfs_timings_state *p;
2842
2843	p = kzalloc(sizeof(*p), GFP_KERNEL);
2844	if (!p)
2845	return -ENOMEM;
2846
2847	kvm_get_kvm(vcpu->kvm);
2848	p->vcpu = vcpu;
2849	file->private_data = p;
2850
2851	return nonseekable_open(inode, file);
2852	}
2853
2854	static int debugfs_timings_release(struct inode inode, struct* file *file)
2855	{
2856	struct debugfs_timings_state *p = file->private_data;
2857
2858	kvm_put_kvm(p->vcpu->kvm);
2859	kfree(p);
2860	return `0`;
2861	}
2862
2863	static ssize_t debugfs_timings_read(struct file file, char* __user *buf,
2864	size_t len, loff_t *ppos)
2865	{
2866	struct debugfs_timings_state *p = file->private_data;
2867	struct kvm_vcpu *vcpu = p->vcpu;
2868	char s, buf_end;
2869	struct kvmhv_tb_accumulator tb;
2870	u64 count;
2871	loff_t pos;
2872	ssize_t n;
2873	int i, loops;
2874	bool ok;
2875
2876	if (!p->buflen) {
2877	s = p->buf;
2878	buf_end = s + sizeof(p->buf);
2879	for (i = `0`; i < N_TIMINGS; ++i) {
2880	struct kvmhv_tb_accumulator *acc;
2881
2882	acc = (struct kvmhv_tb_accumulator *)
2883	((unsigned long)vcpu + timings[i].offset);
2884	ok = false;
2885	for (loops = `0`; loops < `1000`; ++loops) {
2886	count = acc->seqcount;
2887	if (!(count & `1`)) {
2888	smp_rmb();
2889	tb = *acc;
2890	smp_rmb();
2891	if (count == acc->seqcount) {
2892	ok = true;
2893	break;
2894	}
2895	}
2896	udelay(`1`);
2897	}
2898	if (!ok)
2899	snprintf(s, buf_end - s, "%s: stuck\n",
2900	timings[i].name);
2901	else
2902	snprintf(s, buf_end - s,
2903	"%s: %llu %llu %llu %llu\n",
2904	timings[i].name, count / `2`,
2905	tb_to_ns(tb.tb_total),
2906	tb_to_ns(tb.tb_min),
2907	tb_to_ns(tb.tb_max));
2908	s += strlen(s);
2909	}
2910	p->buflen = s - p->buf;
2911	}
2912
2913	pos = *ppos;
2914	if (pos >= p->buflen)
2915	return `0`;
2916	if (len > p->buflen - pos)
2917	len = p->buflen - pos;
2918	n = copy_to_user(buf, p->buf + pos, len);
2919	if (n) {
2920	if (n == len)
2921	return -EFAULT;
2922	len -= n;
2923	}
2924	*ppos = pos + len;
2925	return len;
2926	}
2927
2928	static ssize_t debugfs_timings_write(struct file file, const* char __user *buf,
2929	size_t len, loff_t *ppos)
2930	{
2931	return -EACCES;
2932	}
2933
2934	static const struct file_operations debugfs_timings_ops = {
2935	.owner = THIS_MODULE,
2936	.open = debugfs_timings_open,
2937	.release = debugfs_timings_release,
2938	.read = debugfs_timings_read,
2939	.write = debugfs_timings_write,
2940	.llseek = generic_file_llseek,
2941	};
2942
2943	/ Create a debugfs directory for the vcpu /
2944	static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu vcpu, struct* dentry *debugfs_dentry)
2945	{
2946	if (cpu_has_feature(CPU_FTR_ARCH_300) == IS_ENABLED(CONFIG_KVM_BOOK3S_HV_P9_TIMING))
2947	debugfs_create_file("timings", `0444`, debugfs_dentry, vcpu,
2948	&debugfs_timings_ops);
2949	return `0`;
2950	}
2951
2952	#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2953	static int kvmppc_arch_create_vcpu_debugfs_hv(struct kvm_vcpu vcpu, struct* dentry *debugfs_dentry)
2954	{
2955	return `0`;
2956	}
2957	#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
2958
2959	static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
2960	{
2961	int err;
2962	int core;
2963	struct kvmppc_vcore *vcore;
2964	struct kvm *kvm;
2965	unsigned int id;
2966
2967	kvm = vcpu->kvm;
2968	id = vcpu->vcpu_id;
2969
2970	vcpu->arch.shared = &vcpu->arch.shregs;
2971	#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
2972	/*
2973	* The shared struct is never shared on HV,
2974	* so we can always use host endianness
2975	*/
2976	#ifdef __BIG_ENDIAN__
2977	vcpu->arch.shared_big_endian = true;
2978	#else
2979	vcpu->arch.shared_big_endian = false;
2980	#endif
2981	#endif
2982
2983	if (kvmhv_is_nestedv2()) {
2984	err = kvmhv_nestedv2_vcpu_create(vcpu, &vcpu->arch.nestedv2_io);
2985	if (err < `0`)
2986	return err;
2987	}
2988
2989	kvmppc_set_mmcr_hv(vcpu, `0`, MMCR0_FC);
2990	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
2991	kvmppc_set_mmcr_hv(vcpu, `0`, kvmppc_get_mmcr_hv(vcpu, `0`) \| MMCR0_PMCCEXT);
2992	kvmppc_set_mmcra_hv(vcpu, MMCRA_BHRB_DISABLE);
2993	}
2994
2995	kvmppc_set_ctrl_hv(vcpu, CTRL_RUNLATCH);
2996	/ default to host PVR, since we can't spoof it /
2997	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
2998	spin_lock_init(&vcpu->arch.vpa_update_lock);
2999	spin_lock_init(&vcpu->arch.tbacct_lock);
3000	vcpu->arch.busy_preempt = TB_NIL;
3001	__kvmppc_set_msr_hv(vcpu, MSR_ME);
3002	vcpu->arch.intr_msr = MSR_SF \| MSR_ME;
3003
3004	/*
3005	* Set the default HFSCR for the guest from the host value.
3006	* This value is only used on POWER9 and later.
3007	* On >= POWER9, we want to virtualize the doorbell facility, so we
3008	* don't set the HFSCR_MSGP bit, and that causes those instructions
3009	* to trap and then we emulate them.
3010	*/
3011	kvmppc_set_hfscr_hv(vcpu, HFSCR_TAR \| HFSCR_EBB \| HFSCR_PM \| HFSCR_BHRB \|
3012	HFSCR_DSCR \| HFSCR_VECVSX \| HFSCR_FP);
3013
3014	/ On POWER10 and later, allow prefixed instructions /
3015	if (cpu_has_feature(CPU_FTR_ARCH_31))
3016	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) \| HFSCR_PREFIX);
3017
3018	if (cpu_has_feature(CPU_FTR_HVMODE)) {
3019	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & mfspr(SPRN_HFSCR));
3020
3021	#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
3022	if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3023	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) \| HFSCR_TM);
3024	#endif
3025	}
3026	if (cpu_has_feature(CPU_FTR_TM_COMP))
3027	vcpu->arch.hfscr \|= HFSCR_TM;
3028
3029	vcpu->arch.hfscr_permitted = kvmppc_get_hfscr_hv(vcpu);
3030
3031	/*
3032	* PM, EBB, TM are demand-faulted so start with it clear.
3033	*/
3034	kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & ~(HFSCR_PM \| HFSCR_EBB \| HFSCR_TM));
3035
3036	kvmppc_mmu_book3s_hv_init(vcpu);
3037
3038	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
3039
3040	init_waitqueue_head(&vcpu->arch.cpu_run);
3041
3042	mutex_lock(&kvm->lock);
3043	vcore = NULL;
3044	err = -EINVAL;
3045	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
3046	if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
3047	pr_devel("KVM: VCPU ID too high\n");
3048	core = KVM_MAX_VCORES;
3049	} else {
3050	BUG_ON(kvm->arch.smt_mode != `1`);
3051	core = kvmppc_pack_vcpu_id(kvm, id);
3052	}
3053	} else {
3054	core = id / kvm->arch.smt_mode;
3055	}
3056	if (core < KVM_MAX_VCORES) {
3057	vcore = kvm->arch.vcores[core];
3058	if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
3059	pr_devel("KVM: collision on id %u", id);
3060	vcore = NULL;
3061	} else if (!vcore) {
3062	/*
3063	* Take mmu_setup_lock for mutual exclusion
3064	* with kvmppc_update_lpcr().
3065	*/
3066	err = -ENOMEM;
3067	vcore = kvmppc_vcore_create(kvm,
3068	id: id & ~(kvm->arch.smt_mode - `1`));
3069	mutex_lock(&kvm->arch.mmu_setup_lock);
3070	kvm->arch.vcores[core] = vcore;
3071	kvm->arch.online_vcores++;
3072	mutex_unlock(lock: &kvm->arch.mmu_setup_lock);
3073	}
3074	}
3075	mutex_unlock(lock: &kvm->lock);
3076
3077	if (!vcore)
3078	return err;
3079
3080	spin_lock(lock: &vcore->lock);
3081	++vcore->num_threads;
3082	spin_unlock(lock: &vcore->lock);
3083	vcpu->arch.vcore = vcore;
3084	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
3085	vcpu->arch.thread_cpu = -`1`;
3086	vcpu->arch.prev_cpu = -`1`;
3087
3088	vcpu->arch.cpu_type = KVM_CPU_3S_64;
3089	kvmppc_sanity_check(vcpu);
3090
3091	return `0`;
3092	}
3093
3094	static int kvmhv_set_smt_mode(struct kvm kvm, unsigned* long smt_mode,
3095	unsigned long flags)
3096	{
3097	int err;
3098	int esmt = `0`;
3099
3100	if (flags)
3101	return -EINVAL;
3102	if (smt_mode > MAX_SMT_THREADS \|\| !is_power_of_2(smt_mode))
3103	return -EINVAL;
3104	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
3105	/*
3106	* On POWER8 (or POWER7), the threading mode is "strict",
3107	* so we pack smt_mode vcpus per vcore.
3108	*/
3109	if (smt_mode > threads_per_subcore)
3110	return -EINVAL;
3111	} else {
3112	/*
3113	* On POWER9, the threading mode is "loose",
3114	* so each vcpu gets its own vcore.
3115	*/
3116	esmt = smt_mode;
3117	smt_mode = `1`;
3118	}
3119	mutex_lock(&kvm->lock);
3120	err = -EBUSY;
3121	if (!kvm->arch.online_vcores) {
3122	kvm->arch.smt_mode = smt_mode;
3123	kvm->arch.emul_smt_mode = esmt;
3124	err = `0`;
3125	}
3126	mutex_unlock(lock: &kvm->lock);
3127
3128	return err;
3129	}
3130
3131	static void unpin_vpa(struct kvm kvm, struct* kvmppc_vpa *vpa)
3132	{
3133	if (vpa->pinned_addr)
3134	kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
3135	vpa->dirty);
3136	}
3137
3138	static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
3139	{
3140	spin_lock(lock: &vcpu->arch.vpa_update_lock);
3141	unpin_vpa(kvm: vcpu->kvm, vpa: &vcpu->arch.dtl);
3142	unpin_vpa(kvm: vcpu->kvm, vpa: &vcpu->arch.slb_shadow);
3143	unpin_vpa(kvm: vcpu->kvm, vpa: &vcpu->arch.vpa);
3144	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
3145	if (kvmhv_is_nestedv2())
3146	kvmhv_nestedv2_vcpu_free(vcpu, &vcpu->arch.nestedv2_io);
3147	}
3148
3149	static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
3150	{
3151	/ Indicate we want to get back into the guest /
3152	return `1`;
3153	}
3154
3155	static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
3156	{
3157	unsigned long dec_nsec, now;
3158
3159	now = get_tb();
3160	if (now > kvmppc_dec_expires_host_tb(vcpu)) {
3161	/ decrementer has already gone negative /
3162	kvmppc_core_queue_dec(vcpu);
3163	kvmppc_core_prepare_to_enter(vcpu);
3164	return;
3165	}
3166	dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
3167	hrtimer_start(timer: &vcpu->arch.dec_timer, tim: dec_nsec, mode: HRTIMER_MODE_REL);
3168	vcpu->arch.timer_running = `1`;
3169	}
3170
3171	extern int __kvmppc_vcore_entry(void);
3172
3173	static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
3174	struct kvm_vcpu *vcpu, u64 tb)
3175	{
3176	u64 now;
3177
3178	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
3179	return;
3180	spin_lock_irq(lock: &vcpu->arch.tbacct_lock);
3181	now = tb;
3182	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
3183	vcpu->arch.stolen_logged;
3184	vcpu->arch.busy_preempt = now;
3185	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
3186	spin_unlock_irq(lock: &vcpu->arch.tbacct_lock);
3187	--vc->n_runnable;
3188	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
3189	}
3190
3191	static int kvmppc_grab_hwthread(int cpu)
3192	{
3193	struct paca_struct *tpaca;
3194	long timeout = `10000`;
3195
3196	tpaca = paca_ptrs[cpu];
3197
3198	/ Ensure the thread won't go into the kernel if it wakes /
3199	tpaca->kvm_hstate.kvm_vcpu = NULL;
3200	tpaca->kvm_hstate.kvm_vcore = NULL;
3201	tpaca->kvm_hstate.napping = `0`;
3202	smp_wmb();
3203	tpaca->kvm_hstate.hwthread_req = `1`;
3204
3205	/*
3206	* If the thread is already executing in the kernel (e.g. handling
3207	* a stray interrupt), wait for it to get back to nap mode.
3208	* The smp_mb() is to ensure that our setting of hwthread_req
3209	* is visible before we look at hwthread_state, so if this
3210	* races with the code at system_reset_pSeries and the thread
3211	* misses our setting of hwthread_req, we are sure to see its
3212	* setting of hwthread_state, and vice versa.
3213	*/
3214	smp_mb();
3215	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
3216	if (--timeout <= `0`) {
3217	pr_err("KVM: couldn't grab cpu %d\n", cpu);
3218	return -EBUSY;
3219	}
3220	udelay(`1`);
3221	}
3222	return `0`;
3223	}
3224
3225	static void kvmppc_release_hwthread(int cpu)
3226	{
3227	struct paca_struct *tpaca;
3228
3229	tpaca = paca_ptrs[cpu];
3230	tpaca->kvm_hstate.hwthread_req = `0`;
3231	tpaca->kvm_hstate.kvm_vcpu = NULL;
3232	tpaca->kvm_hstate.kvm_vcore = NULL;
3233	tpaca->kvm_hstate.kvm_split_mode = NULL;
3234	}
3235
3236	static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);
3237
3238	static void radix_flush_cpu(struct kvm kvm, int* cpu, struct kvm_vcpu *vcpu)
3239	{
3240	struct kvm_nested_guest *nested = vcpu->arch.nested;
3241	cpumask_t *need_tlb_flush;
3242	int i;
3243
3244	if (nested)
3245	need_tlb_flush = &nested->need_tlb_flush;
3246	else
3247	need_tlb_flush = &kvm->arch.need_tlb_flush;
3248
3249	cpu = cpu_first_tlb_thread_sibling(cpu);
3250	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
3251	i += cpu_tlb_thread_sibling_step())
3252	cpumask_set_cpu(cpu: i, dstp: need_tlb_flush);
3253
3254	/*
3255	* Make sure setting of bit in need_tlb_flush precedes testing of
3256	* cpu_in_guest. The matching barrier on the other side is hwsync
3257	* when switching to guest MMU mode, which happens between
3258	* cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
3259	* being tested.
3260	*/
3261	smp_mb();
3262
3263	for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
3264	i += cpu_tlb_thread_sibling_step()) {
3265	struct kvm running = per_cpu_ptr(&cpu_in_guest, i);
3266
3267	if (running == kvm)
3268	smp_call_function_single(cpuid: i, func: do_nothing, NULL, wait: `1`);
3269	}
3270	}
3271
3272	static void do_migrate_away_vcpu(void *arg)
3273	{
3274	struct kvm_vcpu *vcpu = arg;
3275	struct kvm *kvm = vcpu->kvm;
3276
3277	/*
3278	* If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
3279	* ptesync sequence on the old CPU before migrating to a new one, in
3280	* case we interrupted the guest between a tlbie ; eieio ;
3281	* tlbsync; ptesync sequence.
3282	*
3283	* Otherwise, ptesync is sufficient for ordering tlbiel sequences.
3284	*/
3285	if (kvm->arch.lpcr & LPCR_GTSE)
3286	asm volatile("eieio; tlbsync; ptesync");
3287	else
3288	asm volatile("ptesync");
3289	}
3290
3291	static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu vcpu, int* pcpu)
3292	{
3293	struct kvm_nested_guest *nested = vcpu->arch.nested;
3294	struct kvm *kvm = vcpu->kvm;
3295	int prev_cpu;
3296
3297	if (!cpu_has_feature(CPU_FTR_HVMODE))
3298	return;
3299
3300	if (nested)
3301	prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
3302	else
3303	prev_cpu = vcpu->arch.prev_cpu;
3304
3305	/*
3306	* With radix, the guest can do TLB invalidations itself,
3307	* and it could choose to use the local form (tlbiel) if
3308	* it is invalidating a translation that has only ever been
3309	* used on one vcpu. However, that doesn't mean it has
3310	* only ever been used on one physical cpu, since vcpus
3311	* can move around between pcpus. To cope with this, when
3312	* a vcpu moves from one pcpu to another, we need to tell
3313	* any vcpus running on the same core as this vcpu previously
3314	* ran to flush the TLB.
3315	*/
3316	if (prev_cpu != pcpu) {
3317	if (prev_cpu >= `0`) {
3318	if (cpu_first_tlb_thread_sibling(prev_cpu) !=
3319	cpu_first_tlb_thread_sibling(pcpu))
3320	radix_flush_cpu(kvm, cpu: prev_cpu, vcpu);
3321
3322	smp_call_function_single(cpuid: prev_cpu,
3323	func: do_migrate_away_vcpu, info: vcpu, wait: `1`);
3324	}
3325	if (nested)
3326	nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
3327	else
3328	vcpu->arch.prev_cpu = pcpu;
3329	}
3330	}
3331
3332	static void kvmppc_start_thread(struct kvm_vcpu vcpu, struct* kvmppc_vcore *vc)
3333	{
3334	int cpu;
3335	struct paca_struct *tpaca;
3336
3337	cpu = vc->pcpu;
3338	if (vcpu) {
3339	if (vcpu->arch.timer_running) {
3340	hrtimer_try_to_cancel(timer: &vcpu->arch.dec_timer);
3341	vcpu->arch.timer_running = `0`;
3342	}
3343	cpu += vcpu->arch.ptid;
3344	vcpu->cpu = vc->pcpu;
3345	vcpu->arch.thread_cpu = cpu;
3346	}
3347	tpaca = paca_ptrs[cpu];
3348	tpaca->kvm_hstate.kvm_vcpu = vcpu;
3349	tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
3350	tpaca->kvm_hstate.fake_suspend = `0`;
3351	/ Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore /
3352	smp_wmb();
3353	tpaca->kvm_hstate.kvm_vcore = vc;
3354	if (cpu != smp_processor_id())
3355	kvmppc_ipi_thread(cpu);
3356	}
3357
3358	static void kvmppc_wait_for_nap(int n_threads)
3359	{
3360	int cpu = smp_processor_id();
3361	int i, loops;
3362
3363	if (n_threads <= `1`)
3364	return;
3365	for (loops = `0`; loops < `1000000`; ++loops) {
3366	/*
3367	* Check if all threads are finished.
3368	* We set the vcore pointer when starting a thread
3369	* and the thread clears it when finished, so we look
3370	* for any threads that still have a non-NULL vcore ptr.
3371	*/
3372	for (i = `1`; i < n_threads; ++i)
3373	if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3374	break;
3375	if (i == n_threads) {
3376	HMT_medium();
3377	return;
3378	}
3379	HMT_low();
3380	}
3381	HMT_medium();
3382	for (i = `1`; i < n_threads; ++i)
3383	if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
3384	pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
3385	}
3386
3387	/*
3388	* Check that we are on thread 0 and that any other threads in
3389	* this core are off-line. Then grab the threads so they can't
3390	* enter the kernel.
3391	*/
3392	static int on_primary_thread(void)
3393	{
3394	int cpu = smp_processor_id();
3395	int thr;
3396
3397	/ Are we on a primary subcore? /
3398	if (cpu_thread_in_subcore(cpu))
3399	return `0`;
3400
3401	thr = `0`;
3402	while (++thr < threads_per_subcore)
3403	if (cpu_online(cpu: cpu + thr))
3404	return `0`;
3405
3406	/ Grab all hw threads so they can't go into the kernel /
3407	for (thr = `1`; thr < threads_per_subcore; ++thr) {
3408	if (kvmppc_grab_hwthread(cpu + thr)) {
3409	/ Couldn't grab one; let the others go /
3410	do {
3411	kvmppc_release_hwthread(cpu + thr);
3412	} while (--thr > `0`);
3413	return `0`;
3414	}
3415	}
3416	return `1`;
3417	}
3418
3419	/*
3420	* A list of virtual cores for each physical CPU.
3421	* These are vcores that could run but their runner VCPU tasks are
3422	* (or may be) preempted.
3423	*/
3424	struct preempted_vcore_list {
3425	struct list_head list;
3426	spinlock_t lock;
3427	};
3428
3429	static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
3430
3431	static void init_vcore_lists(void)
3432	{
3433	int cpu;
3434
3435	for_each_possible_cpu(cpu) {
3436	struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
3437	spin_lock_init(&lp->lock);
3438	INIT_LIST_HEAD(list: &lp->list);
3439	}
3440	}
3441
3442	static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
3443	{
3444	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
3445
3446	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
3447
3448	vc->vcore_state = VCORE_PREEMPT;
3449	vc->pcpu = smp_processor_id();
3450	if (vc->num_threads < threads_per_vcore(kvm: vc->kvm)) {
3451	spin_lock(lock: &lp->lock);
3452	list_add_tail(new: &vc->preempt_list, head: &lp->list);
3453	spin_unlock(lock: &lp->lock);
3454	}
3455
3456	/ Start accumulating stolen time /
3457	kvmppc_core_start_stolen(vc, tb: mftb());
3458	}
3459
3460	static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
3461	{
3462	struct preempted_vcore_list *lp;
3463
3464	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
3465
3466	kvmppc_core_end_stolen(vc, tb: mftb());
3467	if (!list_empty(head: &vc->preempt_list)) {
3468	lp = &per_cpu(preempted_vcores, vc->pcpu);
3469	spin_lock(lock: &lp->lock);
3470	list_del_init(entry: &vc->preempt_list);
3471	spin_unlock(lock: &lp->lock);
3472	}
3473	vc->vcore_state = VCORE_INACTIVE;
3474	}
3475
3476	/*
3477	* This stores information about the virtual cores currently
3478	* assigned to a physical core.
3479	*/
3480	struct core_info {
3481	int n_subcores;
3482	int max_subcore_threads;
3483	int total_threads;
3484	int subcore_threads[MAX_SUBCORES];
3485	struct kvmppc_vcore *vc[MAX_SUBCORES];
3486	};
3487
3488	/*
3489	* This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
3490	* respectively in 2-way micro-threading (split-core) mode on POWER8.
3491	*/
3492	static int subcore_thread_map[MAX_SUBCORES] = { `0`, `4`, `2`, `6` };
3493
3494	static void init_core_info(struct core_info cip, struct* kvmppc_vcore *vc)
3495	{
3496	memset(cip, `0`, sizeof(*cip));
3497	cip->n_subcores = `1`;
3498	cip->max_subcore_threads = vc->num_threads;
3499	cip->total_threads = vc->num_threads;
3500	cip->subcore_threads[`0`] = vc->num_threads;
3501	cip->vc[`0`] = vc;
3502	}
3503
3504	static bool subcore_config_ok(int n_subcores, int n_threads)
3505	{
3506	/*
3507	* POWER9 "SMT4" cores are permanently in what is effectively a 4-way
3508	* split-core mode, with one thread per subcore.
3509	*/
3510	if (cpu_has_feature(CPU_FTR_ARCH_300))
3511	return n_subcores <= `4` && n_threads == `1`;
3512
3513	/ On POWER8, can only dynamically split if unsplit to begin with /
3514	if (n_subcores > `1` && threads_per_subcore < MAX_SMT_THREADS)
3515	return false;
3516	if (n_subcores > MAX_SUBCORES)
3517	return false;
3518	if (n_subcores > `1`) {
3519	if (!(dynamic_mt_modes & `2`))
3520	n_subcores = `4`;
3521	if (n_subcores > `2` && !(dynamic_mt_modes & `4`))
3522	return false;
3523	}
3524
3525	return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
3526	}
3527
3528	static void init_vcore_to_run(struct kvmppc_vcore *vc)
3529	{
3530	vc->entry_exit_map = `0`;
3531	vc->in_guest = `0`;
3532	vc->napping_threads = `0`;
3533	vc->conferring_threads = `0`;
3534	vc->tb_offset_applied = `0`;
3535	}
3536
3537	static bool can_dynamic_split(struct kvmppc_vcore vc, struct* core_info *cip)
3538	{
3539	int n_threads = vc->num_threads;
3540	int sub;
3541
3542	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
3543	return false;
3544
3545	/ In one_vm_per_core mode, require all vcores to be from the same vm /
3546	if (one_vm_per_core && vc->kvm != cip->vc[`0`]->kvm)
3547	return false;
3548
3549	if (n_threads < cip->max_subcore_threads)
3550	n_threads = cip->max_subcore_threads;
3551	if (!subcore_config_ok(n_subcores: cip->n_subcores + `1`, n_threads))
3552	return false;
3553	cip->max_subcore_threads = n_threads;
3554
3555	sub = cip->n_subcores;
3556	++cip->n_subcores;
3557	cip->total_threads += vc->num_threads;
3558	cip->subcore_threads[sub] = vc->num_threads;
3559	cip->vc[sub] = vc;
3560	init_vcore_to_run(vc);
3561	list_del_init(entry: &vc->preempt_list);
3562
3563	return true;
3564	}
3565
3566	/*
3567	* Work out whether it is possible to piggyback the execution of
3568	* vcore pvc onto the execution of the other vcores described in cip.
3569	*/
3570	static bool can_piggyback(struct kvmppc_vcore pvc, struct* core_info *cip,
3571	int target_threads)
3572	{
3573	if (cip->total_threads + pvc->num_threads > target_threads)
3574	return false;
3575
3576	return can_dynamic_split(vc: pvc, cip);
3577	}
3578
3579	static void prepare_threads(struct kvmppc_vcore *vc)
3580	{
3581	int i;
3582	struct kvm_vcpu *vcpu;
3583
3584	for_each_runnable_thread(i, vcpu, vc) {
3585	if (signal_pending(p: vcpu->arch.run_task))
3586	vcpu->arch.ret = -EINTR;
3587	else if (vcpu->arch.vpa.update_pending \|\|
3588	vcpu->arch.slb_shadow.update_pending \|\|
3589	vcpu->arch.dtl.update_pending)
3590	vcpu->arch.ret = RESUME_GUEST;
3591	else
3592	continue;
3593	kvmppc_remove_runnable(vc, vcpu, tb: mftb());
3594	wake_up(&vcpu->arch.cpu_run);
3595	}
3596	}
3597
3598	static void collect_piggybacks(struct core_info cip, int* target_threads)
3599	{
3600	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
3601	struct kvmppc_vcore pvc, vcnext;
3602
3603	spin_lock(lock: &lp->lock);
3604	list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
3605	if (!spin_trylock(lock: &pvc->lock))
3606	continue;
3607	prepare_threads(vc: pvc);
3608	if (!pvc->n_runnable \|\| !pvc->kvm->arch.mmu_ready) {
3609	list_del_init(entry: &pvc->preempt_list);
3610	if (pvc->runner == NULL) {
3611	pvc->vcore_state = VCORE_INACTIVE;
3612	kvmppc_core_end_stolen(vc: pvc, tb: mftb());
3613	}
3614	spin_unlock(lock: &pvc->lock);
3615	continue;
3616	}
3617	if (!can_piggyback(pvc, cip, target_threads)) {
3618	spin_unlock(lock: &pvc->lock);
3619	continue;
3620	}
3621	kvmppc_core_end_stolen(vc: pvc, tb: mftb());
3622	pvc->vcore_state = VCORE_PIGGYBACK;
3623	if (cip->total_threads >= target_threads)
3624	break;
3625	}
3626	spin_unlock(lock: &lp->lock);
3627	}
3628
3629	static bool recheck_signals_and_mmu(struct core_info *cip)
3630	{
3631	int sub, i;
3632	struct kvm_vcpu *vcpu;
3633	struct kvmppc_vcore *vc;
3634
3635	for (sub = `0`; sub < cip->n_subcores; ++sub) {
3636	vc = cip->vc[sub];
3637	if (!vc->kvm->arch.mmu_ready)
3638	return true;
3639	for_each_runnable_thread(i, vcpu, vc)
3640	if (signal_pending(p: vcpu->arch.run_task))
3641	return true;
3642	}
3643	return false;
3644	}
3645
3646	static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
3647	{
3648	int still_running = `0`, i;
3649	u64 now;
3650	long ret;
3651	struct kvm_vcpu *vcpu;
3652
3653	spin_lock(lock: &vc->lock);
3654	now = get_tb();
3655	for_each_runnable_thread(i, vcpu, vc) {
3656	/*
3657	* It's safe to unlock the vcore in the loop here, because
3658	* for_each_runnable_thread() is safe against removal of
3659	* the vcpu, and the vcore state is VCORE_EXITING here,
3660	* so any vcpus becoming runnable will have their arch.trap
3661	* set to zero and can't actually run in the guest.
3662	*/
3663	spin_unlock(lock: &vc->lock);
3664	/ cancel pending dec exception if dec is positive /
3665	if (now < kvmppc_dec_expires_host_tb(vcpu) &&
3666	kvmppc_core_pending_dec(vcpu))
3667	kvmppc_core_dequeue_dec(vcpu);
3668
3669	trace_kvm_guest_exit(vcpu);
3670
3671	ret = RESUME_GUEST;
3672	if (vcpu->arch.trap)
3673	ret = kvmppc_handle_exit_hv(vcpu,
3674	tsk: vcpu->arch.run_task);
3675
3676	vcpu->arch.ret = ret;
3677	vcpu->arch.trap = `0`;
3678
3679	spin_lock(lock: &vc->lock);
3680	if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
3681	if (vcpu->arch.pending_exceptions)
3682	kvmppc_core_prepare_to_enter(vcpu);
3683	if (vcpu->arch.ceded)
3684	kvmppc_set_timer(vcpu);
3685	else
3686	++still_running;
3687	} else {
3688	kvmppc_remove_runnable(vc, vcpu, tb: mftb());
3689	wake_up(&vcpu->arch.cpu_run);
3690	}
3691	}
3692	if (!is_master) {
3693	if (still_running > `0`) {
3694	kvmppc_vcore_preempt(vc);
3695	} else if (vc->runner) {
3696	vc->vcore_state = VCORE_PREEMPT;
3697	kvmppc_core_start_stolen(vc, tb: mftb());
3698	} else {
3699	vc->vcore_state = VCORE_INACTIVE;
3700	}
3701	if (vc->n_runnable > `0` && vc->runner == NULL) {
3702	/ make sure there's a candidate runner awake /
3703	i = -`1`;
3704	vcpu = next_runnable_thread(vc, ip: &i);
3705	wake_up(&vcpu->arch.cpu_run);
3706	}
3707	}
3708	spin_unlock(lock: &vc->lock);
3709	}
3710
3711	/*
3712	* Clear core from the list of active host cores as we are about to
3713	* enter the guest. Only do this if it is the primary thread of the
3714	* core (not if a subcore) that is entering the guest.
3715	*/
3716	static inline int kvmppc_clear_host_core(unsigned int cpu)
3717	{
3718	int core;
3719
3720	if (!kvmppc_host_rm_ops_hv \|\| cpu_thread_in_core(cpu))
3721	return `0`;
3722	/*
3723	* Memory barrier can be omitted here as we will do a smp_wmb()
3724	* later in kvmppc_start_thread and we need ensure that state is
3725	* visible to other CPUs only after we enter guest.
3726	*/
3727	core = cpu >> threads_shift;
3728	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = `0`;
3729	return `0`;
3730	}
3731
3732	/*
3733	* Advertise this core as an active host core since we exited the guest
3734	* Only need to do this if it is the primary thread of the core that is
3735	* exiting.
3736	*/
3737	static inline int kvmppc_set_host_core(unsigned int cpu)
3738	{
3739	int core;
3740
3741	if (!kvmppc_host_rm_ops_hv \|\| cpu_thread_in_core(cpu))
3742	return `0`;
3743
3744	/*
3745	* Memory barrier can be omitted here because we do a spin_unlock
3746	* immediately after this which provides the memory barrier.
3747	*/
3748	core = cpu >> threads_shift;
3749	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = `1`;
3750	return `0`;
3751	}
3752
3753	static void set_irq_happened(int trap)
3754	{
3755	switch (trap) {
3756	case BOOK3S_INTERRUPT_EXTERNAL:
3757	local_paca->irq_happened \|= PACA_IRQ_EE;
3758	break;
3759	case BOOK3S_INTERRUPT_H_DOORBELL:
3760	local_paca->irq_happened \|= PACA_IRQ_DBELL;
3761	break;
3762	case BOOK3S_INTERRUPT_HMI:
3763	local_paca->irq_happened \|= PACA_IRQ_HMI;
3764	break;
3765	case BOOK3S_INTERRUPT_SYSTEM_RESET:
3766	replay_system_reset();
3767	break;
3768	}
3769	}
3770
3771	/*
3772	* Run a set of guest threads on a physical core.
3773	* Called with vc->lock held.
3774	*/
3775	static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3776	{
3777	struct kvm_vcpu *vcpu;
3778	int i;
3779	int srcu_idx;
3780	struct core_info core_info;
3781	struct kvmppc_vcore *pvc;
3782	struct kvm_split_mode split_info, *sip;
3783	int split, subcore_size, active;
3784	int sub;
3785	bool thr0_done;
3786	unsigned long cmd_bit, stat_bit;
3787	int pcpu, thr;
3788	int target_threads;
3789	int controlled_threads;
3790	int trap;
3791	bool is_power8;
3792
3793	if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
3794	return;
3795
3796	/*
3797	* Remove from the list any threads that have a signal pending
3798	* or need a VPA update done
3799	*/
3800	prepare_threads(vc);
3801
3802	/ if the runner is no longer runnable, let the caller pick a new one /
3803	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
3804	return;
3805
3806	/*
3807	* Initialize *vc.
3808	*/
3809	init_vcore_to_run(vc);
3810	vc->preempt_tb = TB_NIL;
3811
3812	/*
3813	* Number of threads that we will be controlling: the same as
3814	* the number of threads per subcore, except on POWER9,
3815	* where it's 1 because the threads are (mostly) independent.
3816	*/
3817	controlled_threads = threads_per_vcore(kvm: vc->kvm);
3818
3819	/*
3820	* Make sure we are running on primary threads, and that secondary
3821	* threads are offline. Also check if the number of threads in this
3822	* guest are greater than the current system threads per guest.
3823	*/
3824	if ((controlled_threads > `1`) &&
3825	((vc->num_threads > threads_per_subcore) \|\| !on_primary_thread())) {
3826	for_each_runnable_thread(i, vcpu, vc) {
3827	vcpu->arch.ret = -EBUSY;
3828	kvmppc_remove_runnable(vc, vcpu, tb: mftb());
3829	wake_up(&vcpu->arch.cpu_run);
3830	}
3831	goto out;
3832	}
3833
3834	/*
3835	* See if we could run any other vcores on the physical core
3836	* along with this one.
3837	*/
3838	init_core_info(cip: &core_info, vc);
3839	pcpu = smp_processor_id();
3840	target_threads = controlled_threads;
3841	if (target_smt_mode && target_smt_mode < target_threads)
3842	target_threads = target_smt_mode;
3843	if (vc->num_threads < target_threads)
3844	collect_piggybacks(cip: &core_info, target_threads);
3845
3846	/*
3847	* Hard-disable interrupts, and check resched flag and signals.
3848	* If we need to reschedule or deliver a signal, clean up
3849	* and return without going into the guest(s).
3850	* If the mmu_ready flag has been cleared, don't go into the
3851	* guest because that means a HPT resize operation is in progress.
3852	*/
3853	local_irq_disable();
3854	hard_irq_disable();
3855	if (lazy_irq_pending() \|\| need_resched() \|\|
3856	recheck_signals_and_mmu(cip: &core_info)) {
3857	local_irq_enable();
3858	vc->vcore_state = VCORE_INACTIVE;
3859	/ Unlock all except the primary vcore /
3860	for (sub = `1`; sub < core_info.n_subcores; ++sub) {
3861	pvc = core_info.vc[sub];
3862	/ Put back on to the preempted vcores list /
3863	kvmppc_vcore_preempt(vc: pvc);
3864	spin_unlock(lock: &pvc->lock);
3865	}
3866	for (i = `0`; i < controlled_threads; ++i)
3867	kvmppc_release_hwthread(cpu: pcpu + i);
3868	return;
3869	}
3870
3871	kvmppc_clear_host_core(cpu: pcpu);
3872
3873	/ Decide on micro-threading (split-core) mode /
3874	subcore_size = threads_per_subcore;
3875	cmd_bit = stat_bit = `0`;
3876	split = core_info.n_subcores;
3877	sip = NULL;
3878	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
3879
3880	if (split > `1`) {
3881	sip = &split_info;
3882	memset(&split_info, `0`, sizeof(split_info));
3883	for (sub = `0`; sub < core_info.n_subcores; ++sub)
3884	split_info.vc[sub] = core_info.vc[sub];
3885
3886	if (is_power8) {
3887	if (split == `2` && (dynamic_mt_modes & `2`)) {
3888	cmd_bit = HID0_POWER8_1TO2LPAR;
3889	stat_bit = HID0_POWER8_2LPARMODE;
3890	} else {
3891	split = `4`;
3892	cmd_bit = HID0_POWER8_1TO4LPAR;
3893	stat_bit = HID0_POWER8_4LPARMODE;
3894	}
3895	subcore_size = MAX_SMT_THREADS / split;
3896	split_info.rpr = mfspr(SPRN_RPR);
3897	split_info.pmmar = mfspr(SPRN_PMMAR);
3898	split_info.ldbar = mfspr(SPRN_LDBAR);
3899	split_info.subcore_size = subcore_size;
3900	} else {
3901	split_info.subcore_size = `1`;
3902	}
3903
3904	/ order writes to split_info before kvm_split_mode pointer /
3905	smp_wmb();
3906	}
3907
3908	for (thr = `0`; thr < controlled_threads; ++thr) {
3909	struct paca_struct *paca = paca_ptrs[pcpu + thr];
3910
3911	paca->kvm_hstate.napping = `0`;
3912	paca->kvm_hstate.kvm_split_mode = sip;
3913	}
3914
3915	/ Initiate micro-threading (split-core) on POWER8 if required /
3916	if (cmd_bit) {
3917	unsigned long hid0 = mfspr(SPRN_HID0);
3918
3919	hid0 \|= cmd_bit \| HID0_POWER8_DYNLPARDIS;
3920	mb();
3921	mtspr(SPRN_HID0, hid0);
3922	isync();
3923	for (;;) {
3924	hid0 = mfspr(SPRN_HID0);
3925	if (hid0 & stat_bit)
3926	break;
3927	cpu_relax();
3928	}
3929	}
3930
3931	/*
3932	* On POWER8, set RWMR register.
3933	* Since it only affects PURR and SPURR, it doesn't affect
3934	* the host, so we don't save/restore the host value.
3935	*/
3936	if (is_power8) {
3937	unsigned long rwmr_val = RWMR_RPA_P8_8THREAD;
3938	int n_online = atomic_read(v: &vc->online_count);
3939
3940	/*
3941	* Use the 8-thread value if we're doing split-core
3942	* or if the vcore's online count looks bogus.
3943	*/
3944	if (split == `1` && threads_per_subcore == MAX_SMT_THREADS &&
3945	n_online >= `1` && n_online <= MAX_SMT_THREADS)
3946	rwmr_val = p8_rwmr_values[n_online];
3947	mtspr(SPRN_RWMR, rwmr_val);
3948	}
3949
3950	/ Start all the threads /
3951	active = `0`;
3952	for (sub = `0`; sub < core_info.n_subcores; ++sub) {
3953	thr = is_power8 ? subcore_thread_map[sub] : sub;
3954	thr0_done = false;
3955	active \|= `1` << thr;
3956	pvc = core_info.vc[sub];
3957	pvc->pcpu = pcpu + thr;
3958	for_each_runnable_thread(i, vcpu, pvc) {
3959	/*
3960	* XXX: is kvmppc_start_thread called too late here?
3961	* It updates vcpu->cpu and vcpu->arch.thread_cpu
3962	* which are used by kvmppc_fast_vcpu_kick_hv(), but
3963	* kick is called after new exceptions become available
3964	* and exceptions are checked earlier than here, by
3965	* kvmppc_core_prepare_to_enter.
3966	*/
3967	kvmppc_start_thread(vcpu, vc: pvc);
3968	kvmppc_update_vpa_dispatch(vcpu, vc: pvc);
3969	trace_kvm_guest_enter(vcpu);
3970	if (!vcpu->arch.ptid)
3971	thr0_done = true;
3972	active \|= `1` << (thr + vcpu->arch.ptid);
3973	}
3974	/*
3975	* We need to start the first thread of each subcore
3976	* even if it doesn't have a vcpu.
3977	*/
3978	if (!thr0_done)
3979	kvmppc_start_thread(NULL, vc: pvc);
3980	}
3981
3982	/*
3983	* Ensure that split_info.do_nap is set after setting
3984	* the vcore pointer in the PACA of the secondaries.
3985	*/
3986	smp_mb();
3987
3988	/*
3989	* When doing micro-threading, poke the inactive threads as well.
3990	* This gets them to the nap instruction after kvm_do_nap,
3991	* which reduces the time taken to unsplit later.
3992	*/
3993	if (cmd_bit) {
3994	split_info.do_nap = `1`; / ask secondaries to nap when done /
3995	for (thr = `1`; thr < threads_per_subcore; ++thr)
3996	if (!(active & (`1` << thr)))
3997	kvmppc_ipi_thread(pcpu + thr);
3998	}
3999
4000	vc->vcore_state = VCORE_RUNNING;
4001	preempt_disable();
4002
4003	trace_kvmppc_run_core(vc, where: `0`);
4004
4005	for (sub = `0`; sub < core_info.n_subcores; ++sub)
4006	spin_unlock(lock: &core_info.vc[sub]->lock);
4007
4008	guest_timing_enter_irqoff();
4009
4010	srcu_idx = srcu_read_lock(ssp: &vc->kvm->srcu);
4011
4012	guest_state_enter_irqoff();
4013	this_cpu_disable_ftrace();
4014
4015	trap = __kvmppc_vcore_entry();
4016
4017	this_cpu_enable_ftrace();
4018	guest_state_exit_irqoff();
4019
4020	srcu_read_unlock(ssp: &vc->kvm->srcu, idx: srcu_idx);
4021
4022	set_irq_happened(trap);
4023
4024	spin_lock(lock: &vc->lock);
4025	/ prevent other vcpu threads from doing kvmppc_start_thread() now /
4026	vc->vcore_state = VCORE_EXITING;
4027
4028	/ wait for secondary threads to finish writing their state to memory /
4029	kvmppc_wait_for_nap(n_threads: controlled_threads);
4030
4031	/ Return to whole-core mode if we split the core earlier /
4032	if (cmd_bit) {
4033	unsigned long hid0 = mfspr(SPRN_HID0);
4034	unsigned long loops = `0`;
4035
4036	hid0 &= ~HID0_POWER8_DYNLPARDIS;
4037	stat_bit = HID0_POWER8_2LPARMODE \| HID0_POWER8_4LPARMODE;
4038	mb();
4039	mtspr(SPRN_HID0, hid0);
4040	isync();
4041	for (;;) {
4042	hid0 = mfspr(SPRN_HID0);
4043	if (!(hid0 & stat_bit))
4044	break;
4045	cpu_relax();
4046	++loops;
4047	}
4048	split_info.do_nap = `0`;
4049	}
4050
4051	kvmppc_set_host_core(cpu: pcpu);
4052
4053	if (!vtime_accounting_enabled_this_cpu()) {
4054	local_irq_enable();
4055	/*
4056	* Service IRQs here before guest_timing_exit_irqoff() so any
4057	* ticks that occurred while running the guest are accounted to
4058	* the guest. If vtime accounting is enabled, accounting uses
4059	* TB rather than ticks, so it can be done without enabling
4060	* interrupts here, which has the problem that it accounts
4061	* interrupt processing overhead to the host.
4062	*/
4063	local_irq_disable();
4064	}
4065	guest_timing_exit_irqoff();
4066
4067	local_irq_enable();
4068
4069	/ Let secondaries go back to the offline loop /
4070	for (i = `0`; i < controlled_threads; ++i) {
4071	kvmppc_release_hwthread(cpu: pcpu + i);
4072	if (sip && sip->napped[i])
4073	kvmppc_ipi_thread(cpu: pcpu + i);
4074	}
4075
4076	spin_unlock(lock: &vc->lock);
4077
4078	/ make sure updates to secondary vcpu structs are visible now /
4079	smp_mb();
4080
4081	preempt_enable();
4082
4083	for (sub = `0`; sub < core_info.n_subcores; ++sub) {
4084	pvc = core_info.vc[sub];
4085	post_guest_process(vc: pvc, is_master: pvc == vc);
4086	}
4087
4088	spin_lock(lock: &vc->lock);
4089
4090	out:
4091	vc->vcore_state = VCORE_INACTIVE;
4092	trace_kvmppc_run_core(vc, where: `1`);
4093	}
4094
4095	static inline bool hcall_is_xics(unsigned long req)
4096	{
4097	return req == H_EOI \|\| req == H_CPPR \|\| req == H_IPI \|\|
4098	req == H_IPOLL \|\| req == H_XIRR \|\| req == H_XIRR_X;
4099	}
4100
4101	static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
4102	{
4103	struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
4104	if (lp) {
4105	u32 yield_count = be32_to_cpu(lp->yield_count) + `1`;
4106	lp->yield_count = cpu_to_be32(yield_count);
4107	vcpu->arch.vpa.dirty = `1`;
4108	}
4109	}
4110
4111	static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu *vcpu, u64 time_limit,
4112	unsigned long lpcr, u64 *tb)
4113	{
4114	struct kvmhv_nestedv2_io *io;
4115	unsigned long msr, i;
4116	int trap;
4117	long rc;
4118
4119	io = &vcpu->arch.nestedv2_io;
4120
4121	msr = mfmsr();
4122	kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
4123	if (lazy_irq_pending())
4124	return `0`;
4125
4126	rc = kvmhv_nestedv2_flush_vcpu(vcpu, time_limit);
4127	if (rc < `0`)
4128	return -EINVAL;
4129
4130	kvmppc_gse_put_u64(io->vcpu_run_input, KVMPPC_GSID_LPCR, lpcr);
4131
4132	accumulate_time(vcpu, &vcpu->arch.in_guest);
4133	rc = plpar_guest_run_vcpu(`0`, vcpu->kvm->arch.lpid, vcpu->vcpu_id,
4134	&trap, &i);
4135
4136	if (rc != H_SUCCESS) {
4137	pr_err("KVM Guest Run VCPU hcall failed\n");
4138	if (rc == H_INVALID_ELEMENT_ID)
4139	pr_err("KVM: Guest Run VCPU invalid element id at %ld\n", i);
4140	else if (rc == H_INVALID_ELEMENT_SIZE)
4141	pr_err("KVM: Guest Run VCPU invalid element size at %ld\n", i);
4142	else if (rc == H_INVALID_ELEMENT_VALUE)
4143	pr_err("KVM: Guest Run VCPU invalid element value at %ld\n", i);
4144	return -EINVAL;
4145	}
4146	accumulate_time(vcpu, &vcpu->arch.guest_exit);
4147
4148	*tb = mftb();
4149	kvmppc_gsm_reset(io->vcpu_message);
4150	kvmppc_gsm_reset(io->vcore_message);
4151	kvmppc_gsbm_zero(&io->valids);
4152
4153	rc = kvmhv_nestedv2_parse_output(vcpu);
4154	if (rc < `0`)
4155	return -EINVAL;
4156
4157	timer_rearm_host_dec(*tb);
4158
4159	return trap;
4160	}
4161
4162	/ call our hypervisor to load up HV regs and go /
4163	static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu vcpu, u64 time_limit, unsigned* long lpcr, u64 *tb)
4164	{
4165	unsigned long host_psscr;
4166	unsigned long msr;
4167	struct hv_guest_state hvregs;
4168	struct p9_host_os_sprs host_os_sprs;
4169	s64 dec;
4170	int trap;
4171
4172	msr = mfmsr();
4173
4174	save_p9_host_os_sprs(host_os_sprs: &host_os_sprs);
4175
4176	/*
4177	* We need to save and restore the guest visible part of the
4178	* psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
4179	* doesn't do this for us. Note only required if pseries since
4180	* this is done in kvmhv_vcpu_entry_p9() below otherwise.
4181	*/
4182	host_psscr = mfspr(SPRN_PSSCR_PR);
4183
4184	kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
4185	if (lazy_irq_pending())
4186	return `0`;
4187
4188	if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
4189	msr = mfmsr(); / TM restore can update msr /
4190
4191	if (vcpu->arch.psscr != host_psscr)
4192	mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
4193
4194	kvmhv_save_hv_regs(vcpu, &hvregs);
4195	hvregs.lpcr = lpcr;
4196	hvregs.amor = ~`0`;
4197	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
4198	hvregs.version = HV_GUEST_STATE_VERSION;
4199	if (vcpu->arch.nested) {
4200	hvregs.lpid = vcpu->arch.nested->shadow_lpid;
4201	hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
4202	} else {
4203	hvregs.lpid = vcpu->kvm->arch.lpid;
4204	hvregs.vcpu_token = vcpu->vcpu_id;
4205	}
4206	hvregs.hdec_expiry = time_limit;
4207
4208	/*
4209	* When setting DEC, we must always deal with irq_work_raise
4210	* via NMI vs setting DEC. The problem occurs right as we
4211	* switch into guest mode if a NMI hits and sets pending work
4212	* and sets DEC, then that will apply to the guest and not
4213	* bring us back to the host.
4214	*
4215	* irq_work_raise could check a flag (or possibly LPCR[HDICE]
4216	* for example) and set HDEC to 1? That wouldn't solve the
4217	* nested hv case which needs to abort the hcall or zero the
4218	* time limit.
4219	*
4220	* XXX: Another day's problem.
4221	*/
4222	mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);
4223
4224	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
4225	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
4226	switch_pmu_to_guest(vcpu, host_os_sprs: &host_os_sprs);
4227	accumulate_time(vcpu, &vcpu->arch.in_guest);
4228	trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
4229	__pa(&vcpu->arch.regs));
4230	accumulate_time(vcpu, &vcpu->arch.guest_exit);
4231	kvmhv_restore_hv_return_state(vcpu, &hvregs);
4232	switch_pmu_to_host(vcpu, host_os_sprs: &host_os_sprs);
4233	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
4234	vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
4235	vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
4236	vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
4237
4238	store_vcpu_state(vcpu);
4239
4240	dec = mfspr(SPRN_DEC);
4241	if (!(lpcr & LPCR_LD)) / Sign extend if not using large decrementer /
4242	dec = (s32) dec;
4243	*tb = mftb();
4244	vcpu->arch.dec_expires = dec + (*tb + kvmppc_get_tb_offset(vcpu));
4245
4246	timer_rearm_host_dec(*tb);
4247
4248	restore_p9_host_os_sprs(vcpu, host_os_sprs: &host_os_sprs);
4249	if (vcpu->arch.psscr != host_psscr)
4250	mtspr(SPRN_PSSCR_PR, host_psscr);
4251
4252	return trap;
4253	}
4254
4255	/*
4256	* Guest entry for POWER9 and later CPUs.
4257	*/
4258	static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
4259	unsigned long lpcr, u64 *tb)
4260	{
4261	struct kvm *kvm = vcpu->kvm;
4262	struct kvm_nested_guest *nested = vcpu->arch.nested;
4263	u64 next_timer;
4264	int trap;
4265
4266	next_timer = timer_get_next_tb();
4267	if (*tb >= next_timer)
4268	return BOOK3S_INTERRUPT_HV_DECREMENTER;
4269	if (next_timer < time_limit)
4270	time_limit = next_timer;
4271	else if (tb >= time_limit) /* nested time limit /
4272	return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
4273
4274	vcpu->arch.ceded = `0`;
4275
4276	vcpu_vpa_increment_dispatch(vcpu);
4277
4278	if (kvmhv_on_pseries()) {
4279	if (kvmhv_is_nestedv1())
4280	trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
4281	else
4282	trap = kvmhv_vcpu_entry_nestedv2(vcpu, time_limit, lpcr, tb);
4283
4284	/ H_CEDE has to be handled now, not later /
4285	if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested &&
4286	kvmppc_get_gpr(vcpu, `3`) == H_CEDE) {
4287	kvmppc_cede(vcpu);
4288	kvmppc_set_gpr(vcpu, `3`, `0`);
4289	trap = `0`;
4290	}
4291
4292	} else if (nested) {
4293	__this_cpu_write(cpu_in_guest, kvm);
4294	trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
4295	__this_cpu_write(cpu_in_guest, NULL);
4296
4297	} else {
4298	kvmppc_xive_push_vcpu(vcpu);
4299
4300	__this_cpu_write(cpu_in_guest, kvm);
4301	trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
4302	__this_cpu_write(cpu_in_guest, NULL);
4303
4304	if (trap == BOOK3S_INTERRUPT_SYSCALL &&
4305	!(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
4306	unsigned long req = kvmppc_get_gpr(vcpu, `3`);
4307
4308	/*
4309	* XIVE rearm and XICS hcalls must be handled
4310	* before xive context is pulled (is this
4311	* true?)
4312	*/
4313	if (req == H_CEDE) {
4314	/ H_CEDE has to be handled now /
4315	kvmppc_cede(vcpu);
4316	if (!kvmppc_xive_rearm_escalation(vcpu)) {
4317	/*
4318	* Pending escalation so abort
4319	* the cede.
4320	*/
4321	vcpu->arch.ceded = `0`;
4322	}
4323	kvmppc_set_gpr(vcpu, `3`, `0`);
4324	trap = `0`;
4325
4326	} else if (req == H_ENTER_NESTED) {
4327	/*
4328	* L2 should not run with the L1
4329	* context so rearm and pull it.
4330	*/
4331	if (!kvmppc_xive_rearm_escalation(vcpu)) {
4332	/*
4333	* Pending escalation so abort
4334	* H_ENTER_NESTED.
4335	*/
4336	kvmppc_set_gpr(vcpu, `3`, `0`);
4337	trap = `0`;
4338	}
4339
4340	} else if (hcall_is_xics(req)) {
4341	int ret;
4342
4343	ret = kvmppc_xive_xics_hcall(vcpu, req);
4344	if (ret != H_TOO_HARD) {
4345	kvmppc_set_gpr(vcpu, `3`, ret);
4346	trap = `0`;
4347	}
4348	}
4349	}
4350	kvmppc_xive_pull_vcpu(vcpu);
4351
4352	if (kvm_is_radix(kvm))
4353	vcpu->arch.slb_max = `0`;
4354	}
4355
4356	vcpu_vpa_increment_dispatch(vcpu);
4357
4358	return trap;
4359	}
4360
4361	/*
4362	* Wait for some other vcpu thread to execute us, and
4363	* wake us up when we need to handle something in the host.
4364	*/
4365	static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
4366	struct kvm_vcpu vcpu, int* wait_state)
4367	{
4368	DEFINE_WAIT(wait);
4369
4370	prepare_to_wait(wq_head: &vcpu->arch.cpu_run, wq_entry: &wait, state: wait_state);
4371	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4372	spin_unlock(lock: &vc->lock);
4373	schedule();
4374	spin_lock(lock: &vc->lock);
4375	}
4376	finish_wait(wq_head: &vcpu->arch.cpu_run, wq_entry: &wait);
4377	}
4378
4379	static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
4380	{
4381	if (!halt_poll_ns_grow)
4382	return;
4383
4384	vc->halt_poll_ns *= halt_poll_ns_grow;
4385	if (vc->halt_poll_ns < halt_poll_ns_grow_start)
4386	vc->halt_poll_ns = halt_poll_ns_grow_start;
4387	}
4388
4389	static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
4390	{
4391	if (halt_poll_ns_shrink == `0`)
4392	vc->halt_poll_ns = `0`;
4393	else
4394	vc->halt_poll_ns /= halt_poll_ns_shrink;
4395	}
4396
4397	#ifdef CONFIG_KVM_XICS
4398	static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
4399	{
4400	if (!xics_on_xive())
4401	return false;
4402	return vcpu->arch.irq_pending \|\| vcpu->arch.xive_saved_state.pipr <
4403	vcpu->arch.xive_saved_state.cppr;
4404	}
4405	#else
4406	static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
4407	{
4408	return false;
4409	}
4410	#endif /* CONFIG_KVM_XICS */
4411
4412	static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
4413	{
4414	if (vcpu->arch.pending_exceptions \|\| vcpu->arch.prodded \|\|
4415	kvmppc_doorbell_pending(vcpu) \|\| xive_interrupt_pending(vcpu))
4416	return true;
4417
4418	return false;
4419	}
4420
4421	static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu)
4422	{
4423	if (!vcpu->arch.ceded \|\| kvmppc_vcpu_woken(vcpu))
4424	return true;
4425	return false;
4426	}
4427
4428	/*
4429	* Check to see if any of the runnable vcpus on the vcore have pending
4430	* exceptions or are no longer ceded
4431	*/
4432	static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
4433	{
4434	struct kvm_vcpu *vcpu;
4435	int i;
4436
4437	for_each_runnable_thread(i, vcpu, vc) {
4438	if (kvmppc_vcpu_check_block(vcpu))
4439	return `1`;
4440	}
4441
4442	return `0`;
4443	}
4444
4445	/*
4446	* All the vcpus in this vcore are idle, so wait for a decrementer
4447	* or external interrupt to one of the vcpus. vc->lock is held.
4448	*/
4449	static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
4450	{
4451	ktime_t cur, start_poll, start_wait;
4452	int do_sleep = `1`;
4453	u64 block_ns;
4454
4455	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
4456
4457	/ Poll for pending exceptions and ceded state /
4458	cur = start_poll = ktime_get();
4459	if (vc->halt_poll_ns) {
4460	ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
4461	++vc->runner->stat.generic.halt_attempted_poll;
4462
4463	vc->vcore_state = VCORE_POLLING;
4464	spin_unlock(lock: &vc->lock);
4465
4466	do {
4467	if (kvmppc_vcore_check_block(vc)) {
4468	do_sleep = `0`;
4469	break;
4470	}
4471	cur = ktime_get();
4472	} while (kvm_vcpu_can_poll(cur, stop));
4473
4474	spin_lock(lock: &vc->lock);
4475	vc->vcore_state = VCORE_INACTIVE;
4476
4477	if (!do_sleep) {
4478	++vc->runner->stat.generic.halt_successful_poll;
4479	goto out;
4480	}
4481	}
4482
4483	prepare_to_rcuwait(w: &vc->wait);
4484	set_current_state(TASK_INTERRUPTIBLE);
4485	if (kvmppc_vcore_check_block(vc)) {
4486	finish_rcuwait(w: &vc->wait);
4487	do_sleep = `0`;
4488	/ If we polled, count this as a successful poll /
4489	if (vc->halt_poll_ns)
4490	++vc->runner->stat.generic.halt_successful_poll;
4491	goto out;
4492	}
4493
4494	start_wait = ktime_get();
4495
4496	vc->vcore_state = VCORE_SLEEPING;
4497	trace_kvmppc_vcore_blocked(vcpu: vc->runner, where: `0`);
4498	spin_unlock(lock: &vc->lock);
4499	schedule();
4500	finish_rcuwait(w: &vc->wait);
4501	spin_lock(lock: &vc->lock);
4502	vc->vcore_state = VCORE_INACTIVE;
4503	trace_kvmppc_vcore_blocked(vcpu: vc->runner, where: `1`);
4504	++vc->runner->stat.halt_successful_wait;
4505
4506	cur = ktime_get();
4507
4508	out:
4509	block_ns = ktime_to_ns(kt: cur) - ktime_to_ns(kt: start_poll);
4510
4511	/ Attribute wait time /
4512	if (do_sleep) {
4513	vc->runner->stat.generic.halt_wait_ns +=
4514	ktime_to_ns(kt: cur) - ktime_to_ns(kt: start_wait);
4515	KVM_STATS_LOG_HIST_UPDATE(
4516	vc->runner->stat.generic.halt_wait_hist,
4517	ktime_to_ns(cur) - ktime_to_ns(start_wait));
4518	/ Attribute failed poll time /
4519	if (vc->halt_poll_ns) {
4520	vc->runner->stat.generic.halt_poll_fail_ns +=
4521	ktime_to_ns(kt: start_wait) -
4522	ktime_to_ns(kt: start_poll);
4523	KVM_STATS_LOG_HIST_UPDATE(
4524	vc->runner->stat.generic.halt_poll_fail_hist,
4525	ktime_to_ns(start_wait) -
4526	ktime_to_ns(start_poll));
4527	}
4528	} else {
4529	/ Attribute successful poll time /
4530	if (vc->halt_poll_ns) {
4531	vc->runner->stat.generic.halt_poll_success_ns +=
4532	ktime_to_ns(kt: cur) -
4533	ktime_to_ns(kt: start_poll);
4534	KVM_STATS_LOG_HIST_UPDATE(
4535	vc->runner->stat.generic.halt_poll_success_hist,
4536	ktime_to_ns(cur) - ktime_to_ns(start_poll));
4537	}
4538	}
4539
4540	/ Adjust poll time /
4541	if (halt_poll_ns) {
4542	if (block_ns <= vc->halt_poll_ns)
4543	;
4544	/ We slept and blocked for longer than the max halt time /
4545	else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
4546	shrink_halt_poll_ns(vc);
4547	/ We slept and our poll time is too small /
4548	else if (vc->halt_poll_ns < halt_poll_ns &&
4549	block_ns < halt_poll_ns)
4550	grow_halt_poll_ns(vc);
4551	if (vc->halt_poll_ns > halt_poll_ns)
4552	vc->halt_poll_ns = halt_poll_ns;
4553	} else
4554	vc->halt_poll_ns = `0`;
4555
4556	trace_kvmppc_vcore_wakeup(do_sleep, ns: block_ns);
4557	}
4558
4559	/*
4560	* This never fails for a radix guest, as none of the operations it does
4561	* for a radix guest can fail or have a way to report failure.
4562	*/
4563	static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
4564	{
4565	int r = `0`;
4566	struct kvm *kvm = vcpu->kvm;
4567
4568	mutex_lock(&kvm->arch.mmu_setup_lock);
4569	if (!kvm->arch.mmu_ready) {
4570	if (!kvm_is_radix(kvm))
4571	r = kvmppc_hv_setup_htab_rma(vcpu);
4572	if (!r) {
4573	if (cpu_has_feature(CPU_FTR_ARCH_300))
4574	kvmppc_setup_partition_table(kvm);
4575	kvm->arch.mmu_ready = `1`;
4576	}
4577	}
4578	mutex_unlock(lock: &kvm->arch.mmu_setup_lock);
4579	return r;
4580	}
4581
4582	static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
4583	{
4584	struct kvm_run *run = vcpu->run;
4585	int n_ceded, i, r;
4586	struct kvmppc_vcore *vc;
4587	struct kvm_vcpu *v;
4588
4589	trace_kvmppc_run_vcpu_enter(vcpu);
4590
4591	run->exit_reason = `0`;
4592	vcpu->arch.ret = RESUME_GUEST;
4593	vcpu->arch.trap = `0`;
4594	kvmppc_update_vpas(vcpu);
4595
4596	/*
4597	* Synchronize with other threads in this virtual core
4598	*/
4599	vc = vcpu->arch.vcore;
4600	spin_lock(lock: &vc->lock);
4601	vcpu->arch.ceded = `0`;
4602	vcpu->arch.run_task = current;
4603	vcpu->arch.stolen_logged = vcore_stolen_time(vc, now: mftb());
4604	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4605	vcpu->arch.busy_preempt = TB_NIL;
4606	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
4607	++vc->n_runnable;
4608
4609	/*
4610	* This happens the first time this is called for a vcpu.
4611	* If the vcore is already running, we may be able to start
4612	* this thread straight away and have it join in.
4613	*/
4614	if (!signal_pending(current)) {
4615	if ((vc->vcore_state == VCORE_PIGGYBACK \|\|
4616	vc->vcore_state == VCORE_RUNNING) &&
4617	!VCORE_IS_EXITING(vc)) {
4618	kvmppc_update_vpa_dispatch(vcpu, vc);
4619	kvmppc_start_thread(vcpu, vc);
4620	trace_kvm_guest_enter(vcpu);
4621	} else if (vc->vcore_state == VCORE_SLEEPING) {
4622	rcuwait_wake_up(w: &vc->wait);
4623	}
4624
4625	}
4626
4627	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4628	!signal_pending(current)) {
4629	/ See if the MMU is ready to go /
4630	if (!vcpu->kvm->arch.mmu_ready) {
4631	spin_unlock(lock: &vc->lock);
4632	r = kvmhv_setup_mmu(vcpu);
4633	spin_lock(lock: &vc->lock);
4634	if (r) {
4635	run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4636	run->fail_entry.
4637	hardware_entry_failure_reason = `0`;
4638	vcpu->arch.ret = r;
4639	break;
4640	}
4641	}
4642
4643	if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4644	kvmppc_vcore_end_preempt(vc);
4645
4646	if (vc->vcore_state != VCORE_INACTIVE) {
4647	kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
4648	continue;
4649	}
4650	for_each_runnable_thread(i, v, vc) {
4651	kvmppc_core_prepare_to_enter(v);
4652	if (signal_pending(p: v->arch.run_task)) {
4653	kvmppc_remove_runnable(vc, vcpu: v, tb: mftb());
4654	v->stat.signal_exits++;
4655	v->run->exit_reason = KVM_EXIT_INTR;
4656	v->arch.ret = -EINTR;
4657	wake_up(&v->arch.cpu_run);
4658	}
4659	}
4660	if (!vc->n_runnable \|\| vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
4661	break;
4662	n_ceded = `0`;
4663	for_each_runnable_thread(i, v, vc) {
4664	if (!kvmppc_vcpu_woken(vcpu: v))
4665	n_ceded += v->arch.ceded;
4666	else
4667	v->arch.ceded = `0`;
4668	}
4669	vc->runner = vcpu;
4670	if (n_ceded == vc->n_runnable) {
4671	kvmppc_vcore_blocked(vc);
4672	} else if (need_resched()) {
4673	kvmppc_vcore_preempt(vc);
4674	/ Let something else run /
4675	cond_resched_lock(&vc->lock);
4676	if (vc->vcore_state == VCORE_PREEMPT)
4677	kvmppc_vcore_end_preempt(vc);
4678	} else {
4679	kvmppc_run_core(vc);
4680	}
4681	vc->runner = NULL;
4682	}
4683
4684	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
4685	(vc->vcore_state == VCORE_RUNNING \|\|
4686	vc->vcore_state == VCORE_EXITING \|\|
4687	vc->vcore_state == VCORE_PIGGYBACK))
4688	kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
4689
4690	if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
4691	kvmppc_vcore_end_preempt(vc);
4692
4693	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
4694	kvmppc_remove_runnable(vc, vcpu, tb: mftb());
4695	vcpu->stat.signal_exits++;
4696	run->exit_reason = KVM_EXIT_INTR;
4697	vcpu->arch.ret = -EINTR;
4698	}
4699
4700	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
4701	/ Wake up some vcpu to run the core /
4702	i = -`1`;
4703	v = next_runnable_thread(vc, ip: &i);
4704	wake_up(&v->arch.cpu_run);
4705	}
4706
4707	trace_kvmppc_run_vcpu_exit(vcpu);
4708	spin_unlock(lock: &vc->lock);
4709	return vcpu->arch.ret;
4710	}
4711
4712	int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
4713	unsigned long lpcr)
4714	{
4715	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
4716	struct kvm_run *run = vcpu->run;
4717	int trap, r, pcpu;
4718	int srcu_idx;
4719	struct kvmppc_vcore *vc;
4720	struct kvm *kvm = vcpu->kvm;
4721	struct kvm_nested_guest *nested = vcpu->arch.nested;
4722	unsigned long flags;
4723	u64 tb;
4724
4725	trace_kvmppc_run_vcpu_enter(vcpu);
4726
4727	run->exit_reason = `0`;
4728	vcpu->arch.ret = RESUME_GUEST;
4729	vcpu->arch.trap = `0`;
4730
4731	vc = vcpu->arch.vcore;
4732	vcpu->arch.ceded = `0`;
4733	vcpu->arch.run_task = current;
4734	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
4735
4736	/ See if the MMU is ready to go /
4737	if (unlikely(!kvm->arch.mmu_ready)) {
4738	r = kvmhv_setup_mmu(vcpu);
4739	if (r) {
4740	run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4741	run->fail_entry.hardware_entry_failure_reason = `0`;
4742	vcpu->arch.ret = r;
4743	return r;
4744	}
4745	}
4746
4747	if (need_resched())
4748	cond_resched();
4749
4750	kvmppc_update_vpas(vcpu);
4751
4752	preempt_disable();
4753	pcpu = smp_processor_id();
4754	if (kvm_is_radix(kvm))
4755	kvmppc_prepare_radix_vcpu(vcpu, pcpu);
4756
4757	/ flags save not required, but irq_pmu has no disable/enable API /
4758	powerpc_local_irq_pmu_save(flags);
4759
4760	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4761
4762	if (signal_pending(current))
4763	goto sigpend;
4764	if (need_resched() \|\| !kvm->arch.mmu_ready)
4765	goto out;
4766
4767	vcpu->cpu = pcpu;
4768	vcpu->arch.thread_cpu = pcpu;
4769	vc->pcpu = pcpu;
4770	local_paca->kvm_hstate.kvm_vcpu = vcpu;
4771	local_paca->kvm_hstate.ptid = `0`;
4772	local_paca->kvm_hstate.fake_suspend = `0`;
4773
4774	/*
4775	* Orders set cpu/thread_cpu vs testing for pending interrupts and
4776	* doorbells below. The other side is when these fields are set vs
4777	* kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to
4778	* kick a vCPU to notice the pending interrupt.
4779	*/
4780	smp_mb();
4781
4782	if (!nested) {
4783	kvmppc_core_prepare_to_enter(vcpu);
4784	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
4785	&vcpu->arch.pending_exceptions) \|\|
4786	xive_interrupt_pending(vcpu)) {
4787	/*
4788	* For nested HV, don't synthesize but always pass MER,
4789	* the L0 will be able to optimise that more
4790	* effectively than manipulating registers directly.
4791	*/
4792	if (!kvmhv_on_pseries() && (__kvmppc_get_msr_hv(vcpu) & MSR_EE))
4793	kvmppc_inject_interrupt_hv(vcpu,
4794	BOOK3S_INTERRUPT_EXTERNAL, `0`);
4795	else
4796	lpcr \|= LPCR_MER;
4797	}
4798	} else if (vcpu->arch.pending_exceptions \|\|
4799	vcpu->arch.doorbell_request \|\|
4800	xive_interrupt_pending(vcpu)) {
4801	vcpu->arch.ret = RESUME_HOST;
4802	goto out;
4803	}
4804
4805	if (vcpu->arch.timer_running) {
4806	hrtimer_try_to_cancel(timer: &vcpu->arch.dec_timer);
4807	vcpu->arch.timer_running = `0`;
4808	}
4809
4810	tb = mftb();
4811
4812	kvmppc_update_vpa_dispatch_p9(vcpu, vc, now: tb + kvmppc_get_tb_offset(vcpu));
4813
4814	trace_kvm_guest_enter(vcpu);
4815
4816	guest_timing_enter_irqoff();
4817
4818	srcu_idx = srcu_read_lock(ssp: &kvm->srcu);
4819
4820	guest_state_enter_irqoff();
4821	this_cpu_disable_ftrace();
4822
4823	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, tb: &tb);
4824	vcpu->arch.trap = trap;
4825
4826	this_cpu_enable_ftrace();
4827	guest_state_exit_irqoff();
4828
4829	srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx);
4830
4831	set_irq_happened(trap);
4832
4833	vcpu->cpu = -`1`;
4834	vcpu->arch.thread_cpu = -`1`;
4835	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
4836
4837	if (!vtime_accounting_enabled_this_cpu()) {
4838	powerpc_local_irq_pmu_restore(flags);
4839	/*
4840	* Service IRQs here before guest_timing_exit_irqoff() so any
4841	* ticks that occurred while running the guest are accounted to
4842	* the guest. If vtime accounting is enabled, accounting uses
4843	* TB rather than ticks, so it can be done without enabling
4844	* interrupts here, which has the problem that it accounts
4845	* interrupt processing overhead to the host.
4846	*/
4847	powerpc_local_irq_pmu_save(flags);
4848	}
4849	guest_timing_exit_irqoff();
4850
4851	powerpc_local_irq_pmu_restore(flags);
4852
4853	preempt_enable();
4854
4855	/*
4856	* cancel pending decrementer exception if DEC is now positive, or if
4857	* entering a nested guest in which case the decrementer is now owned
4858	* by L2 and the L1 decrementer is provided in hdec_expires
4859	*/
4860	if (!kvmhv_is_nestedv2() && kvmppc_core_pending_dec(vcpu) &&
4861	((tb < kvmppc_dec_expires_host_tb(vcpu)) \|\|
4862	(trap == BOOK3S_INTERRUPT_SYSCALL &&
4863	kvmppc_get_gpr(vcpu, `3`) == H_ENTER_NESTED)))
4864	kvmppc_core_dequeue_dec(vcpu);
4865
4866	trace_kvm_guest_exit(vcpu);
4867	r = RESUME_GUEST;
4868	if (trap) {
4869	if (!nested)
4870	r = kvmppc_handle_exit_hv(vcpu, current);
4871	else
4872	r = kvmppc_handle_nested_exit(vcpu);
4873	}
4874	vcpu->arch.ret = r;
4875
4876	if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
4877	kvmppc_set_timer(vcpu);
4878
4879	prepare_to_rcuwait(w: wait);
4880	for (;;) {
4881	set_current_state(TASK_INTERRUPTIBLE);
4882	if (signal_pending(current)) {
4883	vcpu->stat.signal_exits++;
4884	run->exit_reason = KVM_EXIT_INTR;
4885	vcpu->arch.ret = -EINTR;
4886	break;
4887	}
4888
4889	if (kvmppc_vcpu_check_block(vcpu))
4890	break;
4891
4892	trace_kvmppc_vcore_blocked(vcpu, where: `0`);
4893	schedule();
4894	trace_kvmppc_vcore_blocked(vcpu, where: `1`);
4895	}
4896	finish_rcuwait(w: wait);
4897	}
4898	vcpu->arch.ceded = `0`;
4899
4900	done:
4901	trace_kvmppc_run_vcpu_exit(vcpu);
4902
4903	return vcpu->arch.ret;
4904
4905	sigpend:
4906	vcpu->stat.signal_exits++;
4907	run->exit_reason = KVM_EXIT_INTR;
4908	vcpu->arch.ret = -EINTR;
4909	out:
4910	vcpu->cpu = -`1`;
4911	vcpu->arch.thread_cpu = -`1`;
4912	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
4913	powerpc_local_irq_pmu_restore(flags);
4914	preempt_enable();
4915	goto done;
4916	}
4917
4918	static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
4919	{
4920	struct kvm_run *run = vcpu->run;
4921	int r;
4922	int srcu_idx;
4923	struct kvm *kvm;
4924	unsigned long msr;
4925
4926	start_timing(vcpu, &vcpu->arch.vcpu_entry);
4927
4928	if (!vcpu->arch.sane) {
4929	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4930	return -EINVAL;
4931	}
4932
4933	/ No need to go into the guest when all we'll do is come back out /
4934	if (signal_pending(current)) {
4935	run->exit_reason = KVM_EXIT_INTR;
4936	return -EINTR;
4937	}
4938
4939	#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
4940	/*
4941	* Don't allow entry with a suspended transaction, because
4942	* the guest entry/exit code will lose it.
4943	*/
4944	if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
4945	(current->thread.regs->msr & MSR_TM)) {
4946	if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
4947	run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4948	run->fail_entry.hardware_entry_failure_reason = `0`;
4949	return -EINVAL;
4950	}
4951	}
4952	#endif
4953
4954	/*
4955	* Force online to 1 for the sake of old userspace which doesn't
4956	* set it.
4957	*/
4958	if (!vcpu->arch.online) {
4959	atomic_inc(v: &vcpu->arch.vcore->online_count);
4960	vcpu->arch.online = `1`;
4961	}
4962
4963	kvmppc_core_prepare_to_enter(vcpu);
4964
4965	kvm = vcpu->kvm;
4966	atomic_inc(v: &kvm->arch.vcpus_running);
4967	/ Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt /
4968	smp_mb();
4969
4970	msr = `0`;
4971	if (IS_ENABLED(CONFIG_PPC_FPU))
4972	msr \|= MSR_FP;
4973	if (cpu_has_feature(CPU_FTR_ALTIVEC))
4974	msr \|= MSR_VEC;
4975	if (cpu_has_feature(CPU_FTR_VSX))
4976	msr \|= MSR_VSX;
4977	if ((cpu_has_feature(CPU_FTR_TM) \|\|
4978	cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
4979	(kvmppc_get_hfscr_hv(vcpu) & HFSCR_TM))
4980	msr \|= MSR_TM;
4981	msr = msr_check_and_set(msr);
4982
4983	kvmppc_save_user_regs();
4984
4985	kvmppc_save_current_sprs();
4986
4987	if (!cpu_has_feature(CPU_FTR_ARCH_300))
4988	vcpu->arch.waitp = &vcpu->arch.vcore->wait;
4989	vcpu->arch.pgdir = kvm->mm->pgd;
4990	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
4991
4992	do {
4993	accumulate_time(vcpu, &vcpu->arch.guest_entry);
4994	if (cpu_has_feature(CPU_FTR_ARCH_300))
4995	r = kvmhv_run_single_vcpu(vcpu, time_limit: ~(u64)`0`,
4996	lpcr: vcpu->arch.vcore->lpcr);
4997	else
4998	r = kvmppc_run_vcpu(vcpu);
4999
5000	if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
5001	accumulate_time(vcpu, &vcpu->arch.hcall);
5002
5003	if (!kvmhv_is_nestedv2() && WARN_ON_ONCE(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) {
5004	/*
5005	* These should have been caught reflected
5006	* into the guest by now. Final sanity check:
5007	* don't allow userspace to execute hcalls in
5008	* the hypervisor.
5009	*/
5010	r = RESUME_GUEST;
5011	continue;
5012	}
5013	trace_kvm_hcall_enter(vcpu);
5014	r = kvmppc_pseries_do_hcall(vcpu);
5015	trace_kvm_hcall_exit(vcpu, ret: r);
5016	kvmppc_core_prepare_to_enter(vcpu);
5017	} else if (r == RESUME_PAGE_FAULT) {
5018	accumulate_time(vcpu, &vcpu->arch.pg_fault);
5019	srcu_idx = srcu_read_lock(ssp: &kvm->srcu);
5020	r = kvmppc_book3s_hv_page_fault(vcpu,
5021	vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
5022	srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx);
5023	} else if (r == RESUME_PASSTHROUGH) {
5024	if (WARN_ON(xics_on_xive()))
5025	r = H_SUCCESS;
5026	else
5027	r = kvmppc_xics_rm_complete(vcpu, `0`);
5028	}
5029	} while (is_kvmppc_resume_guest(r));
5030	accumulate_time(vcpu, &vcpu->arch.vcpu_exit);
5031
5032	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
5033	atomic_dec(v: &kvm->arch.vcpus_running);
5034
5035	srr_regs_clobbered();
5036
5037	end_timing(vcpu);
5038
5039	return r;
5040	}
5041
5042	static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
5043	int shift, int sllp)
5044	{
5045	(*sps)->page_shift = shift;
5046	(*sps)->slb_enc = sllp;
5047	(*sps)->enc[`0`].page_shift = shift;
5048	(*sps)->enc[`0`].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
5049	/*
5050	* Add 16MB MPSS support (may get filtered out by userspace)
5051	*/
5052	if (shift != `24`) {
5053	int penc = kvmppc_pgsize_lp_encoding(shift, `24`);
5054	if (penc != -`1`) {
5055	(*sps)->enc[`1`].page_shift = `24`;
5056	(*sps)->enc[`1`].pte_enc = penc;
5057	}
5058	}
5059	(*sps)++;
5060	}
5061
5062	static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
5063	struct kvm_ppc_smmu_info *info)
5064	{
5065	struct kvm_ppc_one_seg_page_size *sps;
5066
5067	/*
5068	* POWER7, POWER8 and POWER9 all support 32 storage keys for data.
5069	* POWER7 doesn't support keys for instruction accesses,
5070	* POWER8 and POWER9 do.
5071	*/
5072	info->data_keys = `32`;
5073	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? `32` : `0`;
5074
5075	/ POWER7, 8 and 9 all have 1T segments and 32-entry SLB /
5076	info->flags = KVM_PPC_PAGE_SIZES_REAL \| KVM_PPC_1T_SEGMENTS;
5077	info->slb_size = `32`;
5078
5079	/ We only support these sizes for now, and no muti-size segments /
5080	sps = &info->sps[`0`];
5081	kvmppc_add_seg_page_size(sps: &sps, shift: `12`, sllp: `0`);
5082	kvmppc_add_seg_page_size(&sps, `16`, SLB_VSID_L \| SLB_VSID_LP_01);
5083	kvmppc_add_seg_page_size(&sps, `24`, SLB_VSID_L);
5084
5085	/ If running as a nested hypervisor, we don't support HPT guests /
5086	if (kvmhv_on_pseries())
5087	info->flags \|= KVM_PPC_NO_HASH;
5088
5089	return `0`;
5090	}
5091
5092	/*
5093	* Get (and clear) the dirty memory log for a memory slot.
5094	*/
5095	static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
5096	struct kvm_dirty_log *log)
5097	{
5098	struct kvm_memslots *slots;
5099	struct kvm_memory_slot *memslot;
5100	int r;
5101	unsigned long n, i;
5102	unsigned long buf, p;
5103	struct kvm_vcpu *vcpu;
5104
5105	mutex_lock(&kvm->slots_lock);
5106
5107	r = -EINVAL;
5108	if (log->slot >= KVM_USER_MEM_SLOTS)
5109	goto out;
5110
5111	slots = kvm_memslots(kvm);
5112	memslot = id_to_memslot(slots, id: log->slot);
5113	r = -ENOENT;
5114	if (!memslot \|\| !memslot->dirty_bitmap)
5115	goto out;
5116
5117	/*
5118	* Use second half of bitmap area because both HPT and radix
5119	* accumulate bits in the first half.
5120	*/
5121	n = kvm_dirty_bitmap_bytes(memslot);
5122	buf = memslot->dirty_bitmap + n / sizeof(long);
5123	memset(buf, `0`, n);
5124
5125	if (kvm_is_radix(kvm))
5126	r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
5127	else
5128	r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
5129	if (r)
5130	goto out;
5131
5132	/*
5133	* We accumulate dirty bits in the first half of the
5134	* memslot's dirty_bitmap area, for when pages are paged
5135	* out or modified by the host directly. Pick up these
5136	* bits and add them to the map.
5137	*/
5138	p = memslot->dirty_bitmap;
5139	for (i = `0`; i < n / sizeof(long); ++i)
5140	buf[i] \|= xchg(&p[i], `0`);
5141
5142	/ Harvest dirty bits from VPA and DTL updates /
5143	/ Note: we never modify the SLB shadow buffer areas /
5144	kvm_for_each_vcpu(i, vcpu, kvm) {
5145	spin_lock(lock: &vcpu->arch.vpa_update_lock);
5146	kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
5147	kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
5148	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
5149	}
5150
5151	r = -EFAULT;
5152	if (copy_to_user(to: log->dirty_bitmap, from: buf, n))
5153	goto out;
5154
5155	r = `0`;
5156	out:
5157	mutex_unlock(lock: &kvm->slots_lock);
5158	return r;
5159	}
5160
5161	static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
5162	{
5163	vfree(addr: slot->arch.rmap);
5164	slot->arch.rmap = NULL;
5165	}
5166
5167	static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
5168	const struct kvm_memory_slot *old,
5169	struct kvm_memory_slot *new,
5170	enum kvm_mr_change change)
5171	{
5172	if (change == KVM_MR_CREATE) {
5173	unsigned long size = array_size(new->npages, sizeof(*new->arch.rmap));
5174
5175	if ((size >> PAGE_SHIFT) > totalram_pages())
5176	return -ENOMEM;
5177
5178	new->arch.rmap = vzalloc(size);
5179	if (!new->arch.rmap)
5180	return -ENOMEM;
5181	} else if (change != KVM_MR_DELETE) {
5182	new->arch.rmap = old->arch.rmap;
5183	}
5184
5185	return `0`;
5186	}
5187
5188	static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
5189	struct kvm_memory_slot *old,
5190	const struct kvm_memory_slot *new,
5191	enum kvm_mr_change change)
5192	{
5193	/*
5194	* If we are creating or modifying a memslot, it might make
5195	* some address that was previously cached as emulated
5196	* MMIO be no longer emulated MMIO, so invalidate
5197	* all the caches of emulated MMIO translations.
5198	*/
5199	if (change != KVM_MR_DELETE)
5200	atomic64_inc(v: &kvm->arch.mmio_update);
5201
5202	/*
5203	* For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
5204	* have already called kvm_arch_flush_shadow_memslot() to
5205	* flush shadow mappings. For KVM_MR_CREATE we have no
5206	* previous mappings. So the only case to handle is
5207	* KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
5208	* has been changed.
5209	* For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
5210	* to get rid of any THP PTEs in the partition-scoped page tables
5211	* so we can track dirtiness at the page level; we flush when
5212	* clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
5213	* using THP PTEs.
5214	*/
5215	if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
5216	((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
5217	kvmppc_radix_flush_memslot(kvm, old);
5218	/*
5219	* If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
5220	*/
5221	if (!kvm->arch.secure_guest)
5222	return;
5223
5224	switch (change) {
5225	case KVM_MR_CREATE:
5226	/*
5227	* @TODO kvmppc_uvmem_memslot_create() can fail and
5228	* return error. Fix this.
5229	*/
5230	kvmppc_uvmem_memslot_create(kvm, new);
5231	break;
5232	case KVM_MR_DELETE:
5233	kvmppc_uvmem_memslot_delete(kvm, old);
5234	break;
5235	default:
5236	/ TODO: Handle KVM_MR_MOVE /
5237	break;
5238	}
5239	}
5240
5241	/*
5242	* Update LPCR values in kvm->arch and in vcores.
5243	* Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
5244	* of kvm->arch.lpcr update).
5245	*/
5246	void kvmppc_update_lpcr(struct kvm kvm, unsigned* long lpcr, unsigned long mask)
5247	{
5248	long int i;
5249	u32 cores_done = `0`;
5250
5251	if ((kvm->arch.lpcr & mask) == lpcr)
5252	return;
5253
5254	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) \| lpcr;
5255
5256	for (i = `0`; i < KVM_MAX_VCORES; ++i) {
5257	struct kvmppc_vcore *vc = kvm->arch.vcores[i];
5258	if (!vc)
5259	continue;
5260
5261	spin_lock(&vc->lock);
5262	vc->lpcr = (vc->lpcr & ~mask) \| lpcr;
5263	verify_lpcr(kvm, vc->lpcr);
5264	spin_unlock(&vc->lock);
5265	if (++cores_done >= kvm->arch.online_vcores)
5266	break;
5267	}
5268
5269	if (kvmhv_is_nestedv2()) {
5270	struct kvm_vcpu *vcpu;
5271
5272	kvm_for_each_vcpu(i, vcpu, kvm) {
5273	kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LPCR);
5274	}
5275	}
5276	}
5277
5278	void kvmppc_setup_partition_table(struct kvm *kvm)
5279	{
5280	unsigned long dw0, dw1;
5281
5282	if (!kvm_is_radix(kvm)) {
5283	/ PS field - page size for VRMA /
5284	dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> `1`) \|
5285	((kvm->arch.vrma_slb_v & SLB_VSID_LP) << `1`);
5286	/ HTABSIZE and HTABORG fields /
5287	dw0 \|= kvm->arch.sdr1;
5288
5289	/ Second dword as set by userspace /
5290	dw1 = kvm->arch.process_table;
5291	} else {
5292	dw0 = PATB_HR \| radix__get_tree_size() \|
5293	__pa(kvm->arch.pgtable) \| RADIX_PGD_INDEX_SIZE;
5294	dw1 = PATB_GR \| kvm->arch.process_table;
5295	}
5296	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
5297	}
5298
5299	/*
5300	* Set up HPT (hashed page table) and RMA (real-mode area).
5301	* Must be called with kvm->arch.mmu_setup_lock held.
5302	*/
5303	static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
5304	{
5305	int err = `0`;
5306	struct kvm *kvm = vcpu->kvm;
5307	unsigned long hva;
5308	struct kvm_memory_slot *memslot;
5309	struct vm_area_struct *vma;
5310	unsigned long lpcr = `0`, senc;
5311	unsigned long psize, porder;
5312	int srcu_idx;
5313
5314	/ Allocate hashed page table (if not done already) and reset it /
5315	if (!kvm->arch.hpt.virt) {
5316	int order = KVM_DEFAULT_HPT_ORDER;
5317	struct kvm_hpt_info info;
5318
5319	err = kvmppc_allocate_hpt(&info, order);
5320	/ If we get here, it means userspace didn't specify a*
5321	* size explicitly. So, try successively smaller
5322	* sizes if the default failed. */
5323	while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
5324	err = kvmppc_allocate_hpt(&info, order);
5325
5326	if (err < `0`) {
5327	pr_err("KVM: Couldn't alloc HPT\n");
5328	goto out;
5329	}
5330
5331	kvmppc_set_hpt(kvm, &info);
5332	}
5333
5334	/ Look up the memslot for guest physical address 0 /
5335	srcu_idx = srcu_read_lock(ssp: &kvm->srcu);
5336	memslot = gfn_to_memslot(kvm, gfn: `0`);
5337
5338	/ We must have some memory at 0 by now /
5339	err = -EINVAL;
5340	if (!memslot \|\| (memslot->flags & KVM_MEMSLOT_INVALID))
5341	goto out_srcu;
5342
5343	/ Look up the VMA for the start of this memory slot /
5344	hva = memslot->userspace_addr;
5345	mmap_read_lock(mm: kvm->mm);
5346	vma = vma_lookup(mm: kvm->mm, addr: hva);
5347	if (!vma \|\| (vma->vm_flags & VM_IO))
5348	goto up_out;
5349
5350	psize = vma_kernel_pagesize(vma);
5351
5352	mmap_read_unlock(mm: kvm->mm);
5353
5354	/ We can handle 4k, 64k or 16M pages in the VRMA /
5355	if (psize >= `0x1000000`)
5356	psize = `0x1000000`;
5357	else if (psize >= `0x10000`)
5358	psize = `0x10000`;
5359	else
5360	psize = `0x1000`;
5361	porder = __ilog2(psize);
5362
5363	senc = slb_pgsize_encoding(psize);
5364	kvm->arch.vrma_slb_v = senc \| SLB_VSID_B_1T \|
5365	(VRMA_VSID << SLB_VSID_SHIFT_1T);
5366	/ Create HPTEs in the hash page table for the VRMA /
5367	kvmppc_map_vrma(vcpu, memslot, porder);
5368
5369	/ Update VRMASD field in the LPCR /
5370	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
5371	/ the -4 is to account for senc values starting at 0x10 /
5372	lpcr = senc << (LPCR_VRMASD_SH - `4`);
5373	kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
5374	}
5375
5376	/ Order updates to kvm->arch.lpcr etc. vs. mmu_ready /
5377	smp_wmb();
5378	err = `0`;
5379	out_srcu:
5380	srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx);
5381	out:
5382	return err;
5383
5384	up_out:
5385	mmap_read_unlock(mm: kvm->mm);
5386	goto out_srcu;
5387	}
5388
5389	/*
5390	* Must be called with kvm->arch.mmu_setup_lock held and
5391	* mmu_ready = 0 and no vcpus running.
5392	*/
5393	int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
5394	{
5395	unsigned long lpcr, lpcr_mask;
5396
5397	if (nesting_enabled(kvm))
5398	kvmhv_release_all_nested(kvm);
5399	kvmppc_rmap_reset(kvm);
5400	kvm->arch.process_table = `0`;
5401	/ Mutual exclusion with kvm_unmap_gfn_range etc. /
5402	spin_lock(lock: &kvm->mmu_lock);
5403	kvm->arch.radix = `0`;
5404	spin_unlock(lock: &kvm->mmu_lock);
5405	kvmppc_free_radix(kvm);
5406
5407	lpcr = LPCR_VPM1;
5408	lpcr_mask = LPCR_VPM1 \| LPCR_UPRT \| LPCR_GTSE \| LPCR_HR;
5409	if (cpu_has_feature(CPU_FTR_ARCH_31))
5410	lpcr_mask \|= LPCR_HAIL;
5411	kvmppc_update_lpcr(kvm, lpcr, mask: lpcr_mask);
5412
5413	return `0`;
5414	}
5415
5416	/*
5417	* Must be called with kvm->arch.mmu_setup_lock held and
5418	* mmu_ready = 0 and no vcpus running.
5419	*/
5420	int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
5421	{
5422	unsigned long lpcr, lpcr_mask;
5423	int err;
5424
5425	err = kvmppc_init_vm_radix(kvm);
5426	if (err)
5427	return err;
5428	kvmppc_rmap_reset(kvm);
5429	/ Mutual exclusion with kvm_unmap_gfn_range etc. /
5430	spin_lock(lock: &kvm->mmu_lock);
5431	kvm->arch.radix = `1`;
5432	spin_unlock(lock: &kvm->mmu_lock);
5433	kvmppc_free_hpt(&kvm->arch.hpt);
5434
5435	lpcr = LPCR_UPRT \| LPCR_GTSE \| LPCR_HR;
5436	lpcr_mask = LPCR_VPM1 \| LPCR_UPRT \| LPCR_GTSE \| LPCR_HR;
5437	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
5438	lpcr_mask \|= LPCR_HAIL;
5439	if (cpu_has_feature(CPU_FTR_HVMODE) &&
5440	(kvm->arch.host_lpcr & LPCR_HAIL))
5441	lpcr \|= LPCR_HAIL;
5442	}
5443	kvmppc_update_lpcr(kvm, lpcr, mask: lpcr_mask);
5444
5445	return `0`;
5446	}
5447
5448	#ifdef CONFIG_KVM_XICS
5449	/*
5450	* Allocate a per-core structure for managing state about which cores are
5451	* running in the host versus the guest and for exchanging data between
5452	* real mode KVM and CPU running in the host.
5453	* This is only done for the first VM.
5454	* The allocated structure stays even if all VMs have stopped.
5455	* It is only freed when the kvm-hv module is unloaded.
5456	* It's OK for this routine to fail, we just don't support host
5457	* core operations like redirecting H_IPI wakeups.
5458	*/
5459	void kvmppc_alloc_host_rm_ops(void)
5460	{
5461	struct kvmppc_host_rm_ops *ops;
5462	unsigned long l_ops;
5463	int cpu, core;
5464	int size;
5465
5466	if (cpu_has_feature(CPU_FTR_ARCH_300))
5467	return;
5468
5469	/ Not the first time here ? /
5470	if (kvmppc_host_rm_ops_hv != NULL)
5471	return;
5472
5473	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
5474	if (!ops)
5475	return;
5476
5477	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
5478	ops->rm_core = kzalloc(size, GFP_KERNEL);
5479
5480	if (!ops->rm_core) {
5481	kfree(ops);
5482	return;
5483	}
5484
5485	cpus_read_lock();
5486
5487	for (cpu = `0`; cpu < nr_cpu_ids; cpu += threads_per_core) {
5488	if (!cpu_online(cpu))
5489	continue;
5490
5491	core = cpu >> threads_shift;
5492	ops->rm_core[core].rm_state.in_host = `1`;
5493	}
5494
5495	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
5496
5497	/*
5498	* Make the contents of the kvmppc_host_rm_ops structure visible
5499	* to other CPUs before we assign it to the global variable.
5500	* Do an atomic assignment (no locks used here), but if someone
5501	* beats us to it, just free our copy and return.
5502	*/
5503	smp_wmb();
5504	l_ops = (unsigned long) ops;
5505
5506	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, `0`, l_ops)) {
5507	cpus_read_unlock();
5508	kfree(ops->rm_core);
5509	kfree(ops);
5510	return;
5511	}
5512
5513	cpuhp_setup_state_nocalls_cpuslocked(CPUHP_KVM_PPC_BOOK3S_PREPARE,
5514	"ppc/kvm_book3s:prepare",
5515	kvmppc_set_host_core,
5516	kvmppc_clear_host_core);
5517	cpus_read_unlock();
5518	}
5519
5520	void kvmppc_free_host_rm_ops(void)
5521	{
5522	if (kvmppc_host_rm_ops_hv) {
5523	cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE);
5524	kfree(kvmppc_host_rm_ops_hv->rm_core);
5525	kfree(kvmppc_host_rm_ops_hv);
5526	kvmppc_host_rm_ops_hv = NULL;
5527	}
5528	}
5529	#endif
5530
5531	static int kvmppc_core_init_vm_hv(struct kvm *kvm)
5532	{
5533	unsigned long lpcr, lpid;
5534	int ret;
5535
5536	mutex_init(&kvm->arch.uvmem_lock);
5537	INIT_LIST_HEAD(list: &kvm->arch.uvmem_pfns);
5538	mutex_init(&kvm->arch.mmu_setup_lock);
5539
5540	/ Allocate the guest's logical partition ID /
5541
5542	if (!kvmhv_is_nestedv2()) {
5543	lpid = kvmppc_alloc_lpid();
5544	if ((long)lpid < `0`)
5545	return -ENOMEM;
5546	kvm->arch.lpid = lpid;
5547	}
5548
5549	kvmppc_alloc_host_rm_ops();
5550
5551	kvmhv_vm_nested_init(kvm);
5552
5553	if (kvmhv_is_nestedv2()) {
5554	long rc;
5555	unsigned long guest_id;
5556
5557	rc = plpar_guest_create(`0`, &guest_id);
5558
5559	if (rc != H_SUCCESS)
5560	pr_err("KVM: Create Guest hcall failed, rc=%ld\n", rc);
5561
5562	switch (rc) {
5563	case H_PARAMETER:
5564	case H_FUNCTION:
5565	case H_STATE:
5566	return -EINVAL;
5567	case H_NOT_ENOUGH_RESOURCES:
5568	case H_ABORTED:
5569	return -ENOMEM;
5570	case H_AUTHORITY:
5571	return -EPERM;
5572	case H_NOT_AVAILABLE:
5573	return -EBUSY;
5574	}
5575	kvm->arch.lpid = guest_id;
5576	}
5577
5578
5579	/*
5580	* Since we don't flush the TLB when tearing down a VM,
5581	* and this lpid might have previously been used,
5582	* make sure we flush on each core before running the new VM.
5583	* On POWER9, the tlbie in mmu_partition_table_set_entry()
5584	* does this flush for us.
5585	*/
5586	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5587	cpumask_setall(dstp: &kvm->arch.need_tlb_flush);
5588
5589	/ Start out with the default set of hcalls enabled /
5590	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
5591	sizeof(kvm->arch.enabled_hcalls));
5592
5593	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5594	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
5595
5596	/ Init LPCR for virtual RMA mode /
5597	if (cpu_has_feature(CPU_FTR_HVMODE)) {
5598	kvm->arch.host_lpid = mfspr(SPRN_LPID);
5599	kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
5600	lpcr &= LPCR_PECE \| LPCR_LPES;
5601	} else {
5602	/*
5603	* The L2 LPES mode will be set by the L0 according to whether
5604	* or not it needs to take external interrupts in HV mode.
5605	*/
5606	lpcr = `0`;
5607	}
5608	lpcr \|= (`4UL` << LPCR_DPFD_SH) \| LPCR_HDICE \|
5609	LPCR_VPM0 \| LPCR_VPM1;
5610	kvm->arch.vrma_slb_v = SLB_VSID_B_1T \|
5611	(VRMA_VSID << SLB_VSID_SHIFT_1T);
5612	/ On POWER8 turn on online bit to enable PURR/SPURR /
5613	if (cpu_has_feature(CPU_FTR_ARCH_207S))
5614	lpcr \|= LPCR_ONL;
5615	/*
5616	* On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
5617	* Set HVICE bit to enable hypervisor virtualization interrupts.
5618	* Set HEIC to prevent OS interrupts to go to hypervisor (should
5619	* be unnecessary but better safe than sorry in case we re-enable
5620	* EE in HV mode with this LPCR still set)
5621	*/
5622	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5623	lpcr &= ~LPCR_VPM0;
5624	lpcr \|= LPCR_HVICE \| LPCR_HEIC;
5625
5626	/*
5627	* If xive is enabled, we route 0x500 interrupts directly
5628	* to the guest.
5629	*/
5630	if (xics_on_xive())
5631	lpcr \|= LPCR_LPES;
5632	}
5633
5634	/*
5635	* If the host uses radix, the guest starts out as radix.
5636	*/
5637	if (radix_enabled()) {
5638	kvm->arch.radix = `1`;
5639	kvm->arch.mmu_ready = `1`;
5640	lpcr &= ~LPCR_VPM1;
5641	lpcr \|= LPCR_UPRT \| LPCR_GTSE \| LPCR_HR;
5642	if (cpu_has_feature(CPU_FTR_HVMODE) &&
5643	cpu_has_feature(CPU_FTR_ARCH_31) &&
5644	(kvm->arch.host_lpcr & LPCR_HAIL))
5645	lpcr \|= LPCR_HAIL;
5646	ret = kvmppc_init_vm_radix(kvm);
5647	if (ret) {
5648	if (kvmhv_is_nestedv2())
5649	plpar_guest_delete(`0`, kvm->arch.lpid);
5650	else
5651	kvmppc_free_lpid(kvm->arch.lpid);
5652	return ret;
5653	}
5654	kvmppc_setup_partition_table(kvm);
5655	}
5656
5657	verify_lpcr(kvm, lpcr);
5658	kvm->arch.lpcr = lpcr;
5659
5660	/ Initialization for future HPT resizes /
5661	kvm->arch.resize_hpt = NULL;
5662
5663	/*
5664	* Work out how many sets the TLB has, for the use of
5665	* the TLB invalidation loop in book3s_hv_rmhandlers.S.
5666	*/
5667	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
5668	/*
5669	* P10 will flush all the congruence class with a single tlbiel
5670	*/
5671	kvm->arch.tlb_sets = `1`;
5672	} else if (radix_enabled())
5673	kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; / 128 /
5674	else if (cpu_has_feature(CPU_FTR_ARCH_300))
5675	kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; / 256 /
5676	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
5677	kvm->arch.tlb_sets = POWER8_TLB_SETS; / 512 /
5678	else
5679	kvm->arch.tlb_sets = POWER7_TLB_SETS; / 128 /
5680
5681	/*
5682	* Track that we now have a HV mode VM active. This blocks secondary
5683	* CPU threads from coming online.
5684	*/
5685	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5686	kvm_hv_vm_activated();
5687
5688	/*
5689	* Initialize smt_mode depending on processor.
5690	* POWER8 and earlier have to use "strict" threading, where
5691	* all vCPUs in a vcore have to run on the same (sub)core,
5692	* whereas on POWER9 the threads can each run a different
5693	* guest.
5694	*/
5695	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5696	kvm->arch.smt_mode = threads_per_subcore;
5697	else
5698	kvm->arch.smt_mode = `1`;
5699	kvm->arch.emul_smt_mode = `1`;
5700
5701	return `0`;
5702	}
5703
5704	static int kvmppc_arch_create_vm_debugfs_hv(struct kvm *kvm)
5705	{
5706	kvmppc_mmu_debugfs_init(kvm);
5707	if (radix_enabled())
5708	kvmhv_radix_debugfs_init(kvm);
5709	return `0`;
5710	}
5711
5712	static void kvmppc_free_vcores(struct kvm *kvm)
5713	{
5714	long int i;
5715
5716	for (i = `0`; i < KVM_MAX_VCORES; ++i)
5717	kfree(kvm->arch.vcores[i]);
5718	kvm->arch.online_vcores = `0`;
5719	}
5720
5721	static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
5722	{
5723	if (!cpu_has_feature(CPU_FTR_ARCH_300))
5724	kvm_hv_vm_deactivated();
5725
5726	kvmppc_free_vcores(kvm);
5727
5728
5729	if (kvm_is_radix(kvm))
5730	kvmppc_free_radix(kvm);
5731	else
5732	kvmppc_free_hpt(&kvm->arch.hpt);
5733
5734	/ Perform global invalidation and return lpid to the pool /
5735	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5736	if (nesting_enabled(kvm))
5737	kvmhv_release_all_nested(kvm);
5738	kvm->arch.process_table = `0`;
5739	if (kvm->arch.secure_guest)
5740	uv_svm_terminate(kvm->arch.lpid);
5741	if (!kvmhv_is_nestedv2())
5742	kvmhv_set_ptbl_entry(kvm->arch.lpid, `0`, `0`);
5743	}
5744
5745	if (kvmhv_is_nestedv2()) {
5746	kvmhv_flush_lpid(kvm->arch.lpid);
5747	plpar_guest_delete(`0`, kvm->arch.lpid);
5748	} else {
5749	kvmppc_free_lpid(kvm->arch.lpid);
5750	}
5751
5752	kvmppc_free_pimap(kvm);
5753	}
5754
5755	/ We don't need to emulate any privileged instructions or dcbz /
5756	static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
5757	unsigned int inst, int *advance)
5758	{
5759	return EMULATE_FAIL;
5760	}
5761
5762	static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu vcpu, int* sprn,
5763	ulong spr_val)
5764	{
5765	return EMULATE_FAIL;
5766	}
5767
5768	static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu vcpu, int* sprn,
5769	ulong *spr_val)
5770	{
5771	return EMULATE_FAIL;
5772	}
5773
5774	static int kvmppc_core_check_processor_compat_hv(void)
5775	{
5776	if (cpu_has_feature(CPU_FTR_HVMODE) &&
5777	cpu_has_feature(CPU_FTR_ARCH_206))
5778	return `0`;
5779
5780	/ POWER9 in radix mode is capable of being a nested hypervisor. /
5781	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
5782	return `0`;
5783
5784	return -EIO;
5785	}
5786
5787	#ifdef CONFIG_KVM_XICS
5788
5789	void kvmppc_free_pimap(struct kvm *kvm)
5790	{
5791	kfree(kvm->arch.pimap);
5792	}
5793
5794	static struct kvmppc_passthru_irqmap kvmppc_alloc_pimap(void*)
5795	{
5796	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
5797	}
5798
5799	static int kvmppc_set_passthru_irq(struct kvm kvm, int* host_irq, int guest_gsi)
5800	{
5801	struct irq_desc *desc;
5802	struct kvmppc_irq_map *irq_map;
5803	struct kvmppc_passthru_irqmap *pimap;
5804	struct irq_chip *chip;
5805	int i, rc = `0`;
5806	struct irq_data *host_data;
5807
5808	if (!kvm_irq_bypass)
5809	return `1`;
5810
5811	desc = irq_to_desc(host_irq);
5812	if (!desc)
5813	return -EIO;
5814
5815	mutex_lock(&kvm->lock);
5816
5817	pimap = kvm->arch.pimap;
5818	if (pimap == NULL) {
5819	/ First call, allocate structure to hold IRQ map /
5820	pimap = kvmppc_alloc_pimap();
5821	if (pimap == NULL) {
5822	mutex_unlock(&kvm->lock);
5823	return -ENOMEM;
5824	}
5825	kvm->arch.pimap = pimap;
5826	}
5827
5828	/*
5829	* For now, we only support interrupts for which the EOI operation
5830	* is an OPAL call followed by a write to XIRR, since that's
5831	* what our real-mode EOI code does, or a XIVE interrupt
5832	*/
5833	chip = irq_data_get_irq_chip(&desc->irq_data);
5834	if (!chip \|\| !is_pnv_opal_msi(chip)) {
5835	pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
5836	host_irq, guest_gsi);
5837	mutex_unlock(&kvm->lock);
5838	return -ENOENT;
5839	}
5840
5841	/*
5842	* See if we already have an entry for this guest IRQ number.
5843	* If it's mapped to a hardware IRQ number, that's an error,
5844	* otherwise re-use this entry.
5845	*/
5846	for (i = `0`; i < pimap->n_mapped; i++) {
5847	if (guest_gsi == pimap->mapped[i].v_hwirq) {
5848	if (pimap->mapped[i].r_hwirq) {
5849	mutex_unlock(&kvm->lock);
5850	return -EINVAL;
5851	}
5852	break;
5853	}
5854	}
5855
5856	if (i == KVMPPC_PIRQ_MAPPED) {
5857	mutex_unlock(&kvm->lock);
5858	return -EAGAIN; / table is full /
5859	}
5860
5861	irq_map = &pimap->mapped[i];
5862
5863	irq_map->v_hwirq = guest_gsi;
5864	irq_map->desc = desc;
5865
5866	/*
5867	* Order the above two stores before the next to serialize with
5868	* the KVM real mode handler.
5869	*/
5870	smp_wmb();
5871
5872	/*
5873	* The 'host_irq' number is mapped in the PCI-MSI domain but
5874	* the underlying calls, which will EOI the interrupt in real
5875	* mode, need an HW IRQ number mapped in the XICS IRQ domain.
5876	*/
5877	host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
5878	irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
5879
5880	if (i == pimap->n_mapped)
5881	pimap->n_mapped++;
5882
5883	if (xics_on_xive())
5884	rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
5885	else
5886	kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
5887	if (rc)
5888	irq_map->r_hwirq = `0`;
5889
5890	mutex_unlock(&kvm->lock);
5891
5892	return `0`;
5893	}
5894
5895	static int kvmppc_clr_passthru_irq(struct kvm kvm, int* host_irq, int guest_gsi)
5896	{
5897	struct irq_desc *desc;
5898	struct kvmppc_passthru_irqmap *pimap;
5899	int i, rc = `0`;
5900
5901	if (!kvm_irq_bypass)
5902	return `0`;
5903
5904	desc = irq_to_desc(host_irq);
5905	if (!desc)
5906	return -EIO;
5907
5908	mutex_lock(&kvm->lock);
5909	if (!kvm->arch.pimap)
5910	goto unlock;
5911
5912	pimap = kvm->arch.pimap;
5913
5914	for (i = `0`; i < pimap->n_mapped; i++) {
5915	if (guest_gsi == pimap->mapped[i].v_hwirq)
5916	break;
5917	}
5918
5919	if (i == pimap->n_mapped) {
5920	mutex_unlock(&kvm->lock);
5921	return -ENODEV;
5922	}
5923
5924	if (xics_on_xive())
5925	rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
5926	else
5927	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
5928
5929	/ invalidate the entry (what to do on error from the above ?) /
5930	pimap->mapped[i].r_hwirq = `0`;
5931
5932	/*
5933	* We don't free this structure even when the count goes to
5934	* zero. The structure is freed when we destroy the VM.
5935	*/
5936	unlock:
5937	mutex_unlock(&kvm->lock);
5938	return rc;
5939	}
5940
5941	static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
5942	struct irq_bypass_producer *prod)
5943	{
5944	int ret = `0`;
5945	struct kvm_kernel_irqfd *irqfd =
5946	container_of(cons, struct kvm_kernel_irqfd, consumer);
5947
5948	irqfd->producer = prod;
5949
5950	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
5951	if (ret)
5952	pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
5953	prod->irq, irqfd->gsi, ret);
5954
5955	return ret;
5956	}
5957
5958	static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
5959	struct irq_bypass_producer *prod)
5960	{
5961	int ret;
5962	struct kvm_kernel_irqfd *irqfd =
5963	container_of(cons, struct kvm_kernel_irqfd, consumer);
5964
5965	irqfd->producer = NULL;
5966
5967	/*
5968	* When producer of consumer is unregistered, we change back to
5969	* default external interrupt handling mode - KVM real mode
5970	* will switch back to host.
5971	*/
5972	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
5973	if (ret)
5974	pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
5975	prod->irq, irqfd->gsi, ret);
5976	}
5977	#endif
5978
5979	static int kvm_arch_vm_ioctl_hv(struct file *filp,
5980	unsigned int ioctl, unsigned long arg)
5981	{
5982	struct kvm *kvm __maybe_unused = filp->private_data;
5983	void __user argp = (void* __user *)arg;
5984	int r;
5985
5986	switch (ioctl) {
5987
5988	case KVM_PPC_ALLOCATE_HTAB: {
5989	u32 htab_order;
5990
5991	/ If we're a nested hypervisor, we currently only support radix /
5992	if (kvmhv_on_pseries()) {
5993	r = -EOPNOTSUPP;
5994	break;
5995	}
5996
5997	r = -EFAULT;
5998	if (get_user(htab_order, (u32 __user *)argp))
5999	break;
6000	r = kvmppc_alloc_reset_hpt(kvm, htab_order);
6001	if (r)
6002	break;
6003	r = `0`;
6004	break;
6005	}
6006
6007	case KVM_PPC_GET_HTAB_FD: {
6008	struct kvm_get_htab_fd ghf;
6009
6010	r = -EFAULT;
6011	if (copy_from_user(to: &ghf, from: argp, n: sizeof(ghf)))
6012	break;
6013	r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
6014	break;
6015	}
6016
6017	case KVM_PPC_RESIZE_HPT_PREPARE: {
6018	struct kvm_ppc_resize_hpt rhpt;
6019
6020	r = -EFAULT;
6021	if (copy_from_user(to: &rhpt, from: argp, n: sizeof(rhpt)))
6022	break;
6023
6024	r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
6025	break;
6026	}
6027
6028	case KVM_PPC_RESIZE_HPT_COMMIT: {
6029	struct kvm_ppc_resize_hpt rhpt;
6030
6031	r = -EFAULT;
6032	if (copy_from_user(to: &rhpt, from: argp, n: sizeof(rhpt)))
6033	break;
6034
6035	r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
6036	break;
6037	}
6038
6039	default:
6040	r = -ENOTTY;
6041	}
6042
6043	return r;
6044	}
6045
6046	/*
6047	* List of hcall numbers to enable by default.
6048	* For compatibility with old userspace, we enable by default
6049	* all hcalls that were implemented before the hcall-enabling
6050	* facility was added. Note this list should not include H_RTAS.
6051	*/
6052	static unsigned int default_hcall_list[] = {
6053	H_REMOVE,
6054	H_ENTER,
6055	H_READ,
6056	H_PROTECT,
6057	H_BULK_REMOVE,
6058	#ifdef CONFIG_SPAPR_TCE_IOMMU
6059	H_GET_TCE,
6060	H_PUT_TCE,
6061	#endif
6062	H_SET_DABR,
6063	H_SET_XDABR,
6064	H_CEDE,
6065	H_PROD,
6066	H_CONFER,
6067	H_REGISTER_VPA,
6068	#ifdef CONFIG_KVM_XICS
6069	H_EOI,
6070	H_CPPR,
6071	H_IPI,
6072	H_IPOLL,
6073	H_XIRR,
6074	H_XIRR_X,
6075	#endif
6076	`0`
6077	};
6078
6079	static void init_default_hcalls(void)
6080	{
6081	int i;
6082	unsigned int hcall;
6083
6084	for (i = `0`; default_hcall_list[i]; ++i) {
6085	hcall = default_hcall_list[i];
6086	WARN_ON(!kvmppc_hcall_impl_hv(hcall));
6087	__set_bit(hcall / `4`, default_enabled_hcalls);
6088	}
6089	}
6090
6091	static int kvmhv_configure_mmu(struct kvm kvm, struct* kvm_ppc_mmuv3_cfg *cfg)
6092	{
6093	unsigned long lpcr;
6094	int radix;
6095	int err;
6096
6097	/ If not on a POWER9, reject it /
6098	if (!cpu_has_feature(CPU_FTR_ARCH_300))
6099	return -ENODEV;
6100
6101	/ If any unknown flags set, reject it /
6102	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX \| KVM_PPC_MMUV3_GTSE))
6103	return -EINVAL;
6104
6105	/ GR (guest radix) bit in process_table field must match /
6106	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
6107	if (!!(cfg->process_table & PATB_GR) != radix)
6108	return -EINVAL;
6109
6110	/ Process table size field must be reasonable, i.e. <= 24 /
6111	if ((cfg->process_table & PRTS_MASK) > `24`)
6112	return -EINVAL;
6113
6114	/ We can change a guest to/from radix now, if the host is radix /
6115	if (radix && !radix_enabled())
6116	return -EINVAL;
6117
6118	/ If we're a nested hypervisor, we currently only support radix /
6119	if (kvmhv_on_pseries() && !radix)
6120	return -EINVAL;
6121
6122	mutex_lock(&kvm->arch.mmu_setup_lock);
6123	if (radix != kvm_is_radix(kvm)) {
6124	if (kvm->arch.mmu_ready) {
6125	kvm->arch.mmu_ready = `0`;
6126	/ order mmu_ready vs. vcpus_running /
6127	smp_mb();
6128	if (atomic_read(v: &kvm->arch.vcpus_running)) {
6129	kvm->arch.mmu_ready = `1`;
6130	err = -EBUSY;
6131	goto out_unlock;
6132	}
6133	}
6134	if (radix)
6135	err = kvmppc_switch_mmu_to_radix(kvm);
6136	else
6137	err = kvmppc_switch_mmu_to_hpt(kvm);
6138	if (err)
6139	goto out_unlock;
6140	}
6141
6142	kvm->arch.process_table = cfg->process_table;
6143	kvmppc_setup_partition_table(kvm);
6144
6145	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : `0`;
6146	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
6147	err = `0`;
6148
6149	out_unlock:
6150	mutex_unlock(lock: &kvm->arch.mmu_setup_lock);
6151	return err;
6152	}
6153
6154	static int kvmhv_enable_nested(struct kvm *kvm)
6155	{
6156	if (!nested)
6157	return -EPERM;
6158	if (!cpu_has_feature(CPU_FTR_ARCH_300))
6159	return -ENODEV;
6160	if (!radix_enabled())
6161	return -ENODEV;
6162	if (kvmhv_is_nestedv2())
6163	return -ENODEV;
6164
6165	/ kvm == NULL means the caller is testing if the capability exists /
6166	if (kvm)
6167	kvm->arch.nested_enable = true;
6168	return `0`;
6169	}
6170
6171	static int kvmhv_load_from_eaddr(struct kvm_vcpu vcpu, ulong eaddr, void *ptr,
6172	int size)
6173	{
6174	int rc = -EINVAL;
6175
6176	if (kvmhv_vcpu_is_radix(vcpu)) {
6177	rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
6178
6179	if (rc > `0`)
6180	rc = -EINVAL;
6181	}
6182
6183	/ For now quadrants are the only way to access nested guest memory /
6184	if (rc && vcpu->arch.nested)
6185	rc = -EAGAIN;
6186
6187	return rc;
6188	}
6189
6190	static int kvmhv_store_to_eaddr(struct kvm_vcpu vcpu, ulong eaddr, void *ptr,
6191	int size)
6192	{
6193	int rc = -EINVAL;
6194
6195	if (kvmhv_vcpu_is_radix(vcpu)) {
6196	rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
6197
6198	if (rc > `0`)
6199	rc = -EINVAL;
6200	}
6201
6202	/ For now quadrants are the only way to access nested guest memory /
6203	if (rc && vcpu->arch.nested)
6204	rc = -EAGAIN;
6205
6206	return rc;
6207	}
6208
6209	static void unpin_vpa_reset(struct kvm kvm, struct* kvmppc_vpa *vpa)
6210	{
6211	unpin_vpa(kvm, vpa);
6212	vpa->gpa = `0`;
6213	vpa->pinned_addr = NULL;
6214	vpa->dirty = false;
6215	vpa->update_pending = `0`;
6216	}
6217
6218	/*
6219	* Enable a guest to become a secure VM, or test whether
6220	* that could be enabled.
6221	* Called when the KVM_CAP_PPC_SECURE_GUEST capability is
6222	* tested (kvm == NULL) or enabled (kvm != NULL).
6223	*/
6224	static int kvmhv_enable_svm(struct kvm *kvm)
6225	{
6226	if (!kvmppc_uvmem_available())
6227	return -EINVAL;
6228	if (kvm)
6229	kvm->arch.svm_enabled = `1`;
6230	return `0`;
6231	}
6232
6233	/*
6234	* IOCTL handler to turn off secure mode of guest
6235	*
6236	* - Release all device pages
6237	* - Issue ucall to terminate the guest on the UV side
6238	* - Unpin the VPA pages.
6239	* - Reinit the partition scoped page tables
6240	*/
6241	static int kvmhv_svm_off(struct kvm *kvm)
6242	{
6243	struct kvm_vcpu *vcpu;
6244	int mmu_was_ready;
6245	int srcu_idx;
6246	int ret = `0`;
6247	unsigned long i;
6248
6249	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
6250	return ret;
6251
6252	mutex_lock(&kvm->arch.mmu_setup_lock);
6253	mmu_was_ready = kvm->arch.mmu_ready;
6254	if (kvm->arch.mmu_ready) {
6255	kvm->arch.mmu_ready = `0`;
6256	/ order mmu_ready vs. vcpus_running /
6257	smp_mb();
6258	if (atomic_read(v: &kvm->arch.vcpus_running)) {
6259	kvm->arch.mmu_ready = `1`;
6260	ret = -EBUSY;
6261	goto out;
6262	}
6263	}
6264
6265	srcu_idx = srcu_read_lock(ssp: &kvm->srcu);
6266	for (i = `0`; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
6267	struct kvm_memory_slot *memslot;
6268	struct kvm_memslots *slots = __kvm_memslots(kvm, as_id: i);
6269	int bkt;
6270
6271	if (!slots)
6272	continue;
6273
6274	kvm_for_each_memslot(memslot, bkt, slots) {
6275	kvmppc_uvmem_drop_pages(memslot, kvm, true);
6276	uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
6277	}
6278	}
6279	srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx);
6280
6281	ret = uv_svm_terminate(kvm->arch.lpid);
6282	if (ret != U_SUCCESS) {
6283	ret = -EINVAL;
6284	goto out;
6285	}
6286
6287	/*
6288	* When secure guest is reset, all the guest pages are sent
6289	* to UV via UV_PAGE_IN before the non-boot vcpus get a
6290	* chance to run and unpin their VPA pages. Unpinning of all
6291	* VPA pages is done here explicitly so that VPA pages
6292	* can be migrated to the secure side.
6293	*
6294	* This is required to for the secure SMP guest to reboot
6295	* correctly.
6296	*/
6297	kvm_for_each_vcpu(i, vcpu, kvm) {
6298	spin_lock(lock: &vcpu->arch.vpa_update_lock);
6299	unpin_vpa_reset(kvm, vpa: &vcpu->arch.dtl);
6300	unpin_vpa_reset(kvm, vpa: &vcpu->arch.slb_shadow);
6301	unpin_vpa_reset(kvm, vpa: &vcpu->arch.vpa);
6302	spin_unlock(lock: &vcpu->arch.vpa_update_lock);
6303	}
6304
6305	kvmppc_setup_partition_table(kvm);
6306	kvm->arch.secure_guest = `0`;
6307	kvm->arch.mmu_ready = mmu_was_ready;
6308	out:
6309	mutex_unlock(lock: &kvm->arch.mmu_setup_lock);
6310	return ret;
6311	}
6312
6313	static int kvmhv_enable_dawr1(struct kvm *kvm)
6314	{
6315	if (!cpu_has_feature(CPU_FTR_DAWR1))
6316	return -ENODEV;
6317
6318	/ kvm == NULL means the caller is testing if the capability exists /
6319	if (kvm)
6320	kvm->arch.dawr1_enabled = true;
6321	return `0`;
6322	}
6323
6324	static bool kvmppc_hash_v3_possible(void)
6325	{
6326	if (!cpu_has_feature(CPU_FTR_ARCH_300))
6327	return false;
6328
6329	if (!cpu_has_feature(CPU_FTR_HVMODE))
6330	return false;
6331
6332	/*
6333	* POWER9 chips before version 2.02 can't have some threads in
6334	* HPT mode and some in radix mode on the same core.
6335	*/
6336	if (radix_enabled()) {
6337	unsigned int pvr = mfspr(SPRN_PVR);
6338	if ((pvr >> `16`) == PVR_POWER9 &&
6339	(((pvr & `0xe000`) == `0` && (pvr & `0xfff`) < `0x202`) \|\|
6340	((pvr & `0xe000`) == `0x2000` && (pvr & `0xfff`) < `0x101`)))
6341	return false;
6342	}
6343
6344	return true;
6345	}
6346
6347	static struct kvmppc_ops kvm_ops_hv = {
6348	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
6349	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
6350	.get_one_reg = kvmppc_get_one_reg_hv,
6351	.set_one_reg = kvmppc_set_one_reg_hv,
6352	.vcpu_load = kvmppc_core_vcpu_load_hv,
6353	.vcpu_put = kvmppc_core_vcpu_put_hv,
6354	.inject_interrupt = kvmppc_inject_interrupt_hv,
6355	.set_msr = kvmppc_set_msr_hv,
6356	.vcpu_run = kvmppc_vcpu_run_hv,
6357	.vcpu_create = kvmppc_core_vcpu_create_hv,
6358	.vcpu_free = kvmppc_core_vcpu_free_hv,
6359	.check_requests = kvmppc_core_check_requests_hv,
6360	.get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv,
6361	.flush_memslot = kvmppc_core_flush_memslot_hv,
6362	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
6363	.commit_memory_region = kvmppc_core_commit_memory_region_hv,
6364	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
6365	.age_gfn = kvm_age_gfn_hv,
6366	.test_age_gfn = kvm_test_age_gfn_hv,
6367	.set_spte_gfn = kvm_set_spte_gfn_hv,
6368	.free_memslot = kvmppc_core_free_memslot_hv,
6369	.init_vm = kvmppc_core_init_vm_hv,
6370	.destroy_vm = kvmppc_core_destroy_vm_hv,
6371	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
6372	.emulate_op = kvmppc_core_emulate_op_hv,
6373	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
6374	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
6375	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
6376	.arch_vm_ioctl = kvm_arch_vm_ioctl_hv,
6377	.hcall_implemented = kvmppc_hcall_impl_hv,
6378	#ifdef CONFIG_KVM_XICS
6379	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
6380	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
6381	#endif
6382	.configure_mmu = kvmhv_configure_mmu,
6383	.get_rmmu_info = kvmhv_get_rmmu_info,
6384	.set_smt_mode = kvmhv_set_smt_mode,
6385	.enable_nested = kvmhv_enable_nested,
6386	.load_from_eaddr = kvmhv_load_from_eaddr,
6387	.store_to_eaddr = kvmhv_store_to_eaddr,
6388	.enable_svm = kvmhv_enable_svm,
6389	.svm_off = kvmhv_svm_off,
6390	.enable_dawr1 = kvmhv_enable_dawr1,
6391	.hash_v3_possible = kvmppc_hash_v3_possible,
6392	.create_vcpu_debugfs = kvmppc_arch_create_vcpu_debugfs_hv,
6393	.create_vm_debugfs = kvmppc_arch_create_vm_debugfs_hv,
6394	};
6395
6396	static int kvm_init_subcore_bitmap(void)
6397	{
6398	int i, j;
6399	int nr_cores = cpu_nr_cores();
6400	struct sibling_subcore_state *sibling_subcore_state;
6401
6402	for (i = `0`; i < nr_cores; i++) {
6403	int first_cpu = i * threads_per_core;
6404	int node = cpu_to_node(cpu: first_cpu);
6405
6406	/ Ignore if it is already allocated. /
6407	if (paca_ptrs[first_cpu]->sibling_subcore_state)
6408	continue;
6409
6410	sibling_subcore_state =
6411	kzalloc_node(sizeof(struct sibling_subcore_state),
6412	GFP_KERNEL, node);
6413	if (!sibling_subcore_state)
6414	return -ENOMEM;
6415
6416
6417	for (j = `0`; j < threads_per_core; j++) {
6418	int cpu = first_cpu + j;
6419
6420	paca_ptrs[cpu]->sibling_subcore_state =
6421	sibling_subcore_state;
6422	}
6423	}
6424	return `0`;
6425	}
6426
6427	static int kvmppc_radix_possible(void)
6428	{
6429	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
6430	}
6431
6432	static int kvmppc_book3s_init_hv(void)
6433	{
6434	int r;
6435
6436	if (!tlbie_capable) {
6437	pr_err("KVM-HV: Host does not support TLBIE\n");
6438	return -ENODEV;
6439	}
6440
6441	/*
6442	* FIXME!! Do we need to check on all cpus ?
6443	*/
6444	r = kvmppc_core_check_processor_compat_hv();
6445	if (r < `0`)
6446	return -ENODEV;
6447
6448	r = kvmhv_nested_init();
6449	if (r)
6450	return r;
6451
6452	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
6453	r = kvm_init_subcore_bitmap();
6454	if (r)
6455	goto err;
6456	}
6457
6458	/*
6459	* We need a way of accessing the XICS interrupt controller,
6460	* either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or
6461	* indirectly, via OPAL.
6462	*/
6463	#ifdef CONFIG_SMP
6464	if (!xics_on_xive() && !kvmhv_on_pseries() &&
6465	!local_paca->kvm_hstate.xics_phys) {
6466	struct device_node *np;
6467
6468	np = of_find_compatible_node(NULL, NULL, compat: "ibm,opal-intc");
6469	if (!np) {
6470	pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
6471	r = -ENODEV;
6472	goto err;
6473	}
6474	/ presence of intc confirmed - node can be dropped again /
6475	of_node_put(node: np);
6476	}
6477	#endif
6478
6479	init_default_hcalls();
6480
6481	init_vcore_lists();
6482
6483	r = kvmppc_mmu_hv_init();
6484	if (r)
6485	goto err;
6486
6487	if (kvmppc_radix_possible()) {
6488	r = kvmppc_radix_init();
6489	if (r)
6490	goto err;
6491	}
6492
6493	r = kvmppc_uvmem_init();
6494	if (r < `0`) {
6495	pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
6496	return r;
6497	}
6498
6499	kvm_ops_hv.owner = THIS_MODULE;
6500	kvmppc_hv_ops = &kvm_ops_hv;
6501
6502	return `0`;
6503
6504	err:
6505	kvmhv_nested_exit();
6506	kvmppc_radix_exit();
6507
6508	return r;
6509	}
6510
6511	static void kvmppc_book3s_exit_hv(void)
6512	{
6513	kvmppc_uvmem_free();
6514	kvmppc_free_host_rm_ops();
6515	if (kvmppc_radix_possible())
6516	kvmppc_radix_exit();
6517	kvmppc_hv_ops = NULL;
6518	kvmhv_nested_exit();
6519	}
6520
6521	module_init(kvmppc_book3s_init_hv);
6522	module_exit(kvmppc_book3s_exit_hv);
6523	MODULE_LICENSE("GPL");
6524	MODULE_ALIAS_MISCDEV(KVM_MINOR);
6525	MODULE_ALIAS("devname:kvm");
6526

source code of linux/arch/powerpc/kvm/book3s_hv.c