1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2020 - Google LLC
4 * Author: Quentin Perret <qperret@google.com>
5 */
6
7#include <linux/init.h>
8#include <linux/interval_tree_generic.h>
9#include <linux/kmemleak.h>
10#include <linux/kvm_host.h>
11#include <asm/kvm_mmu.h>
12#include <linux/memblock.h>
13#include <linux/mutex.h>
14
15#include <asm/kvm_pkvm.h>
16
17#include "hyp_constants.h"
18
19DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
20
21static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
22static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
23
24phys_addr_t hyp_mem_base;
25phys_addr_t hyp_mem_size;
26
27static int __init register_memblock_regions(void)
28{
29 struct memblock_region *reg;
30
31 for_each_mem_region(reg) {
32 if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
33 return -ENOMEM;
34
35 hyp_memory[*hyp_memblock_nr_ptr] = *reg;
36 (*hyp_memblock_nr_ptr)++;
37 }
38
39 return 0;
40}
41
42void __init kvm_hyp_reserve(void)
43{
44 u64 hyp_mem_pages = 0;
45 int ret;
46
47 if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
48 return;
49
50 if (kvm_get_mode() != KVM_MODE_PROTECTED)
51 return;
52
53 ret = register_memblock_regions();
54 if (ret) {
55 *hyp_memblock_nr_ptr = 0;
56 kvm_err("Failed to register hyp memblocks: %d\n", ret);
57 return;
58 }
59
60 hyp_mem_pages += hyp_s1_pgtable_pages();
61 hyp_mem_pages += host_s2_pgtable_pages();
62 hyp_mem_pages += hyp_vm_table_pages();
63 hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
64 hyp_mem_pages += pkvm_selftest_pages();
65 hyp_mem_pages += hyp_ffa_proxy_pages();
66
67 /*
68 * Try to allocate a PMD-aligned region to reduce TLB pressure once
69 * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
70 */
71 hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
72 hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
73 PMD_SIZE);
74 if (!hyp_mem_base)
75 hyp_mem_base = memblock_phys_alloc(size: hyp_mem_size, PAGE_SIZE);
76 else
77 hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
78
79 if (!hyp_mem_base) {
80 kvm_err("Failed to reserve hyp memory\n");
81 return;
82 }
83
84 kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
85 hyp_mem_base);
86}
87
88static void __pkvm_destroy_hyp_vm(struct kvm *kvm)
89{
90 if (pkvm_hyp_vm_is_created(kvm)) {
91 WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
92 kvm->arch.pkvm.handle));
93 } else if (kvm->arch.pkvm.handle) {
94 /*
95 * The VM could have been reserved but hyp initialization has
96 * failed. Make sure to unreserve it.
97 */
98 kvm_call_hyp_nvhe(__pkvm_unreserve_vm, kvm->arch.pkvm.handle);
99 }
100
101 kvm->arch.pkvm.handle = 0;
102 kvm->arch.pkvm.is_created = false;
103 free_hyp_memcache(&kvm->arch.pkvm.teardown_mc);
104 free_hyp_memcache(&kvm->arch.pkvm.stage2_teardown_mc);
105}
106
107static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
108{
109 size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
110 pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle;
111 void *hyp_vcpu;
112 int ret;
113
114 vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
115
116 hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
117 if (!hyp_vcpu)
118 return -ENOMEM;
119
120 ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu);
121 if (!ret)
122 vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED);
123 else
124 free_pages_exact(virt: hyp_vcpu, size: hyp_vcpu_sz);
125
126 return ret;
127}
128
129/*
130 * Allocates and donates memory for hypervisor VM structs at EL2.
131 *
132 * Allocates space for the VM state, which includes the hyp vm as well as
133 * the hyp vcpus.
134 *
135 * Stores an opaque handler in the kvm struct for future reference.
136 *
137 * Return 0 on success, negative error code on failure.
138 */
139static int __pkvm_create_hyp_vm(struct kvm *kvm)
140{
141 size_t pgd_sz, hyp_vm_sz;
142 void *pgd, *hyp_vm;
143 int ret;
144
145 if (kvm->created_vcpus < 1)
146 return -EINVAL;
147
148 pgd_sz = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr);
149
150 /*
151 * The PGD pages will be reclaimed using a hyp_memcache which implies
152 * page granularity. So, use alloc_pages_exact() to get individual
153 * refcounts.
154 */
155 pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
156 if (!pgd)
157 return -ENOMEM;
158
159 /* Allocate memory to donate to hyp for vm and vcpu pointers. */
160 hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
161 size_mul(sizeof(void *),
162 kvm->created_vcpus)));
163 hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
164 if (!hyp_vm) {
165 ret = -ENOMEM;
166 goto free_pgd;
167 }
168
169 /* Donate the VM memory to hyp and let hyp initialize it. */
170 ret = kvm_call_hyp_nvhe(__pkvm_init_vm, kvm, hyp_vm, pgd);
171 if (ret)
172 goto free_vm;
173
174 kvm->arch.pkvm.is_created = true;
175 kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
176 kvm_account_pgtable_pages(virt: pgd, nr: pgd_sz / PAGE_SIZE);
177
178 return 0;
179free_vm:
180 free_pages_exact(virt: hyp_vm, size: hyp_vm_sz);
181free_pgd:
182 free_pages_exact(virt: pgd, size: pgd_sz);
183 return ret;
184}
185
186bool pkvm_hyp_vm_is_created(struct kvm *kvm)
187{
188 return READ_ONCE(kvm->arch.pkvm.is_created);
189}
190
191int pkvm_create_hyp_vm(struct kvm *kvm)
192{
193 int ret = 0;
194
195 mutex_lock(&kvm->arch.config_lock);
196 if (!pkvm_hyp_vm_is_created(kvm))
197 ret = __pkvm_create_hyp_vm(kvm);
198 mutex_unlock(lock: &kvm->arch.config_lock);
199
200 return ret;
201}
202
203int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
204{
205 int ret = 0;
206
207 mutex_lock(&vcpu->kvm->arch.config_lock);
208 if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED))
209 ret = __pkvm_create_hyp_vcpu(vcpu);
210 mutex_unlock(lock: &vcpu->kvm->arch.config_lock);
211
212 return ret;
213}
214
215void pkvm_destroy_hyp_vm(struct kvm *kvm)
216{
217 mutex_lock(&kvm->arch.config_lock);
218 __pkvm_destroy_hyp_vm(kvm);
219 mutex_unlock(lock: &kvm->arch.config_lock);
220}
221
222int pkvm_init_host_vm(struct kvm *kvm)
223{
224 int ret;
225
226 if (pkvm_hyp_vm_is_created(kvm))
227 return -EINVAL;
228
229 /* VM is already reserved, no need to proceed. */
230 if (kvm->arch.pkvm.handle)
231 return 0;
232
233 /* Reserve the VM in hyp and obtain a hyp handle for the VM. */
234 ret = kvm_call_hyp_nvhe(__pkvm_reserve_vm);
235 if (ret < 0)
236 return ret;
237
238 kvm->arch.pkvm.handle = ret;
239
240 return 0;
241}
242
243static void __init _kvm_host_prot_finalize(void *arg)
244{
245 int *err = arg;
246
247 if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
248 WRITE_ONCE(*err, -EINVAL);
249}
250
251static int __init pkvm_drop_host_privileges(void)
252{
253 int ret = 0;
254
255 /*
256 * Flip the static key upfront as that may no longer be possible
257 * once the host stage 2 is installed.
258 */
259 static_branch_enable(&kvm_protected_mode_initialized);
260 on_each_cpu(func: _kvm_host_prot_finalize, info: &ret, wait: 1);
261 return ret;
262}
263
264static int __init finalize_pkvm(void)
265{
266 int ret;
267
268 if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised())
269 return 0;
270
271 /*
272 * Exclude HYP sections from kmemleak so that they don't get peeked
273 * at, which would end badly once inaccessible.
274 */
275 kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
276 kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start);
277 kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start);
278 kmemleak_free_part_phys(phys: hyp_mem_base, size: hyp_mem_size);
279
280 ret = pkvm_drop_host_privileges();
281 if (ret)
282 pr_err("Failed to finalize Hyp protection: %d\n", ret);
283
284 return ret;
285}
286device_initcall_sync(finalize_pkvm);
287
288static u64 __pkvm_mapping_start(struct pkvm_mapping *m)
289{
290 return m->gfn * PAGE_SIZE;
291}
292
293static u64 __pkvm_mapping_end(struct pkvm_mapping *m)
294{
295 return (m->gfn + m->nr_pages) * PAGE_SIZE - 1;
296}
297
298INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last,
299 __pkvm_mapping_start, __pkvm_mapping_end, static,
300 pkvm_mapping);
301
302/*
303 * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow
304 * freeing of __map inline.
305 */
306#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \
307 for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \
308 __start, __end - 1); \
309 __tmp && ({ \
310 __map = __tmp; \
311 __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \
312 true; \
313 }); \
314 )
315
316int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
317 struct kvm_pgtable_mm_ops *mm_ops)
318{
319 pgt->pkvm_mappings = RB_ROOT_CACHED;
320 pgt->mmu = mmu;
321
322 return 0;
323}
324
325static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end)
326{
327 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
328 pkvm_handle_t handle = kvm->arch.pkvm.handle;
329 struct pkvm_mapping *mapping;
330 int ret;
331
332 if (!handle)
333 return 0;
334
335 for_each_mapping_in_range_safe(pgt, start, end, mapping) {
336 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
337 mapping->nr_pages);
338 if (WARN_ON(ret))
339 return ret;
340 pkvm_mapping_remove(node: mapping, root: &pgt->pkvm_mappings);
341 kfree(objp: mapping);
342 }
343
344 return 0;
345}
346
347void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
348 u64 addr, u64 size)
349{
350 __pkvm_pgtable_stage2_unmap(pgt, start: addr, end: addr + size);
351}
352
353void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
354{
355 /* Expected to be called after all pKVM mappings have been released. */
356 WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root));
357}
358
359int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
360 u64 phys, enum kvm_pgtable_prot prot,
361 void *mc, enum kvm_pgtable_walk_flags flags)
362{
363 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
364 struct pkvm_mapping *mapping = NULL;
365 struct kvm_hyp_memcache *cache = mc;
366 u64 gfn = addr >> PAGE_SHIFT;
367 u64 pfn = phys >> PAGE_SHIFT;
368 int ret;
369
370 if (size != PAGE_SIZE && size != PMD_SIZE)
371 return -EINVAL;
372
373 lockdep_assert_held_write(&kvm->mmu_lock);
374
375 /*
376 * Calling stage2_map() on top of existing mappings is either happening because of a race
377 * with another vCPU, or because we're changing between page and block mappings. As per
378 * user_mem_abort(), same-size permission faults are handled in the relax_perms() path.
379 */
380 mapping = pkvm_mapping_iter_first(root: &pgt->pkvm_mappings, start: addr, last: addr + size - 1);
381 if (mapping) {
382 if (size == (mapping->nr_pages * PAGE_SIZE))
383 return -EAGAIN;
384
385 /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */
386 ret = __pkvm_pgtable_stage2_unmap(pgt, start: addr, end: addr + size);
387 if (ret)
388 return ret;
389 mapping = NULL;
390 }
391
392 ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot);
393 if (WARN_ON(ret))
394 return ret;
395
396 swap(mapping, cache->mapping);
397 mapping->gfn = gfn;
398 mapping->pfn = pfn;
399 mapping->nr_pages = size / PAGE_SIZE;
400 pkvm_mapping_insert(node: mapping, root: &pgt->pkvm_mappings);
401
402 return ret;
403}
404
405int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
406{
407 lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock);
408
409 return __pkvm_pgtable_stage2_unmap(pgt, start: addr, end: addr + size);
410}
411
412int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
413{
414 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
415 pkvm_handle_t handle = kvm->arch.pkvm.handle;
416 struct pkvm_mapping *mapping;
417 int ret = 0;
418
419 lockdep_assert_held(&kvm->mmu_lock);
420 for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
421 ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
422 mapping->nr_pages);
423 if (WARN_ON(ret))
424 break;
425 }
426
427 return ret;
428}
429
430int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
431{
432 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
433 struct pkvm_mapping *mapping;
434
435 lockdep_assert_held(&kvm->mmu_lock);
436 for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
437 __clean_dcache_guest_page(pfn_to_kaddr(pfn: mapping->pfn),
438 PAGE_SIZE * mapping->nr_pages);
439
440 return 0;
441}
442
443bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold)
444{
445 struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
446 pkvm_handle_t handle = kvm->arch.pkvm.handle;
447 struct pkvm_mapping *mapping;
448 bool young = false;
449
450 lockdep_assert_held(&kvm->mmu_lock);
451 for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
452 young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
453 mapping->nr_pages, mkold);
454
455 return young;
456}
457
458int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
459 enum kvm_pgtable_walk_flags flags)
460{
461 return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot);
462}
463
464void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
465 enum kvm_pgtable_walk_flags flags)
466{
467 WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
468}
469
470void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
471{
472 WARN_ON_ONCE(1);
473}
474
475kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
476 enum kvm_pgtable_prot prot, void *mc, bool force_pte)
477{
478 WARN_ON_ONCE(1);
479 return NULL;
480}
481
482int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
483 struct kvm_mmu_memory_cache *mc)
484{
485 WARN_ON_ONCE(1);
486 return -EINVAL;
487}
488

source code of linux/arch/arm64/kvm/pkvm.c