1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2019 Western Digital Corporation or its affiliates. |
4 | * |
5 | * Authors: |
6 | * Anup Patel <anup.patel@wdc.com> |
7 | */ |
8 | |
9 | #include <linux/bitops.h> |
10 | #include <linux/errno.h> |
11 | #include <linux/err.h> |
12 | #include <linux/hugetlb.h> |
13 | #include <linux/module.h> |
14 | #include <linux/uaccess.h> |
15 | #include <linux/vmalloc.h> |
16 | #include <linux/kvm_host.h> |
17 | #include <linux/sched/signal.h> |
18 | #include <asm/csr.h> |
19 | #include <asm/page.h> |
20 | #include <asm/pgtable.h> |
21 | |
22 | #ifdef CONFIG_64BIT |
23 | static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); |
24 | static unsigned long gstage_pgd_levels __ro_after_init = 3; |
25 | #define gstage_index_bits 9 |
26 | #else |
27 | static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); |
28 | static unsigned long gstage_pgd_levels __ro_after_init = 2; |
29 | #define gstage_index_bits 10 |
30 | #endif |
31 | |
32 | #define gstage_pgd_xbits 2 |
33 | #define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits)) |
34 | #define gstage_gpa_bits (HGATP_PAGE_SHIFT + \ |
35 | (gstage_pgd_levels * gstage_index_bits) + \ |
36 | gstage_pgd_xbits) |
37 | #define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits)) |
38 | |
39 | #define gstage_pte_leaf(__ptep) \ |
40 | (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) |
41 | |
42 | static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) |
43 | { |
44 | unsigned long mask; |
45 | unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level); |
46 | |
47 | if (level == (gstage_pgd_levels - 1)) |
48 | mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1; |
49 | else |
50 | mask = PTRS_PER_PTE - 1; |
51 | |
52 | return (addr >> shift) & mask; |
53 | } |
54 | |
55 | static inline unsigned long gstage_pte_page_vaddr(pte_t pte) |
56 | { |
57 | return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); |
58 | } |
59 | |
60 | static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) |
61 | { |
62 | u32 i; |
63 | unsigned long psz = 1UL << 12; |
64 | |
65 | for (i = 0; i < gstage_pgd_levels; i++) { |
66 | if (page_size == (psz << (i * gstage_index_bits))) { |
67 | *out_level = i; |
68 | return 0; |
69 | } |
70 | } |
71 | |
72 | return -EINVAL; |
73 | } |
74 | |
75 | static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) |
76 | { |
77 | if (gstage_pgd_levels < level) |
78 | return -EINVAL; |
79 | |
80 | *out_pgorder = 12 + (level * gstage_index_bits); |
81 | return 0; |
82 | } |
83 | |
84 | static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) |
85 | { |
86 | int rc; |
87 | unsigned long page_order = PAGE_SHIFT; |
88 | |
89 | rc = gstage_level_to_page_order(level, out_pgorder: &page_order); |
90 | if (rc) |
91 | return rc; |
92 | |
93 | *out_pgsize = BIT(page_order); |
94 | return 0; |
95 | } |
96 | |
97 | static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr, |
98 | pte_t **ptepp, u32 *ptep_level) |
99 | { |
100 | pte_t *ptep; |
101 | u32 current_level = gstage_pgd_levels - 1; |
102 | |
103 | *ptep_level = current_level; |
104 | ptep = (pte_t *)kvm->arch.pgd; |
105 | ptep = &ptep[gstage_pte_index(addr, level: current_level)]; |
106 | while (ptep && pte_val(pte: ptep_get(ptep))) { |
107 | if (gstage_pte_leaf(ptep)) { |
108 | *ptep_level = current_level; |
109 | *ptepp = ptep; |
110 | return true; |
111 | } |
112 | |
113 | if (current_level) { |
114 | current_level--; |
115 | *ptep_level = current_level; |
116 | ptep = (pte_t *)gstage_pte_page_vaddr(pte: ptep_get(ptep)); |
117 | ptep = &ptep[gstage_pte_index(addr, level: current_level)]; |
118 | } else { |
119 | ptep = NULL; |
120 | } |
121 | } |
122 | |
123 | return false; |
124 | } |
125 | |
126 | static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) |
127 | { |
128 | unsigned long order = PAGE_SHIFT; |
129 | |
130 | if (gstage_level_to_page_order(level, out_pgorder: &order)) |
131 | return; |
132 | addr &= ~(BIT(order) - 1); |
133 | |
134 | kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order); |
135 | } |
136 | |
137 | static int gstage_set_pte(struct kvm *kvm, u32 level, |
138 | struct kvm_mmu_memory_cache *pcache, |
139 | gpa_t addr, const pte_t *new_pte) |
140 | { |
141 | u32 current_level = gstage_pgd_levels - 1; |
142 | pte_t *next_ptep = (pte_t *)kvm->arch.pgd; |
143 | pte_t *ptep = &next_ptep[gstage_pte_index(addr, level: current_level)]; |
144 | |
145 | if (current_level < level) |
146 | return -EINVAL; |
147 | |
148 | while (current_level != level) { |
149 | if (gstage_pte_leaf(ptep)) |
150 | return -EEXIST; |
151 | |
152 | if (!pte_val(pte: ptep_get(ptep))) { |
153 | if (!pcache) |
154 | return -ENOMEM; |
155 | next_ptep = kvm_mmu_memory_cache_alloc(mc: pcache); |
156 | if (!next_ptep) |
157 | return -ENOMEM; |
158 | set_pte(ptep, pte: pfn_pte(PFN_DOWN(__pa(next_ptep)), |
159 | __pgprot(_PAGE_TABLE))); |
160 | } else { |
161 | if (gstage_pte_leaf(ptep)) |
162 | return -EEXIST; |
163 | next_ptep = (pte_t *)gstage_pte_page_vaddr(pte: ptep_get(ptep)); |
164 | } |
165 | |
166 | current_level--; |
167 | ptep = &next_ptep[gstage_pte_index(addr, level: current_level)]; |
168 | } |
169 | |
170 | set_pte(ptep, pte: *new_pte); |
171 | if (gstage_pte_leaf(ptep)) |
172 | gstage_remote_tlb_flush(kvm, level: current_level, addr); |
173 | |
174 | return 0; |
175 | } |
176 | |
177 | static int gstage_map_page(struct kvm *kvm, |
178 | struct kvm_mmu_memory_cache *pcache, |
179 | gpa_t gpa, phys_addr_t hpa, |
180 | unsigned long page_size, |
181 | bool page_rdonly, bool page_exec) |
182 | { |
183 | int ret; |
184 | u32 level = 0; |
185 | pte_t new_pte; |
186 | pgprot_t prot; |
187 | |
188 | ret = gstage_page_size_to_level(page_size, out_level: &level); |
189 | if (ret) |
190 | return ret; |
191 | |
192 | /* |
193 | * A RISC-V implementation can choose to either: |
194 | * 1) Update 'A' and 'D' PTE bits in hardware |
195 | * 2) Generate page fault when 'A' and/or 'D' bits are not set |
196 | * PTE so that software can update these bits. |
197 | * |
198 | * We support both options mentioned above. To achieve this, we |
199 | * always set 'A' and 'D' PTE bits at time of creating G-stage |
200 | * mapping. To support KVM dirty page logging with both options |
201 | * mentioned above, we will write-protect G-stage PTEs to track |
202 | * dirty pages. |
203 | */ |
204 | |
205 | if (page_exec) { |
206 | if (page_rdonly) |
207 | prot = PAGE_READ_EXEC; |
208 | else |
209 | prot = PAGE_WRITE_EXEC; |
210 | } else { |
211 | if (page_rdonly) |
212 | prot = PAGE_READ; |
213 | else |
214 | prot = PAGE_WRITE; |
215 | } |
216 | new_pte = pfn_pte(PFN_DOWN(hpa), pgprot: prot); |
217 | new_pte = pte_mkdirty(pte: new_pte); |
218 | |
219 | return gstage_set_pte(kvm, level, pcache, addr: gpa, new_pte: &new_pte); |
220 | } |
221 | |
222 | enum gstage_op { |
223 | GSTAGE_OP_NOP = 0, /* Nothing */ |
224 | GSTAGE_OP_CLEAR, /* Clear/Unmap */ |
225 | GSTAGE_OP_WP, /* Write-protect */ |
226 | }; |
227 | |
228 | static void gstage_op_pte(struct kvm *kvm, gpa_t addr, |
229 | pte_t *ptep, u32 ptep_level, enum gstage_op op) |
230 | { |
231 | int i, ret; |
232 | pte_t *next_ptep; |
233 | u32 next_ptep_level; |
234 | unsigned long next_page_size, page_size; |
235 | |
236 | ret = gstage_level_to_page_size(level: ptep_level, out_pgsize: &page_size); |
237 | if (ret) |
238 | return; |
239 | |
240 | BUG_ON(addr & (page_size - 1)); |
241 | |
242 | if (!pte_val(pte: ptep_get(ptep))) |
243 | return; |
244 | |
245 | if (ptep_level && !gstage_pte_leaf(ptep)) { |
246 | next_ptep = (pte_t *)gstage_pte_page_vaddr(pte: ptep_get(ptep)); |
247 | next_ptep_level = ptep_level - 1; |
248 | ret = gstage_level_to_page_size(level: next_ptep_level, |
249 | out_pgsize: &next_page_size); |
250 | if (ret) |
251 | return; |
252 | |
253 | if (op == GSTAGE_OP_CLEAR) |
254 | set_pte(ptep, pte: __pte(val: 0)); |
255 | for (i = 0; i < PTRS_PER_PTE; i++) |
256 | gstage_op_pte(kvm, addr: addr + i * next_page_size, |
257 | ptep: &next_ptep[i], ptep_level: next_ptep_level, op); |
258 | if (op == GSTAGE_OP_CLEAR) |
259 | put_page(virt_to_page(next_ptep)); |
260 | } else { |
261 | if (op == GSTAGE_OP_CLEAR) |
262 | set_pte(ptep, pte: __pte(val: 0)); |
263 | else if (op == GSTAGE_OP_WP) |
264 | set_pte(ptep, pte: __pte(val: pte_val(pte: ptep_get(ptep)) & ~_PAGE_WRITE)); |
265 | gstage_remote_tlb_flush(kvm, level: ptep_level, addr); |
266 | } |
267 | } |
268 | |
269 | static void gstage_unmap_range(struct kvm *kvm, gpa_t start, |
270 | gpa_t size, bool may_block) |
271 | { |
272 | int ret; |
273 | pte_t *ptep; |
274 | u32 ptep_level; |
275 | bool found_leaf; |
276 | unsigned long page_size; |
277 | gpa_t addr = start, end = start + size; |
278 | |
279 | while (addr < end) { |
280 | found_leaf = gstage_get_leaf_entry(kvm, addr, |
281 | ptepp: &ptep, ptep_level: &ptep_level); |
282 | ret = gstage_level_to_page_size(level: ptep_level, out_pgsize: &page_size); |
283 | if (ret) |
284 | break; |
285 | |
286 | if (!found_leaf) |
287 | goto next; |
288 | |
289 | if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) |
290 | gstage_op_pte(kvm, addr, ptep, |
291 | ptep_level, op: GSTAGE_OP_CLEAR); |
292 | |
293 | next: |
294 | addr += page_size; |
295 | |
296 | /* |
297 | * If the range is too large, release the kvm->mmu_lock |
298 | * to prevent starvation and lockup detector warnings. |
299 | */ |
300 | if (may_block && addr < end) |
301 | cond_resched_lock(&kvm->mmu_lock); |
302 | } |
303 | } |
304 | |
305 | static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) |
306 | { |
307 | int ret; |
308 | pte_t *ptep; |
309 | u32 ptep_level; |
310 | bool found_leaf; |
311 | gpa_t addr = start; |
312 | unsigned long page_size; |
313 | |
314 | while (addr < end) { |
315 | found_leaf = gstage_get_leaf_entry(kvm, addr, |
316 | ptepp: &ptep, ptep_level: &ptep_level); |
317 | ret = gstage_level_to_page_size(level: ptep_level, out_pgsize: &page_size); |
318 | if (ret) |
319 | break; |
320 | |
321 | if (!found_leaf) |
322 | goto next; |
323 | |
324 | if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) |
325 | gstage_op_pte(kvm, addr, ptep, |
326 | ptep_level, op: GSTAGE_OP_WP); |
327 | |
328 | next: |
329 | addr += page_size; |
330 | } |
331 | } |
332 | |
333 | static void gstage_wp_memory_region(struct kvm *kvm, int slot) |
334 | { |
335 | struct kvm_memslots *slots = kvm_memslots(kvm); |
336 | struct kvm_memory_slot *memslot = id_to_memslot(slots, id: slot); |
337 | phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; |
338 | phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; |
339 | |
340 | spin_lock(lock: &kvm->mmu_lock); |
341 | gstage_wp_range(kvm, start, end); |
342 | spin_unlock(lock: &kvm->mmu_lock); |
343 | kvm_flush_remote_tlbs(kvm); |
344 | } |
345 | |
346 | int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, |
347 | phys_addr_t hpa, unsigned long size, |
348 | bool writable, bool in_atomic) |
349 | { |
350 | pte_t pte; |
351 | int ret = 0; |
352 | unsigned long pfn; |
353 | phys_addr_t addr, end; |
354 | struct kvm_mmu_memory_cache pcache = { |
355 | .gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0, |
356 | .gfp_zero = __GFP_ZERO, |
357 | }; |
358 | |
359 | end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; |
360 | pfn = __phys_to_pfn(hpa); |
361 | |
362 | for (addr = gpa; addr < end; addr += PAGE_SIZE) { |
363 | pte = pfn_pte(page_nr: pfn, PAGE_KERNEL_IO); |
364 | |
365 | if (!writable) |
366 | pte = pte_wrprotect(pte); |
367 | |
368 | ret = kvm_mmu_topup_memory_cache(mc: &pcache, min: gstage_pgd_levels); |
369 | if (ret) |
370 | goto out; |
371 | |
372 | spin_lock(lock: &kvm->mmu_lock); |
373 | ret = gstage_set_pte(kvm, level: 0, pcache: &pcache, addr, new_pte: &pte); |
374 | spin_unlock(lock: &kvm->mmu_lock); |
375 | if (ret) |
376 | goto out; |
377 | |
378 | pfn++; |
379 | } |
380 | |
381 | out: |
382 | kvm_mmu_free_memory_cache(mc: &pcache); |
383 | return ret; |
384 | } |
385 | |
386 | void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) |
387 | { |
388 | spin_lock(lock: &kvm->mmu_lock); |
389 | gstage_unmap_range(kvm, start: gpa, size, may_block: false); |
390 | spin_unlock(lock: &kvm->mmu_lock); |
391 | } |
392 | |
393 | void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, |
394 | struct kvm_memory_slot *slot, |
395 | gfn_t gfn_offset, |
396 | unsigned long mask) |
397 | { |
398 | phys_addr_t base_gfn = slot->base_gfn + gfn_offset; |
399 | phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; |
400 | phys_addr_t end = (base_gfn + __fls(word: mask) + 1) << PAGE_SHIFT; |
401 | |
402 | gstage_wp_range(kvm, start, end); |
403 | } |
404 | |
405 | void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) |
406 | { |
407 | } |
408 | |
409 | void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free) |
410 | { |
411 | } |
412 | |
413 | void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) |
414 | { |
415 | } |
416 | |
417 | void kvm_arch_flush_shadow_all(struct kvm *kvm) |
418 | { |
419 | kvm_riscv_gstage_free_pgd(kvm); |
420 | } |
421 | |
422 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, |
423 | struct kvm_memory_slot *slot) |
424 | { |
425 | gpa_t gpa = slot->base_gfn << PAGE_SHIFT; |
426 | phys_addr_t size = slot->npages << PAGE_SHIFT; |
427 | |
428 | spin_lock(lock: &kvm->mmu_lock); |
429 | gstage_unmap_range(kvm, start: gpa, size, may_block: false); |
430 | spin_unlock(lock: &kvm->mmu_lock); |
431 | } |
432 | |
433 | void kvm_arch_commit_memory_region(struct kvm *kvm, |
434 | struct kvm_memory_slot *old, |
435 | const struct kvm_memory_slot *new, |
436 | enum kvm_mr_change change) |
437 | { |
438 | /* |
439 | * At this point memslot has been committed and there is an |
440 | * allocated dirty_bitmap[], dirty pages will be tracked while |
441 | * the memory slot is write protected. |
442 | */ |
443 | if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) |
444 | gstage_wp_memory_region(kvm, slot: new->id); |
445 | } |
446 | |
447 | int kvm_arch_prepare_memory_region(struct kvm *kvm, |
448 | const struct kvm_memory_slot *old, |
449 | struct kvm_memory_slot *new, |
450 | enum kvm_mr_change change) |
451 | { |
452 | hva_t hva, reg_end, size; |
453 | gpa_t base_gpa; |
454 | bool writable; |
455 | int ret = 0; |
456 | |
457 | if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && |
458 | change != KVM_MR_FLAGS_ONLY) |
459 | return 0; |
460 | |
461 | /* |
462 | * Prevent userspace from creating a memory region outside of the GPA |
463 | * space addressable by the KVM guest GPA space. |
464 | */ |
465 | if ((new->base_gfn + new->npages) >= |
466 | (gstage_gpa_size >> PAGE_SHIFT)) |
467 | return -EFAULT; |
468 | |
469 | hva = new->userspace_addr; |
470 | size = new->npages << PAGE_SHIFT; |
471 | reg_end = hva + size; |
472 | base_gpa = new->base_gfn << PAGE_SHIFT; |
473 | writable = !(new->flags & KVM_MEM_READONLY); |
474 | |
475 | mmap_read_lock(current->mm); |
476 | |
477 | /* |
478 | * A memory region could potentially cover multiple VMAs, and |
479 | * any holes between them, so iterate over all of them to find |
480 | * out if we can map any of them right now. |
481 | * |
482 | * +--------------------------------------------+ |
483 | * +---------------+----------------+ +----------------+ |
484 | * | : VMA 1 | VMA 2 | | VMA 3 : | |
485 | * +---------------+----------------+ +----------------+ |
486 | * | memory region | |
487 | * +--------------------------------------------+ |
488 | */ |
489 | do { |
490 | struct vm_area_struct *vma = find_vma(current->mm, addr: hva); |
491 | hva_t vm_start, vm_end; |
492 | |
493 | if (!vma || vma->vm_start >= reg_end) |
494 | break; |
495 | |
496 | /* |
497 | * Mapping a read-only VMA is only allowed if the |
498 | * memory region is configured as read-only. |
499 | */ |
500 | if (writable && !(vma->vm_flags & VM_WRITE)) { |
501 | ret = -EPERM; |
502 | break; |
503 | } |
504 | |
505 | /* Take the intersection of this VMA with the memory region */ |
506 | vm_start = max(hva, vma->vm_start); |
507 | vm_end = min(reg_end, vma->vm_end); |
508 | |
509 | if (vma->vm_flags & VM_PFNMAP) { |
510 | gpa_t gpa = base_gpa + (vm_start - hva); |
511 | phys_addr_t pa; |
512 | |
513 | pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; |
514 | pa += vm_start - vma->vm_start; |
515 | |
516 | /* IO region dirty page logging not allowed */ |
517 | if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { |
518 | ret = -EINVAL; |
519 | goto out; |
520 | } |
521 | |
522 | ret = kvm_riscv_gstage_ioremap(kvm, gpa, hpa: pa, |
523 | size: vm_end - vm_start, |
524 | writable, in_atomic: false); |
525 | if (ret) |
526 | break; |
527 | } |
528 | hva = vm_end; |
529 | } while (hva < reg_end); |
530 | |
531 | if (change == KVM_MR_FLAGS_ONLY) |
532 | goto out; |
533 | |
534 | if (ret) |
535 | kvm_riscv_gstage_iounmap(kvm, gpa: base_gpa, size); |
536 | |
537 | out: |
538 | mmap_read_unlock(current->mm); |
539 | return ret; |
540 | } |
541 | |
542 | bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) |
543 | { |
544 | if (!kvm->arch.pgd) |
545 | return false; |
546 | |
547 | gstage_unmap_range(kvm, start: range->start << PAGE_SHIFT, |
548 | size: (range->end - range->start) << PAGE_SHIFT, |
549 | may_block: range->may_block); |
550 | return false; |
551 | } |
552 | |
553 | bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) |
554 | { |
555 | int ret; |
556 | kvm_pfn_t pfn = pte_pfn(pte: range->arg.pte); |
557 | |
558 | if (!kvm->arch.pgd) |
559 | return false; |
560 | |
561 | WARN_ON(range->end - range->start != 1); |
562 | |
563 | ret = gstage_map_page(kvm, NULL, gpa: range->start << PAGE_SHIFT, |
564 | __pfn_to_phys(pfn), PAGE_SIZE, page_rdonly: true, page_exec: true); |
565 | if (ret) { |
566 | kvm_debug("Failed to map G-stage page (error %d)\n" , ret); |
567 | return true; |
568 | } |
569 | |
570 | return false; |
571 | } |
572 | |
573 | bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) |
574 | { |
575 | pte_t *ptep; |
576 | u32 ptep_level = 0; |
577 | u64 size = (range->end - range->start) << PAGE_SHIFT; |
578 | |
579 | if (!kvm->arch.pgd) |
580 | return false; |
581 | |
582 | WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); |
583 | |
584 | if (!gstage_get_leaf_entry(kvm, addr: range->start << PAGE_SHIFT, |
585 | ptepp: &ptep, ptep_level: &ptep_level)) |
586 | return false; |
587 | |
588 | return ptep_test_and_clear_young(NULL, addr: 0, ptep); |
589 | } |
590 | |
591 | bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) |
592 | { |
593 | pte_t *ptep; |
594 | u32 ptep_level = 0; |
595 | u64 size = (range->end - range->start) << PAGE_SHIFT; |
596 | |
597 | if (!kvm->arch.pgd) |
598 | return false; |
599 | |
600 | WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); |
601 | |
602 | if (!gstage_get_leaf_entry(kvm, addr: range->start << PAGE_SHIFT, |
603 | ptepp: &ptep, ptep_level: &ptep_level)) |
604 | return false; |
605 | |
606 | return pte_young(pte: ptep_get(ptep)); |
607 | } |
608 | |
609 | int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, |
610 | struct kvm_memory_slot *memslot, |
611 | gpa_t gpa, unsigned long hva, bool is_write) |
612 | { |
613 | int ret; |
614 | kvm_pfn_t hfn; |
615 | bool writable; |
616 | short vma_pageshift; |
617 | gfn_t gfn = gpa >> PAGE_SHIFT; |
618 | struct vm_area_struct *vma; |
619 | struct kvm *kvm = vcpu->kvm; |
620 | struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache; |
621 | bool logging = (memslot->dirty_bitmap && |
622 | !(memslot->flags & KVM_MEM_READONLY)) ? true : false; |
623 | unsigned long vma_pagesize, mmu_seq; |
624 | |
625 | /* We need minimum second+third level pages */ |
626 | ret = kvm_mmu_topup_memory_cache(mc: pcache, min: gstage_pgd_levels); |
627 | if (ret) { |
628 | kvm_err("Failed to topup G-stage cache\n" ); |
629 | return ret; |
630 | } |
631 | |
632 | mmap_read_lock(current->mm); |
633 | |
634 | vma = vma_lookup(current->mm, addr: hva); |
635 | if (unlikely(!vma)) { |
636 | kvm_err("Failed to find VMA for hva 0x%lx\n" , hva); |
637 | mmap_read_unlock(current->mm); |
638 | return -EFAULT; |
639 | } |
640 | |
641 | if (is_vm_hugetlb_page(vma)) |
642 | vma_pageshift = huge_page_shift(h: hstate_vma(vma)); |
643 | else |
644 | vma_pageshift = PAGE_SHIFT; |
645 | vma_pagesize = 1ULL << vma_pageshift; |
646 | if (logging || (vma->vm_flags & VM_PFNMAP)) |
647 | vma_pagesize = PAGE_SIZE; |
648 | |
649 | if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) |
650 | gfn = (gpa & huge_page_mask(h: hstate_vma(vma))) >> PAGE_SHIFT; |
651 | |
652 | /* |
653 | * Read mmu_invalidate_seq so that KVM can detect if the results of |
654 | * vma_lookup() or gfn_to_pfn_prot() become stale priort to acquiring |
655 | * kvm->mmu_lock. |
656 | * |
657 | * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs |
658 | * with the smp_wmb() in kvm_mmu_invalidate_end(). |
659 | */ |
660 | mmu_seq = kvm->mmu_invalidate_seq; |
661 | mmap_read_unlock(current->mm); |
662 | |
663 | if (vma_pagesize != PUD_SIZE && |
664 | vma_pagesize != PMD_SIZE && |
665 | vma_pagesize != PAGE_SIZE) { |
666 | kvm_err("Invalid VMA page size 0x%lx\n" , vma_pagesize); |
667 | return -EFAULT; |
668 | } |
669 | |
670 | hfn = gfn_to_pfn_prot(kvm, gfn, write_fault: is_write, writable: &writable); |
671 | if (hfn == KVM_PFN_ERR_HWPOISON) { |
672 | send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, |
673 | vma_pageshift, current); |
674 | return 0; |
675 | } |
676 | if (is_error_noslot_pfn(pfn: hfn)) |
677 | return -EFAULT; |
678 | |
679 | /* |
680 | * If logging is active then we allow writable pages only |
681 | * for write faults. |
682 | */ |
683 | if (logging && !is_write) |
684 | writable = false; |
685 | |
686 | spin_lock(lock: &kvm->mmu_lock); |
687 | |
688 | if (mmu_invalidate_retry(kvm, mmu_seq)) |
689 | goto out_unlock; |
690 | |
691 | if (writable) { |
692 | kvm_set_pfn_dirty(pfn: hfn); |
693 | mark_page_dirty(kvm, gfn); |
694 | ret = gstage_map_page(kvm, pcache, gpa, hpa: hfn << PAGE_SHIFT, |
695 | page_size: vma_pagesize, page_rdonly: false, page_exec: true); |
696 | } else { |
697 | ret = gstage_map_page(kvm, pcache, gpa, hpa: hfn << PAGE_SHIFT, |
698 | page_size: vma_pagesize, page_rdonly: true, page_exec: true); |
699 | } |
700 | |
701 | if (ret) |
702 | kvm_err("Failed to map in G-stage\n" ); |
703 | |
704 | out_unlock: |
705 | spin_unlock(lock: &kvm->mmu_lock); |
706 | kvm_set_pfn_accessed(pfn: hfn); |
707 | kvm_release_pfn_clean(pfn: hfn); |
708 | return ret; |
709 | } |
710 | |
711 | int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) |
712 | { |
713 | struct page *pgd_page; |
714 | |
715 | if (kvm->arch.pgd != NULL) { |
716 | kvm_err("kvm_arch already initialized?\n" ); |
717 | return -EINVAL; |
718 | } |
719 | |
720 | pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, |
721 | get_order(gstage_pgd_size)); |
722 | if (!pgd_page) |
723 | return -ENOMEM; |
724 | kvm->arch.pgd = page_to_virt(pgd_page); |
725 | kvm->arch.pgd_phys = page_to_phys(pgd_page); |
726 | |
727 | return 0; |
728 | } |
729 | |
730 | void kvm_riscv_gstage_free_pgd(struct kvm *kvm) |
731 | { |
732 | void *pgd = NULL; |
733 | |
734 | spin_lock(lock: &kvm->mmu_lock); |
735 | if (kvm->arch.pgd) { |
736 | gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false); |
737 | pgd = READ_ONCE(kvm->arch.pgd); |
738 | kvm->arch.pgd = NULL; |
739 | kvm->arch.pgd_phys = 0; |
740 | } |
741 | spin_unlock(lock: &kvm->mmu_lock); |
742 | |
743 | if (pgd) |
744 | free_pages((unsigned long)pgd, get_order(gstage_pgd_size)); |
745 | } |
746 | |
747 | void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) |
748 | { |
749 | unsigned long hgatp = gstage_mode; |
750 | struct kvm_arch *k = &vcpu->kvm->arch; |
751 | |
752 | hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; |
753 | hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; |
754 | |
755 | csr_write(CSR_HGATP, hgatp); |
756 | |
757 | if (!kvm_riscv_gstage_vmid_bits()) |
758 | kvm_riscv_local_hfence_gvma_all(); |
759 | } |
760 | |
761 | void __init kvm_riscv_gstage_mode_detect(void) |
762 | { |
763 | #ifdef CONFIG_64BIT |
764 | /* Try Sv57x4 G-stage mode */ |
765 | csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); |
766 | if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { |
767 | gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); |
768 | gstage_pgd_levels = 5; |
769 | goto skip_sv48x4_test; |
770 | } |
771 | |
772 | /* Try Sv48x4 G-stage mode */ |
773 | csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); |
774 | if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { |
775 | gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); |
776 | gstage_pgd_levels = 4; |
777 | } |
778 | skip_sv48x4_test: |
779 | |
780 | csr_write(CSR_HGATP, 0); |
781 | kvm_riscv_local_hfence_gvma_all(); |
782 | #endif |
783 | } |
784 | |
785 | unsigned long __init kvm_riscv_gstage_mode(void) |
786 | { |
787 | return gstage_mode >> HGATP_MODE_SHIFT; |
788 | } |
789 | |
790 | int kvm_riscv_gstage_gpa_bits(void) |
791 | { |
792 | return gstage_gpa_bits; |
793 | } |
794 | |