1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2020-2023 Loongson Technology Corporation Limited |
4 | */ |
5 | |
6 | #include <linux/highmem.h> |
7 | #include <linux/hugetlb.h> |
8 | #include <linux/kvm_host.h> |
9 | #include <linux/page-flags.h> |
10 | #include <linux/uaccess.h> |
11 | #include <asm/mmu_context.h> |
12 | #include <asm/pgalloc.h> |
13 | #include <asm/tlb.h> |
14 | #include <asm/kvm_mmu.h> |
15 | |
16 | static inline bool kvm_hugepage_capable(struct kvm_memory_slot *slot) |
17 | { |
18 | return slot->arch.flags & KVM_MEM_HUGEPAGE_CAPABLE; |
19 | } |
20 | |
21 | static inline bool kvm_hugepage_incapable(struct kvm_memory_slot *slot) |
22 | { |
23 | return slot->arch.flags & KVM_MEM_HUGEPAGE_INCAPABLE; |
24 | } |
25 | |
26 | static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx) |
27 | { |
28 | ctx->level = kvm->arch.root_level; |
29 | /* pte table */ |
30 | ctx->invalid_ptes = kvm->arch.invalid_ptes; |
31 | ctx->pte_shifts = kvm->arch.pte_shifts; |
32 | ctx->pgtable_shift = ctx->pte_shifts[ctx->level]; |
33 | ctx->invalid_entry = ctx->invalid_ptes[ctx->level]; |
34 | ctx->opaque = kvm; |
35 | } |
36 | |
37 | /* |
38 | * Mark a range of guest physical address space old (all accesses fault) in the |
39 | * VM's GPA page table to allow detection of commonly used pages. |
40 | */ |
41 | static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) |
42 | { |
43 | if (kvm_pte_young(*pte)) { |
44 | *pte = kvm_pte_mkold(*pte); |
45 | return 1; |
46 | } |
47 | |
48 | return 0; |
49 | } |
50 | |
51 | /* |
52 | * Mark a range of guest physical address space clean (writes fault) in the VM's |
53 | * GPA page table to allow dirty page tracking. |
54 | */ |
55 | static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) |
56 | { |
57 | gfn_t offset; |
58 | kvm_pte_t val; |
59 | |
60 | val = *pte; |
61 | /* |
62 | * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end |
63 | * may cross hugepage, for first huge page parameter addr is equal to |
64 | * start, however for the second huge page addr is base address of |
65 | * this huge page, rather than start or end address |
66 | */ |
67 | if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) { |
68 | offset = (addr >> PAGE_SHIFT) - ctx->gfn; |
69 | if (!(BIT(offset) & ctx->mask)) |
70 | return 0; |
71 | } |
72 | |
73 | /* |
74 | * Need not split huge page now, just set write-proect pte bit |
75 | * Split huge page until next write fault |
76 | */ |
77 | if (kvm_pte_dirty(val)) { |
78 | *pte = kvm_pte_mkclean(val); |
79 | return 1; |
80 | } |
81 | |
82 | return 0; |
83 | } |
84 | |
85 | /* |
86 | * Clear pte entry |
87 | */ |
88 | static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) |
89 | { |
90 | struct kvm *kvm; |
91 | |
92 | kvm = ctx->opaque; |
93 | if (ctx->level) |
94 | kvm->stat.hugepages--; |
95 | else |
96 | kvm->stat.pages--; |
97 | |
98 | *pte = ctx->invalid_entry; |
99 | |
100 | return 1; |
101 | } |
102 | |
103 | /* |
104 | * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. |
105 | * |
106 | * Allocate a blank KVM GPA page directory (PGD) for representing guest physical |
107 | * to host physical page mappings. |
108 | * |
109 | * Returns: Pointer to new KVM GPA page directory. |
110 | * NULL on allocation failure. |
111 | */ |
112 | kvm_pte_t *kvm_pgd_alloc(void) |
113 | { |
114 | kvm_pte_t *pgd; |
115 | |
116 | pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0); |
117 | if (pgd) |
118 | pgd_init((void *)pgd); |
119 | |
120 | return pgd; |
121 | } |
122 | |
123 | static void _kvm_pte_init(void *addr, unsigned long val) |
124 | { |
125 | unsigned long *p, *end; |
126 | |
127 | p = (unsigned long *)addr; |
128 | end = p + PTRS_PER_PTE; |
129 | do { |
130 | p[0] = val; |
131 | p[1] = val; |
132 | p[2] = val; |
133 | p[3] = val; |
134 | p[4] = val; |
135 | p += 8; |
136 | p[-3] = val; |
137 | p[-2] = val; |
138 | p[-1] = val; |
139 | } while (p != end); |
140 | } |
141 | |
142 | /* |
143 | * Caller must hold kvm->mm_lock |
144 | * |
145 | * Walk the page tables of kvm to find the PTE corresponding to the |
146 | * address @addr. If page tables don't exist for @addr, they will be created |
147 | * from the MMU cache if @cache is not NULL. |
148 | */ |
149 | static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, |
150 | struct kvm_mmu_memory_cache *cache, |
151 | unsigned long addr, int level) |
152 | { |
153 | kvm_ptw_ctx ctx; |
154 | kvm_pte_t *entry, *child; |
155 | |
156 | kvm_ptw_prepare(kvm, &ctx); |
157 | child = kvm->arch.pgd; |
158 | while (ctx.level > level) { |
159 | entry = kvm_pgtable_offset(&ctx, child, addr); |
160 | if (kvm_pte_none(&ctx, entry)) { |
161 | if (!cache) |
162 | return NULL; |
163 | |
164 | child = kvm_mmu_memory_cache_alloc(mc: cache); |
165 | _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]); |
166 | kvm_set_pte(entry, __pa(child)); |
167 | } else if (kvm_pte_huge(*entry)) { |
168 | return entry; |
169 | } else |
170 | child = (kvm_pte_t *)__va(PHYSADDR(*entry)); |
171 | kvm_ptw_enter(&ctx); |
172 | } |
173 | |
174 | entry = kvm_pgtable_offset(&ctx, child, addr); |
175 | |
176 | return entry; |
177 | } |
178 | |
179 | /* |
180 | * Page walker for VM shadow mmu at last level |
181 | * The last level is small pte page or huge pmd page |
182 | */ |
183 | static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) |
184 | { |
185 | int ret; |
186 | phys_addr_t next, start, size; |
187 | struct list_head *list; |
188 | kvm_pte_t *entry, *child; |
189 | |
190 | ret = 0; |
191 | start = addr; |
192 | child = (kvm_pte_t *)__va(PHYSADDR(*dir)); |
193 | entry = kvm_pgtable_offset(ctx, child, addr); |
194 | do { |
195 | next = addr + (0x1UL << ctx->pgtable_shift); |
196 | if (!kvm_pte_present(ctx, entry)) |
197 | continue; |
198 | |
199 | ret |= ctx->ops(entry, addr, ctx); |
200 | } while (entry++, addr = next, addr < end); |
201 | |
202 | if (kvm_need_flush(ctx)) { |
203 | size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); |
204 | if (start + size == end) { |
205 | list = (struct list_head *)child; |
206 | list_add_tail(new: list, head: &ctx->list); |
207 | *dir = ctx->invalid_ptes[ctx->level + 1]; |
208 | } |
209 | } |
210 | |
211 | return ret; |
212 | } |
213 | |
214 | /* |
215 | * Page walker for VM shadow mmu at page table dir level |
216 | */ |
217 | static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) |
218 | { |
219 | int ret; |
220 | phys_addr_t next, start, size; |
221 | struct list_head *list; |
222 | kvm_pte_t *entry, *child; |
223 | |
224 | ret = 0; |
225 | start = addr; |
226 | child = (kvm_pte_t *)__va(PHYSADDR(*dir)); |
227 | entry = kvm_pgtable_offset(ctx, child, addr); |
228 | do { |
229 | next = kvm_pgtable_addr_end(ctx, addr, end); |
230 | if (!kvm_pte_present(ctx, entry)) |
231 | continue; |
232 | |
233 | if (kvm_pte_huge(*entry)) { |
234 | ret |= ctx->ops(entry, addr, ctx); |
235 | continue; |
236 | } |
237 | |
238 | kvm_ptw_enter(ctx); |
239 | if (ctx->level == 0) |
240 | ret |= kvm_ptw_leaf(entry, addr, next, ctx); |
241 | else |
242 | ret |= kvm_ptw_dir(entry, addr, next, ctx); |
243 | kvm_ptw_exit(ctx); |
244 | } while (entry++, addr = next, addr < end); |
245 | |
246 | if (kvm_need_flush(ctx)) { |
247 | size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3); |
248 | if (start + size == end) { |
249 | list = (struct list_head *)child; |
250 | list_add_tail(new: list, head: &ctx->list); |
251 | *dir = ctx->invalid_ptes[ctx->level + 1]; |
252 | } |
253 | } |
254 | |
255 | return ret; |
256 | } |
257 | |
258 | /* |
259 | * Page walker for VM shadow mmu at page root table |
260 | */ |
261 | static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx) |
262 | { |
263 | int ret; |
264 | phys_addr_t next; |
265 | kvm_pte_t *entry; |
266 | |
267 | ret = 0; |
268 | entry = kvm_pgtable_offset(ctx, dir, addr); |
269 | do { |
270 | next = kvm_pgtable_addr_end(ctx, addr, end); |
271 | if (!kvm_pte_present(ctx, entry)) |
272 | continue; |
273 | |
274 | kvm_ptw_enter(ctx); |
275 | ret |= kvm_ptw_dir(entry, addr, next, ctx); |
276 | kvm_ptw_exit(ctx); |
277 | } while (entry++, addr = next, addr < end); |
278 | |
279 | return ret; |
280 | } |
281 | |
282 | /* |
283 | * kvm_flush_range() - Flush a range of guest physical addresses. |
284 | * @kvm: KVM pointer. |
285 | * @start_gfn: Guest frame number of first page in GPA range to flush. |
286 | * @end_gfn: Guest frame number of last page in GPA range to flush. |
287 | * @lock: Whether to hold mmu_lock or not |
288 | * |
289 | * Flushes a range of GPA mappings from the GPA page tables. |
290 | */ |
291 | static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock) |
292 | { |
293 | int ret; |
294 | kvm_ptw_ctx ctx; |
295 | struct list_head *pos, *temp; |
296 | |
297 | ctx.ops = kvm_flush_pte; |
298 | ctx.flag = _KVM_FLUSH_PGTABLE; |
299 | kvm_ptw_prepare(kvm, &ctx); |
300 | INIT_LIST_HEAD(&ctx.list); |
301 | |
302 | if (lock) { |
303 | spin_lock(lock: &kvm->mmu_lock); |
304 | ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, |
305 | end_gfn << PAGE_SHIFT, &ctx); |
306 | spin_unlock(lock: &kvm->mmu_lock); |
307 | } else |
308 | ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, |
309 | end_gfn << PAGE_SHIFT, &ctx); |
310 | |
311 | /* Flush vpid for each vCPU individually */ |
312 | if (ret) |
313 | kvm_flush_remote_tlbs(kvm); |
314 | |
315 | /* |
316 | * free pte table page after mmu_lock |
317 | * the pte table page is linked together with ctx.list |
318 | */ |
319 | list_for_each_safe(pos, temp, &ctx.list) { |
320 | list_del(pos); |
321 | free_page((unsigned long)pos); |
322 | } |
323 | } |
324 | |
325 | /* |
326 | * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. |
327 | * @kvm: KVM pointer. |
328 | * @start_gfn: Guest frame number of first page in GPA range to flush. |
329 | * @end_gfn: Guest frame number of last page in GPA range to flush. |
330 | * |
331 | * Make a range of GPA mappings clean so that guest writes will fault and |
332 | * trigger dirty page logging. |
333 | * |
334 | * The caller must hold the @kvm->mmu_lock spinlock. |
335 | * |
336 | * Returns: Whether any GPA mappings were modified, which would require |
337 | * derived mappings (GVA page tables & TLB enties) to be |
338 | * invalidated. |
339 | */ |
340 | static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) |
341 | { |
342 | kvm_ptw_ctx ctx; |
343 | |
344 | ctx.ops = kvm_mkclean_pte; |
345 | ctx.flag = 0; |
346 | kvm_ptw_prepare(kvm, &ctx); |
347 | return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx); |
348 | } |
349 | |
350 | /* |
351 | * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages |
352 | * @kvm: The KVM pointer |
353 | * @slot: The memory slot associated with mask |
354 | * @gfn_offset: The gfn offset in memory slot |
355 | * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory |
356 | * slot to be write protected |
357 | * |
358 | * Walks bits set in mask write protects the associated pte's. Caller must |
359 | * acquire @kvm->mmu_lock. |
360 | */ |
361 | void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, |
362 | struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) |
363 | { |
364 | kvm_ptw_ctx ctx; |
365 | gfn_t base_gfn = slot->base_gfn + gfn_offset; |
366 | gfn_t start = base_gfn + __ffs(mask); |
367 | gfn_t end = base_gfn + __fls(word: mask) + 1; |
368 | |
369 | ctx.ops = kvm_mkclean_pte; |
370 | ctx.flag = _KVM_HAS_PGMASK; |
371 | ctx.mask = mask; |
372 | ctx.gfn = base_gfn; |
373 | kvm_ptw_prepare(kvm, &ctx); |
374 | |
375 | kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx); |
376 | } |
377 | |
378 | int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, |
379 | struct kvm_memory_slot *new, enum kvm_mr_change change) |
380 | { |
381 | gpa_t gpa_start; |
382 | hva_t hva_start; |
383 | size_t size, gpa_offset, hva_offset; |
384 | |
385 | if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE)) |
386 | return 0; |
387 | /* |
388 | * Prevent userspace from creating a memory region outside of the |
389 | * VM GPA address space |
390 | */ |
391 | if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT)) |
392 | return -ENOMEM; |
393 | |
394 | new->arch.flags = 0; |
395 | size = new->npages * PAGE_SIZE; |
396 | gpa_start = new->base_gfn << PAGE_SHIFT; |
397 | hva_start = new->userspace_addr; |
398 | if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE) |
399 | && IS_ALIGNED(hva_start, PMD_SIZE)) |
400 | new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE; |
401 | else { |
402 | /* |
403 | * Pages belonging to memslots that don't have the same |
404 | * alignment within a PMD for userspace and GPA cannot be |
405 | * mapped with PMD entries, because we'll end up mapping |
406 | * the wrong pages. |
407 | * |
408 | * Consider a layout like the following: |
409 | * |
410 | * memslot->userspace_addr: |
411 | * +-----+--------------------+--------------------+---+ |
412 | * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| |
413 | * +-----+--------------------+--------------------+---+ |
414 | * |
415 | * memslot->base_gfn << PAGE_SIZE: |
416 | * +---+--------------------+--------------------+-----+ |
417 | * |abc|def Stage-2 block | Stage-2 block |tvxyz| |
418 | * +---+--------------------+--------------------+-----+ |
419 | * |
420 | * If we create those stage-2 blocks, we'll end up with this |
421 | * incorrect mapping: |
422 | * d -> f |
423 | * e -> g |
424 | * f -> h |
425 | */ |
426 | gpa_offset = gpa_start & (PMD_SIZE - 1); |
427 | hva_offset = hva_start & (PMD_SIZE - 1); |
428 | if (gpa_offset != hva_offset) { |
429 | new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; |
430 | } else { |
431 | if (gpa_offset == 0) |
432 | gpa_offset = PMD_SIZE; |
433 | if ((size + gpa_offset) < (PMD_SIZE * 2)) |
434 | new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE; |
435 | } |
436 | } |
437 | |
438 | return 0; |
439 | } |
440 | |
441 | void kvm_arch_commit_memory_region(struct kvm *kvm, |
442 | struct kvm_memory_slot *old, |
443 | const struct kvm_memory_slot *new, |
444 | enum kvm_mr_change change) |
445 | { |
446 | int needs_flush; |
447 | |
448 | /* |
449 | * If dirty page logging is enabled, write protect all pages in the slot |
450 | * ready for dirty logging. |
451 | * |
452 | * There is no need to do this in any of the following cases: |
453 | * CREATE: No dirty mappings will already exist. |
454 | * MOVE/DELETE: The old mappings will already have been cleaned up by |
455 | * kvm_arch_flush_shadow_memslot() |
456 | */ |
457 | if (change == KVM_MR_FLAGS_ONLY && |
458 | (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && |
459 | new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { |
460 | spin_lock(lock: &kvm->mmu_lock); |
461 | /* Write protect GPA page table entries */ |
462 | needs_flush = kvm_mkclean_gpa_pt(kvm, start_gfn: new->base_gfn, |
463 | end_gfn: new->base_gfn + new->npages); |
464 | spin_unlock(lock: &kvm->mmu_lock); |
465 | if (needs_flush) |
466 | kvm_flush_remote_tlbs(kvm); |
467 | } |
468 | } |
469 | |
470 | void kvm_arch_flush_shadow_all(struct kvm *kvm) |
471 | { |
472 | kvm_flush_range(kvm, start_gfn: 0, end_gfn: kvm->arch.gpa_size >> PAGE_SHIFT, lock: 0); |
473 | } |
474 | |
475 | void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) |
476 | { |
477 | /* |
478 | * The slot has been made invalid (ready for moving or deletion), so we |
479 | * need to ensure that it can no longer be accessed by any guest vCPUs. |
480 | */ |
481 | kvm_flush_range(kvm, start_gfn: slot->base_gfn, end_gfn: slot->base_gfn + slot->npages, lock: 1); |
482 | } |
483 | |
484 | bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) |
485 | { |
486 | kvm_ptw_ctx ctx; |
487 | |
488 | ctx.flag = 0; |
489 | ctx.ops = kvm_flush_pte; |
490 | kvm_ptw_prepare(kvm, &ctx); |
491 | INIT_LIST_HEAD(&ctx.list); |
492 | |
493 | return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, |
494 | range->end << PAGE_SHIFT, &ctx); |
495 | } |
496 | |
497 | bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) |
498 | { |
499 | unsigned long prot_bits; |
500 | kvm_pte_t *ptep; |
501 | kvm_pfn_t pfn = pte_pfn(pte: range->arg.pte); |
502 | gpa_t gpa = range->start << PAGE_SHIFT; |
503 | |
504 | ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); |
505 | if (!ptep) |
506 | return false; |
507 | |
508 | /* Replacing an absent or old page doesn't need flushes */ |
509 | if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) { |
510 | kvm_set_pte(ptep, 0); |
511 | return false; |
512 | } |
513 | |
514 | /* Fill new pte if write protected or page migrated */ |
515 | prot_bits = _PAGE_PRESENT | __READABLE; |
516 | prot_bits |= _CACHE_MASK & pte_val(range->arg.pte); |
517 | |
518 | /* |
519 | * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support |
520 | * _PAGE_WRITE for map_page_fast if next page write fault |
521 | * _PAGE_DIRTY since gpa has already recorded as dirty page |
522 | */ |
523 | prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte); |
524 | kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits))); |
525 | |
526 | return true; |
527 | } |
528 | |
529 | bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) |
530 | { |
531 | kvm_ptw_ctx ctx; |
532 | |
533 | ctx.flag = 0; |
534 | ctx.ops = kvm_mkold_pte; |
535 | kvm_ptw_prepare(kvm, &ctx); |
536 | |
537 | return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT, |
538 | range->end << PAGE_SHIFT, &ctx); |
539 | } |
540 | |
541 | bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) |
542 | { |
543 | gpa_t gpa = range->start << PAGE_SHIFT; |
544 | kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); |
545 | |
546 | if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) |
547 | return true; |
548 | |
549 | return false; |
550 | } |
551 | |
552 | /* |
553 | * kvm_map_page_fast() - Fast path GPA fault handler. |
554 | * @vcpu: vCPU pointer. |
555 | * @gpa: Guest physical address of fault. |
556 | * @write: Whether the fault was due to a write. |
557 | * |
558 | * Perform fast path GPA fault handling, doing all that can be done without |
559 | * calling into KVM. This handles marking old pages young (for idle page |
560 | * tracking), and dirtying of clean pages (for dirty page logging). |
561 | * |
562 | * Returns: 0 on success, in which case we can update derived mappings and |
563 | * resume guest execution. |
564 | * -EFAULT on failure due to absent GPA mapping or write to |
565 | * read-only page, in which case KVM must be consulted. |
566 | */ |
567 | static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) |
568 | { |
569 | int ret = 0; |
570 | kvm_pfn_t pfn = 0; |
571 | kvm_pte_t *ptep, changed, new; |
572 | gfn_t gfn = gpa >> PAGE_SHIFT; |
573 | struct kvm *kvm = vcpu->kvm; |
574 | struct kvm_memory_slot *slot; |
575 | |
576 | spin_lock(lock: &kvm->mmu_lock); |
577 | |
578 | /* Fast path - just check GPA page table for an existing entry */ |
579 | ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); |
580 | if (!ptep || !kvm_pte_present(NULL, ptep)) { |
581 | ret = -EFAULT; |
582 | goto out; |
583 | } |
584 | |
585 | /* Track access to pages marked old */ |
586 | new = *ptep; |
587 | if (!kvm_pte_young(new)) |
588 | new = kvm_pte_mkyoung(new); |
589 | /* call kvm_set_pfn_accessed() after unlock */ |
590 | |
591 | if (write && !kvm_pte_dirty(new)) { |
592 | if (!kvm_pte_write(new)) { |
593 | ret = -EFAULT; |
594 | goto out; |
595 | } |
596 | |
597 | if (kvm_pte_huge(new)) { |
598 | /* |
599 | * Do not set write permission when dirty logging is |
600 | * enabled for HugePages |
601 | */ |
602 | slot = gfn_to_memslot(kvm, gfn); |
603 | if (kvm_slot_dirty_track_enabled(slot)) { |
604 | ret = -EFAULT; |
605 | goto out; |
606 | } |
607 | } |
608 | |
609 | /* Track dirtying of writeable pages */ |
610 | new = kvm_pte_mkdirty(new); |
611 | } |
612 | |
613 | changed = new ^ (*ptep); |
614 | if (changed) { |
615 | kvm_set_pte(ptep, new); |
616 | pfn = kvm_pte_pfn(new); |
617 | } |
618 | spin_unlock(lock: &kvm->mmu_lock); |
619 | |
620 | /* |
621 | * Fixme: pfn may be freed after mmu_lock |
622 | * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this? |
623 | */ |
624 | if (kvm_pte_young(changed)) |
625 | kvm_set_pfn_accessed(pfn); |
626 | |
627 | if (kvm_pte_dirty(changed)) { |
628 | mark_page_dirty(kvm, gfn); |
629 | kvm_set_pfn_dirty(pfn); |
630 | } |
631 | return ret; |
632 | out: |
633 | spin_unlock(lock: &kvm->mmu_lock); |
634 | return ret; |
635 | } |
636 | |
637 | static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, |
638 | unsigned long hva, bool write) |
639 | { |
640 | hva_t start, end; |
641 | |
642 | /* Disable dirty logging on HugePages */ |
643 | if (kvm_slot_dirty_track_enabled(slot: memslot) && write) |
644 | return false; |
645 | |
646 | if (kvm_hugepage_capable(slot: memslot)) |
647 | return true; |
648 | |
649 | if (kvm_hugepage_incapable(slot: memslot)) |
650 | return false; |
651 | |
652 | start = memslot->userspace_addr; |
653 | end = start + memslot->npages * PAGE_SIZE; |
654 | |
655 | /* |
656 | * Next, let's make sure we're not trying to map anything not covered |
657 | * by the memslot. This means we have to prohibit block size mappings |
658 | * for the beginning and end of a non-block aligned and non-block sized |
659 | * memory slot (illustrated by the head and tail parts of the |
660 | * userspace view above containing pages 'abcde' and 'xyz', |
661 | * respectively). |
662 | * |
663 | * Note that it doesn't matter if we do the check using the |
664 | * userspace_addr or the base_gfn, as both are equally aligned (per |
665 | * the check above) and equally sized. |
666 | */ |
667 | return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE)); |
668 | } |
669 | |
670 | /* |
671 | * Lookup the mapping level for @gfn in the current mm. |
672 | * |
673 | * WARNING! Use of host_pfn_mapping_level() requires the caller and the end |
674 | * consumer to be tied into KVM's handlers for MMU notifier events! |
675 | * |
676 | * There are several ways to safely use this helper: |
677 | * |
678 | * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before |
679 | * consuming it. In this case, mmu_lock doesn't need to be held during the |
680 | * lookup, but it does need to be held while checking the MMU notifier. |
681 | * |
682 | * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation |
683 | * event for the hva. This can be done by explicit checking the MMU notifier |
684 | * or by ensuring that KVM already has a valid mapping that covers the hva. |
685 | * |
686 | * - Do not use the result to install new mappings, e.g. use the host mapping |
687 | * level only to decide whether or not to zap an entry. In this case, it's |
688 | * not required to hold mmu_lock (though it's highly likely the caller will |
689 | * want to hold mmu_lock anyways, e.g. to modify SPTEs). |
690 | * |
691 | * Note! The lookup can still race with modifications to host page tables, but |
692 | * the above "rules" ensure KVM will not _consume_ the result of the walk if a |
693 | * race with the primary MMU occurs. |
694 | */ |
695 | static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, |
696 | const struct kvm_memory_slot *slot) |
697 | { |
698 | int level = 0; |
699 | unsigned long hva; |
700 | unsigned long flags; |
701 | pgd_t pgd; |
702 | p4d_t p4d; |
703 | pud_t pud; |
704 | pmd_t pmd; |
705 | |
706 | /* |
707 | * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() |
708 | * is not solely for performance, it's also necessary to avoid the |
709 | * "writable" check in __gfn_to_hva_many(), which will always fail on |
710 | * read-only memslots due to gfn_to_hva() assuming writes. Earlier |
711 | * page fault steps have already verified the guest isn't writing a |
712 | * read-only memslot. |
713 | */ |
714 | hva = __gfn_to_hva_memslot(slot, gfn); |
715 | |
716 | /* |
717 | * Disable IRQs to prevent concurrent tear down of host page tables, |
718 | * e.g. if the primary MMU promotes a P*D to a huge page and then frees |
719 | * the original page table. |
720 | */ |
721 | local_irq_save(flags); |
722 | |
723 | /* |
724 | * Read each entry once. As above, a non-leaf entry can be promoted to |
725 | * a huge page _during_ this walk. Re-reading the entry could send the |
726 | * walk into the weeks, e.g. p*d_leaf() returns false (sees the old |
727 | * value) and then p*d_offset() walks into the target huge page instead |
728 | * of the old page table (sees the new value). |
729 | */ |
730 | pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); |
731 | if (pgd_none(pgd)) |
732 | goto out; |
733 | |
734 | p4d = READ_ONCE(*p4d_offset(&pgd, hva)); |
735 | if (p4d_none(p4d) || !p4d_present(p4d)) |
736 | goto out; |
737 | |
738 | pud = READ_ONCE(*pud_offset(&p4d, hva)); |
739 | if (pud_none(pud) || !pud_present(pud)) |
740 | goto out; |
741 | |
742 | pmd = READ_ONCE(*pmd_offset(&pud, hva)); |
743 | if (pmd_none(pmd) || !pmd_present(pmd)) |
744 | goto out; |
745 | |
746 | if (kvm_pte_huge(pmd_val(pmd))) |
747 | level = 1; |
748 | |
749 | out: |
750 | local_irq_restore(flags); |
751 | return level; |
752 | } |
753 | |
754 | /* |
755 | * Split huge page |
756 | */ |
757 | static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn) |
758 | { |
759 | int i; |
760 | kvm_pte_t val, *child; |
761 | struct kvm *kvm = vcpu->kvm; |
762 | struct kvm_mmu_memory_cache *memcache; |
763 | |
764 | memcache = &vcpu->arch.mmu_page_cache; |
765 | child = kvm_mmu_memory_cache_alloc(memcache); |
766 | val = kvm_pte_mksmall(*ptep); |
767 | for (i = 0; i < PTRS_PER_PTE; i++) { |
768 | kvm_set_pte(child + i, val); |
769 | val += PAGE_SIZE; |
770 | } |
771 | |
772 | /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */ |
773 | kvm_set_pte(ptep, __pa(child)); |
774 | |
775 | kvm->stat.hugepages--; |
776 | kvm->stat.pages += PTRS_PER_PTE; |
777 | |
778 | return child + (gfn & (PTRS_PER_PTE - 1)); |
779 | } |
780 | |
781 | /* |
782 | * kvm_map_page() - Map a guest physical page. |
783 | * @vcpu: vCPU pointer. |
784 | * @gpa: Guest physical address of fault. |
785 | * @write: Whether the fault was due to a write. |
786 | * |
787 | * Handle GPA faults by creating a new GPA mapping (or updating an existing |
788 | * one). |
789 | * |
790 | * This takes care of marking pages young or dirty (idle/dirty page tracking), |
791 | * asking KVM for the corresponding PFN, and creating a mapping in the GPA page |
792 | * tables. Derived mappings (GVA page tables and TLBs) must be handled by the |
793 | * caller. |
794 | * |
795 | * Returns: 0 on success |
796 | * -EFAULT if there is no memory region at @gpa or a write was |
797 | * attempted to a read-only memory region. This is usually handled |
798 | * as an MMIO access. |
799 | */ |
800 | static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) |
801 | { |
802 | bool writeable; |
803 | int srcu_idx, err, retry_no = 0, level; |
804 | unsigned long hva, mmu_seq, prot_bits; |
805 | kvm_pfn_t pfn; |
806 | kvm_pte_t *ptep, new_pte; |
807 | gfn_t gfn = gpa >> PAGE_SHIFT; |
808 | struct kvm *kvm = vcpu->kvm; |
809 | struct kvm_memory_slot *memslot; |
810 | struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; |
811 | |
812 | /* Try the fast path to handle old / clean pages */ |
813 | srcu_idx = srcu_read_lock(ssp: &kvm->srcu); |
814 | err = kvm_map_page_fast(vcpu, gpa, write); |
815 | if (!err) |
816 | goto out; |
817 | |
818 | memslot = gfn_to_memslot(kvm, gfn); |
819 | hva = gfn_to_hva_memslot_prot(slot: memslot, gfn, writable: &writeable); |
820 | if (kvm_is_error_hva(addr: hva) || (write && !writeable)) { |
821 | err = -EFAULT; |
822 | goto out; |
823 | } |
824 | |
825 | /* We need a minimum of cached pages ready for page table creation */ |
826 | err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); |
827 | if (err) |
828 | goto out; |
829 | |
830 | retry: |
831 | /* |
832 | * Used to check for invalidations in progress, of the pfn that is |
833 | * returned by pfn_to_pfn_prot below. |
834 | */ |
835 | mmu_seq = kvm->mmu_invalidate_seq; |
836 | /* |
837 | * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in |
838 | * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't |
839 | * risk the page we get a reference to getting unmapped before we have a |
840 | * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. |
841 | * |
842 | * This smp_rmb() pairs with the effective smp_wmb() of the combination |
843 | * of the pte_unmap_unlock() after the PTE is zapped, and the |
844 | * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before |
845 | * mmu_invalidate_seq is incremented. |
846 | */ |
847 | smp_rmb(); |
848 | |
849 | /* Slow path - ask KVM core whether we can access this GPA */ |
850 | pfn = gfn_to_pfn_prot(kvm, gfn, write_fault: write, writable: &writeable); |
851 | if (is_error_noslot_pfn(pfn)) { |
852 | err = -EFAULT; |
853 | goto out; |
854 | } |
855 | |
856 | /* Check if an invalidation has taken place since we got pfn */ |
857 | spin_lock(lock: &kvm->mmu_lock); |
858 | if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { |
859 | /* |
860 | * This can happen when mappings are changed asynchronously, but |
861 | * also synchronously if a COW is triggered by |
862 | * gfn_to_pfn_prot(). |
863 | */ |
864 | spin_unlock(lock: &kvm->mmu_lock); |
865 | kvm_release_pfn_clean(pfn); |
866 | if (retry_no > 100) { |
867 | retry_no = 0; |
868 | schedule(); |
869 | } |
870 | retry_no++; |
871 | goto retry; |
872 | } |
873 | |
874 | /* |
875 | * For emulated devices such virtio device, actual cache attribute is |
876 | * determined by physical machine. |
877 | * For pass through physical device, it should be uncachable |
878 | */ |
879 | prot_bits = _PAGE_PRESENT | __READABLE; |
880 | if (pfn_valid(pfn)) |
881 | prot_bits |= _CACHE_CC; |
882 | else |
883 | prot_bits |= _CACHE_SUC; |
884 | |
885 | if (writeable) { |
886 | prot_bits |= _PAGE_WRITE; |
887 | if (write) |
888 | prot_bits |= __WRITEABLE; |
889 | } |
890 | |
891 | /* Disable dirty logging on HugePages */ |
892 | level = 0; |
893 | if (!fault_supports_huge_mapping(memslot, hva, write)) { |
894 | level = 0; |
895 | } else { |
896 | level = host_pfn_mapping_level(kvm, gfn, slot: memslot); |
897 | if (level == 1) { |
898 | gfn = gfn & ~(PTRS_PER_PTE - 1); |
899 | pfn = pfn & ~(PTRS_PER_PTE - 1); |
900 | } |
901 | } |
902 | |
903 | /* Ensure page tables are allocated */ |
904 | ptep = kvm_populate_gpa(kvm, memcache, gpa, level); |
905 | new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits)); |
906 | if (level == 1) { |
907 | new_pte = kvm_pte_mkhuge(new_pte); |
908 | /* |
909 | * previous pmd entry is invalid_pte_table |
910 | * there is invalid tlb with small page |
911 | * need flush these invalid tlbs for current vcpu |
912 | */ |
913 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
914 | ++kvm->stat.hugepages; |
915 | } else if (kvm_pte_huge(*ptep) && write) |
916 | ptep = kvm_split_huge(vcpu, ptep, gfn); |
917 | else |
918 | ++kvm->stat.pages; |
919 | kvm_set_pte(ptep, new_pte); |
920 | spin_unlock(lock: &kvm->mmu_lock); |
921 | |
922 | if (prot_bits & _PAGE_DIRTY) { |
923 | mark_page_dirty_in_slot(kvm, memslot, gfn); |
924 | kvm_set_pfn_dirty(pfn); |
925 | } |
926 | |
927 | kvm_set_pfn_accessed(pfn); |
928 | kvm_release_pfn_clean(pfn); |
929 | out: |
930 | srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx); |
931 | return err; |
932 | } |
933 | |
934 | int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) |
935 | { |
936 | int ret; |
937 | |
938 | ret = kvm_map_page(vcpu, gpa, write); |
939 | if (ret) |
940 | return ret; |
941 | |
942 | /* Invalidate this entry in the TLB */ |
943 | kvm_flush_tlb_gpa(vcpu, gpa); |
944 | |
945 | return 0; |
946 | } |
947 | |
948 | void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) |
949 | { |
950 | } |
951 | |
952 | void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, |
953 | const struct kvm_memory_slot *memslot) |
954 | { |
955 | kvm_flush_remote_tlbs(kvm); |
956 | } |
957 | |