1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU |
4 | * |
5 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. |
6 | * Author: Alex Williamson <alex.williamson@redhat.com> |
7 | * |
8 | * Derived from original vfio: |
9 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. |
10 | * Author: Tom Lyon, pugs@cisco.com |
11 | * |
12 | * We arbitrarily define a Type1 IOMMU as one matching the below code. |
13 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel |
14 | * VT-d, but that makes it harder to re-use as theoretically anyone |
15 | * implementing a similar IOMMU could make use of this. We expect the |
16 | * IOMMU to support the IOMMU API and have few to no restrictions around |
17 | * the IOVA range that can be mapped. The Type1 IOMMU is currently |
18 | * optimized for relatively static mappings of a userspace process with |
19 | * userspace pages pinned into memory. We also assume devices and IOMMU |
20 | * domains are PCI based as the IOMMU API is still centered around a |
21 | * device/bus interface rather than a group interface. |
22 | */ |
23 | |
24 | #include <linux/compat.h> |
25 | #include <linux/device.h> |
26 | #include <linux/fs.h> |
27 | #include <linux/highmem.h> |
28 | #include <linux/iommu.h> |
29 | #include <linux/module.h> |
30 | #include <linux/mm.h> |
31 | #include <linux/kthread.h> |
32 | #include <linux/rbtree.h> |
33 | #include <linux/sched/signal.h> |
34 | #include <linux/sched/mm.h> |
35 | #include <linux/slab.h> |
36 | #include <linux/uaccess.h> |
37 | #include <linux/vfio.h> |
38 | #include <linux/workqueue.h> |
39 | #include <linux/notifier.h> |
40 | #include "vfio.h" |
41 | |
42 | #define DRIVER_VERSION "0.2" |
43 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" |
44 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" |
45 | |
46 | static bool allow_unsafe_interrupts; |
47 | module_param_named(allow_unsafe_interrupts, |
48 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); |
49 | MODULE_PARM_DESC(allow_unsafe_interrupts, |
50 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support." ); |
51 | |
52 | static bool disable_hugepages; |
53 | module_param_named(disable_hugepages, |
54 | disable_hugepages, bool, S_IRUGO | S_IWUSR); |
55 | MODULE_PARM_DESC(disable_hugepages, |
56 | "Disable VFIO IOMMU support for IOMMU hugepages." ); |
57 | |
58 | static unsigned int dma_entry_limit __read_mostly = U16_MAX; |
59 | module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644); |
60 | MODULE_PARM_DESC(dma_entry_limit, |
61 | "Maximum number of user DMA mappings per container (65535)." ); |
62 | |
63 | struct vfio_iommu { |
64 | struct list_head domain_list; |
65 | struct list_head iova_list; |
66 | struct mutex lock; |
67 | struct rb_root dma_list; |
68 | struct list_head device_list; |
69 | struct mutex device_list_lock; |
70 | unsigned int dma_avail; |
71 | unsigned int vaddr_invalid_count; |
72 | uint64_t pgsize_bitmap; |
73 | uint64_t num_non_pinned_groups; |
74 | bool v2; |
75 | bool nesting; |
76 | bool dirty_page_tracking; |
77 | struct list_head emulated_iommu_groups; |
78 | }; |
79 | |
80 | struct vfio_domain { |
81 | struct iommu_domain *domain; |
82 | struct list_head next; |
83 | struct list_head group_list; |
84 | bool fgsp : 1; /* Fine-grained super pages */ |
85 | bool enforce_cache_coherency : 1; |
86 | }; |
87 | |
88 | struct vfio_dma { |
89 | struct rb_node node; |
90 | dma_addr_t iova; /* Device address */ |
91 | unsigned long vaddr; /* Process virtual addr */ |
92 | size_t size; /* Map size (bytes) */ |
93 | int prot; /* IOMMU_READ/WRITE */ |
94 | bool iommu_mapped; |
95 | bool lock_cap; /* capable(CAP_IPC_LOCK) */ |
96 | bool vaddr_invalid; |
97 | struct task_struct *task; |
98 | struct rb_root pfn_list; /* Ex-user pinned pfn list */ |
99 | unsigned long *bitmap; |
100 | struct mm_struct *mm; |
101 | size_t locked_vm; |
102 | }; |
103 | |
104 | struct vfio_batch { |
105 | struct page **pages; /* for pin_user_pages_remote */ |
106 | struct page *fallback_page; /* if pages alloc fails */ |
107 | int capacity; /* length of pages array */ |
108 | int size; /* of batch currently */ |
109 | int offset; /* of next entry in pages */ |
110 | }; |
111 | |
112 | struct vfio_iommu_group { |
113 | struct iommu_group *iommu_group; |
114 | struct list_head next; |
115 | bool pinned_page_dirty_scope; |
116 | }; |
117 | |
118 | struct vfio_iova { |
119 | struct list_head list; |
120 | dma_addr_t start; |
121 | dma_addr_t end; |
122 | }; |
123 | |
124 | /* |
125 | * Guest RAM pinning working set or DMA target |
126 | */ |
127 | struct vfio_pfn { |
128 | struct rb_node node; |
129 | dma_addr_t iova; /* Device address */ |
130 | unsigned long pfn; /* Host pfn */ |
131 | unsigned int ref_count; |
132 | }; |
133 | |
134 | struct vfio_regions { |
135 | struct list_head list; |
136 | dma_addr_t iova; |
137 | phys_addr_t phys; |
138 | size_t len; |
139 | }; |
140 | |
141 | #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) |
142 | |
143 | /* |
144 | * Input argument of number of bits to bitmap_set() is unsigned integer, which |
145 | * further casts to signed integer for unaligned multi-bit operation, |
146 | * __bitmap_set(). |
147 | * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte, |
148 | * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page |
149 | * system. |
150 | */ |
151 | #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) |
152 | #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) |
153 | |
154 | static int put_pfn(unsigned long pfn, int prot); |
155 | |
156 | static struct vfio_iommu_group* |
157 | vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, |
158 | struct iommu_group *iommu_group); |
159 | |
160 | /* |
161 | * This code handles mapping and unmapping of user data buffers |
162 | * into DMA'ble space using the IOMMU |
163 | */ |
164 | |
165 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, |
166 | dma_addr_t start, size_t size) |
167 | { |
168 | struct rb_node *node = iommu->dma_list.rb_node; |
169 | |
170 | while (node) { |
171 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); |
172 | |
173 | if (start + size <= dma->iova) |
174 | node = node->rb_left; |
175 | else if (start >= dma->iova + dma->size) |
176 | node = node->rb_right; |
177 | else |
178 | return dma; |
179 | } |
180 | |
181 | return NULL; |
182 | } |
183 | |
184 | static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu, |
185 | dma_addr_t start, u64 size) |
186 | { |
187 | struct rb_node *res = NULL; |
188 | struct rb_node *node = iommu->dma_list.rb_node; |
189 | struct vfio_dma *dma_res = NULL; |
190 | |
191 | while (node) { |
192 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); |
193 | |
194 | if (start < dma->iova + dma->size) { |
195 | res = node; |
196 | dma_res = dma; |
197 | if (start >= dma->iova) |
198 | break; |
199 | node = node->rb_left; |
200 | } else { |
201 | node = node->rb_right; |
202 | } |
203 | } |
204 | if (res && size && dma_res->iova >= start + size) |
205 | res = NULL; |
206 | return res; |
207 | } |
208 | |
209 | static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) |
210 | { |
211 | struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; |
212 | struct vfio_dma *dma; |
213 | |
214 | while (*link) { |
215 | parent = *link; |
216 | dma = rb_entry(parent, struct vfio_dma, node); |
217 | |
218 | if (new->iova + new->size <= dma->iova) |
219 | link = &(*link)->rb_left; |
220 | else |
221 | link = &(*link)->rb_right; |
222 | } |
223 | |
224 | rb_link_node(node: &new->node, parent, rb_link: link); |
225 | rb_insert_color(&new->node, &iommu->dma_list); |
226 | } |
227 | |
228 | static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) |
229 | { |
230 | rb_erase(&old->node, &iommu->dma_list); |
231 | } |
232 | |
233 | |
234 | static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize) |
235 | { |
236 | uint64_t npages = dma->size / pgsize; |
237 | |
238 | if (npages > DIRTY_BITMAP_PAGES_MAX) |
239 | return -EINVAL; |
240 | |
241 | /* |
242 | * Allocate extra 64 bits that are used to calculate shift required for |
243 | * bitmap_shift_left() to manipulate and club unaligned number of pages |
244 | * in adjacent vfio_dma ranges. |
245 | */ |
246 | dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64), |
247 | GFP_KERNEL); |
248 | if (!dma->bitmap) |
249 | return -ENOMEM; |
250 | |
251 | return 0; |
252 | } |
253 | |
254 | static void vfio_dma_bitmap_free(struct vfio_dma *dma) |
255 | { |
256 | kvfree(addr: dma->bitmap); |
257 | dma->bitmap = NULL; |
258 | } |
259 | |
260 | static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize) |
261 | { |
262 | struct rb_node *p; |
263 | unsigned long pgshift = __ffs(pgsize); |
264 | |
265 | for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) { |
266 | struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node); |
267 | |
268 | bitmap_set(map: dma->bitmap, start: (vpfn->iova - dma->iova) >> pgshift, nbits: 1); |
269 | } |
270 | } |
271 | |
272 | static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu) |
273 | { |
274 | struct rb_node *n; |
275 | unsigned long pgshift = __ffs(iommu->pgsize_bitmap); |
276 | |
277 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
278 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
279 | |
280 | bitmap_set(map: dma->bitmap, start: 0, nbits: dma->size >> pgshift); |
281 | } |
282 | } |
283 | |
284 | static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) |
285 | { |
286 | struct rb_node *n; |
287 | |
288 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
289 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
290 | int ret; |
291 | |
292 | ret = vfio_dma_bitmap_alloc(dma, pgsize); |
293 | if (ret) { |
294 | struct rb_node *p; |
295 | |
296 | for (p = rb_prev(n); p; p = rb_prev(p)) { |
297 | struct vfio_dma *dma = rb_entry(n, |
298 | struct vfio_dma, node); |
299 | |
300 | vfio_dma_bitmap_free(dma); |
301 | } |
302 | return ret; |
303 | } |
304 | vfio_dma_populate_bitmap(dma, pgsize); |
305 | } |
306 | return 0; |
307 | } |
308 | |
309 | static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) |
310 | { |
311 | struct rb_node *n; |
312 | |
313 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
314 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
315 | |
316 | vfio_dma_bitmap_free(dma); |
317 | } |
318 | } |
319 | |
320 | /* |
321 | * Helper Functions for host iova-pfn list |
322 | */ |
323 | static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) |
324 | { |
325 | struct vfio_pfn *vpfn; |
326 | struct rb_node *node = dma->pfn_list.rb_node; |
327 | |
328 | while (node) { |
329 | vpfn = rb_entry(node, struct vfio_pfn, node); |
330 | |
331 | if (iova < vpfn->iova) |
332 | node = node->rb_left; |
333 | else if (iova > vpfn->iova) |
334 | node = node->rb_right; |
335 | else |
336 | return vpfn; |
337 | } |
338 | return NULL; |
339 | } |
340 | |
341 | static void vfio_link_pfn(struct vfio_dma *dma, |
342 | struct vfio_pfn *new) |
343 | { |
344 | struct rb_node **link, *parent = NULL; |
345 | struct vfio_pfn *vpfn; |
346 | |
347 | link = &dma->pfn_list.rb_node; |
348 | while (*link) { |
349 | parent = *link; |
350 | vpfn = rb_entry(parent, struct vfio_pfn, node); |
351 | |
352 | if (new->iova < vpfn->iova) |
353 | link = &(*link)->rb_left; |
354 | else |
355 | link = &(*link)->rb_right; |
356 | } |
357 | |
358 | rb_link_node(node: &new->node, parent, rb_link: link); |
359 | rb_insert_color(&new->node, &dma->pfn_list); |
360 | } |
361 | |
362 | static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old) |
363 | { |
364 | rb_erase(&old->node, &dma->pfn_list); |
365 | } |
366 | |
367 | static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova, |
368 | unsigned long pfn) |
369 | { |
370 | struct vfio_pfn *vpfn; |
371 | |
372 | vpfn = kzalloc(size: sizeof(*vpfn), GFP_KERNEL); |
373 | if (!vpfn) |
374 | return -ENOMEM; |
375 | |
376 | vpfn->iova = iova; |
377 | vpfn->pfn = pfn; |
378 | vpfn->ref_count = 1; |
379 | vfio_link_pfn(dma, new: vpfn); |
380 | return 0; |
381 | } |
382 | |
383 | static void vfio_remove_from_pfn_list(struct vfio_dma *dma, |
384 | struct vfio_pfn *vpfn) |
385 | { |
386 | vfio_unlink_pfn(dma, old: vpfn); |
387 | kfree(objp: vpfn); |
388 | } |
389 | |
390 | static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma, |
391 | unsigned long iova) |
392 | { |
393 | struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); |
394 | |
395 | if (vpfn) |
396 | vpfn->ref_count++; |
397 | return vpfn; |
398 | } |
399 | |
400 | static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) |
401 | { |
402 | int ret = 0; |
403 | |
404 | vpfn->ref_count--; |
405 | if (!vpfn->ref_count) { |
406 | ret = put_pfn(pfn: vpfn->pfn, prot: dma->prot); |
407 | vfio_remove_from_pfn_list(dma, vpfn); |
408 | } |
409 | return ret; |
410 | } |
411 | |
412 | static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm, |
413 | bool lock_cap, long npage) |
414 | { |
415 | int ret = mmap_write_lock_killable(mm); |
416 | |
417 | if (ret) |
418 | return ret; |
419 | |
420 | ret = __account_locked_vm(mm, abs(npage), inc: npage > 0, task, bypass_rlim: lock_cap); |
421 | mmap_write_unlock(mm); |
422 | return ret; |
423 | } |
424 | |
425 | static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) |
426 | { |
427 | struct mm_struct *mm; |
428 | int ret; |
429 | |
430 | if (!npage) |
431 | return 0; |
432 | |
433 | mm = dma->mm; |
434 | if (async && !mmget_not_zero(mm)) |
435 | return -ESRCH; /* process exited */ |
436 | |
437 | ret = mm_lock_acct(task: dma->task, mm, lock_cap: dma->lock_cap, npage); |
438 | if (!ret) |
439 | dma->locked_vm += npage; |
440 | |
441 | if (async) |
442 | mmput(mm); |
443 | |
444 | return ret; |
445 | } |
446 | |
447 | /* |
448 | * Some mappings aren't backed by a struct page, for example an mmap'd |
449 | * MMIO range for our own or another device. These use a different |
450 | * pfn conversion and shouldn't be tracked as locked pages. |
451 | * For compound pages, any driver that sets the reserved bit in head |
452 | * page needs to set the reserved bit in all subpages to be safe. |
453 | */ |
454 | static bool is_invalid_reserved_pfn(unsigned long pfn) |
455 | { |
456 | if (pfn_valid(pfn)) |
457 | return PageReserved(pfn_to_page(pfn)); |
458 | |
459 | return true; |
460 | } |
461 | |
462 | static int put_pfn(unsigned long pfn, int prot) |
463 | { |
464 | if (!is_invalid_reserved_pfn(pfn)) { |
465 | struct page *page = pfn_to_page(pfn); |
466 | |
467 | unpin_user_pages_dirty_lock(pages: &page, npages: 1, make_dirty: prot & IOMMU_WRITE); |
468 | return 1; |
469 | } |
470 | return 0; |
471 | } |
472 | |
473 | #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) |
474 | |
475 | static void vfio_batch_init(struct vfio_batch *batch) |
476 | { |
477 | batch->size = 0; |
478 | batch->offset = 0; |
479 | |
480 | if (unlikely(disable_hugepages)) |
481 | goto fallback; |
482 | |
483 | batch->pages = (struct page **) __get_free_page(GFP_KERNEL); |
484 | if (!batch->pages) |
485 | goto fallback; |
486 | |
487 | batch->capacity = VFIO_BATCH_MAX_CAPACITY; |
488 | return; |
489 | |
490 | fallback: |
491 | batch->pages = &batch->fallback_page; |
492 | batch->capacity = 1; |
493 | } |
494 | |
495 | static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) |
496 | { |
497 | while (batch->size) { |
498 | unsigned long pfn = page_to_pfn(batch->pages[batch->offset]); |
499 | |
500 | put_pfn(pfn, prot: dma->prot); |
501 | batch->offset++; |
502 | batch->size--; |
503 | } |
504 | } |
505 | |
506 | static void vfio_batch_fini(struct vfio_batch *batch) |
507 | { |
508 | if (batch->capacity == VFIO_BATCH_MAX_CAPACITY) |
509 | free_page((unsigned long)batch->pages); |
510 | } |
511 | |
512 | static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, |
513 | unsigned long vaddr, unsigned long *pfn, |
514 | bool write_fault) |
515 | { |
516 | pte_t *ptep; |
517 | pte_t pte; |
518 | spinlock_t *ptl; |
519 | int ret; |
520 | |
521 | ret = follow_pte(mm: vma->vm_mm, address: vaddr, ptepp: &ptep, ptlp: &ptl); |
522 | if (ret) { |
523 | bool unlocked = false; |
524 | |
525 | ret = fixup_user_fault(mm, address: vaddr, |
526 | fault_flags: FAULT_FLAG_REMOTE | |
527 | (write_fault ? FAULT_FLAG_WRITE : 0), |
528 | unlocked: &unlocked); |
529 | if (unlocked) |
530 | return -EAGAIN; |
531 | |
532 | if (ret) |
533 | return ret; |
534 | |
535 | ret = follow_pte(mm: vma->vm_mm, address: vaddr, ptepp: &ptep, ptlp: &ptl); |
536 | if (ret) |
537 | return ret; |
538 | } |
539 | |
540 | pte = ptep_get(ptep); |
541 | |
542 | if (write_fault && !pte_write(pte)) |
543 | ret = -EFAULT; |
544 | else |
545 | *pfn = pte_pfn(pte); |
546 | |
547 | pte_unmap_unlock(ptep, ptl); |
548 | return ret; |
549 | } |
550 | |
551 | /* |
552 | * Returns the positive number of pfns successfully obtained or a negative |
553 | * error code. |
554 | */ |
555 | static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, |
556 | long npages, int prot, unsigned long *pfn, |
557 | struct page **pages) |
558 | { |
559 | struct vm_area_struct *vma; |
560 | unsigned int flags = 0; |
561 | int ret; |
562 | |
563 | if (prot & IOMMU_WRITE) |
564 | flags |= FOLL_WRITE; |
565 | |
566 | mmap_read_lock(mm); |
567 | ret = pin_user_pages_remote(mm, start: vaddr, nr_pages: npages, gup_flags: flags | FOLL_LONGTERM, |
568 | pages, NULL); |
569 | if (ret > 0) { |
570 | *pfn = page_to_pfn(pages[0]); |
571 | goto done; |
572 | } |
573 | |
574 | vaddr = untagged_addr_remote(mm, vaddr); |
575 | |
576 | retry: |
577 | vma = vma_lookup(mm, addr: vaddr); |
578 | |
579 | if (vma && vma->vm_flags & VM_PFNMAP) { |
580 | ret = follow_fault_pfn(vma, mm, vaddr, pfn, write_fault: prot & IOMMU_WRITE); |
581 | if (ret == -EAGAIN) |
582 | goto retry; |
583 | |
584 | if (!ret) { |
585 | if (is_invalid_reserved_pfn(pfn: *pfn)) |
586 | ret = 1; |
587 | else |
588 | ret = -EFAULT; |
589 | } |
590 | } |
591 | done: |
592 | mmap_read_unlock(mm); |
593 | return ret; |
594 | } |
595 | |
596 | /* |
597 | * Attempt to pin pages. We really don't want to track all the pfns and |
598 | * the iommu can only map chunks of consecutive pfns anyway, so get the |
599 | * first page and all consecutive pages with the same locking. |
600 | */ |
601 | static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, |
602 | long npage, unsigned long *pfn_base, |
603 | unsigned long limit, struct vfio_batch *batch) |
604 | { |
605 | unsigned long pfn; |
606 | struct mm_struct *mm = current->mm; |
607 | long ret, pinned = 0, lock_acct = 0; |
608 | bool rsvd; |
609 | dma_addr_t iova = vaddr - dma->vaddr + dma->iova; |
610 | |
611 | /* This code path is only user initiated */ |
612 | if (!mm) |
613 | return -ENODEV; |
614 | |
615 | if (batch->size) { |
616 | /* Leftover pages in batch from an earlier call. */ |
617 | *pfn_base = page_to_pfn(batch->pages[batch->offset]); |
618 | pfn = *pfn_base; |
619 | rsvd = is_invalid_reserved_pfn(pfn: *pfn_base); |
620 | } else { |
621 | *pfn_base = 0; |
622 | } |
623 | |
624 | while (npage) { |
625 | if (!batch->size) { |
626 | /* Empty batch, so refill it. */ |
627 | long req_pages = min_t(long, npage, batch->capacity); |
628 | |
629 | ret = vaddr_get_pfns(mm, vaddr, npages: req_pages, prot: dma->prot, |
630 | pfn: &pfn, pages: batch->pages); |
631 | if (ret < 0) |
632 | goto unpin_out; |
633 | |
634 | batch->size = ret; |
635 | batch->offset = 0; |
636 | |
637 | if (!*pfn_base) { |
638 | *pfn_base = pfn; |
639 | rsvd = is_invalid_reserved_pfn(pfn: *pfn_base); |
640 | } |
641 | } |
642 | |
643 | /* |
644 | * pfn is preset for the first iteration of this inner loop and |
645 | * updated at the end to handle a VM_PFNMAP pfn. In that case, |
646 | * batch->pages isn't valid (there's no struct page), so allow |
647 | * batch->pages to be touched only when there's more than one |
648 | * pfn to check, which guarantees the pfns are from a |
649 | * !VM_PFNMAP vma. |
650 | */ |
651 | while (true) { |
652 | if (pfn != *pfn_base + pinned || |
653 | rsvd != is_invalid_reserved_pfn(pfn)) |
654 | goto out; |
655 | |
656 | /* |
657 | * Reserved pages aren't counted against the user, |
658 | * externally pinned pages are already counted against |
659 | * the user. |
660 | */ |
661 | if (!rsvd && !vfio_find_vpfn(dma, iova)) { |
662 | if (!dma->lock_cap && |
663 | mm->locked_vm + lock_acct + 1 > limit) { |
664 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n" , |
665 | __func__, limit << PAGE_SHIFT); |
666 | ret = -ENOMEM; |
667 | goto unpin_out; |
668 | } |
669 | lock_acct++; |
670 | } |
671 | |
672 | pinned++; |
673 | npage--; |
674 | vaddr += PAGE_SIZE; |
675 | iova += PAGE_SIZE; |
676 | batch->offset++; |
677 | batch->size--; |
678 | |
679 | if (!batch->size) |
680 | break; |
681 | |
682 | pfn = page_to_pfn(batch->pages[batch->offset]); |
683 | } |
684 | |
685 | if (unlikely(disable_hugepages)) |
686 | break; |
687 | } |
688 | |
689 | out: |
690 | ret = vfio_lock_acct(dma, npage: lock_acct, async: false); |
691 | |
692 | unpin_out: |
693 | if (batch->size == 1 && !batch->offset) { |
694 | /* May be a VM_PFNMAP pfn, which the batch can't remember. */ |
695 | put_pfn(pfn, prot: dma->prot); |
696 | batch->size = 0; |
697 | } |
698 | |
699 | if (ret < 0) { |
700 | if (pinned && !rsvd) { |
701 | for (pfn = *pfn_base ; pinned ; pfn++, pinned--) |
702 | put_pfn(pfn, prot: dma->prot); |
703 | } |
704 | vfio_batch_unpin(batch, dma); |
705 | |
706 | return ret; |
707 | } |
708 | |
709 | return pinned; |
710 | } |
711 | |
712 | static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, |
713 | unsigned long pfn, long npage, |
714 | bool do_accounting) |
715 | { |
716 | long unlocked = 0, locked = 0; |
717 | long i; |
718 | |
719 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) { |
720 | if (put_pfn(pfn: pfn++, prot: dma->prot)) { |
721 | unlocked++; |
722 | if (vfio_find_vpfn(dma, iova)) |
723 | locked++; |
724 | } |
725 | } |
726 | |
727 | if (do_accounting) |
728 | vfio_lock_acct(dma, npage: locked - unlocked, async: true); |
729 | |
730 | return unlocked; |
731 | } |
732 | |
733 | static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, |
734 | unsigned long *pfn_base, bool do_accounting) |
735 | { |
736 | struct page *pages[1]; |
737 | struct mm_struct *mm; |
738 | int ret; |
739 | |
740 | mm = dma->mm; |
741 | if (!mmget_not_zero(mm)) |
742 | return -ENODEV; |
743 | |
744 | ret = vaddr_get_pfns(mm, vaddr, npages: 1, prot: dma->prot, pfn: pfn_base, pages); |
745 | if (ret != 1) |
746 | goto out; |
747 | |
748 | ret = 0; |
749 | |
750 | if (do_accounting && !is_invalid_reserved_pfn(pfn: *pfn_base)) { |
751 | ret = vfio_lock_acct(dma, npage: 1, async: false); |
752 | if (ret) { |
753 | put_pfn(pfn: *pfn_base, prot: dma->prot); |
754 | if (ret == -ENOMEM) |
755 | pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK " |
756 | "(%ld) exceeded\n" , __func__, |
757 | dma->task->comm, task_pid_nr(dma->task), |
758 | task_rlimit(dma->task, RLIMIT_MEMLOCK)); |
759 | } |
760 | } |
761 | |
762 | out: |
763 | mmput(mm); |
764 | return ret; |
765 | } |
766 | |
767 | static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, |
768 | bool do_accounting) |
769 | { |
770 | int unlocked; |
771 | struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); |
772 | |
773 | if (!vpfn) |
774 | return 0; |
775 | |
776 | unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); |
777 | |
778 | if (do_accounting) |
779 | vfio_lock_acct(dma, npage: -unlocked, async: true); |
780 | |
781 | return unlocked; |
782 | } |
783 | |
784 | static int vfio_iommu_type1_pin_pages(void *iommu_data, |
785 | struct iommu_group *iommu_group, |
786 | dma_addr_t user_iova, |
787 | int npage, int prot, |
788 | struct page **pages) |
789 | { |
790 | struct vfio_iommu *iommu = iommu_data; |
791 | struct vfio_iommu_group *group; |
792 | int i, j, ret; |
793 | unsigned long remote_vaddr; |
794 | struct vfio_dma *dma; |
795 | bool do_accounting; |
796 | |
797 | if (!iommu || !pages) |
798 | return -EINVAL; |
799 | |
800 | /* Supported for v2 version only */ |
801 | if (!iommu->v2) |
802 | return -EACCES; |
803 | |
804 | mutex_lock(&iommu->lock); |
805 | |
806 | if (WARN_ONCE(iommu->vaddr_invalid_count, |
807 | "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n" )) { |
808 | ret = -EBUSY; |
809 | goto pin_done; |
810 | } |
811 | |
812 | /* Fail if no dma_umap notifier is registered */ |
813 | if (list_empty(head: &iommu->device_list)) { |
814 | ret = -EINVAL; |
815 | goto pin_done; |
816 | } |
817 | |
818 | /* |
819 | * If iommu capable domain exist in the container then all pages are |
820 | * already pinned and accounted. Accounting should be done if there is no |
821 | * iommu capable domain in the container. |
822 | */ |
823 | do_accounting = list_empty(head: &iommu->domain_list); |
824 | |
825 | for (i = 0; i < npage; i++) { |
826 | unsigned long phys_pfn; |
827 | dma_addr_t iova; |
828 | struct vfio_pfn *vpfn; |
829 | |
830 | iova = user_iova + PAGE_SIZE * i; |
831 | dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE); |
832 | if (!dma) { |
833 | ret = -EINVAL; |
834 | goto pin_unwind; |
835 | } |
836 | |
837 | if ((dma->prot & prot) != prot) { |
838 | ret = -EPERM; |
839 | goto pin_unwind; |
840 | } |
841 | |
842 | vpfn = vfio_iova_get_vfio_pfn(dma, iova); |
843 | if (vpfn) { |
844 | pages[i] = pfn_to_page(vpfn->pfn); |
845 | continue; |
846 | } |
847 | |
848 | remote_vaddr = dma->vaddr + (iova - dma->iova); |
849 | ret = vfio_pin_page_external(dma, vaddr: remote_vaddr, pfn_base: &phys_pfn, |
850 | do_accounting); |
851 | if (ret) |
852 | goto pin_unwind; |
853 | |
854 | if (!pfn_valid(pfn: phys_pfn)) { |
855 | ret = -EINVAL; |
856 | goto pin_unwind; |
857 | } |
858 | |
859 | ret = vfio_add_to_pfn_list(dma, iova, pfn: phys_pfn); |
860 | if (ret) { |
861 | if (put_pfn(pfn: phys_pfn, prot: dma->prot) && do_accounting) |
862 | vfio_lock_acct(dma, npage: -1, async: true); |
863 | goto pin_unwind; |
864 | } |
865 | |
866 | pages[i] = pfn_to_page(phys_pfn); |
867 | |
868 | if (iommu->dirty_page_tracking) { |
869 | unsigned long pgshift = __ffs(iommu->pgsize_bitmap); |
870 | |
871 | /* |
872 | * Bitmap populated with the smallest supported page |
873 | * size |
874 | */ |
875 | bitmap_set(map: dma->bitmap, |
876 | start: (iova - dma->iova) >> pgshift, nbits: 1); |
877 | } |
878 | } |
879 | ret = i; |
880 | |
881 | group = vfio_iommu_find_iommu_group(iommu, iommu_group); |
882 | if (!group->pinned_page_dirty_scope) { |
883 | group->pinned_page_dirty_scope = true; |
884 | iommu->num_non_pinned_groups--; |
885 | } |
886 | |
887 | goto pin_done; |
888 | |
889 | pin_unwind: |
890 | pages[i] = NULL; |
891 | for (j = 0; j < i; j++) { |
892 | dma_addr_t iova; |
893 | |
894 | iova = user_iova + PAGE_SIZE * j; |
895 | dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE); |
896 | vfio_unpin_page_external(dma, iova, do_accounting); |
897 | pages[j] = NULL; |
898 | } |
899 | pin_done: |
900 | mutex_unlock(lock: &iommu->lock); |
901 | return ret; |
902 | } |
903 | |
904 | static void vfio_iommu_type1_unpin_pages(void *iommu_data, |
905 | dma_addr_t user_iova, int npage) |
906 | { |
907 | struct vfio_iommu *iommu = iommu_data; |
908 | bool do_accounting; |
909 | int i; |
910 | |
911 | /* Supported for v2 version only */ |
912 | if (WARN_ON(!iommu->v2)) |
913 | return; |
914 | |
915 | mutex_lock(&iommu->lock); |
916 | |
917 | do_accounting = list_empty(head: &iommu->domain_list); |
918 | for (i = 0; i < npage; i++) { |
919 | dma_addr_t iova = user_iova + PAGE_SIZE * i; |
920 | struct vfio_dma *dma; |
921 | |
922 | dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE); |
923 | if (!dma) |
924 | break; |
925 | |
926 | vfio_unpin_page_external(dma, iova, do_accounting); |
927 | } |
928 | |
929 | mutex_unlock(lock: &iommu->lock); |
930 | |
931 | WARN_ON(i != npage); |
932 | } |
933 | |
934 | static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, |
935 | struct list_head *regions, |
936 | struct iommu_iotlb_gather *iotlb_gather) |
937 | { |
938 | long unlocked = 0; |
939 | struct vfio_regions *entry, *next; |
940 | |
941 | iommu_iotlb_sync(domain: domain->domain, iotlb_gather); |
942 | |
943 | list_for_each_entry_safe(entry, next, regions, list) { |
944 | unlocked += vfio_unpin_pages_remote(dma, |
945 | iova: entry->iova, |
946 | pfn: entry->phys >> PAGE_SHIFT, |
947 | npage: entry->len >> PAGE_SHIFT, |
948 | do_accounting: false); |
949 | list_del(entry: &entry->list); |
950 | kfree(objp: entry); |
951 | } |
952 | |
953 | cond_resched(); |
954 | |
955 | return unlocked; |
956 | } |
957 | |
958 | /* |
959 | * Generally, VFIO needs to unpin remote pages after each IOTLB flush. |
960 | * Therefore, when using IOTLB flush sync interface, VFIO need to keep track |
961 | * of these regions (currently using a list). |
962 | * |
963 | * This value specifies maximum number of regions for each IOTLB flush sync. |
964 | */ |
965 | #define VFIO_IOMMU_TLB_SYNC_MAX 512 |
966 | |
967 | static size_t unmap_unpin_fast(struct vfio_domain *domain, |
968 | struct vfio_dma *dma, dma_addr_t *iova, |
969 | size_t len, phys_addr_t phys, long *unlocked, |
970 | struct list_head *unmapped_list, |
971 | int *unmapped_cnt, |
972 | struct iommu_iotlb_gather *iotlb_gather) |
973 | { |
974 | size_t unmapped = 0; |
975 | struct vfio_regions *entry = kzalloc(size: sizeof(*entry), GFP_KERNEL); |
976 | |
977 | if (entry) { |
978 | unmapped = iommu_unmap_fast(domain: domain->domain, iova: *iova, size: len, |
979 | iotlb_gather); |
980 | |
981 | if (!unmapped) { |
982 | kfree(objp: entry); |
983 | } else { |
984 | entry->iova = *iova; |
985 | entry->phys = phys; |
986 | entry->len = unmapped; |
987 | list_add_tail(new: &entry->list, head: unmapped_list); |
988 | |
989 | *iova += unmapped; |
990 | (*unmapped_cnt)++; |
991 | } |
992 | } |
993 | |
994 | /* |
995 | * Sync if the number of fast-unmap regions hits the limit |
996 | * or in case of errors. |
997 | */ |
998 | if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { |
999 | *unlocked += vfio_sync_unpin(dma, domain, regions: unmapped_list, |
1000 | iotlb_gather); |
1001 | *unmapped_cnt = 0; |
1002 | } |
1003 | |
1004 | return unmapped; |
1005 | } |
1006 | |
1007 | static size_t unmap_unpin_slow(struct vfio_domain *domain, |
1008 | struct vfio_dma *dma, dma_addr_t *iova, |
1009 | size_t len, phys_addr_t phys, |
1010 | long *unlocked) |
1011 | { |
1012 | size_t unmapped = iommu_unmap(domain: domain->domain, iova: *iova, size: len); |
1013 | |
1014 | if (unmapped) { |
1015 | *unlocked += vfio_unpin_pages_remote(dma, iova: *iova, |
1016 | pfn: phys >> PAGE_SHIFT, |
1017 | npage: unmapped >> PAGE_SHIFT, |
1018 | do_accounting: false); |
1019 | *iova += unmapped; |
1020 | cond_resched(); |
1021 | } |
1022 | return unmapped; |
1023 | } |
1024 | |
1025 | static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, |
1026 | bool do_accounting) |
1027 | { |
1028 | dma_addr_t iova = dma->iova, end = dma->iova + dma->size; |
1029 | struct vfio_domain *domain, *d; |
1030 | LIST_HEAD(unmapped_region_list); |
1031 | struct iommu_iotlb_gather iotlb_gather; |
1032 | int unmapped_region_cnt = 0; |
1033 | long unlocked = 0; |
1034 | |
1035 | if (!dma->size) |
1036 | return 0; |
1037 | |
1038 | if (list_empty(head: &iommu->domain_list)) |
1039 | return 0; |
1040 | |
1041 | /* |
1042 | * We use the IOMMU to track the physical addresses, otherwise we'd |
1043 | * need a much more complicated tracking system. Unfortunately that |
1044 | * means we need to use one of the iommu domains to figure out the |
1045 | * pfns to unpin. The rest need to be unmapped in advance so we have |
1046 | * no iommu translations remaining when the pages are unpinned. |
1047 | */ |
1048 | domain = d = list_first_entry(&iommu->domain_list, |
1049 | struct vfio_domain, next); |
1050 | |
1051 | list_for_each_entry_continue(d, &iommu->domain_list, next) { |
1052 | iommu_unmap(domain: d->domain, iova: dma->iova, size: dma->size); |
1053 | cond_resched(); |
1054 | } |
1055 | |
1056 | iommu_iotlb_gather_init(gather: &iotlb_gather); |
1057 | while (iova < end) { |
1058 | size_t unmapped, len; |
1059 | phys_addr_t phys, next; |
1060 | |
1061 | phys = iommu_iova_to_phys(domain: domain->domain, iova); |
1062 | if (WARN_ON(!phys)) { |
1063 | iova += PAGE_SIZE; |
1064 | continue; |
1065 | } |
1066 | |
1067 | /* |
1068 | * To optimize for fewer iommu_unmap() calls, each of which |
1069 | * may require hardware cache flushing, try to find the |
1070 | * largest contiguous physical memory chunk to unmap. |
1071 | */ |
1072 | for (len = PAGE_SIZE; |
1073 | !domain->fgsp && iova + len < end; len += PAGE_SIZE) { |
1074 | next = iommu_iova_to_phys(domain: domain->domain, iova: iova + len); |
1075 | if (next != phys + len) |
1076 | break; |
1077 | } |
1078 | |
1079 | /* |
1080 | * First, try to use fast unmap/unpin. In case of failure, |
1081 | * switch to slow unmap/unpin path. |
1082 | */ |
1083 | unmapped = unmap_unpin_fast(domain, dma, iova: &iova, len, phys, |
1084 | unlocked: &unlocked, unmapped_list: &unmapped_region_list, |
1085 | unmapped_cnt: &unmapped_region_cnt, |
1086 | iotlb_gather: &iotlb_gather); |
1087 | if (!unmapped) { |
1088 | unmapped = unmap_unpin_slow(domain, dma, iova: &iova, len, |
1089 | phys, unlocked: &unlocked); |
1090 | if (WARN_ON(!unmapped)) |
1091 | break; |
1092 | } |
1093 | } |
1094 | |
1095 | dma->iommu_mapped = false; |
1096 | |
1097 | if (unmapped_region_cnt) { |
1098 | unlocked += vfio_sync_unpin(dma, domain, regions: &unmapped_region_list, |
1099 | iotlb_gather: &iotlb_gather); |
1100 | } |
1101 | |
1102 | if (do_accounting) { |
1103 | vfio_lock_acct(dma, npage: -unlocked, async: true); |
1104 | return 0; |
1105 | } |
1106 | return unlocked; |
1107 | } |
1108 | |
1109 | static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) |
1110 | { |
1111 | WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)); |
1112 | vfio_unmap_unpin(iommu, dma, do_accounting: true); |
1113 | vfio_unlink_dma(iommu, old: dma); |
1114 | put_task_struct(t: dma->task); |
1115 | mmdrop(mm: dma->mm); |
1116 | vfio_dma_bitmap_free(dma); |
1117 | if (dma->vaddr_invalid) |
1118 | iommu->vaddr_invalid_count--; |
1119 | kfree(objp: dma); |
1120 | iommu->dma_avail++; |
1121 | } |
1122 | |
1123 | static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu) |
1124 | { |
1125 | struct vfio_domain *domain; |
1126 | |
1127 | iommu->pgsize_bitmap = ULONG_MAX; |
1128 | |
1129 | list_for_each_entry(domain, &iommu->domain_list, next) |
1130 | iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap; |
1131 | |
1132 | /* |
1133 | * In case the IOMMU supports page sizes smaller than PAGE_SIZE |
1134 | * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes. |
1135 | * That way the user will be able to map/unmap buffers whose size/ |
1136 | * start address is aligned with PAGE_SIZE. Pinning code uses that |
1137 | * granularity while iommu driver can use the sub-PAGE_SIZE size |
1138 | * to map the buffer. |
1139 | */ |
1140 | if (iommu->pgsize_bitmap & ~PAGE_MASK) { |
1141 | iommu->pgsize_bitmap &= PAGE_MASK; |
1142 | iommu->pgsize_bitmap |= PAGE_SIZE; |
1143 | } |
1144 | } |
1145 | |
1146 | static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, |
1147 | struct vfio_dma *dma, dma_addr_t base_iova, |
1148 | size_t pgsize) |
1149 | { |
1150 | unsigned long pgshift = __ffs(pgsize); |
1151 | unsigned long nbits = dma->size >> pgshift; |
1152 | unsigned long bit_offset = (dma->iova - base_iova) >> pgshift; |
1153 | unsigned long copy_offset = bit_offset / BITS_PER_LONG; |
1154 | unsigned long shift = bit_offset % BITS_PER_LONG; |
1155 | unsigned long leftover; |
1156 | |
1157 | /* |
1158 | * mark all pages dirty if any IOMMU capable device is not able |
1159 | * to report dirty pages and all pages are pinned and mapped. |
1160 | */ |
1161 | if (iommu->num_non_pinned_groups && dma->iommu_mapped) |
1162 | bitmap_set(map: dma->bitmap, start: 0, nbits); |
1163 | |
1164 | if (shift) { |
1165 | bitmap_shift_left(dst: dma->bitmap, src: dma->bitmap, shift, |
1166 | nbits: nbits + shift); |
1167 | |
1168 | if (copy_from_user(to: &leftover, |
1169 | from: (void __user *)(bitmap + copy_offset), |
1170 | n: sizeof(leftover))) |
1171 | return -EFAULT; |
1172 | |
1173 | bitmap_or(dst: dma->bitmap, src1: dma->bitmap, src2: &leftover, nbits: shift); |
1174 | } |
1175 | |
1176 | if (copy_to_user(to: (void __user *)(bitmap + copy_offset), from: dma->bitmap, |
1177 | DIRTY_BITMAP_BYTES(nbits + shift))) |
1178 | return -EFAULT; |
1179 | |
1180 | return 0; |
1181 | } |
1182 | |
1183 | static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, |
1184 | dma_addr_t iova, size_t size, size_t pgsize) |
1185 | { |
1186 | struct vfio_dma *dma; |
1187 | struct rb_node *n; |
1188 | unsigned long pgshift = __ffs(pgsize); |
1189 | int ret; |
1190 | |
1191 | /* |
1192 | * GET_BITMAP request must fully cover vfio_dma mappings. Multiple |
1193 | * vfio_dma mappings may be clubbed by specifying large ranges, but |
1194 | * there must not be any previous mappings bisected by the range. |
1195 | * An error will be returned if these conditions are not met. |
1196 | */ |
1197 | dma = vfio_find_dma(iommu, start: iova, size: 1); |
1198 | if (dma && dma->iova != iova) |
1199 | return -EINVAL; |
1200 | |
1201 | dma = vfio_find_dma(iommu, start: iova + size - 1, size: 0); |
1202 | if (dma && dma->iova + dma->size != iova + size) |
1203 | return -EINVAL; |
1204 | |
1205 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
1206 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
1207 | |
1208 | if (dma->iova < iova) |
1209 | continue; |
1210 | |
1211 | if (dma->iova > iova + size - 1) |
1212 | break; |
1213 | |
1214 | ret = update_user_bitmap(bitmap, iommu, dma, base_iova: iova, pgsize); |
1215 | if (ret) |
1216 | return ret; |
1217 | |
1218 | /* |
1219 | * Re-populate bitmap to include all pinned pages which are |
1220 | * considered as dirty but exclude pages which are unpinned and |
1221 | * pages which are marked dirty by vfio_dma_rw() |
1222 | */ |
1223 | bitmap_clear(map: dma->bitmap, start: 0, nbits: dma->size >> pgshift); |
1224 | vfio_dma_populate_bitmap(dma, pgsize); |
1225 | } |
1226 | return 0; |
1227 | } |
1228 | |
1229 | static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size) |
1230 | { |
1231 | if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) || |
1232 | (bitmap_size < DIRTY_BITMAP_BYTES(npages))) |
1233 | return -EINVAL; |
1234 | |
1235 | return 0; |
1236 | } |
1237 | |
1238 | /* |
1239 | * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate |
1240 | * and unmap iovas within the range we're about to unmap. Drivers MUST unpin |
1241 | * pages in response to an invalidation. |
1242 | */ |
1243 | static void vfio_notify_dma_unmap(struct vfio_iommu *iommu, |
1244 | struct vfio_dma *dma) |
1245 | { |
1246 | struct vfio_device *device; |
1247 | |
1248 | if (list_empty(head: &iommu->device_list)) |
1249 | return; |
1250 | |
1251 | /* |
1252 | * The device is expected to call vfio_unpin_pages() for any IOVA it has |
1253 | * pinned within the range. Since vfio_unpin_pages() will eventually |
1254 | * call back down to this code and try to obtain the iommu->lock we must |
1255 | * drop it. |
1256 | */ |
1257 | mutex_lock(&iommu->device_list_lock); |
1258 | mutex_unlock(lock: &iommu->lock); |
1259 | |
1260 | list_for_each_entry(device, &iommu->device_list, iommu_entry) |
1261 | device->ops->dma_unmap(device, dma->iova, dma->size); |
1262 | |
1263 | mutex_unlock(lock: &iommu->device_list_lock); |
1264 | mutex_lock(&iommu->lock); |
1265 | } |
1266 | |
1267 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, |
1268 | struct vfio_iommu_type1_dma_unmap *unmap, |
1269 | struct vfio_bitmap *bitmap) |
1270 | { |
1271 | struct vfio_dma *dma, *dma_last = NULL; |
1272 | size_t unmapped = 0, pgsize; |
1273 | int ret = -EINVAL, retries = 0; |
1274 | unsigned long pgshift; |
1275 | dma_addr_t iova = unmap->iova; |
1276 | u64 size = unmap->size; |
1277 | bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL; |
1278 | bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR; |
1279 | struct rb_node *n, *first_n; |
1280 | |
1281 | mutex_lock(&iommu->lock); |
1282 | |
1283 | /* Cannot update vaddr if mdev is present. */ |
1284 | if (invalidate_vaddr && !list_empty(head: &iommu->emulated_iommu_groups)) { |
1285 | ret = -EBUSY; |
1286 | goto unlock; |
1287 | } |
1288 | |
1289 | pgshift = __ffs(iommu->pgsize_bitmap); |
1290 | pgsize = (size_t)1 << pgshift; |
1291 | |
1292 | if (iova & (pgsize - 1)) |
1293 | goto unlock; |
1294 | |
1295 | if (unmap_all) { |
1296 | if (iova || size) |
1297 | goto unlock; |
1298 | size = U64_MAX; |
1299 | } else if (!size || size & (pgsize - 1) || |
1300 | iova + size - 1 < iova || size > SIZE_MAX) { |
1301 | goto unlock; |
1302 | } |
1303 | |
1304 | /* When dirty tracking is enabled, allow only min supported pgsize */ |
1305 | if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && |
1306 | (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) { |
1307 | goto unlock; |
1308 | } |
1309 | |
1310 | WARN_ON((pgsize - 1) & PAGE_MASK); |
1311 | again: |
1312 | /* |
1313 | * vfio-iommu-type1 (v1) - User mappings were coalesced together to |
1314 | * avoid tracking individual mappings. This means that the granularity |
1315 | * of the original mapping was lost and the user was allowed to attempt |
1316 | * to unmap any range. Depending on the contiguousness of physical |
1317 | * memory and page sizes supported by the IOMMU, arbitrary unmaps may |
1318 | * or may not have worked. We only guaranteed unmap granularity |
1319 | * matching the original mapping; even though it was untracked here, |
1320 | * the original mappings are reflected in IOMMU mappings. This |
1321 | * resulted in a couple unusual behaviors. First, if a range is not |
1322 | * able to be unmapped, ex. a set of 4k pages that was mapped as a |
1323 | * 2M hugepage into the IOMMU, the unmap ioctl returns success but with |
1324 | * a zero sized unmap. Also, if an unmap request overlaps the first |
1325 | * address of a hugepage, the IOMMU will unmap the entire hugepage. |
1326 | * This also returns success and the returned unmap size reflects the |
1327 | * actual size unmapped. |
1328 | * |
1329 | * We attempt to maintain compatibility with this "v1" interface, but |
1330 | * we take control out of the hands of the IOMMU. Therefore, an unmap |
1331 | * request offset from the beginning of the original mapping will |
1332 | * return success with zero sized unmap. And an unmap request covering |
1333 | * the first iova of mapping will unmap the entire range. |
1334 | * |
1335 | * The v2 version of this interface intends to be more deterministic. |
1336 | * Unmap requests must fully cover previous mappings. Multiple |
1337 | * mappings may still be unmaped by specifying large ranges, but there |
1338 | * must not be any previous mappings bisected by the range. An error |
1339 | * will be returned if these conditions are not met. The v2 interface |
1340 | * will only return success and a size of zero if there were no |
1341 | * mappings within the range. |
1342 | */ |
1343 | if (iommu->v2 && !unmap_all) { |
1344 | dma = vfio_find_dma(iommu, start: iova, size: 1); |
1345 | if (dma && dma->iova != iova) |
1346 | goto unlock; |
1347 | |
1348 | dma = vfio_find_dma(iommu, start: iova + size - 1, size: 0); |
1349 | if (dma && dma->iova + dma->size != iova + size) |
1350 | goto unlock; |
1351 | } |
1352 | |
1353 | ret = 0; |
1354 | n = first_n = vfio_find_dma_first_node(iommu, start: iova, size); |
1355 | |
1356 | while (n) { |
1357 | dma = rb_entry(n, struct vfio_dma, node); |
1358 | if (dma->iova >= iova + size) |
1359 | break; |
1360 | |
1361 | if (!iommu->v2 && iova > dma->iova) |
1362 | break; |
1363 | |
1364 | if (invalidate_vaddr) { |
1365 | if (dma->vaddr_invalid) { |
1366 | struct rb_node *last_n = n; |
1367 | |
1368 | for (n = first_n; n != last_n; n = rb_next(n)) { |
1369 | dma = rb_entry(n, |
1370 | struct vfio_dma, node); |
1371 | dma->vaddr_invalid = false; |
1372 | iommu->vaddr_invalid_count--; |
1373 | } |
1374 | ret = -EINVAL; |
1375 | unmapped = 0; |
1376 | break; |
1377 | } |
1378 | dma->vaddr_invalid = true; |
1379 | iommu->vaddr_invalid_count++; |
1380 | unmapped += dma->size; |
1381 | n = rb_next(n); |
1382 | continue; |
1383 | } |
1384 | |
1385 | if (!RB_EMPTY_ROOT(&dma->pfn_list)) { |
1386 | if (dma_last == dma) { |
1387 | BUG_ON(++retries > 10); |
1388 | } else { |
1389 | dma_last = dma; |
1390 | retries = 0; |
1391 | } |
1392 | |
1393 | vfio_notify_dma_unmap(iommu, dma); |
1394 | goto again; |
1395 | } |
1396 | |
1397 | if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { |
1398 | ret = update_user_bitmap(bitmap: bitmap->data, iommu, dma, |
1399 | base_iova: iova, pgsize); |
1400 | if (ret) |
1401 | break; |
1402 | } |
1403 | |
1404 | unmapped += dma->size; |
1405 | n = rb_next(n); |
1406 | vfio_remove_dma(iommu, dma); |
1407 | } |
1408 | |
1409 | unlock: |
1410 | mutex_unlock(lock: &iommu->lock); |
1411 | |
1412 | /* Report how much was unmapped */ |
1413 | unmap->size = unmapped; |
1414 | |
1415 | return ret; |
1416 | } |
1417 | |
1418 | static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, |
1419 | unsigned long pfn, long npage, int prot) |
1420 | { |
1421 | struct vfio_domain *d; |
1422 | int ret; |
1423 | |
1424 | list_for_each_entry(d, &iommu->domain_list, next) { |
1425 | ret = iommu_map(domain: d->domain, iova, paddr: (phys_addr_t)pfn << PAGE_SHIFT, |
1426 | size: npage << PAGE_SHIFT, prot: prot | IOMMU_CACHE, |
1427 | GFP_KERNEL_ACCOUNT); |
1428 | if (ret) |
1429 | goto unwind; |
1430 | |
1431 | cond_resched(); |
1432 | } |
1433 | |
1434 | return 0; |
1435 | |
1436 | unwind: |
1437 | list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) { |
1438 | iommu_unmap(domain: d->domain, iova, size: npage << PAGE_SHIFT); |
1439 | cond_resched(); |
1440 | } |
1441 | |
1442 | return ret; |
1443 | } |
1444 | |
1445 | static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, |
1446 | size_t map_size) |
1447 | { |
1448 | dma_addr_t iova = dma->iova; |
1449 | unsigned long vaddr = dma->vaddr; |
1450 | struct vfio_batch batch; |
1451 | size_t size = map_size; |
1452 | long npage; |
1453 | unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
1454 | int ret = 0; |
1455 | |
1456 | vfio_batch_init(batch: &batch); |
1457 | |
1458 | while (size) { |
1459 | /* Pin a contiguous chunk of memory */ |
1460 | npage = vfio_pin_pages_remote(dma, vaddr: vaddr + dma->size, |
1461 | npage: size >> PAGE_SHIFT, pfn_base: &pfn, limit, |
1462 | batch: &batch); |
1463 | if (npage <= 0) { |
1464 | WARN_ON(!npage); |
1465 | ret = (int)npage; |
1466 | break; |
1467 | } |
1468 | |
1469 | /* Map it! */ |
1470 | ret = vfio_iommu_map(iommu, iova: iova + dma->size, pfn, npage, |
1471 | prot: dma->prot); |
1472 | if (ret) { |
1473 | vfio_unpin_pages_remote(dma, iova: iova + dma->size, pfn, |
1474 | npage, do_accounting: true); |
1475 | vfio_batch_unpin(batch: &batch, dma); |
1476 | break; |
1477 | } |
1478 | |
1479 | size -= npage << PAGE_SHIFT; |
1480 | dma->size += npage << PAGE_SHIFT; |
1481 | } |
1482 | |
1483 | vfio_batch_fini(batch: &batch); |
1484 | dma->iommu_mapped = true; |
1485 | |
1486 | if (ret) |
1487 | vfio_remove_dma(iommu, dma); |
1488 | |
1489 | return ret; |
1490 | } |
1491 | |
1492 | /* |
1493 | * Check dma map request is within a valid iova range |
1494 | */ |
1495 | static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu, |
1496 | dma_addr_t start, dma_addr_t end) |
1497 | { |
1498 | struct list_head *iova = &iommu->iova_list; |
1499 | struct vfio_iova *node; |
1500 | |
1501 | list_for_each_entry(node, iova, list) { |
1502 | if (start >= node->start && end <= node->end) |
1503 | return true; |
1504 | } |
1505 | |
1506 | /* |
1507 | * Check for list_empty() as well since a container with |
1508 | * a single mdev device will have an empty list. |
1509 | */ |
1510 | return list_empty(head: iova); |
1511 | } |
1512 | |
1513 | static int vfio_change_dma_owner(struct vfio_dma *dma) |
1514 | { |
1515 | struct task_struct *task = current->group_leader; |
1516 | struct mm_struct *mm = current->mm; |
1517 | long npage = dma->locked_vm; |
1518 | bool lock_cap; |
1519 | int ret; |
1520 | |
1521 | if (mm == dma->mm) |
1522 | return 0; |
1523 | |
1524 | lock_cap = capable(CAP_IPC_LOCK); |
1525 | ret = mm_lock_acct(task, mm, lock_cap, npage); |
1526 | if (ret) |
1527 | return ret; |
1528 | |
1529 | if (mmget_not_zero(mm: dma->mm)) { |
1530 | mm_lock_acct(task: dma->task, mm: dma->mm, lock_cap: dma->lock_cap, npage: -npage); |
1531 | mmput(dma->mm); |
1532 | } |
1533 | |
1534 | if (dma->task != task) { |
1535 | put_task_struct(t: dma->task); |
1536 | dma->task = get_task_struct(t: task); |
1537 | } |
1538 | mmdrop(mm: dma->mm); |
1539 | dma->mm = mm; |
1540 | mmgrab(mm: dma->mm); |
1541 | dma->lock_cap = lock_cap; |
1542 | return 0; |
1543 | } |
1544 | |
1545 | static int vfio_dma_do_map(struct vfio_iommu *iommu, |
1546 | struct vfio_iommu_type1_dma_map *map) |
1547 | { |
1548 | bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR; |
1549 | dma_addr_t iova = map->iova; |
1550 | unsigned long vaddr = map->vaddr; |
1551 | size_t size = map->size; |
1552 | int ret = 0, prot = 0; |
1553 | size_t pgsize; |
1554 | struct vfio_dma *dma; |
1555 | |
1556 | /* Verify that none of our __u64 fields overflow */ |
1557 | if (map->size != size || map->vaddr != vaddr || map->iova != iova) |
1558 | return -EINVAL; |
1559 | |
1560 | /* READ/WRITE from device perspective */ |
1561 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) |
1562 | prot |= IOMMU_WRITE; |
1563 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) |
1564 | prot |= IOMMU_READ; |
1565 | |
1566 | if ((prot && set_vaddr) || (!prot && !set_vaddr)) |
1567 | return -EINVAL; |
1568 | |
1569 | mutex_lock(&iommu->lock); |
1570 | |
1571 | pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); |
1572 | |
1573 | WARN_ON((pgsize - 1) & PAGE_MASK); |
1574 | |
1575 | if (!size || (size | iova | vaddr) & (pgsize - 1)) { |
1576 | ret = -EINVAL; |
1577 | goto out_unlock; |
1578 | } |
1579 | |
1580 | /* Don't allow IOVA or virtual address wrap */ |
1581 | if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) { |
1582 | ret = -EINVAL; |
1583 | goto out_unlock; |
1584 | } |
1585 | |
1586 | dma = vfio_find_dma(iommu, start: iova, size); |
1587 | if (set_vaddr) { |
1588 | if (!dma) { |
1589 | ret = -ENOENT; |
1590 | } else if (!dma->vaddr_invalid || dma->iova != iova || |
1591 | dma->size != size) { |
1592 | ret = -EINVAL; |
1593 | } else { |
1594 | ret = vfio_change_dma_owner(dma); |
1595 | if (ret) |
1596 | goto out_unlock; |
1597 | dma->vaddr = vaddr; |
1598 | dma->vaddr_invalid = false; |
1599 | iommu->vaddr_invalid_count--; |
1600 | } |
1601 | goto out_unlock; |
1602 | } else if (dma) { |
1603 | ret = -EEXIST; |
1604 | goto out_unlock; |
1605 | } |
1606 | |
1607 | if (!iommu->dma_avail) { |
1608 | ret = -ENOSPC; |
1609 | goto out_unlock; |
1610 | } |
1611 | |
1612 | if (!vfio_iommu_iova_dma_valid(iommu, start: iova, end: iova + size - 1)) { |
1613 | ret = -EINVAL; |
1614 | goto out_unlock; |
1615 | } |
1616 | |
1617 | dma = kzalloc(size: sizeof(*dma), GFP_KERNEL); |
1618 | if (!dma) { |
1619 | ret = -ENOMEM; |
1620 | goto out_unlock; |
1621 | } |
1622 | |
1623 | iommu->dma_avail--; |
1624 | dma->iova = iova; |
1625 | dma->vaddr = vaddr; |
1626 | dma->prot = prot; |
1627 | |
1628 | /* |
1629 | * We need to be able to both add to a task's locked memory and test |
1630 | * against the locked memory limit and we need to be able to do both |
1631 | * outside of this call path as pinning can be asynchronous via the |
1632 | * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a |
1633 | * task_struct. Save the group_leader so that all DMA tracking uses |
1634 | * the same task, to make debugging easier. VM locked pages requires |
1635 | * an mm_struct, so grab the mm in case the task dies. |
1636 | */ |
1637 | get_task_struct(current->group_leader); |
1638 | dma->task = current->group_leader; |
1639 | dma->lock_cap = capable(CAP_IPC_LOCK); |
1640 | dma->mm = current->mm; |
1641 | mmgrab(mm: dma->mm); |
1642 | |
1643 | dma->pfn_list = RB_ROOT; |
1644 | |
1645 | /* Insert zero-sized and grow as we map chunks of it */ |
1646 | vfio_link_dma(iommu, new: dma); |
1647 | |
1648 | /* Don't pin and map if container doesn't contain IOMMU capable domain*/ |
1649 | if (list_empty(head: &iommu->domain_list)) |
1650 | dma->size = size; |
1651 | else |
1652 | ret = vfio_pin_map_dma(iommu, dma, map_size: size); |
1653 | |
1654 | if (!ret && iommu->dirty_page_tracking) { |
1655 | ret = vfio_dma_bitmap_alloc(dma, pgsize); |
1656 | if (ret) |
1657 | vfio_remove_dma(iommu, dma); |
1658 | } |
1659 | |
1660 | out_unlock: |
1661 | mutex_unlock(lock: &iommu->lock); |
1662 | return ret; |
1663 | } |
1664 | |
1665 | static int vfio_iommu_replay(struct vfio_iommu *iommu, |
1666 | struct vfio_domain *domain) |
1667 | { |
1668 | struct vfio_batch batch; |
1669 | struct vfio_domain *d = NULL; |
1670 | struct rb_node *n; |
1671 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
1672 | int ret; |
1673 | |
1674 | /* Arbitrarily pick the first domain in the list for lookups */ |
1675 | if (!list_empty(head: &iommu->domain_list)) |
1676 | d = list_first_entry(&iommu->domain_list, |
1677 | struct vfio_domain, next); |
1678 | |
1679 | vfio_batch_init(batch: &batch); |
1680 | |
1681 | n = rb_first(&iommu->dma_list); |
1682 | |
1683 | for (; n; n = rb_next(n)) { |
1684 | struct vfio_dma *dma; |
1685 | dma_addr_t iova; |
1686 | |
1687 | dma = rb_entry(n, struct vfio_dma, node); |
1688 | iova = dma->iova; |
1689 | |
1690 | while (iova < dma->iova + dma->size) { |
1691 | phys_addr_t phys; |
1692 | size_t size; |
1693 | |
1694 | if (dma->iommu_mapped) { |
1695 | phys_addr_t p; |
1696 | dma_addr_t i; |
1697 | |
1698 | if (WARN_ON(!d)) { /* mapped w/o a domain?! */ |
1699 | ret = -EINVAL; |
1700 | goto unwind; |
1701 | } |
1702 | |
1703 | phys = iommu_iova_to_phys(domain: d->domain, iova); |
1704 | |
1705 | if (WARN_ON(!phys)) { |
1706 | iova += PAGE_SIZE; |
1707 | continue; |
1708 | } |
1709 | |
1710 | size = PAGE_SIZE; |
1711 | p = phys + size; |
1712 | i = iova + size; |
1713 | while (i < dma->iova + dma->size && |
1714 | p == iommu_iova_to_phys(domain: d->domain, iova: i)) { |
1715 | size += PAGE_SIZE; |
1716 | p += PAGE_SIZE; |
1717 | i += PAGE_SIZE; |
1718 | } |
1719 | } else { |
1720 | unsigned long pfn; |
1721 | unsigned long vaddr = dma->vaddr + |
1722 | (iova - dma->iova); |
1723 | size_t n = dma->iova + dma->size - iova; |
1724 | long npage; |
1725 | |
1726 | npage = vfio_pin_pages_remote(dma, vaddr, |
1727 | npage: n >> PAGE_SHIFT, |
1728 | pfn_base: &pfn, limit, |
1729 | batch: &batch); |
1730 | if (npage <= 0) { |
1731 | WARN_ON(!npage); |
1732 | ret = (int)npage; |
1733 | goto unwind; |
1734 | } |
1735 | |
1736 | phys = pfn << PAGE_SHIFT; |
1737 | size = npage << PAGE_SHIFT; |
1738 | } |
1739 | |
1740 | ret = iommu_map(domain: domain->domain, iova, paddr: phys, size, |
1741 | prot: dma->prot | IOMMU_CACHE, |
1742 | GFP_KERNEL_ACCOUNT); |
1743 | if (ret) { |
1744 | if (!dma->iommu_mapped) { |
1745 | vfio_unpin_pages_remote(dma, iova, |
1746 | pfn: phys >> PAGE_SHIFT, |
1747 | npage: size >> PAGE_SHIFT, |
1748 | do_accounting: true); |
1749 | vfio_batch_unpin(batch: &batch, dma); |
1750 | } |
1751 | goto unwind; |
1752 | } |
1753 | |
1754 | iova += size; |
1755 | } |
1756 | } |
1757 | |
1758 | /* All dmas are now mapped, defer to second tree walk for unwind */ |
1759 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
1760 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
1761 | |
1762 | dma->iommu_mapped = true; |
1763 | } |
1764 | |
1765 | vfio_batch_fini(batch: &batch); |
1766 | return 0; |
1767 | |
1768 | unwind: |
1769 | for (; n; n = rb_prev(n)) { |
1770 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
1771 | dma_addr_t iova; |
1772 | |
1773 | if (dma->iommu_mapped) { |
1774 | iommu_unmap(domain: domain->domain, iova: dma->iova, size: dma->size); |
1775 | continue; |
1776 | } |
1777 | |
1778 | iova = dma->iova; |
1779 | while (iova < dma->iova + dma->size) { |
1780 | phys_addr_t phys, p; |
1781 | size_t size; |
1782 | dma_addr_t i; |
1783 | |
1784 | phys = iommu_iova_to_phys(domain: domain->domain, iova); |
1785 | if (!phys) { |
1786 | iova += PAGE_SIZE; |
1787 | continue; |
1788 | } |
1789 | |
1790 | size = PAGE_SIZE; |
1791 | p = phys + size; |
1792 | i = iova + size; |
1793 | while (i < dma->iova + dma->size && |
1794 | p == iommu_iova_to_phys(domain: domain->domain, iova: i)) { |
1795 | size += PAGE_SIZE; |
1796 | p += PAGE_SIZE; |
1797 | i += PAGE_SIZE; |
1798 | } |
1799 | |
1800 | iommu_unmap(domain: domain->domain, iova, size); |
1801 | vfio_unpin_pages_remote(dma, iova, pfn: phys >> PAGE_SHIFT, |
1802 | npage: size >> PAGE_SHIFT, do_accounting: true); |
1803 | } |
1804 | } |
1805 | |
1806 | vfio_batch_fini(batch: &batch); |
1807 | return ret; |
1808 | } |
1809 | |
1810 | /* |
1811 | * We change our unmap behavior slightly depending on whether the IOMMU |
1812 | * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage |
1813 | * for practically any contiguous power-of-two mapping we give it. This means |
1814 | * we don't need to look for contiguous chunks ourselves to make unmapping |
1815 | * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d |
1816 | * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks |
1817 | * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when |
1818 | * hugetlbfs is in use. |
1819 | */ |
1820 | static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions) |
1821 | { |
1822 | int ret, order = get_order(PAGE_SIZE * 2); |
1823 | struct vfio_iova *region; |
1824 | struct page *pages; |
1825 | dma_addr_t start; |
1826 | |
1827 | pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); |
1828 | if (!pages) |
1829 | return; |
1830 | |
1831 | list_for_each_entry(region, regions, list) { |
1832 | start = ALIGN(region->start, PAGE_SIZE * 2); |
1833 | if (start >= region->end || (region->end - start < PAGE_SIZE * 2)) |
1834 | continue; |
1835 | |
1836 | ret = iommu_map(domain: domain->domain, iova: start, page_to_phys(pages), PAGE_SIZE * 2, |
1837 | IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, |
1838 | GFP_KERNEL_ACCOUNT); |
1839 | if (!ret) { |
1840 | size_t unmapped = iommu_unmap(domain: domain->domain, iova: start, PAGE_SIZE); |
1841 | |
1842 | if (unmapped == PAGE_SIZE) |
1843 | iommu_unmap(domain: domain->domain, iova: start + PAGE_SIZE, PAGE_SIZE); |
1844 | else |
1845 | domain->fgsp = true; |
1846 | } |
1847 | break; |
1848 | } |
1849 | |
1850 | __free_pages(page: pages, order); |
1851 | } |
1852 | |
1853 | static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain, |
1854 | struct iommu_group *iommu_group) |
1855 | { |
1856 | struct vfio_iommu_group *g; |
1857 | |
1858 | list_for_each_entry(g, &domain->group_list, next) { |
1859 | if (g->iommu_group == iommu_group) |
1860 | return g; |
1861 | } |
1862 | |
1863 | return NULL; |
1864 | } |
1865 | |
1866 | static struct vfio_iommu_group* |
1867 | vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, |
1868 | struct iommu_group *iommu_group) |
1869 | { |
1870 | struct vfio_iommu_group *group; |
1871 | struct vfio_domain *domain; |
1872 | |
1873 | list_for_each_entry(domain, &iommu->domain_list, next) { |
1874 | group = find_iommu_group(domain, iommu_group); |
1875 | if (group) |
1876 | return group; |
1877 | } |
1878 | |
1879 | list_for_each_entry(group, &iommu->emulated_iommu_groups, next) |
1880 | if (group->iommu_group == iommu_group) |
1881 | return group; |
1882 | return NULL; |
1883 | } |
1884 | |
1885 | static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, |
1886 | phys_addr_t *base) |
1887 | { |
1888 | struct iommu_resv_region *region; |
1889 | bool ret = false; |
1890 | |
1891 | list_for_each_entry(region, group_resv_regions, list) { |
1892 | /* |
1893 | * The presence of any 'real' MSI regions should take |
1894 | * precedence over the software-managed one if the |
1895 | * IOMMU driver happens to advertise both types. |
1896 | */ |
1897 | if (region->type == IOMMU_RESV_MSI) { |
1898 | ret = false; |
1899 | break; |
1900 | } |
1901 | |
1902 | if (region->type == IOMMU_RESV_SW_MSI) { |
1903 | *base = region->start; |
1904 | ret = true; |
1905 | } |
1906 | } |
1907 | |
1908 | return ret; |
1909 | } |
1910 | |
1911 | /* |
1912 | * This is a helper function to insert an address range to iova list. |
1913 | * The list is initially created with a single entry corresponding to |
1914 | * the IOMMU domain geometry to which the device group is attached. |
1915 | * The list aperture gets modified when a new domain is added to the |
1916 | * container if the new aperture doesn't conflict with the current one |
1917 | * or with any existing dma mappings. The list is also modified to |
1918 | * exclude any reserved regions associated with the device group. |
1919 | */ |
1920 | static int vfio_iommu_iova_insert(struct list_head *head, |
1921 | dma_addr_t start, dma_addr_t end) |
1922 | { |
1923 | struct vfio_iova *region; |
1924 | |
1925 | region = kmalloc(size: sizeof(*region), GFP_KERNEL); |
1926 | if (!region) |
1927 | return -ENOMEM; |
1928 | |
1929 | INIT_LIST_HEAD(list: ®ion->list); |
1930 | region->start = start; |
1931 | region->end = end; |
1932 | |
1933 | list_add_tail(new: ®ion->list, head); |
1934 | return 0; |
1935 | } |
1936 | |
1937 | /* |
1938 | * Check the new iommu aperture conflicts with existing aper or with any |
1939 | * existing dma mappings. |
1940 | */ |
1941 | static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu, |
1942 | dma_addr_t start, dma_addr_t end) |
1943 | { |
1944 | struct vfio_iova *first, *last; |
1945 | struct list_head *iova = &iommu->iova_list; |
1946 | |
1947 | if (list_empty(head: iova)) |
1948 | return false; |
1949 | |
1950 | /* Disjoint sets, return conflict */ |
1951 | first = list_first_entry(iova, struct vfio_iova, list); |
1952 | last = list_last_entry(iova, struct vfio_iova, list); |
1953 | if (start > last->end || end < first->start) |
1954 | return true; |
1955 | |
1956 | /* Check for any existing dma mappings below the new start */ |
1957 | if (start > first->start) { |
1958 | if (vfio_find_dma(iommu, start: first->start, size: start - first->start)) |
1959 | return true; |
1960 | } |
1961 | |
1962 | /* Check for any existing dma mappings beyond the new end */ |
1963 | if (end < last->end) { |
1964 | if (vfio_find_dma(iommu, start: end + 1, size: last->end - end)) |
1965 | return true; |
1966 | } |
1967 | |
1968 | return false; |
1969 | } |
1970 | |
1971 | /* |
1972 | * Resize iommu iova aperture window. This is called only if the new |
1973 | * aperture has no conflict with existing aperture and dma mappings. |
1974 | */ |
1975 | static int vfio_iommu_aper_resize(struct list_head *iova, |
1976 | dma_addr_t start, dma_addr_t end) |
1977 | { |
1978 | struct vfio_iova *node, *next; |
1979 | |
1980 | if (list_empty(head: iova)) |
1981 | return vfio_iommu_iova_insert(head: iova, start, end); |
1982 | |
1983 | /* Adjust iova list start */ |
1984 | list_for_each_entry_safe(node, next, iova, list) { |
1985 | if (start < node->start) |
1986 | break; |
1987 | if (start >= node->start && start < node->end) { |
1988 | node->start = start; |
1989 | break; |
1990 | } |
1991 | /* Delete nodes before new start */ |
1992 | list_del(entry: &node->list); |
1993 | kfree(objp: node); |
1994 | } |
1995 | |
1996 | /* Adjust iova list end */ |
1997 | list_for_each_entry_safe(node, next, iova, list) { |
1998 | if (end > node->end) |
1999 | continue; |
2000 | if (end > node->start && end <= node->end) { |
2001 | node->end = end; |
2002 | continue; |
2003 | } |
2004 | /* Delete nodes after new end */ |
2005 | list_del(entry: &node->list); |
2006 | kfree(objp: node); |
2007 | } |
2008 | |
2009 | return 0; |
2010 | } |
2011 | |
2012 | /* |
2013 | * Check reserved region conflicts with existing dma mappings |
2014 | */ |
2015 | static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu, |
2016 | struct list_head *resv_regions) |
2017 | { |
2018 | struct iommu_resv_region *region; |
2019 | |
2020 | /* Check for conflict with existing dma mappings */ |
2021 | list_for_each_entry(region, resv_regions, list) { |
2022 | if (region->type == IOMMU_RESV_DIRECT_RELAXABLE) |
2023 | continue; |
2024 | |
2025 | if (vfio_find_dma(iommu, start: region->start, size: region->length)) |
2026 | return true; |
2027 | } |
2028 | |
2029 | return false; |
2030 | } |
2031 | |
2032 | /* |
2033 | * Check iova region overlap with reserved regions and |
2034 | * exclude them from the iommu iova range |
2035 | */ |
2036 | static int vfio_iommu_resv_exclude(struct list_head *iova, |
2037 | struct list_head *resv_regions) |
2038 | { |
2039 | struct iommu_resv_region *resv; |
2040 | struct vfio_iova *n, *next; |
2041 | |
2042 | list_for_each_entry(resv, resv_regions, list) { |
2043 | phys_addr_t start, end; |
2044 | |
2045 | if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) |
2046 | continue; |
2047 | |
2048 | start = resv->start; |
2049 | end = resv->start + resv->length - 1; |
2050 | |
2051 | list_for_each_entry_safe(n, next, iova, list) { |
2052 | int ret = 0; |
2053 | |
2054 | /* No overlap */ |
2055 | if (start > n->end || end < n->start) |
2056 | continue; |
2057 | /* |
2058 | * Insert a new node if current node overlaps with the |
2059 | * reserve region to exclude that from valid iova range. |
2060 | * Note that, new node is inserted before the current |
2061 | * node and finally the current node is deleted keeping |
2062 | * the list updated and sorted. |
2063 | */ |
2064 | if (start > n->start) |
2065 | ret = vfio_iommu_iova_insert(head: &n->list, start: n->start, |
2066 | end: start - 1); |
2067 | if (!ret && end < n->end) |
2068 | ret = vfio_iommu_iova_insert(head: &n->list, start: end + 1, |
2069 | end: n->end); |
2070 | if (ret) |
2071 | return ret; |
2072 | |
2073 | list_del(entry: &n->list); |
2074 | kfree(objp: n); |
2075 | } |
2076 | } |
2077 | |
2078 | if (list_empty(head: iova)) |
2079 | return -EINVAL; |
2080 | |
2081 | return 0; |
2082 | } |
2083 | |
2084 | static void vfio_iommu_resv_free(struct list_head *resv_regions) |
2085 | { |
2086 | struct iommu_resv_region *n, *next; |
2087 | |
2088 | list_for_each_entry_safe(n, next, resv_regions, list) { |
2089 | list_del(entry: &n->list); |
2090 | kfree(objp: n); |
2091 | } |
2092 | } |
2093 | |
2094 | static void vfio_iommu_iova_free(struct list_head *iova) |
2095 | { |
2096 | struct vfio_iova *n, *next; |
2097 | |
2098 | list_for_each_entry_safe(n, next, iova, list) { |
2099 | list_del(entry: &n->list); |
2100 | kfree(objp: n); |
2101 | } |
2102 | } |
2103 | |
2104 | static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu, |
2105 | struct list_head *iova_copy) |
2106 | { |
2107 | struct list_head *iova = &iommu->iova_list; |
2108 | struct vfio_iova *n; |
2109 | int ret; |
2110 | |
2111 | list_for_each_entry(n, iova, list) { |
2112 | ret = vfio_iommu_iova_insert(head: iova_copy, start: n->start, end: n->end); |
2113 | if (ret) |
2114 | goto out_free; |
2115 | } |
2116 | |
2117 | return 0; |
2118 | |
2119 | out_free: |
2120 | vfio_iommu_iova_free(iova: iova_copy); |
2121 | return ret; |
2122 | } |
2123 | |
2124 | static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, |
2125 | struct list_head *iova_copy) |
2126 | { |
2127 | struct list_head *iova = &iommu->iova_list; |
2128 | |
2129 | vfio_iommu_iova_free(iova); |
2130 | |
2131 | list_splice_tail(list: iova_copy, head: iova); |
2132 | } |
2133 | |
2134 | static int vfio_iommu_domain_alloc(struct device *dev, void *data) |
2135 | { |
2136 | struct iommu_domain **domain = data; |
2137 | |
2138 | *domain = iommu_domain_alloc(bus: dev->bus); |
2139 | return 1; /* Don't iterate */ |
2140 | } |
2141 | |
2142 | static int vfio_iommu_type1_attach_group(void *iommu_data, |
2143 | struct iommu_group *iommu_group, enum vfio_group_type type) |
2144 | { |
2145 | struct vfio_iommu *iommu = iommu_data; |
2146 | struct vfio_iommu_group *group; |
2147 | struct vfio_domain *domain, *d; |
2148 | bool resv_msi; |
2149 | phys_addr_t resv_msi_base = 0; |
2150 | struct iommu_domain_geometry *geo; |
2151 | LIST_HEAD(iova_copy); |
2152 | LIST_HEAD(group_resv_regions); |
2153 | int ret = -EBUSY; |
2154 | |
2155 | mutex_lock(&iommu->lock); |
2156 | |
2157 | /* Attach could require pinning, so disallow while vaddr is invalid. */ |
2158 | if (iommu->vaddr_invalid_count) |
2159 | goto out_unlock; |
2160 | |
2161 | /* Check for duplicates */ |
2162 | ret = -EINVAL; |
2163 | if (vfio_iommu_find_iommu_group(iommu, iommu_group)) |
2164 | goto out_unlock; |
2165 | |
2166 | ret = -ENOMEM; |
2167 | group = kzalloc(size: sizeof(*group), GFP_KERNEL); |
2168 | if (!group) |
2169 | goto out_unlock; |
2170 | group->iommu_group = iommu_group; |
2171 | |
2172 | if (type == VFIO_EMULATED_IOMMU) { |
2173 | list_add(new: &group->next, head: &iommu->emulated_iommu_groups); |
2174 | /* |
2175 | * An emulated IOMMU group cannot dirty memory directly, it can |
2176 | * only use interfaces that provide dirty tracking. |
2177 | * The iommu scope can only be promoted with the addition of a |
2178 | * dirty tracking group. |
2179 | */ |
2180 | group->pinned_page_dirty_scope = true; |
2181 | ret = 0; |
2182 | goto out_unlock; |
2183 | } |
2184 | |
2185 | ret = -ENOMEM; |
2186 | domain = kzalloc(size: sizeof(*domain), GFP_KERNEL); |
2187 | if (!domain) |
2188 | goto out_free_group; |
2189 | |
2190 | /* |
2191 | * Going via the iommu_group iterator avoids races, and trivially gives |
2192 | * us a representative device for the IOMMU API call. We don't actually |
2193 | * want to iterate beyond the first device (if any). |
2194 | */ |
2195 | ret = -EIO; |
2196 | iommu_group_for_each_dev(group: iommu_group, data: &domain->domain, |
2197 | fn: vfio_iommu_domain_alloc); |
2198 | if (!domain->domain) |
2199 | goto out_free_domain; |
2200 | |
2201 | if (iommu->nesting) { |
2202 | ret = iommu_enable_nesting(domain: domain->domain); |
2203 | if (ret) |
2204 | goto out_domain; |
2205 | } |
2206 | |
2207 | ret = iommu_attach_group(domain: domain->domain, group: group->iommu_group); |
2208 | if (ret) |
2209 | goto out_domain; |
2210 | |
2211 | /* Get aperture info */ |
2212 | geo = &domain->domain->geometry; |
2213 | if (vfio_iommu_aper_conflict(iommu, start: geo->aperture_start, |
2214 | end: geo->aperture_end)) { |
2215 | ret = -EINVAL; |
2216 | goto out_detach; |
2217 | } |
2218 | |
2219 | ret = iommu_get_group_resv_regions(group: iommu_group, head: &group_resv_regions); |
2220 | if (ret) |
2221 | goto out_detach; |
2222 | |
2223 | if (vfio_iommu_resv_conflict(iommu, resv_regions: &group_resv_regions)) { |
2224 | ret = -EINVAL; |
2225 | goto out_detach; |
2226 | } |
2227 | |
2228 | /* |
2229 | * We don't want to work on the original iova list as the list |
2230 | * gets modified and in case of failure we have to retain the |
2231 | * original list. Get a copy here. |
2232 | */ |
2233 | ret = vfio_iommu_iova_get_copy(iommu, iova_copy: &iova_copy); |
2234 | if (ret) |
2235 | goto out_detach; |
2236 | |
2237 | ret = vfio_iommu_aper_resize(iova: &iova_copy, start: geo->aperture_start, |
2238 | end: geo->aperture_end); |
2239 | if (ret) |
2240 | goto out_detach; |
2241 | |
2242 | ret = vfio_iommu_resv_exclude(iova: &iova_copy, resv_regions: &group_resv_regions); |
2243 | if (ret) |
2244 | goto out_detach; |
2245 | |
2246 | resv_msi = vfio_iommu_has_sw_msi(group_resv_regions: &group_resv_regions, base: &resv_msi_base); |
2247 | |
2248 | INIT_LIST_HEAD(list: &domain->group_list); |
2249 | list_add(new: &group->next, head: &domain->group_list); |
2250 | |
2251 | if (!allow_unsafe_interrupts && |
2252 | !iommu_group_has_isolated_msi(group: iommu_group)) { |
2253 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n" , |
2254 | __func__); |
2255 | ret = -EPERM; |
2256 | goto out_detach; |
2257 | } |
2258 | |
2259 | /* |
2260 | * If the IOMMU can block non-coherent operations (ie PCIe TLPs with |
2261 | * no-snoop set) then VFIO always turns this feature on because on Intel |
2262 | * platforms it optimizes KVM to disable wbinvd emulation. |
2263 | */ |
2264 | if (domain->domain->ops->enforce_cache_coherency) |
2265 | domain->enforce_cache_coherency = |
2266 | domain->domain->ops->enforce_cache_coherency( |
2267 | domain->domain); |
2268 | |
2269 | /* |
2270 | * Try to match an existing compatible domain. We don't want to |
2271 | * preclude an IOMMU driver supporting multiple bus_types and being |
2272 | * able to include different bus_types in the same IOMMU domain, so |
2273 | * we test whether the domains use the same iommu_ops rather than |
2274 | * testing if they're on the same bus_type. |
2275 | */ |
2276 | list_for_each_entry(d, &iommu->domain_list, next) { |
2277 | if (d->domain->ops == domain->domain->ops && |
2278 | d->enforce_cache_coherency == |
2279 | domain->enforce_cache_coherency) { |
2280 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2281 | if (!iommu_attach_group(domain: d->domain, |
2282 | group: group->iommu_group)) { |
2283 | list_add(new: &group->next, head: &d->group_list); |
2284 | iommu_domain_free(domain: domain->domain); |
2285 | kfree(objp: domain); |
2286 | goto done; |
2287 | } |
2288 | |
2289 | ret = iommu_attach_group(domain: domain->domain, |
2290 | group: group->iommu_group); |
2291 | if (ret) |
2292 | goto out_domain; |
2293 | } |
2294 | } |
2295 | |
2296 | vfio_test_domain_fgsp(domain, regions: &iova_copy); |
2297 | |
2298 | /* replay mappings on new domains */ |
2299 | ret = vfio_iommu_replay(iommu, domain); |
2300 | if (ret) |
2301 | goto out_detach; |
2302 | |
2303 | if (resv_msi) { |
2304 | ret = iommu_get_msi_cookie(domain: domain->domain, base: resv_msi_base); |
2305 | if (ret && ret != -ENODEV) |
2306 | goto out_detach; |
2307 | } |
2308 | |
2309 | list_add(new: &domain->next, head: &iommu->domain_list); |
2310 | vfio_update_pgsize_bitmap(iommu); |
2311 | done: |
2312 | /* Delete the old one and insert new iova list */ |
2313 | vfio_iommu_iova_insert_copy(iommu, iova_copy: &iova_copy); |
2314 | |
2315 | /* |
2316 | * An iommu backed group can dirty memory directly and therefore |
2317 | * demotes the iommu scope until it declares itself dirty tracking |
2318 | * capable via the page pinning interface. |
2319 | */ |
2320 | iommu->num_non_pinned_groups++; |
2321 | mutex_unlock(lock: &iommu->lock); |
2322 | vfio_iommu_resv_free(resv_regions: &group_resv_regions); |
2323 | |
2324 | return 0; |
2325 | |
2326 | out_detach: |
2327 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2328 | out_domain: |
2329 | iommu_domain_free(domain: domain->domain); |
2330 | vfio_iommu_iova_free(iova: &iova_copy); |
2331 | vfio_iommu_resv_free(resv_regions: &group_resv_regions); |
2332 | out_free_domain: |
2333 | kfree(objp: domain); |
2334 | out_free_group: |
2335 | kfree(objp: group); |
2336 | out_unlock: |
2337 | mutex_unlock(lock: &iommu->lock); |
2338 | return ret; |
2339 | } |
2340 | |
2341 | static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) |
2342 | { |
2343 | struct rb_node *node; |
2344 | |
2345 | while ((node = rb_first(&iommu->dma_list))) |
2346 | vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); |
2347 | } |
2348 | |
2349 | static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) |
2350 | { |
2351 | struct rb_node *n, *p; |
2352 | |
2353 | n = rb_first(&iommu->dma_list); |
2354 | for (; n; n = rb_next(n)) { |
2355 | struct vfio_dma *dma; |
2356 | long locked = 0, unlocked = 0; |
2357 | |
2358 | dma = rb_entry(n, struct vfio_dma, node); |
2359 | unlocked += vfio_unmap_unpin(iommu, dma, do_accounting: false); |
2360 | p = rb_first(&dma->pfn_list); |
2361 | for (; p; p = rb_next(p)) { |
2362 | struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, |
2363 | node); |
2364 | |
2365 | if (!is_invalid_reserved_pfn(pfn: vpfn->pfn)) |
2366 | locked++; |
2367 | } |
2368 | vfio_lock_acct(dma, npage: locked - unlocked, async: true); |
2369 | } |
2370 | } |
2371 | |
2372 | /* |
2373 | * Called when a domain is removed in detach. It is possible that |
2374 | * the removed domain decided the iova aperture window. Modify the |
2375 | * iova aperture with the smallest window among existing domains. |
2376 | */ |
2377 | static void vfio_iommu_aper_expand(struct vfio_iommu *iommu, |
2378 | struct list_head *iova_copy) |
2379 | { |
2380 | struct vfio_domain *domain; |
2381 | struct vfio_iova *node; |
2382 | dma_addr_t start = 0; |
2383 | dma_addr_t end = (dma_addr_t)~0; |
2384 | |
2385 | if (list_empty(head: iova_copy)) |
2386 | return; |
2387 | |
2388 | list_for_each_entry(domain, &iommu->domain_list, next) { |
2389 | struct iommu_domain_geometry *geo = &domain->domain->geometry; |
2390 | |
2391 | if (geo->aperture_start > start) |
2392 | start = geo->aperture_start; |
2393 | if (geo->aperture_end < end) |
2394 | end = geo->aperture_end; |
2395 | } |
2396 | |
2397 | /* Modify aperture limits. The new aper is either same or bigger */ |
2398 | node = list_first_entry(iova_copy, struct vfio_iova, list); |
2399 | node->start = start; |
2400 | node = list_last_entry(iova_copy, struct vfio_iova, list); |
2401 | node->end = end; |
2402 | } |
2403 | |
2404 | /* |
2405 | * Called when a group is detached. The reserved regions for that |
2406 | * group can be part of valid iova now. But since reserved regions |
2407 | * may be duplicated among groups, populate the iova valid regions |
2408 | * list again. |
2409 | */ |
2410 | static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu, |
2411 | struct list_head *iova_copy) |
2412 | { |
2413 | struct vfio_domain *d; |
2414 | struct vfio_iommu_group *g; |
2415 | struct vfio_iova *node; |
2416 | dma_addr_t start, end; |
2417 | LIST_HEAD(resv_regions); |
2418 | int ret; |
2419 | |
2420 | if (list_empty(head: iova_copy)) |
2421 | return -EINVAL; |
2422 | |
2423 | list_for_each_entry(d, &iommu->domain_list, next) { |
2424 | list_for_each_entry(g, &d->group_list, next) { |
2425 | ret = iommu_get_group_resv_regions(group: g->iommu_group, |
2426 | head: &resv_regions); |
2427 | if (ret) |
2428 | goto done; |
2429 | } |
2430 | } |
2431 | |
2432 | node = list_first_entry(iova_copy, struct vfio_iova, list); |
2433 | start = node->start; |
2434 | node = list_last_entry(iova_copy, struct vfio_iova, list); |
2435 | end = node->end; |
2436 | |
2437 | /* purge the iova list and create new one */ |
2438 | vfio_iommu_iova_free(iova: iova_copy); |
2439 | |
2440 | ret = vfio_iommu_aper_resize(iova: iova_copy, start, end); |
2441 | if (ret) |
2442 | goto done; |
2443 | |
2444 | /* Exclude current reserved regions from iova ranges */ |
2445 | ret = vfio_iommu_resv_exclude(iova: iova_copy, resv_regions: &resv_regions); |
2446 | done: |
2447 | vfio_iommu_resv_free(resv_regions: &resv_regions); |
2448 | return ret; |
2449 | } |
2450 | |
2451 | static void vfio_iommu_type1_detach_group(void *iommu_data, |
2452 | struct iommu_group *iommu_group) |
2453 | { |
2454 | struct vfio_iommu *iommu = iommu_data; |
2455 | struct vfio_domain *domain; |
2456 | struct vfio_iommu_group *group; |
2457 | bool update_dirty_scope = false; |
2458 | LIST_HEAD(iova_copy); |
2459 | |
2460 | mutex_lock(&iommu->lock); |
2461 | list_for_each_entry(group, &iommu->emulated_iommu_groups, next) { |
2462 | if (group->iommu_group != iommu_group) |
2463 | continue; |
2464 | update_dirty_scope = !group->pinned_page_dirty_scope; |
2465 | list_del(entry: &group->next); |
2466 | kfree(objp: group); |
2467 | |
2468 | if (list_empty(head: &iommu->emulated_iommu_groups) && |
2469 | list_empty(head: &iommu->domain_list)) { |
2470 | WARN_ON(!list_empty(&iommu->device_list)); |
2471 | vfio_iommu_unmap_unpin_all(iommu); |
2472 | } |
2473 | goto detach_group_done; |
2474 | } |
2475 | |
2476 | /* |
2477 | * Get a copy of iova list. This will be used to update |
2478 | * and to replace the current one later. Please note that |
2479 | * we will leave the original list as it is if update fails. |
2480 | */ |
2481 | vfio_iommu_iova_get_copy(iommu, iova_copy: &iova_copy); |
2482 | |
2483 | list_for_each_entry(domain, &iommu->domain_list, next) { |
2484 | group = find_iommu_group(domain, iommu_group); |
2485 | if (!group) |
2486 | continue; |
2487 | |
2488 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2489 | update_dirty_scope = !group->pinned_page_dirty_scope; |
2490 | list_del(entry: &group->next); |
2491 | kfree(objp: group); |
2492 | /* |
2493 | * Group ownership provides privilege, if the group list is |
2494 | * empty, the domain goes away. If it's the last domain with |
2495 | * iommu and external domain doesn't exist, then all the |
2496 | * mappings go away too. If it's the last domain with iommu and |
2497 | * external domain exist, update accounting |
2498 | */ |
2499 | if (list_empty(head: &domain->group_list)) { |
2500 | if (list_is_singular(head: &iommu->domain_list)) { |
2501 | if (list_empty(head: &iommu->emulated_iommu_groups)) { |
2502 | WARN_ON(!list_empty( |
2503 | &iommu->device_list)); |
2504 | vfio_iommu_unmap_unpin_all(iommu); |
2505 | } else { |
2506 | vfio_iommu_unmap_unpin_reaccount(iommu); |
2507 | } |
2508 | } |
2509 | iommu_domain_free(domain: domain->domain); |
2510 | list_del(entry: &domain->next); |
2511 | kfree(objp: domain); |
2512 | vfio_iommu_aper_expand(iommu, iova_copy: &iova_copy); |
2513 | vfio_update_pgsize_bitmap(iommu); |
2514 | } |
2515 | break; |
2516 | } |
2517 | |
2518 | if (!vfio_iommu_resv_refresh(iommu, iova_copy: &iova_copy)) |
2519 | vfio_iommu_iova_insert_copy(iommu, iova_copy: &iova_copy); |
2520 | else |
2521 | vfio_iommu_iova_free(iova: &iova_copy); |
2522 | |
2523 | detach_group_done: |
2524 | /* |
2525 | * Removal of a group without dirty tracking may allow the iommu scope |
2526 | * to be promoted. |
2527 | */ |
2528 | if (update_dirty_scope) { |
2529 | iommu->num_non_pinned_groups--; |
2530 | if (iommu->dirty_page_tracking) |
2531 | vfio_iommu_populate_bitmap_full(iommu); |
2532 | } |
2533 | mutex_unlock(lock: &iommu->lock); |
2534 | } |
2535 | |
2536 | static void *vfio_iommu_type1_open(unsigned long arg) |
2537 | { |
2538 | struct vfio_iommu *iommu; |
2539 | |
2540 | iommu = kzalloc(size: sizeof(*iommu), GFP_KERNEL); |
2541 | if (!iommu) |
2542 | return ERR_PTR(error: -ENOMEM); |
2543 | |
2544 | switch (arg) { |
2545 | case VFIO_TYPE1_IOMMU: |
2546 | break; |
2547 | case VFIO_TYPE1_NESTING_IOMMU: |
2548 | iommu->nesting = true; |
2549 | fallthrough; |
2550 | case VFIO_TYPE1v2_IOMMU: |
2551 | iommu->v2 = true; |
2552 | break; |
2553 | default: |
2554 | kfree(objp: iommu); |
2555 | return ERR_PTR(error: -EINVAL); |
2556 | } |
2557 | |
2558 | INIT_LIST_HEAD(list: &iommu->domain_list); |
2559 | INIT_LIST_HEAD(list: &iommu->iova_list); |
2560 | iommu->dma_list = RB_ROOT; |
2561 | iommu->dma_avail = dma_entry_limit; |
2562 | mutex_init(&iommu->lock); |
2563 | mutex_init(&iommu->device_list_lock); |
2564 | INIT_LIST_HEAD(list: &iommu->device_list); |
2565 | iommu->pgsize_bitmap = PAGE_MASK; |
2566 | INIT_LIST_HEAD(list: &iommu->emulated_iommu_groups); |
2567 | |
2568 | return iommu; |
2569 | } |
2570 | |
2571 | static void vfio_release_domain(struct vfio_domain *domain) |
2572 | { |
2573 | struct vfio_iommu_group *group, *group_tmp; |
2574 | |
2575 | list_for_each_entry_safe(group, group_tmp, |
2576 | &domain->group_list, next) { |
2577 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2578 | list_del(entry: &group->next); |
2579 | kfree(objp: group); |
2580 | } |
2581 | |
2582 | iommu_domain_free(domain: domain->domain); |
2583 | } |
2584 | |
2585 | static void vfio_iommu_type1_release(void *iommu_data) |
2586 | { |
2587 | struct vfio_iommu *iommu = iommu_data; |
2588 | struct vfio_domain *domain, *domain_tmp; |
2589 | struct vfio_iommu_group *group, *next_group; |
2590 | |
2591 | list_for_each_entry_safe(group, next_group, |
2592 | &iommu->emulated_iommu_groups, next) { |
2593 | list_del(entry: &group->next); |
2594 | kfree(objp: group); |
2595 | } |
2596 | |
2597 | vfio_iommu_unmap_unpin_all(iommu); |
2598 | |
2599 | list_for_each_entry_safe(domain, domain_tmp, |
2600 | &iommu->domain_list, next) { |
2601 | vfio_release_domain(domain); |
2602 | list_del(entry: &domain->next); |
2603 | kfree(objp: domain); |
2604 | } |
2605 | |
2606 | vfio_iommu_iova_free(iova: &iommu->iova_list); |
2607 | |
2608 | kfree(objp: iommu); |
2609 | } |
2610 | |
2611 | static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu) |
2612 | { |
2613 | struct vfio_domain *domain; |
2614 | int ret = 1; |
2615 | |
2616 | mutex_lock(&iommu->lock); |
2617 | list_for_each_entry(domain, &iommu->domain_list, next) { |
2618 | if (!(domain->enforce_cache_coherency)) { |
2619 | ret = 0; |
2620 | break; |
2621 | } |
2622 | } |
2623 | mutex_unlock(lock: &iommu->lock); |
2624 | |
2625 | return ret; |
2626 | } |
2627 | |
2628 | static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu) |
2629 | { |
2630 | bool ret; |
2631 | |
2632 | mutex_lock(&iommu->lock); |
2633 | ret = !list_empty(head: &iommu->emulated_iommu_groups); |
2634 | mutex_unlock(lock: &iommu->lock); |
2635 | return ret; |
2636 | } |
2637 | |
2638 | static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, |
2639 | unsigned long arg) |
2640 | { |
2641 | switch (arg) { |
2642 | case VFIO_TYPE1_IOMMU: |
2643 | case VFIO_TYPE1v2_IOMMU: |
2644 | case VFIO_TYPE1_NESTING_IOMMU: |
2645 | case VFIO_UNMAP_ALL: |
2646 | return 1; |
2647 | case VFIO_UPDATE_VADDR: |
2648 | /* |
2649 | * Disable this feature if mdevs are present. They cannot |
2650 | * safely pin/unpin/rw while vaddrs are being updated. |
2651 | */ |
2652 | return iommu && !vfio_iommu_has_emulated(iommu); |
2653 | case VFIO_DMA_CC_IOMMU: |
2654 | if (!iommu) |
2655 | return 0; |
2656 | return vfio_domains_have_enforce_cache_coherency(iommu); |
2657 | default: |
2658 | return 0; |
2659 | } |
2660 | } |
2661 | |
2662 | static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps, |
2663 | struct vfio_iommu_type1_info_cap_iova_range *cap_iovas, |
2664 | size_t size) |
2665 | { |
2666 | struct vfio_info_cap_header *; |
2667 | struct vfio_iommu_type1_info_cap_iova_range *iova_cap; |
2668 | |
2669 | header = vfio_info_cap_add(caps, size, |
2670 | VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, version: 1); |
2671 | if (IS_ERR(ptr: header)) |
2672 | return PTR_ERR(ptr: header); |
2673 | |
2674 | iova_cap = container_of(header, |
2675 | struct vfio_iommu_type1_info_cap_iova_range, |
2676 | header); |
2677 | iova_cap->nr_iovas = cap_iovas->nr_iovas; |
2678 | memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges, |
2679 | cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges)); |
2680 | return 0; |
2681 | } |
2682 | |
2683 | static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, |
2684 | struct vfio_info_cap *caps) |
2685 | { |
2686 | struct vfio_iommu_type1_info_cap_iova_range *cap_iovas; |
2687 | struct vfio_iova *iova; |
2688 | size_t size; |
2689 | int iovas = 0, i = 0, ret; |
2690 | |
2691 | list_for_each_entry(iova, &iommu->iova_list, list) |
2692 | iovas++; |
2693 | |
2694 | if (!iovas) { |
2695 | /* |
2696 | * Return 0 as a container with a single mdev device |
2697 | * will have an empty list |
2698 | */ |
2699 | return 0; |
2700 | } |
2701 | |
2702 | size = struct_size(cap_iovas, iova_ranges, iovas); |
2703 | |
2704 | cap_iovas = kzalloc(size, GFP_KERNEL); |
2705 | if (!cap_iovas) |
2706 | return -ENOMEM; |
2707 | |
2708 | cap_iovas->nr_iovas = iovas; |
2709 | |
2710 | list_for_each_entry(iova, &iommu->iova_list, list) { |
2711 | cap_iovas->iova_ranges[i].start = iova->start; |
2712 | cap_iovas->iova_ranges[i].end = iova->end; |
2713 | i++; |
2714 | } |
2715 | |
2716 | ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size); |
2717 | |
2718 | kfree(objp: cap_iovas); |
2719 | return ret; |
2720 | } |
2721 | |
2722 | static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, |
2723 | struct vfio_info_cap *caps) |
2724 | { |
2725 | struct vfio_iommu_type1_info_cap_migration cap_mig = {}; |
2726 | |
2727 | cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; |
2728 | cap_mig.header.version = 1; |
2729 | |
2730 | cap_mig.flags = 0; |
2731 | /* support minimum pgsize */ |
2732 | cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap); |
2733 | cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX; |
2734 | |
2735 | return vfio_info_add_capability(caps, cap: &cap_mig.header, size: sizeof(cap_mig)); |
2736 | } |
2737 | |
2738 | static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, |
2739 | struct vfio_info_cap *caps) |
2740 | { |
2741 | struct vfio_iommu_type1_info_dma_avail cap_dma_avail; |
2742 | |
2743 | cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; |
2744 | cap_dma_avail.header.version = 1; |
2745 | |
2746 | cap_dma_avail.avail = iommu->dma_avail; |
2747 | |
2748 | return vfio_info_add_capability(caps, cap: &cap_dma_avail.header, |
2749 | size: sizeof(cap_dma_avail)); |
2750 | } |
2751 | |
2752 | static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, |
2753 | unsigned long arg) |
2754 | { |
2755 | struct vfio_iommu_type1_info info = {}; |
2756 | unsigned long minsz; |
2757 | struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; |
2758 | int ret; |
2759 | |
2760 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); |
2761 | |
2762 | if (copy_from_user(to: &info, from: (void __user *)arg, n: minsz)) |
2763 | return -EFAULT; |
2764 | |
2765 | if (info.argsz < minsz) |
2766 | return -EINVAL; |
2767 | |
2768 | minsz = min_t(size_t, info.argsz, sizeof(info)); |
2769 | |
2770 | mutex_lock(&iommu->lock); |
2771 | info.flags = VFIO_IOMMU_INFO_PGSIZES; |
2772 | |
2773 | info.iova_pgsizes = iommu->pgsize_bitmap; |
2774 | |
2775 | ret = vfio_iommu_migration_build_caps(iommu, caps: &caps); |
2776 | |
2777 | if (!ret) |
2778 | ret = vfio_iommu_dma_avail_build_caps(iommu, caps: &caps); |
2779 | |
2780 | if (!ret) |
2781 | ret = vfio_iommu_iova_build_caps(iommu, caps: &caps); |
2782 | |
2783 | mutex_unlock(lock: &iommu->lock); |
2784 | |
2785 | if (ret) |
2786 | return ret; |
2787 | |
2788 | if (caps.size) { |
2789 | info.flags |= VFIO_IOMMU_INFO_CAPS; |
2790 | |
2791 | if (info.argsz < sizeof(info) + caps.size) { |
2792 | info.argsz = sizeof(info) + caps.size; |
2793 | } else { |
2794 | vfio_info_cap_shift(caps: &caps, offset: sizeof(info)); |
2795 | if (copy_to_user(to: (void __user *)arg + |
2796 | sizeof(info), from: caps.buf, |
2797 | n: caps.size)) { |
2798 | kfree(objp: caps.buf); |
2799 | return -EFAULT; |
2800 | } |
2801 | info.cap_offset = sizeof(info); |
2802 | } |
2803 | |
2804 | kfree(objp: caps.buf); |
2805 | } |
2806 | |
2807 | return copy_to_user(to: (void __user *)arg, from: &info, n: minsz) ? |
2808 | -EFAULT : 0; |
2809 | } |
2810 | |
2811 | static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu, |
2812 | unsigned long arg) |
2813 | { |
2814 | struct vfio_iommu_type1_dma_map map; |
2815 | unsigned long minsz; |
2816 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE | |
2817 | VFIO_DMA_MAP_FLAG_VADDR; |
2818 | |
2819 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); |
2820 | |
2821 | if (copy_from_user(to: &map, from: (void __user *)arg, n: minsz)) |
2822 | return -EFAULT; |
2823 | |
2824 | if (map.argsz < minsz || map.flags & ~mask) |
2825 | return -EINVAL; |
2826 | |
2827 | return vfio_dma_do_map(iommu, map: &map); |
2828 | } |
2829 | |
2830 | static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu, |
2831 | unsigned long arg) |
2832 | { |
2833 | struct vfio_iommu_type1_dma_unmap unmap; |
2834 | struct vfio_bitmap bitmap = { 0 }; |
2835 | uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP | |
2836 | VFIO_DMA_UNMAP_FLAG_VADDR | |
2837 | VFIO_DMA_UNMAP_FLAG_ALL; |
2838 | unsigned long minsz; |
2839 | int ret; |
2840 | |
2841 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); |
2842 | |
2843 | if (copy_from_user(to: &unmap, from: (void __user *)arg, n: minsz)) |
2844 | return -EFAULT; |
2845 | |
2846 | if (unmap.argsz < minsz || unmap.flags & ~mask) |
2847 | return -EINVAL; |
2848 | |
2849 | if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && |
2850 | (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL | |
2851 | VFIO_DMA_UNMAP_FLAG_VADDR))) |
2852 | return -EINVAL; |
2853 | |
2854 | if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { |
2855 | unsigned long pgshift; |
2856 | |
2857 | if (unmap.argsz < (minsz + sizeof(bitmap))) |
2858 | return -EINVAL; |
2859 | |
2860 | if (copy_from_user(to: &bitmap, |
2861 | from: (void __user *)(arg + minsz), |
2862 | n: sizeof(bitmap))) |
2863 | return -EFAULT; |
2864 | |
2865 | if (!access_ok((void __user *)bitmap.data, bitmap.size)) |
2866 | return -EINVAL; |
2867 | |
2868 | pgshift = __ffs(bitmap.pgsize); |
2869 | ret = verify_bitmap_size(npages: unmap.size >> pgshift, |
2870 | bitmap_size: bitmap.size); |
2871 | if (ret) |
2872 | return ret; |
2873 | } |
2874 | |
2875 | ret = vfio_dma_do_unmap(iommu, unmap: &unmap, bitmap: &bitmap); |
2876 | if (ret) |
2877 | return ret; |
2878 | |
2879 | return copy_to_user(to: (void __user *)arg, from: &unmap, n: minsz) ? |
2880 | -EFAULT : 0; |
2881 | } |
2882 | |
2883 | static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, |
2884 | unsigned long arg) |
2885 | { |
2886 | struct vfio_iommu_type1_dirty_bitmap dirty; |
2887 | uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START | |
2888 | VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | |
2889 | VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; |
2890 | unsigned long minsz; |
2891 | int ret = 0; |
2892 | |
2893 | if (!iommu->v2) |
2894 | return -EACCES; |
2895 | |
2896 | minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags); |
2897 | |
2898 | if (copy_from_user(to: &dirty, from: (void __user *)arg, n: minsz)) |
2899 | return -EFAULT; |
2900 | |
2901 | if (dirty.argsz < minsz || dirty.flags & ~mask) |
2902 | return -EINVAL; |
2903 | |
2904 | /* only one flag should be set at a time */ |
2905 | if (__ffs(dirty.flags) != __fls(word: dirty.flags)) |
2906 | return -EINVAL; |
2907 | |
2908 | if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { |
2909 | size_t pgsize; |
2910 | |
2911 | mutex_lock(&iommu->lock); |
2912 | pgsize = 1 << __ffs(iommu->pgsize_bitmap); |
2913 | if (!iommu->dirty_page_tracking) { |
2914 | ret = vfio_dma_bitmap_alloc_all(iommu, pgsize); |
2915 | if (!ret) |
2916 | iommu->dirty_page_tracking = true; |
2917 | } |
2918 | mutex_unlock(lock: &iommu->lock); |
2919 | return ret; |
2920 | } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { |
2921 | mutex_lock(&iommu->lock); |
2922 | if (iommu->dirty_page_tracking) { |
2923 | iommu->dirty_page_tracking = false; |
2924 | vfio_dma_bitmap_free_all(iommu); |
2925 | } |
2926 | mutex_unlock(lock: &iommu->lock); |
2927 | return 0; |
2928 | } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { |
2929 | struct vfio_iommu_type1_dirty_bitmap_get range; |
2930 | unsigned long pgshift; |
2931 | size_t data_size = dirty.argsz - minsz; |
2932 | size_t iommu_pgsize; |
2933 | |
2934 | if (!data_size || data_size < sizeof(range)) |
2935 | return -EINVAL; |
2936 | |
2937 | if (copy_from_user(to: &range, from: (void __user *)(arg + minsz), |
2938 | n: sizeof(range))) |
2939 | return -EFAULT; |
2940 | |
2941 | if (range.iova + range.size < range.iova) |
2942 | return -EINVAL; |
2943 | if (!access_ok((void __user *)range.bitmap.data, |
2944 | range.bitmap.size)) |
2945 | return -EINVAL; |
2946 | |
2947 | pgshift = __ffs(range.bitmap.pgsize); |
2948 | ret = verify_bitmap_size(npages: range.size >> pgshift, |
2949 | bitmap_size: range.bitmap.size); |
2950 | if (ret) |
2951 | return ret; |
2952 | |
2953 | mutex_lock(&iommu->lock); |
2954 | |
2955 | iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); |
2956 | |
2957 | /* allow only smallest supported pgsize */ |
2958 | if (range.bitmap.pgsize != iommu_pgsize) { |
2959 | ret = -EINVAL; |
2960 | goto out_unlock; |
2961 | } |
2962 | if (range.iova & (iommu_pgsize - 1)) { |
2963 | ret = -EINVAL; |
2964 | goto out_unlock; |
2965 | } |
2966 | if (!range.size || range.size & (iommu_pgsize - 1)) { |
2967 | ret = -EINVAL; |
2968 | goto out_unlock; |
2969 | } |
2970 | |
2971 | if (iommu->dirty_page_tracking) |
2972 | ret = vfio_iova_dirty_bitmap(bitmap: range.bitmap.data, |
2973 | iommu, iova: range.iova, |
2974 | size: range.size, |
2975 | pgsize: range.bitmap.pgsize); |
2976 | else |
2977 | ret = -EINVAL; |
2978 | out_unlock: |
2979 | mutex_unlock(lock: &iommu->lock); |
2980 | |
2981 | return ret; |
2982 | } |
2983 | |
2984 | return -EINVAL; |
2985 | } |
2986 | |
2987 | static long vfio_iommu_type1_ioctl(void *iommu_data, |
2988 | unsigned int cmd, unsigned long arg) |
2989 | { |
2990 | struct vfio_iommu *iommu = iommu_data; |
2991 | |
2992 | switch (cmd) { |
2993 | case VFIO_CHECK_EXTENSION: |
2994 | return vfio_iommu_type1_check_extension(iommu, arg); |
2995 | case VFIO_IOMMU_GET_INFO: |
2996 | return vfio_iommu_type1_get_info(iommu, arg); |
2997 | case VFIO_IOMMU_MAP_DMA: |
2998 | return vfio_iommu_type1_map_dma(iommu, arg); |
2999 | case VFIO_IOMMU_UNMAP_DMA: |
3000 | return vfio_iommu_type1_unmap_dma(iommu, arg); |
3001 | case VFIO_IOMMU_DIRTY_PAGES: |
3002 | return vfio_iommu_type1_dirty_pages(iommu, arg); |
3003 | default: |
3004 | return -ENOTTY; |
3005 | } |
3006 | } |
3007 | |
3008 | static void vfio_iommu_type1_register_device(void *iommu_data, |
3009 | struct vfio_device *vdev) |
3010 | { |
3011 | struct vfio_iommu *iommu = iommu_data; |
3012 | |
3013 | if (!vdev->ops->dma_unmap) |
3014 | return; |
3015 | |
3016 | /* |
3017 | * list_empty(&iommu->device_list) is tested under the iommu->lock while |
3018 | * iteration for dma_unmap must be done under the device_list_lock. |
3019 | * Holding both locks here allows avoiding the device_list_lock in |
3020 | * several fast paths. See vfio_notify_dma_unmap() |
3021 | */ |
3022 | mutex_lock(&iommu->lock); |
3023 | mutex_lock(&iommu->device_list_lock); |
3024 | list_add(new: &vdev->iommu_entry, head: &iommu->device_list); |
3025 | mutex_unlock(lock: &iommu->device_list_lock); |
3026 | mutex_unlock(lock: &iommu->lock); |
3027 | } |
3028 | |
3029 | static void vfio_iommu_type1_unregister_device(void *iommu_data, |
3030 | struct vfio_device *vdev) |
3031 | { |
3032 | struct vfio_iommu *iommu = iommu_data; |
3033 | |
3034 | if (!vdev->ops->dma_unmap) |
3035 | return; |
3036 | |
3037 | mutex_lock(&iommu->lock); |
3038 | mutex_lock(&iommu->device_list_lock); |
3039 | list_del(entry: &vdev->iommu_entry); |
3040 | mutex_unlock(lock: &iommu->device_list_lock); |
3041 | mutex_unlock(lock: &iommu->lock); |
3042 | } |
3043 | |
3044 | static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, |
3045 | dma_addr_t user_iova, void *data, |
3046 | size_t count, bool write, |
3047 | size_t *copied) |
3048 | { |
3049 | struct mm_struct *mm; |
3050 | unsigned long vaddr; |
3051 | struct vfio_dma *dma; |
3052 | bool kthread = current->mm == NULL; |
3053 | size_t offset; |
3054 | |
3055 | *copied = 0; |
3056 | |
3057 | dma = vfio_find_dma(iommu, start: user_iova, size: 1); |
3058 | if (!dma) |
3059 | return -EINVAL; |
3060 | |
3061 | if ((write && !(dma->prot & IOMMU_WRITE)) || |
3062 | !(dma->prot & IOMMU_READ)) |
3063 | return -EPERM; |
3064 | |
3065 | mm = dma->mm; |
3066 | if (!mmget_not_zero(mm)) |
3067 | return -EPERM; |
3068 | |
3069 | if (kthread) |
3070 | kthread_use_mm(mm); |
3071 | else if (current->mm != mm) |
3072 | goto out; |
3073 | |
3074 | offset = user_iova - dma->iova; |
3075 | |
3076 | if (count > dma->size - offset) |
3077 | count = dma->size - offset; |
3078 | |
3079 | vaddr = dma->vaddr + offset; |
3080 | |
3081 | if (write) { |
3082 | *copied = copy_to_user(to: (void __user *)vaddr, from: data, |
3083 | n: count) ? 0 : count; |
3084 | if (*copied && iommu->dirty_page_tracking) { |
3085 | unsigned long pgshift = __ffs(iommu->pgsize_bitmap); |
3086 | /* |
3087 | * Bitmap populated with the smallest supported page |
3088 | * size |
3089 | */ |
3090 | bitmap_set(map: dma->bitmap, start: offset >> pgshift, |
3091 | nbits: ((offset + *copied - 1) >> pgshift) - |
3092 | (offset >> pgshift) + 1); |
3093 | } |
3094 | } else |
3095 | *copied = copy_from_user(to: data, from: (void __user *)vaddr, |
3096 | n: count) ? 0 : count; |
3097 | if (kthread) |
3098 | kthread_unuse_mm(mm); |
3099 | out: |
3100 | mmput(mm); |
3101 | return *copied ? 0 : -EFAULT; |
3102 | } |
3103 | |
3104 | static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova, |
3105 | void *data, size_t count, bool write) |
3106 | { |
3107 | struct vfio_iommu *iommu = iommu_data; |
3108 | int ret = 0; |
3109 | size_t done; |
3110 | |
3111 | mutex_lock(&iommu->lock); |
3112 | |
3113 | if (WARN_ONCE(iommu->vaddr_invalid_count, |
3114 | "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n" )) { |
3115 | ret = -EBUSY; |
3116 | goto out; |
3117 | } |
3118 | |
3119 | while (count > 0) { |
3120 | ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data, |
3121 | count, write, copied: &done); |
3122 | if (ret) |
3123 | break; |
3124 | |
3125 | count -= done; |
3126 | data += done; |
3127 | user_iova += done; |
3128 | } |
3129 | |
3130 | out: |
3131 | mutex_unlock(lock: &iommu->lock); |
3132 | return ret; |
3133 | } |
3134 | |
3135 | static struct iommu_domain * |
3136 | vfio_iommu_type1_group_iommu_domain(void *iommu_data, |
3137 | struct iommu_group *iommu_group) |
3138 | { |
3139 | struct iommu_domain *domain = ERR_PTR(error: -ENODEV); |
3140 | struct vfio_iommu *iommu = iommu_data; |
3141 | struct vfio_domain *d; |
3142 | |
3143 | if (!iommu || !iommu_group) |
3144 | return ERR_PTR(error: -EINVAL); |
3145 | |
3146 | mutex_lock(&iommu->lock); |
3147 | list_for_each_entry(d, &iommu->domain_list, next) { |
3148 | if (find_iommu_group(domain: d, iommu_group)) { |
3149 | domain = d->domain; |
3150 | break; |
3151 | } |
3152 | } |
3153 | mutex_unlock(lock: &iommu->lock); |
3154 | |
3155 | return domain; |
3156 | } |
3157 | |
3158 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { |
3159 | .name = "vfio-iommu-type1" , |
3160 | .owner = THIS_MODULE, |
3161 | .open = vfio_iommu_type1_open, |
3162 | .release = vfio_iommu_type1_release, |
3163 | .ioctl = vfio_iommu_type1_ioctl, |
3164 | .attach_group = vfio_iommu_type1_attach_group, |
3165 | .detach_group = vfio_iommu_type1_detach_group, |
3166 | .pin_pages = vfio_iommu_type1_pin_pages, |
3167 | .unpin_pages = vfio_iommu_type1_unpin_pages, |
3168 | .register_device = vfio_iommu_type1_register_device, |
3169 | .unregister_device = vfio_iommu_type1_unregister_device, |
3170 | .dma_rw = vfio_iommu_type1_dma_rw, |
3171 | .group_iommu_domain = vfio_iommu_type1_group_iommu_domain, |
3172 | }; |
3173 | |
3174 | static int __init vfio_iommu_type1_init(void) |
3175 | { |
3176 | return vfio_register_iommu_driver(ops: &vfio_iommu_driver_ops_type1); |
3177 | } |
3178 | |
3179 | static void __exit vfio_iommu_type1_cleanup(void) |
3180 | { |
3181 | vfio_unregister_iommu_driver(ops: &vfio_iommu_driver_ops_type1); |
3182 | } |
3183 | |
3184 | module_init(vfio_iommu_type1_init); |
3185 | module_exit(vfio_iommu_type1_cleanup); |
3186 | |
3187 | MODULE_VERSION(DRIVER_VERSION); |
3188 | MODULE_LICENSE("GPL v2" ); |
3189 | MODULE_AUTHOR(DRIVER_AUTHOR); |
3190 | MODULE_DESCRIPTION(DRIVER_DESC); |
3191 | |