1 | // SPDX-License-Identifier: GPL-2.0-only |
---|---|
2 | /* |
3 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU |
4 | * |
5 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. |
6 | * Author: Alex Williamson <alex.williamson@redhat.com> |
7 | * |
8 | * Derived from original vfio: |
9 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. |
10 | * Author: Tom Lyon, pugs@cisco.com |
11 | * |
12 | * We arbitrarily define a Type1 IOMMU as one matching the below code. |
13 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel |
14 | * VT-d, but that makes it harder to re-use as theoretically anyone |
15 | * implementing a similar IOMMU could make use of this. We expect the |
16 | * IOMMU to support the IOMMU API and have few to no restrictions around |
17 | * the IOVA range that can be mapped. The Type1 IOMMU is currently |
18 | * optimized for relatively static mappings of a userspace process with |
19 | * userspace pages pinned into memory. We also assume devices and IOMMU |
20 | * domains are PCI based as the IOMMU API is still centered around a |
21 | * device/bus interface rather than a group interface. |
22 | */ |
23 | |
24 | #include <linux/compat.h> |
25 | #include <linux/device.h> |
26 | #include <linux/fs.h> |
27 | #include <linux/highmem.h> |
28 | #include <linux/iommu.h> |
29 | #include <linux/module.h> |
30 | #include <linux/mm.h> |
31 | #include <linux/kthread.h> |
32 | #include <linux/rbtree.h> |
33 | #include <linux/sched/signal.h> |
34 | #include <linux/sched/mm.h> |
35 | #include <linux/slab.h> |
36 | #include <linux/uaccess.h> |
37 | #include <linux/vfio.h> |
38 | #include <linux/workqueue.h> |
39 | #include <linux/notifier.h> |
40 | #include "vfio.h" |
41 | |
42 | #define DRIVER_VERSION "0.2" |
43 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" |
44 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" |
45 | |
46 | static bool allow_unsafe_interrupts; |
47 | module_param_named(allow_unsafe_interrupts, |
48 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); |
49 | MODULE_PARM_DESC(allow_unsafe_interrupts, |
50 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); |
51 | |
52 | static bool disable_hugepages; |
53 | module_param_named(disable_hugepages, |
54 | disable_hugepages, bool, S_IRUGO | S_IWUSR); |
55 | MODULE_PARM_DESC(disable_hugepages, |
56 | "Disable VFIO IOMMU support for IOMMU hugepages."); |
57 | |
58 | static unsigned int dma_entry_limit __read_mostly = U16_MAX; |
59 | module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644); |
60 | MODULE_PARM_DESC(dma_entry_limit, |
61 | "Maximum number of user DMA mappings per container (65535)."); |
62 | |
63 | struct vfio_iommu { |
64 | struct list_head domain_list; |
65 | struct list_head iova_list; |
66 | struct mutex lock; |
67 | struct rb_root dma_list; |
68 | struct list_head device_list; |
69 | struct mutex device_list_lock; |
70 | unsigned int dma_avail; |
71 | unsigned int vaddr_invalid_count; |
72 | uint64_t pgsize_bitmap; |
73 | uint64_t num_non_pinned_groups; |
74 | bool v2; |
75 | bool dirty_page_tracking; |
76 | struct list_head emulated_iommu_groups; |
77 | }; |
78 | |
79 | struct vfio_domain { |
80 | struct iommu_domain *domain; |
81 | struct list_head next; |
82 | struct list_head group_list; |
83 | bool enforce_cache_coherency : 1; |
84 | }; |
85 | |
86 | struct vfio_dma { |
87 | struct rb_node node; |
88 | dma_addr_t iova; /* Device address */ |
89 | unsigned long vaddr; /* Process virtual addr */ |
90 | size_t size; /* Map size (bytes) */ |
91 | int prot; /* IOMMU_READ/WRITE */ |
92 | bool iommu_mapped; |
93 | bool lock_cap; /* capable(CAP_IPC_LOCK) */ |
94 | bool vaddr_invalid; |
95 | struct task_struct *task; |
96 | struct rb_root pfn_list; /* Ex-user pinned pfn list */ |
97 | unsigned long *bitmap; |
98 | struct mm_struct *mm; |
99 | size_t locked_vm; |
100 | }; |
101 | |
102 | struct vfio_batch { |
103 | struct page **pages; /* for pin_user_pages_remote */ |
104 | struct page *fallback_page; /* if pages alloc fails */ |
105 | unsigned int capacity; /* length of pages array */ |
106 | unsigned int size; /* of batch currently */ |
107 | unsigned int offset; /* of next entry in pages */ |
108 | }; |
109 | |
110 | struct vfio_iommu_group { |
111 | struct iommu_group *iommu_group; |
112 | struct list_head next; |
113 | bool pinned_page_dirty_scope; |
114 | }; |
115 | |
116 | struct vfio_iova { |
117 | struct list_head list; |
118 | dma_addr_t start; |
119 | dma_addr_t end; |
120 | }; |
121 | |
122 | /* |
123 | * Guest RAM pinning working set or DMA target |
124 | */ |
125 | struct vfio_pfn { |
126 | struct rb_node node; |
127 | dma_addr_t iova; /* Device address */ |
128 | unsigned long pfn; /* Host pfn */ |
129 | unsigned int ref_count; |
130 | }; |
131 | |
132 | struct vfio_regions { |
133 | struct list_head list; |
134 | dma_addr_t iova; |
135 | phys_addr_t phys; |
136 | size_t len; |
137 | }; |
138 | |
139 | #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) |
140 | |
141 | /* |
142 | * Input argument of number of bits to bitmap_set() is unsigned integer, which |
143 | * further casts to signed integer for unaligned multi-bit operation, |
144 | * __bitmap_set(). |
145 | * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte, |
146 | * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page |
147 | * system. |
148 | */ |
149 | #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) |
150 | #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) |
151 | |
152 | static int put_pfn(unsigned long pfn, int prot); |
153 | |
154 | static struct vfio_iommu_group* |
155 | vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, |
156 | struct iommu_group *iommu_group); |
157 | |
158 | /* |
159 | * This code handles mapping and unmapping of user data buffers |
160 | * into DMA'ble space using the IOMMU |
161 | */ |
162 | |
163 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, |
164 | dma_addr_t start, size_t size) |
165 | { |
166 | struct rb_node *node = iommu->dma_list.rb_node; |
167 | |
168 | while (node) { |
169 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); |
170 | |
171 | if (start + size <= dma->iova) |
172 | node = node->rb_left; |
173 | else if (start >= dma->iova + dma->size) |
174 | node = node->rb_right; |
175 | else |
176 | return dma; |
177 | } |
178 | |
179 | return NULL; |
180 | } |
181 | |
182 | static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu, |
183 | dma_addr_t start, u64 size) |
184 | { |
185 | struct rb_node *res = NULL; |
186 | struct rb_node *node = iommu->dma_list.rb_node; |
187 | struct vfio_dma *dma_res = NULL; |
188 | |
189 | while (node) { |
190 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); |
191 | |
192 | if (start < dma->iova + dma->size) { |
193 | res = node; |
194 | dma_res = dma; |
195 | if (start >= dma->iova) |
196 | break; |
197 | node = node->rb_left; |
198 | } else { |
199 | node = node->rb_right; |
200 | } |
201 | } |
202 | if (res && size && dma_res->iova >= start + size) |
203 | res = NULL; |
204 | return res; |
205 | } |
206 | |
207 | static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) |
208 | { |
209 | struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; |
210 | struct vfio_dma *dma; |
211 | |
212 | while (*link) { |
213 | parent = *link; |
214 | dma = rb_entry(parent, struct vfio_dma, node); |
215 | |
216 | if (new->iova + new->size <= dma->iova) |
217 | link = &(*link)->rb_left; |
218 | else |
219 | link = &(*link)->rb_right; |
220 | } |
221 | |
222 | rb_link_node(node: &new->node, parent, rb_link: link); |
223 | rb_insert_color(&new->node, &iommu->dma_list); |
224 | } |
225 | |
226 | static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) |
227 | { |
228 | rb_erase(&old->node, &iommu->dma_list); |
229 | } |
230 | |
231 | |
232 | static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize) |
233 | { |
234 | uint64_t npages = dma->size / pgsize; |
235 | |
236 | if (npages > DIRTY_BITMAP_PAGES_MAX) |
237 | return -EINVAL; |
238 | |
239 | /* |
240 | * Allocate extra 64 bits that are used to calculate shift required for |
241 | * bitmap_shift_left() to manipulate and club unaligned number of pages |
242 | * in adjacent vfio_dma ranges. |
243 | */ |
244 | dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64), |
245 | GFP_KERNEL); |
246 | if (!dma->bitmap) |
247 | return -ENOMEM; |
248 | |
249 | return 0; |
250 | } |
251 | |
252 | static void vfio_dma_bitmap_free(struct vfio_dma *dma) |
253 | { |
254 | kvfree(addr: dma->bitmap); |
255 | dma->bitmap = NULL; |
256 | } |
257 | |
258 | static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize) |
259 | { |
260 | struct rb_node *p; |
261 | unsigned long pgshift = __ffs(pgsize); |
262 | |
263 | for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) { |
264 | struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node); |
265 | |
266 | bitmap_set(map: dma->bitmap, start: (vpfn->iova - dma->iova) >> pgshift, nbits: 1); |
267 | } |
268 | } |
269 | |
270 | static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu) |
271 | { |
272 | struct rb_node *n; |
273 | unsigned long pgshift = __ffs(iommu->pgsize_bitmap); |
274 | |
275 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
276 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
277 | |
278 | bitmap_set(map: dma->bitmap, start: 0, nbits: dma->size >> pgshift); |
279 | } |
280 | } |
281 | |
282 | static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) |
283 | { |
284 | struct rb_node *n; |
285 | |
286 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
287 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
288 | int ret; |
289 | |
290 | ret = vfio_dma_bitmap_alloc(dma, pgsize); |
291 | if (ret) { |
292 | struct rb_node *p; |
293 | |
294 | for (p = rb_prev(n); p; p = rb_prev(p)) { |
295 | struct vfio_dma *dma = rb_entry(p, |
296 | struct vfio_dma, node); |
297 | |
298 | vfio_dma_bitmap_free(dma); |
299 | } |
300 | return ret; |
301 | } |
302 | vfio_dma_populate_bitmap(dma, pgsize); |
303 | } |
304 | return 0; |
305 | } |
306 | |
307 | static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) |
308 | { |
309 | struct rb_node *n; |
310 | |
311 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
312 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
313 | |
314 | vfio_dma_bitmap_free(dma); |
315 | } |
316 | } |
317 | |
318 | /* |
319 | * Helper Functions for host iova-pfn list |
320 | */ |
321 | static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) |
322 | { |
323 | struct vfio_pfn *vpfn; |
324 | struct rb_node *node = dma->pfn_list.rb_node; |
325 | |
326 | while (node) { |
327 | vpfn = rb_entry(node, struct vfio_pfn, node); |
328 | |
329 | if (iova < vpfn->iova) |
330 | node = node->rb_left; |
331 | else if (iova > vpfn->iova) |
332 | node = node->rb_right; |
333 | else |
334 | return vpfn; |
335 | } |
336 | return NULL; |
337 | } |
338 | |
339 | static void vfio_link_pfn(struct vfio_dma *dma, |
340 | struct vfio_pfn *new) |
341 | { |
342 | struct rb_node **link, *parent = NULL; |
343 | struct vfio_pfn *vpfn; |
344 | |
345 | link = &dma->pfn_list.rb_node; |
346 | while (*link) { |
347 | parent = *link; |
348 | vpfn = rb_entry(parent, struct vfio_pfn, node); |
349 | |
350 | if (new->iova < vpfn->iova) |
351 | link = &(*link)->rb_left; |
352 | else |
353 | link = &(*link)->rb_right; |
354 | } |
355 | |
356 | rb_link_node(node: &new->node, parent, rb_link: link); |
357 | rb_insert_color(&new->node, &dma->pfn_list); |
358 | } |
359 | |
360 | static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old) |
361 | { |
362 | rb_erase(&old->node, &dma->pfn_list); |
363 | } |
364 | |
365 | static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova, |
366 | unsigned long pfn) |
367 | { |
368 | struct vfio_pfn *vpfn; |
369 | |
370 | vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); |
371 | if (!vpfn) |
372 | return -ENOMEM; |
373 | |
374 | vpfn->iova = iova; |
375 | vpfn->pfn = pfn; |
376 | vpfn->ref_count = 1; |
377 | vfio_link_pfn(dma, new: vpfn); |
378 | return 0; |
379 | } |
380 | |
381 | static void vfio_remove_from_pfn_list(struct vfio_dma *dma, |
382 | struct vfio_pfn *vpfn) |
383 | { |
384 | vfio_unlink_pfn(dma, old: vpfn); |
385 | kfree(objp: vpfn); |
386 | } |
387 | |
388 | static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma, |
389 | unsigned long iova) |
390 | { |
391 | struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); |
392 | |
393 | if (vpfn) |
394 | vpfn->ref_count++; |
395 | return vpfn; |
396 | } |
397 | |
398 | static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) |
399 | { |
400 | int ret = 0; |
401 | |
402 | vpfn->ref_count--; |
403 | if (!vpfn->ref_count) { |
404 | ret = put_pfn(pfn: vpfn->pfn, prot: dma->prot); |
405 | vfio_remove_from_pfn_list(dma, vpfn); |
406 | } |
407 | return ret; |
408 | } |
409 | |
410 | static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm, |
411 | bool lock_cap, long npage) |
412 | { |
413 | int ret = mmap_write_lock_killable(mm); |
414 | |
415 | if (ret) |
416 | return ret; |
417 | |
418 | ret = __account_locked_vm(mm, abs(npage), inc: npage > 0, task, bypass_rlim: lock_cap); |
419 | mmap_write_unlock(mm); |
420 | return ret; |
421 | } |
422 | |
423 | static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) |
424 | { |
425 | struct mm_struct *mm; |
426 | int ret; |
427 | |
428 | if (!npage) |
429 | return 0; |
430 | |
431 | mm = dma->mm; |
432 | if (async && !mmget_not_zero(mm)) |
433 | return -ESRCH; /* process exited */ |
434 | |
435 | ret = mm_lock_acct(task: dma->task, mm, lock_cap: dma->lock_cap, npage); |
436 | if (!ret) |
437 | dma->locked_vm += npage; |
438 | |
439 | if (async) |
440 | mmput(mm); |
441 | |
442 | return ret; |
443 | } |
444 | |
445 | /* |
446 | * Some mappings aren't backed by a struct page, for example an mmap'd |
447 | * MMIO range for our own or another device. These use a different |
448 | * pfn conversion and shouldn't be tracked as locked pages. |
449 | * For compound pages, any driver that sets the reserved bit in head |
450 | * page needs to set the reserved bit in all subpages to be safe. |
451 | */ |
452 | static bool is_invalid_reserved_pfn(unsigned long pfn) |
453 | { |
454 | if (pfn_valid(pfn)) |
455 | return PageReserved(pfn_to_page(pfn)); |
456 | |
457 | return true; |
458 | } |
459 | |
460 | static int put_pfn(unsigned long pfn, int prot) |
461 | { |
462 | if (!is_invalid_reserved_pfn(pfn)) { |
463 | struct page *page = pfn_to_page(pfn); |
464 | |
465 | unpin_user_pages_dirty_lock(pages: &page, npages: 1, make_dirty: prot & IOMMU_WRITE); |
466 | return 1; |
467 | } |
468 | return 0; |
469 | } |
470 | |
471 | #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) |
472 | |
473 | static void __vfio_batch_init(struct vfio_batch *batch, bool single) |
474 | { |
475 | batch->size = 0; |
476 | batch->offset = 0; |
477 | |
478 | if (single || unlikely(disable_hugepages)) |
479 | goto fallback; |
480 | |
481 | batch->pages = (struct page **) __get_free_page(GFP_KERNEL); |
482 | if (!batch->pages) |
483 | goto fallback; |
484 | |
485 | batch->capacity = VFIO_BATCH_MAX_CAPACITY; |
486 | return; |
487 | |
488 | fallback: |
489 | batch->pages = &batch->fallback_page; |
490 | batch->capacity = 1; |
491 | } |
492 | |
493 | static void vfio_batch_init(struct vfio_batch *batch) |
494 | { |
495 | __vfio_batch_init(batch, single: false); |
496 | } |
497 | |
498 | static void vfio_batch_init_single(struct vfio_batch *batch) |
499 | { |
500 | __vfio_batch_init(batch, single: true); |
501 | } |
502 | |
503 | static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) |
504 | { |
505 | while (batch->size) { |
506 | unsigned long pfn = page_to_pfn(batch->pages[batch->offset]); |
507 | |
508 | put_pfn(pfn, prot: dma->prot); |
509 | batch->offset++; |
510 | batch->size--; |
511 | } |
512 | } |
513 | |
514 | static void vfio_batch_fini(struct vfio_batch *batch) |
515 | { |
516 | if (batch->capacity == VFIO_BATCH_MAX_CAPACITY) |
517 | free_page((unsigned long)batch->pages); |
518 | } |
519 | |
520 | static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, |
521 | unsigned long vaddr, unsigned long *pfn, |
522 | unsigned long *addr_mask, bool write_fault) |
523 | { |
524 | struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; |
525 | int ret; |
526 | |
527 | ret = follow_pfnmap_start(args: &args); |
528 | if (ret) { |
529 | bool unlocked = false; |
530 | |
531 | ret = fixup_user_fault(mm, address: vaddr, |
532 | fault_flags: FAULT_FLAG_REMOTE | |
533 | (write_fault ? FAULT_FLAG_WRITE : 0), |
534 | unlocked: &unlocked); |
535 | if (unlocked) |
536 | return -EAGAIN; |
537 | |
538 | if (ret) |
539 | return ret; |
540 | |
541 | ret = follow_pfnmap_start(args: &args); |
542 | if (ret) |
543 | return ret; |
544 | } |
545 | |
546 | if (write_fault && !args.writable) { |
547 | ret = -EFAULT; |
548 | } else { |
549 | *pfn = args.pfn; |
550 | *addr_mask = args.addr_mask; |
551 | } |
552 | |
553 | follow_pfnmap_end(args: &args); |
554 | return ret; |
555 | } |
556 | |
557 | /* |
558 | * Returns the positive number of pfns successfully obtained or a negative |
559 | * error code. The initial pfn is stored in the pfn arg. For page-backed |
560 | * pfns, the provided batch is also updated to indicate the filled pages and |
561 | * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and |
562 | * returned initial pfn are provided; subsequent pfns are contiguous. |
563 | */ |
564 | static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, |
565 | unsigned long npages, int prot, unsigned long *pfn, |
566 | struct vfio_batch *batch) |
567 | { |
568 | unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity); |
569 | struct vm_area_struct *vma; |
570 | unsigned int flags = 0; |
571 | long ret; |
572 | |
573 | if (prot & IOMMU_WRITE) |
574 | flags |= FOLL_WRITE; |
575 | |
576 | mmap_read_lock(mm); |
577 | ret = pin_user_pages_remote(mm, start: vaddr, nr_pages: pin_pages, gup_flags: flags | FOLL_LONGTERM, |
578 | pages: batch->pages, NULL); |
579 | if (ret > 0) { |
580 | *pfn = page_to_pfn(batch->pages[0]); |
581 | batch->size = ret; |
582 | batch->offset = 0; |
583 | goto done; |
584 | } else if (!ret) { |
585 | ret = -EFAULT; |
586 | } |
587 | |
588 | vaddr = untagged_addr_remote(mm, vaddr); |
589 | |
590 | retry: |
591 | vma = vma_lookup(mm, addr: vaddr); |
592 | |
593 | if (vma && vma->vm_flags & VM_PFNMAP) { |
594 | unsigned long addr_mask; |
595 | |
596 | ret = follow_fault_pfn(vma, mm, vaddr, pfn, addr_mask: &addr_mask, |
597 | write_fault: prot & IOMMU_WRITE); |
598 | if (ret == -EAGAIN) |
599 | goto retry; |
600 | |
601 | if (!ret) { |
602 | if (is_invalid_reserved_pfn(pfn: *pfn)) { |
603 | unsigned long epfn; |
604 | |
605 | epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1; |
606 | ret = min_t(long, npages, epfn - *pfn); |
607 | } else { |
608 | ret = -EFAULT; |
609 | } |
610 | } |
611 | } |
612 | done: |
613 | mmap_read_unlock(mm); |
614 | return ret; |
615 | } |
616 | |
617 | /* |
618 | * Attempt to pin pages. We really don't want to track all the pfns and |
619 | * the iommu can only map chunks of consecutive pfns anyway, so get the |
620 | * first page and all consecutive pages with the same locking. |
621 | */ |
622 | static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, |
623 | unsigned long npage, unsigned long *pfn_base, |
624 | unsigned long limit, struct vfio_batch *batch) |
625 | { |
626 | unsigned long pfn; |
627 | struct mm_struct *mm = current->mm; |
628 | long ret, pinned = 0, lock_acct = 0; |
629 | bool rsvd; |
630 | dma_addr_t iova = vaddr - dma->vaddr + dma->iova; |
631 | |
632 | /* This code path is only user initiated */ |
633 | if (!mm) |
634 | return -ENODEV; |
635 | |
636 | if (batch->size) { |
637 | /* Leftover pages in batch from an earlier call. */ |
638 | *pfn_base = page_to_pfn(batch->pages[batch->offset]); |
639 | pfn = *pfn_base; |
640 | rsvd = is_invalid_reserved_pfn(pfn: *pfn_base); |
641 | } else { |
642 | *pfn_base = 0; |
643 | } |
644 | |
645 | if (unlikely(disable_hugepages)) |
646 | npage = 1; |
647 | |
648 | while (npage) { |
649 | if (!batch->size) { |
650 | /* Empty batch, so refill it. */ |
651 | ret = vaddr_get_pfns(mm, vaddr, npages: npage, prot: dma->prot, |
652 | pfn: &pfn, batch); |
653 | if (ret < 0) |
654 | goto unpin_out; |
655 | |
656 | if (!*pfn_base) { |
657 | *pfn_base = pfn; |
658 | rsvd = is_invalid_reserved_pfn(pfn: *pfn_base); |
659 | } |
660 | |
661 | /* Handle pfnmap */ |
662 | if (!batch->size) { |
663 | if (pfn != *pfn_base + pinned || !rsvd) |
664 | goto out; |
665 | |
666 | pinned += ret; |
667 | npage -= ret; |
668 | vaddr += (PAGE_SIZE * ret); |
669 | iova += (PAGE_SIZE * ret); |
670 | continue; |
671 | } |
672 | } |
673 | |
674 | /* |
675 | * pfn is preset for the first iteration of this inner loop |
676 | * due to the fact that vaddr_get_pfns() needs to provide the |
677 | * initial pfn for pfnmaps. Therefore to reduce redundancy, |
678 | * the next pfn is fetched at the end of the loop. |
679 | * A PageReserved() page could still qualify as page backed |
680 | * and rsvd here, and therefore continues to use the batch. |
681 | */ |
682 | while (true) { |
683 | if (pfn != *pfn_base + pinned || |
684 | rsvd != is_invalid_reserved_pfn(pfn)) |
685 | goto out; |
686 | |
687 | /* |
688 | * Reserved pages aren't counted against the user, |
689 | * externally pinned pages are already counted against |
690 | * the user. |
691 | */ |
692 | if (!rsvd && !vfio_find_vpfn(dma, iova)) { |
693 | if (!dma->lock_cap && |
694 | mm->locked_vm + lock_acct + 1 > limit) { |
695 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", |
696 | __func__, limit << PAGE_SHIFT); |
697 | ret = -ENOMEM; |
698 | goto unpin_out; |
699 | } |
700 | lock_acct++; |
701 | } |
702 | |
703 | pinned++; |
704 | npage--; |
705 | vaddr += PAGE_SIZE; |
706 | iova += PAGE_SIZE; |
707 | batch->offset++; |
708 | batch->size--; |
709 | |
710 | if (!batch->size) |
711 | break; |
712 | |
713 | pfn = page_to_pfn(batch->pages[batch->offset]); |
714 | } |
715 | } |
716 | |
717 | out: |
718 | ret = vfio_lock_acct(dma, npage: lock_acct, async: false); |
719 | |
720 | unpin_out: |
721 | if (ret < 0) { |
722 | if (pinned && !rsvd) { |
723 | for (pfn = *pfn_base ; pinned ; pfn++, pinned--) |
724 | put_pfn(pfn, prot: dma->prot); |
725 | } |
726 | vfio_batch_unpin(batch, dma); |
727 | |
728 | return ret; |
729 | } |
730 | |
731 | return pinned; |
732 | } |
733 | |
734 | static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, |
735 | unsigned long pfn, unsigned long npage, |
736 | bool do_accounting) |
737 | { |
738 | long unlocked = 0, locked = 0; |
739 | long i; |
740 | |
741 | for (i = 0; i < npage; i++, iova += PAGE_SIZE) { |
742 | if (put_pfn(pfn: pfn++, prot: dma->prot)) { |
743 | unlocked++; |
744 | if (vfio_find_vpfn(dma, iova)) |
745 | locked++; |
746 | } |
747 | } |
748 | |
749 | if (do_accounting) |
750 | vfio_lock_acct(dma, npage: locked - unlocked, async: true); |
751 | |
752 | return unlocked; |
753 | } |
754 | |
755 | static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, |
756 | unsigned long *pfn_base, bool do_accounting) |
757 | { |
758 | struct vfio_batch batch; |
759 | struct mm_struct *mm; |
760 | int ret; |
761 | |
762 | mm = dma->mm; |
763 | if (!mmget_not_zero(mm)) |
764 | return -ENODEV; |
765 | |
766 | vfio_batch_init_single(batch: &batch); |
767 | |
768 | ret = vaddr_get_pfns(mm, vaddr, npages: 1, prot: dma->prot, pfn: pfn_base, batch: &batch); |
769 | if (ret != 1) |
770 | goto out; |
771 | |
772 | ret = 0; |
773 | |
774 | if (do_accounting && !is_invalid_reserved_pfn(pfn: *pfn_base)) { |
775 | ret = vfio_lock_acct(dma, npage: 1, async: false); |
776 | if (ret) { |
777 | put_pfn(pfn: *pfn_base, prot: dma->prot); |
778 | if (ret == -ENOMEM) |
779 | pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK " |
780 | "(%ld) exceeded\n", __func__, |
781 | dma->task->comm, task_pid_nr(dma->task), |
782 | task_rlimit(dma->task, RLIMIT_MEMLOCK)); |
783 | } |
784 | } |
785 | |
786 | out: |
787 | vfio_batch_fini(batch: &batch); |
788 | mmput(mm); |
789 | return ret; |
790 | } |
791 | |
792 | static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, |
793 | bool do_accounting) |
794 | { |
795 | int unlocked; |
796 | struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); |
797 | |
798 | if (!vpfn) |
799 | return 0; |
800 | |
801 | unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); |
802 | |
803 | if (do_accounting) |
804 | vfio_lock_acct(dma, npage: -unlocked, async: true); |
805 | |
806 | return unlocked; |
807 | } |
808 | |
809 | static int vfio_iommu_type1_pin_pages(void *iommu_data, |
810 | struct iommu_group *iommu_group, |
811 | dma_addr_t user_iova, |
812 | int npage, int prot, |
813 | struct page **pages) |
814 | { |
815 | struct vfio_iommu *iommu = iommu_data; |
816 | struct vfio_iommu_group *group; |
817 | int i, j, ret; |
818 | unsigned long remote_vaddr; |
819 | struct vfio_dma *dma; |
820 | bool do_accounting; |
821 | |
822 | if (!iommu || !pages) |
823 | return -EINVAL; |
824 | |
825 | /* Supported for v2 version only */ |
826 | if (!iommu->v2) |
827 | return -EACCES; |
828 | |
829 | mutex_lock(&iommu->lock); |
830 | |
831 | if (WARN_ONCE(iommu->vaddr_invalid_count, |
832 | "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) { |
833 | ret = -EBUSY; |
834 | goto pin_done; |
835 | } |
836 | |
837 | /* Fail if no dma_umap notifier is registered */ |
838 | if (list_empty(head: &iommu->device_list)) { |
839 | ret = -EINVAL; |
840 | goto pin_done; |
841 | } |
842 | |
843 | /* |
844 | * If iommu capable domain exist in the container then all pages are |
845 | * already pinned and accounted. Accounting should be done if there is no |
846 | * iommu capable domain in the container. |
847 | */ |
848 | do_accounting = list_empty(head: &iommu->domain_list); |
849 | |
850 | for (i = 0; i < npage; i++) { |
851 | unsigned long phys_pfn; |
852 | dma_addr_t iova; |
853 | struct vfio_pfn *vpfn; |
854 | |
855 | iova = user_iova + PAGE_SIZE * i; |
856 | dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE); |
857 | if (!dma) { |
858 | ret = -EINVAL; |
859 | goto pin_unwind; |
860 | } |
861 | |
862 | if ((dma->prot & prot) != prot) { |
863 | ret = -EPERM; |
864 | goto pin_unwind; |
865 | } |
866 | |
867 | vpfn = vfio_iova_get_vfio_pfn(dma, iova); |
868 | if (vpfn) { |
869 | pages[i] = pfn_to_page(vpfn->pfn); |
870 | continue; |
871 | } |
872 | |
873 | remote_vaddr = dma->vaddr + (iova - dma->iova); |
874 | ret = vfio_pin_page_external(dma, vaddr: remote_vaddr, pfn_base: &phys_pfn, |
875 | do_accounting); |
876 | if (ret) |
877 | goto pin_unwind; |
878 | |
879 | if (!pfn_valid(pfn: phys_pfn)) { |
880 | ret = -EINVAL; |
881 | goto pin_unwind; |
882 | } |
883 | |
884 | ret = vfio_add_to_pfn_list(dma, iova, pfn: phys_pfn); |
885 | if (ret) { |
886 | if (put_pfn(pfn: phys_pfn, prot: dma->prot) && do_accounting) |
887 | vfio_lock_acct(dma, npage: -1, async: true); |
888 | goto pin_unwind; |
889 | } |
890 | |
891 | pages[i] = pfn_to_page(phys_pfn); |
892 | |
893 | if (iommu->dirty_page_tracking) { |
894 | unsigned long pgshift = __ffs(iommu->pgsize_bitmap); |
895 | |
896 | /* |
897 | * Bitmap populated with the smallest supported page |
898 | * size |
899 | */ |
900 | bitmap_set(map: dma->bitmap, |
901 | start: (iova - dma->iova) >> pgshift, nbits: 1); |
902 | } |
903 | } |
904 | ret = i; |
905 | |
906 | group = vfio_iommu_find_iommu_group(iommu, iommu_group); |
907 | if (!group->pinned_page_dirty_scope) { |
908 | group->pinned_page_dirty_scope = true; |
909 | iommu->num_non_pinned_groups--; |
910 | } |
911 | |
912 | goto pin_done; |
913 | |
914 | pin_unwind: |
915 | pages[i] = NULL; |
916 | for (j = 0; j < i; j++) { |
917 | dma_addr_t iova; |
918 | |
919 | iova = user_iova + PAGE_SIZE * j; |
920 | dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE); |
921 | vfio_unpin_page_external(dma, iova, do_accounting); |
922 | pages[j] = NULL; |
923 | } |
924 | pin_done: |
925 | mutex_unlock(lock: &iommu->lock); |
926 | return ret; |
927 | } |
928 | |
929 | static void vfio_iommu_type1_unpin_pages(void *iommu_data, |
930 | dma_addr_t user_iova, int npage) |
931 | { |
932 | struct vfio_iommu *iommu = iommu_data; |
933 | bool do_accounting; |
934 | int i; |
935 | |
936 | /* Supported for v2 version only */ |
937 | if (WARN_ON(!iommu->v2)) |
938 | return; |
939 | |
940 | mutex_lock(&iommu->lock); |
941 | |
942 | do_accounting = list_empty(head: &iommu->domain_list); |
943 | for (i = 0; i < npage; i++) { |
944 | dma_addr_t iova = user_iova + PAGE_SIZE * i; |
945 | struct vfio_dma *dma; |
946 | |
947 | dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE); |
948 | if (!dma) |
949 | break; |
950 | |
951 | vfio_unpin_page_external(dma, iova, do_accounting); |
952 | } |
953 | |
954 | mutex_unlock(lock: &iommu->lock); |
955 | |
956 | WARN_ON(i != npage); |
957 | } |
958 | |
959 | static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, |
960 | struct list_head *regions, |
961 | struct iommu_iotlb_gather *iotlb_gather) |
962 | { |
963 | long unlocked = 0; |
964 | struct vfio_regions *entry, *next; |
965 | |
966 | iommu_iotlb_sync(domain: domain->domain, iotlb_gather); |
967 | |
968 | list_for_each_entry_safe(entry, next, regions, list) { |
969 | unlocked += vfio_unpin_pages_remote(dma, |
970 | iova: entry->iova, |
971 | pfn: entry->phys >> PAGE_SHIFT, |
972 | npage: entry->len >> PAGE_SHIFT, |
973 | do_accounting: false); |
974 | list_del(entry: &entry->list); |
975 | kfree(objp: entry); |
976 | } |
977 | |
978 | cond_resched(); |
979 | |
980 | return unlocked; |
981 | } |
982 | |
983 | /* |
984 | * Generally, VFIO needs to unpin remote pages after each IOTLB flush. |
985 | * Therefore, when using IOTLB flush sync interface, VFIO need to keep track |
986 | * of these regions (currently using a list). |
987 | * |
988 | * This value specifies maximum number of regions for each IOTLB flush sync. |
989 | */ |
990 | #define VFIO_IOMMU_TLB_SYNC_MAX 512 |
991 | |
992 | static size_t unmap_unpin_fast(struct vfio_domain *domain, |
993 | struct vfio_dma *dma, dma_addr_t *iova, |
994 | size_t len, phys_addr_t phys, long *unlocked, |
995 | struct list_head *unmapped_list, |
996 | int *unmapped_cnt, |
997 | struct iommu_iotlb_gather *iotlb_gather) |
998 | { |
999 | size_t unmapped = 0; |
1000 | struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL); |
1001 | |
1002 | if (entry) { |
1003 | unmapped = iommu_unmap_fast(domain: domain->domain, iova: *iova, size: len, |
1004 | iotlb_gather); |
1005 | |
1006 | if (!unmapped) { |
1007 | kfree(objp: entry); |
1008 | } else { |
1009 | entry->iova = *iova; |
1010 | entry->phys = phys; |
1011 | entry->len = unmapped; |
1012 | list_add_tail(new: &entry->list, head: unmapped_list); |
1013 | |
1014 | *iova += unmapped; |
1015 | (*unmapped_cnt)++; |
1016 | } |
1017 | } |
1018 | |
1019 | /* |
1020 | * Sync if the number of fast-unmap regions hits the limit |
1021 | * or in case of errors. |
1022 | */ |
1023 | if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) { |
1024 | *unlocked += vfio_sync_unpin(dma, domain, regions: unmapped_list, |
1025 | iotlb_gather); |
1026 | *unmapped_cnt = 0; |
1027 | } |
1028 | |
1029 | return unmapped; |
1030 | } |
1031 | |
1032 | static size_t unmap_unpin_slow(struct vfio_domain *domain, |
1033 | struct vfio_dma *dma, dma_addr_t *iova, |
1034 | size_t len, phys_addr_t phys, |
1035 | long *unlocked) |
1036 | { |
1037 | size_t unmapped = iommu_unmap(domain: domain->domain, iova: *iova, size: len); |
1038 | |
1039 | if (unmapped) { |
1040 | *unlocked += vfio_unpin_pages_remote(dma, iova: *iova, |
1041 | pfn: phys >> PAGE_SHIFT, |
1042 | npage: unmapped >> PAGE_SHIFT, |
1043 | do_accounting: false); |
1044 | *iova += unmapped; |
1045 | cond_resched(); |
1046 | } |
1047 | return unmapped; |
1048 | } |
1049 | |
1050 | static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, |
1051 | bool do_accounting) |
1052 | { |
1053 | dma_addr_t iova = dma->iova, end = dma->iova + dma->size; |
1054 | struct vfio_domain *domain, *d; |
1055 | LIST_HEAD(unmapped_region_list); |
1056 | struct iommu_iotlb_gather iotlb_gather; |
1057 | int unmapped_region_cnt = 0; |
1058 | long unlocked = 0; |
1059 | |
1060 | if (!dma->size) |
1061 | return 0; |
1062 | |
1063 | if (list_empty(head: &iommu->domain_list)) |
1064 | return 0; |
1065 | |
1066 | /* |
1067 | * We use the IOMMU to track the physical addresses, otherwise we'd |
1068 | * need a much more complicated tracking system. Unfortunately that |
1069 | * means we need to use one of the iommu domains to figure out the |
1070 | * pfns to unpin. The rest need to be unmapped in advance so we have |
1071 | * no iommu translations remaining when the pages are unpinned. |
1072 | */ |
1073 | domain = d = list_first_entry(&iommu->domain_list, |
1074 | struct vfio_domain, next); |
1075 | |
1076 | list_for_each_entry_continue(d, &iommu->domain_list, next) { |
1077 | iommu_unmap(domain: d->domain, iova: dma->iova, size: dma->size); |
1078 | cond_resched(); |
1079 | } |
1080 | |
1081 | iommu_iotlb_gather_init(gather: &iotlb_gather); |
1082 | while (iova < end) { |
1083 | size_t unmapped, len; |
1084 | phys_addr_t phys, next; |
1085 | |
1086 | phys = iommu_iova_to_phys(domain: domain->domain, iova); |
1087 | if (WARN_ON(!phys)) { |
1088 | iova += PAGE_SIZE; |
1089 | continue; |
1090 | } |
1091 | |
1092 | /* |
1093 | * To optimize for fewer iommu_unmap() calls, each of which |
1094 | * may require hardware cache flushing, try to find the |
1095 | * largest contiguous physical memory chunk to unmap. |
1096 | */ |
1097 | for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) { |
1098 | next = iommu_iova_to_phys(domain: domain->domain, iova: iova + len); |
1099 | if (next != phys + len) |
1100 | break; |
1101 | } |
1102 | |
1103 | /* |
1104 | * First, try to use fast unmap/unpin. In case of failure, |
1105 | * switch to slow unmap/unpin path. |
1106 | */ |
1107 | unmapped = unmap_unpin_fast(domain, dma, iova: &iova, len, phys, |
1108 | unlocked: &unlocked, unmapped_list: &unmapped_region_list, |
1109 | unmapped_cnt: &unmapped_region_cnt, |
1110 | iotlb_gather: &iotlb_gather); |
1111 | if (!unmapped) { |
1112 | unmapped = unmap_unpin_slow(domain, dma, iova: &iova, len, |
1113 | phys, unlocked: &unlocked); |
1114 | if (WARN_ON(!unmapped)) |
1115 | break; |
1116 | } |
1117 | } |
1118 | |
1119 | dma->iommu_mapped = false; |
1120 | |
1121 | if (unmapped_region_cnt) { |
1122 | unlocked += vfio_sync_unpin(dma, domain, regions: &unmapped_region_list, |
1123 | iotlb_gather: &iotlb_gather); |
1124 | } |
1125 | |
1126 | if (do_accounting) { |
1127 | vfio_lock_acct(dma, npage: -unlocked, async: true); |
1128 | return 0; |
1129 | } |
1130 | return unlocked; |
1131 | } |
1132 | |
1133 | static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) |
1134 | { |
1135 | WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)); |
1136 | vfio_unmap_unpin(iommu, dma, do_accounting: true); |
1137 | vfio_unlink_dma(iommu, old: dma); |
1138 | put_task_struct(t: dma->task); |
1139 | mmdrop(mm: dma->mm); |
1140 | vfio_dma_bitmap_free(dma); |
1141 | if (dma->vaddr_invalid) |
1142 | iommu->vaddr_invalid_count--; |
1143 | kfree(objp: dma); |
1144 | iommu->dma_avail++; |
1145 | } |
1146 | |
1147 | static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu) |
1148 | { |
1149 | struct vfio_domain *domain; |
1150 | |
1151 | iommu->pgsize_bitmap = ULONG_MAX; |
1152 | |
1153 | list_for_each_entry(domain, &iommu->domain_list, next) |
1154 | iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap; |
1155 | |
1156 | /* |
1157 | * In case the IOMMU supports page sizes smaller than PAGE_SIZE |
1158 | * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes. |
1159 | * That way the user will be able to map/unmap buffers whose size/ |
1160 | * start address is aligned with PAGE_SIZE. Pinning code uses that |
1161 | * granularity while iommu driver can use the sub-PAGE_SIZE size |
1162 | * to map the buffer. |
1163 | */ |
1164 | if (iommu->pgsize_bitmap & ~PAGE_MASK) { |
1165 | iommu->pgsize_bitmap &= PAGE_MASK; |
1166 | iommu->pgsize_bitmap |= PAGE_SIZE; |
1167 | } |
1168 | } |
1169 | |
1170 | static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, |
1171 | struct vfio_dma *dma, dma_addr_t base_iova, |
1172 | size_t pgsize) |
1173 | { |
1174 | unsigned long pgshift = __ffs(pgsize); |
1175 | unsigned long nbits = dma->size >> pgshift; |
1176 | unsigned long bit_offset = (dma->iova - base_iova) >> pgshift; |
1177 | unsigned long copy_offset = bit_offset / BITS_PER_LONG; |
1178 | unsigned long shift = bit_offset % BITS_PER_LONG; |
1179 | unsigned long leftover; |
1180 | |
1181 | /* |
1182 | * mark all pages dirty if any IOMMU capable device is not able |
1183 | * to report dirty pages and all pages are pinned and mapped. |
1184 | */ |
1185 | if (iommu->num_non_pinned_groups && dma->iommu_mapped) |
1186 | bitmap_set(map: dma->bitmap, start: 0, nbits); |
1187 | |
1188 | if (shift) { |
1189 | bitmap_shift_left(dst: dma->bitmap, src: dma->bitmap, shift, |
1190 | nbits: nbits + shift); |
1191 | |
1192 | if (copy_from_user(to: &leftover, |
1193 | from: (void __user *)(bitmap + copy_offset), |
1194 | n: sizeof(leftover))) |
1195 | return -EFAULT; |
1196 | |
1197 | bitmap_or(dst: dma->bitmap, src1: dma->bitmap, src2: &leftover, nbits: shift); |
1198 | } |
1199 | |
1200 | if (copy_to_user(to: (void __user *)(bitmap + copy_offset), from: dma->bitmap, |
1201 | DIRTY_BITMAP_BYTES(nbits + shift))) |
1202 | return -EFAULT; |
1203 | |
1204 | return 0; |
1205 | } |
1206 | |
1207 | static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, |
1208 | dma_addr_t iova, size_t size, size_t pgsize) |
1209 | { |
1210 | struct vfio_dma *dma; |
1211 | struct rb_node *n; |
1212 | unsigned long pgshift = __ffs(pgsize); |
1213 | int ret; |
1214 | |
1215 | /* |
1216 | * GET_BITMAP request must fully cover vfio_dma mappings. Multiple |
1217 | * vfio_dma mappings may be clubbed by specifying large ranges, but |
1218 | * there must not be any previous mappings bisected by the range. |
1219 | * An error will be returned if these conditions are not met. |
1220 | */ |
1221 | dma = vfio_find_dma(iommu, start: iova, size: 1); |
1222 | if (dma && dma->iova != iova) |
1223 | return -EINVAL; |
1224 | |
1225 | dma = vfio_find_dma(iommu, start: iova + size - 1, size: 0); |
1226 | if (dma && dma->iova + dma->size != iova + size) |
1227 | return -EINVAL; |
1228 | |
1229 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
1230 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
1231 | |
1232 | if (dma->iova < iova) |
1233 | continue; |
1234 | |
1235 | if (dma->iova > iova + size - 1) |
1236 | break; |
1237 | |
1238 | ret = update_user_bitmap(bitmap, iommu, dma, base_iova: iova, pgsize); |
1239 | if (ret) |
1240 | return ret; |
1241 | |
1242 | /* |
1243 | * Re-populate bitmap to include all pinned pages which are |
1244 | * considered as dirty but exclude pages which are unpinned and |
1245 | * pages which are marked dirty by vfio_dma_rw() |
1246 | */ |
1247 | bitmap_clear(map: dma->bitmap, start: 0, nbits: dma->size >> pgshift); |
1248 | vfio_dma_populate_bitmap(dma, pgsize); |
1249 | } |
1250 | return 0; |
1251 | } |
1252 | |
1253 | static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size) |
1254 | { |
1255 | if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) || |
1256 | (bitmap_size < DIRTY_BITMAP_BYTES(npages))) |
1257 | return -EINVAL; |
1258 | |
1259 | return 0; |
1260 | } |
1261 | |
1262 | /* |
1263 | * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate |
1264 | * and unmap iovas within the range we're about to unmap. Drivers MUST unpin |
1265 | * pages in response to an invalidation. |
1266 | */ |
1267 | static void vfio_notify_dma_unmap(struct vfio_iommu *iommu, |
1268 | struct vfio_dma *dma) |
1269 | { |
1270 | struct vfio_device *device; |
1271 | |
1272 | if (list_empty(head: &iommu->device_list)) |
1273 | return; |
1274 | |
1275 | /* |
1276 | * The device is expected to call vfio_unpin_pages() for any IOVA it has |
1277 | * pinned within the range. Since vfio_unpin_pages() will eventually |
1278 | * call back down to this code and try to obtain the iommu->lock we must |
1279 | * drop it. |
1280 | */ |
1281 | mutex_lock(&iommu->device_list_lock); |
1282 | mutex_unlock(lock: &iommu->lock); |
1283 | |
1284 | list_for_each_entry(device, &iommu->device_list, iommu_entry) |
1285 | device->ops->dma_unmap(device, dma->iova, dma->size); |
1286 | |
1287 | mutex_unlock(lock: &iommu->device_list_lock); |
1288 | mutex_lock(&iommu->lock); |
1289 | } |
1290 | |
1291 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, |
1292 | struct vfio_iommu_type1_dma_unmap *unmap, |
1293 | struct vfio_bitmap *bitmap) |
1294 | { |
1295 | struct vfio_dma *dma, *dma_last = NULL; |
1296 | size_t unmapped = 0, pgsize; |
1297 | int ret = -EINVAL, retries = 0; |
1298 | unsigned long pgshift; |
1299 | dma_addr_t iova = unmap->iova; |
1300 | u64 size = unmap->size; |
1301 | bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL; |
1302 | bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR; |
1303 | struct rb_node *n, *first_n; |
1304 | |
1305 | mutex_lock(&iommu->lock); |
1306 | |
1307 | /* Cannot update vaddr if mdev is present. */ |
1308 | if (invalidate_vaddr && !list_empty(head: &iommu->emulated_iommu_groups)) { |
1309 | ret = -EBUSY; |
1310 | goto unlock; |
1311 | } |
1312 | |
1313 | pgshift = __ffs(iommu->pgsize_bitmap); |
1314 | pgsize = (size_t)1 << pgshift; |
1315 | |
1316 | if (iova & (pgsize - 1)) |
1317 | goto unlock; |
1318 | |
1319 | if (unmap_all) { |
1320 | if (iova || size) |
1321 | goto unlock; |
1322 | size = U64_MAX; |
1323 | } else if (!size || size & (pgsize - 1) || |
1324 | iova + size - 1 < iova || size > SIZE_MAX) { |
1325 | goto unlock; |
1326 | } |
1327 | |
1328 | /* When dirty tracking is enabled, allow only min supported pgsize */ |
1329 | if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && |
1330 | (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) { |
1331 | goto unlock; |
1332 | } |
1333 | |
1334 | WARN_ON((pgsize - 1) & PAGE_MASK); |
1335 | again: |
1336 | /* |
1337 | * vfio-iommu-type1 (v1) - User mappings were coalesced together to |
1338 | * avoid tracking individual mappings. This means that the granularity |
1339 | * of the original mapping was lost and the user was allowed to attempt |
1340 | * to unmap any range. Depending on the contiguousness of physical |
1341 | * memory and page sizes supported by the IOMMU, arbitrary unmaps may |
1342 | * or may not have worked. We only guaranteed unmap granularity |
1343 | * matching the original mapping; even though it was untracked here, |
1344 | * the original mappings are reflected in IOMMU mappings. This |
1345 | * resulted in a couple unusual behaviors. First, if a range is not |
1346 | * able to be unmapped, ex. a set of 4k pages that was mapped as a |
1347 | * 2M hugepage into the IOMMU, the unmap ioctl returns success but with |
1348 | * a zero sized unmap. Also, if an unmap request overlaps the first |
1349 | * address of a hugepage, the IOMMU will unmap the entire hugepage. |
1350 | * This also returns success and the returned unmap size reflects the |
1351 | * actual size unmapped. |
1352 | * |
1353 | * We attempt to maintain compatibility with this "v1" interface, but |
1354 | * we take control out of the hands of the IOMMU. Therefore, an unmap |
1355 | * request offset from the beginning of the original mapping will |
1356 | * return success with zero sized unmap. And an unmap request covering |
1357 | * the first iova of mapping will unmap the entire range. |
1358 | * |
1359 | * The v2 version of this interface intends to be more deterministic. |
1360 | * Unmap requests must fully cover previous mappings. Multiple |
1361 | * mappings may still be unmaped by specifying large ranges, but there |
1362 | * must not be any previous mappings bisected by the range. An error |
1363 | * will be returned if these conditions are not met. The v2 interface |
1364 | * will only return success and a size of zero if there were no |
1365 | * mappings within the range. |
1366 | */ |
1367 | if (iommu->v2 && !unmap_all) { |
1368 | dma = vfio_find_dma(iommu, start: iova, size: 1); |
1369 | if (dma && dma->iova != iova) |
1370 | goto unlock; |
1371 | |
1372 | dma = vfio_find_dma(iommu, start: iova + size - 1, size: 0); |
1373 | if (dma && dma->iova + dma->size != iova + size) |
1374 | goto unlock; |
1375 | } |
1376 | |
1377 | ret = 0; |
1378 | n = first_n = vfio_find_dma_first_node(iommu, start: iova, size); |
1379 | |
1380 | while (n) { |
1381 | dma = rb_entry(n, struct vfio_dma, node); |
1382 | if (dma->iova >= iova + size) |
1383 | break; |
1384 | |
1385 | if (!iommu->v2 && iova > dma->iova) |
1386 | break; |
1387 | |
1388 | if (invalidate_vaddr) { |
1389 | if (dma->vaddr_invalid) { |
1390 | struct rb_node *last_n = n; |
1391 | |
1392 | for (n = first_n; n != last_n; n = rb_next(n)) { |
1393 | dma = rb_entry(n, |
1394 | struct vfio_dma, node); |
1395 | dma->vaddr_invalid = false; |
1396 | iommu->vaddr_invalid_count--; |
1397 | } |
1398 | ret = -EINVAL; |
1399 | unmapped = 0; |
1400 | break; |
1401 | } |
1402 | dma->vaddr_invalid = true; |
1403 | iommu->vaddr_invalid_count++; |
1404 | unmapped += dma->size; |
1405 | n = rb_next(n); |
1406 | continue; |
1407 | } |
1408 | |
1409 | if (!RB_EMPTY_ROOT(&dma->pfn_list)) { |
1410 | if (dma_last == dma) { |
1411 | BUG_ON(++retries > 10); |
1412 | } else { |
1413 | dma_last = dma; |
1414 | retries = 0; |
1415 | } |
1416 | |
1417 | vfio_notify_dma_unmap(iommu, dma); |
1418 | goto again; |
1419 | } |
1420 | |
1421 | if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { |
1422 | ret = update_user_bitmap(bitmap: bitmap->data, iommu, dma, |
1423 | base_iova: iova, pgsize); |
1424 | if (ret) |
1425 | break; |
1426 | } |
1427 | |
1428 | unmapped += dma->size; |
1429 | n = rb_next(n); |
1430 | vfio_remove_dma(iommu, dma); |
1431 | } |
1432 | |
1433 | unlock: |
1434 | mutex_unlock(lock: &iommu->lock); |
1435 | |
1436 | /* Report how much was unmapped */ |
1437 | unmap->size = unmapped; |
1438 | |
1439 | return ret; |
1440 | } |
1441 | |
1442 | static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, |
1443 | unsigned long pfn, long npage, int prot) |
1444 | { |
1445 | struct vfio_domain *d; |
1446 | int ret; |
1447 | |
1448 | list_for_each_entry(d, &iommu->domain_list, next) { |
1449 | ret = iommu_map(domain: d->domain, iova, paddr: (phys_addr_t)pfn << PAGE_SHIFT, |
1450 | size: npage << PAGE_SHIFT, prot: prot | IOMMU_CACHE, |
1451 | GFP_KERNEL_ACCOUNT); |
1452 | if (ret) |
1453 | goto unwind; |
1454 | |
1455 | cond_resched(); |
1456 | } |
1457 | |
1458 | return 0; |
1459 | |
1460 | unwind: |
1461 | list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) { |
1462 | iommu_unmap(domain: d->domain, iova, size: npage << PAGE_SHIFT); |
1463 | cond_resched(); |
1464 | } |
1465 | |
1466 | return ret; |
1467 | } |
1468 | |
1469 | static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, |
1470 | size_t map_size) |
1471 | { |
1472 | dma_addr_t iova = dma->iova; |
1473 | unsigned long vaddr = dma->vaddr; |
1474 | struct vfio_batch batch; |
1475 | size_t size = map_size; |
1476 | long npage; |
1477 | unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
1478 | int ret = 0; |
1479 | |
1480 | vfio_batch_init(batch: &batch); |
1481 | |
1482 | while (size) { |
1483 | /* Pin a contiguous chunk of memory */ |
1484 | npage = vfio_pin_pages_remote(dma, vaddr: vaddr + dma->size, |
1485 | npage: size >> PAGE_SHIFT, pfn_base: &pfn, limit, |
1486 | batch: &batch); |
1487 | if (npage <= 0) { |
1488 | WARN_ON(!npage); |
1489 | ret = (int)npage; |
1490 | break; |
1491 | } |
1492 | |
1493 | /* Map it! */ |
1494 | ret = vfio_iommu_map(iommu, iova: iova + dma->size, pfn, npage, |
1495 | prot: dma->prot); |
1496 | if (ret) { |
1497 | vfio_unpin_pages_remote(dma, iova: iova + dma->size, pfn, |
1498 | npage, do_accounting: true); |
1499 | vfio_batch_unpin(batch: &batch, dma); |
1500 | break; |
1501 | } |
1502 | |
1503 | size -= npage << PAGE_SHIFT; |
1504 | dma->size += npage << PAGE_SHIFT; |
1505 | } |
1506 | |
1507 | vfio_batch_fini(batch: &batch); |
1508 | dma->iommu_mapped = true; |
1509 | |
1510 | if (ret) |
1511 | vfio_remove_dma(iommu, dma); |
1512 | |
1513 | return ret; |
1514 | } |
1515 | |
1516 | /* |
1517 | * Check dma map request is within a valid iova range |
1518 | */ |
1519 | static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu, |
1520 | dma_addr_t start, dma_addr_t end) |
1521 | { |
1522 | struct list_head *iova = &iommu->iova_list; |
1523 | struct vfio_iova *node; |
1524 | |
1525 | list_for_each_entry(node, iova, list) { |
1526 | if (start >= node->start && end <= node->end) |
1527 | return true; |
1528 | } |
1529 | |
1530 | /* |
1531 | * Check for list_empty() as well since a container with |
1532 | * a single mdev device will have an empty list. |
1533 | */ |
1534 | return list_empty(head: iova); |
1535 | } |
1536 | |
1537 | static int vfio_change_dma_owner(struct vfio_dma *dma) |
1538 | { |
1539 | struct task_struct *task = current->group_leader; |
1540 | struct mm_struct *mm = current->mm; |
1541 | long npage = dma->locked_vm; |
1542 | bool lock_cap; |
1543 | int ret; |
1544 | |
1545 | if (mm == dma->mm) |
1546 | return 0; |
1547 | |
1548 | lock_cap = capable(CAP_IPC_LOCK); |
1549 | ret = mm_lock_acct(task, mm, lock_cap, npage); |
1550 | if (ret) |
1551 | return ret; |
1552 | |
1553 | if (mmget_not_zero(mm: dma->mm)) { |
1554 | mm_lock_acct(task: dma->task, mm: dma->mm, lock_cap: dma->lock_cap, npage: -npage); |
1555 | mmput(dma->mm); |
1556 | } |
1557 | |
1558 | if (dma->task != task) { |
1559 | put_task_struct(t: dma->task); |
1560 | dma->task = get_task_struct(t: task); |
1561 | } |
1562 | mmdrop(mm: dma->mm); |
1563 | dma->mm = mm; |
1564 | mmgrab(mm: dma->mm); |
1565 | dma->lock_cap = lock_cap; |
1566 | return 0; |
1567 | } |
1568 | |
1569 | static int vfio_dma_do_map(struct vfio_iommu *iommu, |
1570 | struct vfio_iommu_type1_dma_map *map) |
1571 | { |
1572 | bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR; |
1573 | dma_addr_t iova = map->iova; |
1574 | unsigned long vaddr = map->vaddr; |
1575 | size_t size = map->size; |
1576 | int ret = 0, prot = 0; |
1577 | size_t pgsize; |
1578 | struct vfio_dma *dma; |
1579 | |
1580 | /* Verify that none of our __u64 fields overflow */ |
1581 | if (map->size != size || map->vaddr != vaddr || map->iova != iova) |
1582 | return -EINVAL; |
1583 | |
1584 | /* READ/WRITE from device perspective */ |
1585 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) |
1586 | prot |= IOMMU_WRITE; |
1587 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) |
1588 | prot |= IOMMU_READ; |
1589 | |
1590 | if ((prot && set_vaddr) || (!prot && !set_vaddr)) |
1591 | return -EINVAL; |
1592 | |
1593 | mutex_lock(&iommu->lock); |
1594 | |
1595 | pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); |
1596 | |
1597 | WARN_ON((pgsize - 1) & PAGE_MASK); |
1598 | |
1599 | if (!size || (size | iova | vaddr) & (pgsize - 1)) { |
1600 | ret = -EINVAL; |
1601 | goto out_unlock; |
1602 | } |
1603 | |
1604 | /* Don't allow IOVA or virtual address wrap */ |
1605 | if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) { |
1606 | ret = -EINVAL; |
1607 | goto out_unlock; |
1608 | } |
1609 | |
1610 | dma = vfio_find_dma(iommu, start: iova, size); |
1611 | if (set_vaddr) { |
1612 | if (!dma) { |
1613 | ret = -ENOENT; |
1614 | } else if (!dma->vaddr_invalid || dma->iova != iova || |
1615 | dma->size != size) { |
1616 | ret = -EINVAL; |
1617 | } else { |
1618 | ret = vfio_change_dma_owner(dma); |
1619 | if (ret) |
1620 | goto out_unlock; |
1621 | dma->vaddr = vaddr; |
1622 | dma->vaddr_invalid = false; |
1623 | iommu->vaddr_invalid_count--; |
1624 | } |
1625 | goto out_unlock; |
1626 | } else if (dma) { |
1627 | ret = -EEXIST; |
1628 | goto out_unlock; |
1629 | } |
1630 | |
1631 | if (!iommu->dma_avail) { |
1632 | ret = -ENOSPC; |
1633 | goto out_unlock; |
1634 | } |
1635 | |
1636 | if (!vfio_iommu_iova_dma_valid(iommu, start: iova, end: iova + size - 1)) { |
1637 | ret = -EINVAL; |
1638 | goto out_unlock; |
1639 | } |
1640 | |
1641 | dma = kzalloc(sizeof(*dma), GFP_KERNEL); |
1642 | if (!dma) { |
1643 | ret = -ENOMEM; |
1644 | goto out_unlock; |
1645 | } |
1646 | |
1647 | iommu->dma_avail--; |
1648 | dma->iova = iova; |
1649 | dma->vaddr = vaddr; |
1650 | dma->prot = prot; |
1651 | |
1652 | /* |
1653 | * We need to be able to both add to a task's locked memory and test |
1654 | * against the locked memory limit and we need to be able to do both |
1655 | * outside of this call path as pinning can be asynchronous via the |
1656 | * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a |
1657 | * task_struct. Save the group_leader so that all DMA tracking uses |
1658 | * the same task, to make debugging easier. VM locked pages requires |
1659 | * an mm_struct, so grab the mm in case the task dies. |
1660 | */ |
1661 | get_task_struct(current->group_leader); |
1662 | dma->task = current->group_leader; |
1663 | dma->lock_cap = capable(CAP_IPC_LOCK); |
1664 | dma->mm = current->mm; |
1665 | mmgrab(mm: dma->mm); |
1666 | |
1667 | dma->pfn_list = RB_ROOT; |
1668 | |
1669 | /* Insert zero-sized and grow as we map chunks of it */ |
1670 | vfio_link_dma(iommu, new: dma); |
1671 | |
1672 | /* Don't pin and map if container doesn't contain IOMMU capable domain*/ |
1673 | if (list_empty(head: &iommu->domain_list)) |
1674 | dma->size = size; |
1675 | else |
1676 | ret = vfio_pin_map_dma(iommu, dma, map_size: size); |
1677 | |
1678 | if (!ret && iommu->dirty_page_tracking) { |
1679 | ret = vfio_dma_bitmap_alloc(dma, pgsize); |
1680 | if (ret) |
1681 | vfio_remove_dma(iommu, dma); |
1682 | } |
1683 | |
1684 | out_unlock: |
1685 | mutex_unlock(lock: &iommu->lock); |
1686 | return ret; |
1687 | } |
1688 | |
1689 | static int vfio_iommu_replay(struct vfio_iommu *iommu, |
1690 | struct vfio_domain *domain) |
1691 | { |
1692 | struct vfio_batch batch; |
1693 | struct vfio_domain *d = NULL; |
1694 | struct rb_node *n; |
1695 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
1696 | int ret; |
1697 | |
1698 | /* Arbitrarily pick the first domain in the list for lookups */ |
1699 | if (!list_empty(head: &iommu->domain_list)) |
1700 | d = list_first_entry(&iommu->domain_list, |
1701 | struct vfio_domain, next); |
1702 | |
1703 | vfio_batch_init(batch: &batch); |
1704 | |
1705 | n = rb_first(&iommu->dma_list); |
1706 | |
1707 | for (; n; n = rb_next(n)) { |
1708 | struct vfio_dma *dma; |
1709 | dma_addr_t iova; |
1710 | |
1711 | dma = rb_entry(n, struct vfio_dma, node); |
1712 | iova = dma->iova; |
1713 | |
1714 | while (iova < dma->iova + dma->size) { |
1715 | phys_addr_t phys; |
1716 | size_t size; |
1717 | |
1718 | if (dma->iommu_mapped) { |
1719 | phys_addr_t p; |
1720 | dma_addr_t i; |
1721 | |
1722 | if (WARN_ON(!d)) { /* mapped w/o a domain?! */ |
1723 | ret = -EINVAL; |
1724 | goto unwind; |
1725 | } |
1726 | |
1727 | phys = iommu_iova_to_phys(domain: d->domain, iova); |
1728 | |
1729 | if (WARN_ON(!phys)) { |
1730 | iova += PAGE_SIZE; |
1731 | continue; |
1732 | } |
1733 | |
1734 | size = PAGE_SIZE; |
1735 | p = phys + size; |
1736 | i = iova + size; |
1737 | while (i < dma->iova + dma->size && |
1738 | p == iommu_iova_to_phys(domain: d->domain, iova: i)) { |
1739 | size += PAGE_SIZE; |
1740 | p += PAGE_SIZE; |
1741 | i += PAGE_SIZE; |
1742 | } |
1743 | } else { |
1744 | unsigned long pfn; |
1745 | unsigned long vaddr = dma->vaddr + |
1746 | (iova - dma->iova); |
1747 | size_t n = dma->iova + dma->size - iova; |
1748 | long npage; |
1749 | |
1750 | npage = vfio_pin_pages_remote(dma, vaddr, |
1751 | npage: n >> PAGE_SHIFT, |
1752 | pfn_base: &pfn, limit, |
1753 | batch: &batch); |
1754 | if (npage <= 0) { |
1755 | WARN_ON(!npage); |
1756 | ret = (int)npage; |
1757 | goto unwind; |
1758 | } |
1759 | |
1760 | phys = pfn << PAGE_SHIFT; |
1761 | size = npage << PAGE_SHIFT; |
1762 | } |
1763 | |
1764 | ret = iommu_map(domain: domain->domain, iova, paddr: phys, size, |
1765 | prot: dma->prot | IOMMU_CACHE, |
1766 | GFP_KERNEL_ACCOUNT); |
1767 | if (ret) { |
1768 | if (!dma->iommu_mapped) { |
1769 | vfio_unpin_pages_remote(dma, iova, |
1770 | pfn: phys >> PAGE_SHIFT, |
1771 | npage: size >> PAGE_SHIFT, |
1772 | do_accounting: true); |
1773 | vfio_batch_unpin(batch: &batch, dma); |
1774 | } |
1775 | goto unwind; |
1776 | } |
1777 | |
1778 | iova += size; |
1779 | } |
1780 | } |
1781 | |
1782 | /* All dmas are now mapped, defer to second tree walk for unwind */ |
1783 | for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { |
1784 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
1785 | |
1786 | dma->iommu_mapped = true; |
1787 | } |
1788 | |
1789 | vfio_batch_fini(batch: &batch); |
1790 | return 0; |
1791 | |
1792 | unwind: |
1793 | for (; n; n = rb_prev(n)) { |
1794 | struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); |
1795 | dma_addr_t iova; |
1796 | |
1797 | if (dma->iommu_mapped) { |
1798 | iommu_unmap(domain: domain->domain, iova: dma->iova, size: dma->size); |
1799 | continue; |
1800 | } |
1801 | |
1802 | iova = dma->iova; |
1803 | while (iova < dma->iova + dma->size) { |
1804 | phys_addr_t phys, p; |
1805 | size_t size; |
1806 | dma_addr_t i; |
1807 | |
1808 | phys = iommu_iova_to_phys(domain: domain->domain, iova); |
1809 | if (!phys) { |
1810 | iova += PAGE_SIZE; |
1811 | continue; |
1812 | } |
1813 | |
1814 | size = PAGE_SIZE; |
1815 | p = phys + size; |
1816 | i = iova + size; |
1817 | while (i < dma->iova + dma->size && |
1818 | p == iommu_iova_to_phys(domain: domain->domain, iova: i)) { |
1819 | size += PAGE_SIZE; |
1820 | p += PAGE_SIZE; |
1821 | i += PAGE_SIZE; |
1822 | } |
1823 | |
1824 | iommu_unmap(domain: domain->domain, iova, size); |
1825 | vfio_unpin_pages_remote(dma, iova, pfn: phys >> PAGE_SHIFT, |
1826 | npage: size >> PAGE_SHIFT, do_accounting: true); |
1827 | } |
1828 | } |
1829 | |
1830 | vfio_batch_fini(batch: &batch); |
1831 | return ret; |
1832 | } |
1833 | |
1834 | static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain, |
1835 | struct iommu_group *iommu_group) |
1836 | { |
1837 | struct vfio_iommu_group *g; |
1838 | |
1839 | list_for_each_entry(g, &domain->group_list, next) { |
1840 | if (g->iommu_group == iommu_group) |
1841 | return g; |
1842 | } |
1843 | |
1844 | return NULL; |
1845 | } |
1846 | |
1847 | static struct vfio_iommu_group* |
1848 | vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, |
1849 | struct iommu_group *iommu_group) |
1850 | { |
1851 | struct vfio_iommu_group *group; |
1852 | struct vfio_domain *domain; |
1853 | |
1854 | list_for_each_entry(domain, &iommu->domain_list, next) { |
1855 | group = find_iommu_group(domain, iommu_group); |
1856 | if (group) |
1857 | return group; |
1858 | } |
1859 | |
1860 | list_for_each_entry(group, &iommu->emulated_iommu_groups, next) |
1861 | if (group->iommu_group == iommu_group) |
1862 | return group; |
1863 | return NULL; |
1864 | } |
1865 | |
1866 | static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, |
1867 | phys_addr_t *base) |
1868 | { |
1869 | struct iommu_resv_region *region; |
1870 | bool ret = false; |
1871 | |
1872 | list_for_each_entry(region, group_resv_regions, list) { |
1873 | /* |
1874 | * The presence of any 'real' MSI regions should take |
1875 | * precedence over the software-managed one if the |
1876 | * IOMMU driver happens to advertise both types. |
1877 | */ |
1878 | if (region->type == IOMMU_RESV_MSI) { |
1879 | ret = false; |
1880 | break; |
1881 | } |
1882 | |
1883 | if (region->type == IOMMU_RESV_SW_MSI) { |
1884 | *base = region->start; |
1885 | ret = true; |
1886 | } |
1887 | } |
1888 | |
1889 | return ret; |
1890 | } |
1891 | |
1892 | /* |
1893 | * This is a helper function to insert an address range to iova list. |
1894 | * The list is initially created with a single entry corresponding to |
1895 | * the IOMMU domain geometry to which the device group is attached. |
1896 | * The list aperture gets modified when a new domain is added to the |
1897 | * container if the new aperture doesn't conflict with the current one |
1898 | * or with any existing dma mappings. The list is also modified to |
1899 | * exclude any reserved regions associated with the device group. |
1900 | */ |
1901 | static int vfio_iommu_iova_insert(struct list_head *head, |
1902 | dma_addr_t start, dma_addr_t end) |
1903 | { |
1904 | struct vfio_iova *region; |
1905 | |
1906 | region = kmalloc(sizeof(*region), GFP_KERNEL); |
1907 | if (!region) |
1908 | return -ENOMEM; |
1909 | |
1910 | INIT_LIST_HEAD(list: ®ion->list); |
1911 | region->start = start; |
1912 | region->end = end; |
1913 | |
1914 | list_add_tail(new: ®ion->list, head); |
1915 | return 0; |
1916 | } |
1917 | |
1918 | /* |
1919 | * Check the new iommu aperture conflicts with existing aper or with any |
1920 | * existing dma mappings. |
1921 | */ |
1922 | static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu, |
1923 | dma_addr_t start, dma_addr_t end) |
1924 | { |
1925 | struct vfio_iova *first, *last; |
1926 | struct list_head *iova = &iommu->iova_list; |
1927 | |
1928 | if (list_empty(head: iova)) |
1929 | return false; |
1930 | |
1931 | /* Disjoint sets, return conflict */ |
1932 | first = list_first_entry(iova, struct vfio_iova, list); |
1933 | last = list_last_entry(iova, struct vfio_iova, list); |
1934 | if (start > last->end || end < first->start) |
1935 | return true; |
1936 | |
1937 | /* Check for any existing dma mappings below the new start */ |
1938 | if (start > first->start) { |
1939 | if (vfio_find_dma(iommu, start: first->start, size: start - first->start)) |
1940 | return true; |
1941 | } |
1942 | |
1943 | /* Check for any existing dma mappings beyond the new end */ |
1944 | if (end < last->end) { |
1945 | if (vfio_find_dma(iommu, start: end + 1, size: last->end - end)) |
1946 | return true; |
1947 | } |
1948 | |
1949 | return false; |
1950 | } |
1951 | |
1952 | /* |
1953 | * Resize iommu iova aperture window. This is called only if the new |
1954 | * aperture has no conflict with existing aperture and dma mappings. |
1955 | */ |
1956 | static int vfio_iommu_aper_resize(struct list_head *iova, |
1957 | dma_addr_t start, dma_addr_t end) |
1958 | { |
1959 | struct vfio_iova *node, *next; |
1960 | |
1961 | if (list_empty(head: iova)) |
1962 | return vfio_iommu_iova_insert(head: iova, start, end); |
1963 | |
1964 | /* Adjust iova list start */ |
1965 | list_for_each_entry_safe(node, next, iova, list) { |
1966 | if (start < node->start) |
1967 | break; |
1968 | if (start >= node->start && start < node->end) { |
1969 | node->start = start; |
1970 | break; |
1971 | } |
1972 | /* Delete nodes before new start */ |
1973 | list_del(entry: &node->list); |
1974 | kfree(objp: node); |
1975 | } |
1976 | |
1977 | /* Adjust iova list end */ |
1978 | list_for_each_entry_safe(node, next, iova, list) { |
1979 | if (end > node->end) |
1980 | continue; |
1981 | if (end > node->start && end <= node->end) { |
1982 | node->end = end; |
1983 | continue; |
1984 | } |
1985 | /* Delete nodes after new end */ |
1986 | list_del(entry: &node->list); |
1987 | kfree(objp: node); |
1988 | } |
1989 | |
1990 | return 0; |
1991 | } |
1992 | |
1993 | /* |
1994 | * Check reserved region conflicts with existing dma mappings |
1995 | */ |
1996 | static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu, |
1997 | struct list_head *resv_regions) |
1998 | { |
1999 | struct iommu_resv_region *region; |
2000 | |
2001 | /* Check for conflict with existing dma mappings */ |
2002 | list_for_each_entry(region, resv_regions, list) { |
2003 | if (region->type == IOMMU_RESV_DIRECT_RELAXABLE) |
2004 | continue; |
2005 | |
2006 | if (vfio_find_dma(iommu, start: region->start, size: region->length)) |
2007 | return true; |
2008 | } |
2009 | |
2010 | return false; |
2011 | } |
2012 | |
2013 | /* |
2014 | * Check iova region overlap with reserved regions and |
2015 | * exclude them from the iommu iova range |
2016 | */ |
2017 | static int vfio_iommu_resv_exclude(struct list_head *iova, |
2018 | struct list_head *resv_regions) |
2019 | { |
2020 | struct iommu_resv_region *resv; |
2021 | struct vfio_iova *n, *next; |
2022 | |
2023 | list_for_each_entry(resv, resv_regions, list) { |
2024 | phys_addr_t start, end; |
2025 | |
2026 | if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) |
2027 | continue; |
2028 | |
2029 | start = resv->start; |
2030 | end = resv->start + resv->length - 1; |
2031 | |
2032 | list_for_each_entry_safe(n, next, iova, list) { |
2033 | int ret = 0; |
2034 | |
2035 | /* No overlap */ |
2036 | if (start > n->end || end < n->start) |
2037 | continue; |
2038 | /* |
2039 | * Insert a new node if current node overlaps with the |
2040 | * reserve region to exclude that from valid iova range. |
2041 | * Note that, new node is inserted before the current |
2042 | * node and finally the current node is deleted keeping |
2043 | * the list updated and sorted. |
2044 | */ |
2045 | if (start > n->start) |
2046 | ret = vfio_iommu_iova_insert(head: &n->list, start: n->start, |
2047 | end: start - 1); |
2048 | if (!ret && end < n->end) |
2049 | ret = vfio_iommu_iova_insert(head: &n->list, start: end + 1, |
2050 | end: n->end); |
2051 | if (ret) |
2052 | return ret; |
2053 | |
2054 | list_del(entry: &n->list); |
2055 | kfree(objp: n); |
2056 | } |
2057 | } |
2058 | |
2059 | if (list_empty(head: iova)) |
2060 | return -EINVAL; |
2061 | |
2062 | return 0; |
2063 | } |
2064 | |
2065 | static void vfio_iommu_resv_free(struct list_head *resv_regions) |
2066 | { |
2067 | struct iommu_resv_region *n, *next; |
2068 | |
2069 | list_for_each_entry_safe(n, next, resv_regions, list) { |
2070 | list_del(entry: &n->list); |
2071 | kfree(objp: n); |
2072 | } |
2073 | } |
2074 | |
2075 | static void vfio_iommu_iova_free(struct list_head *iova) |
2076 | { |
2077 | struct vfio_iova *n, *next; |
2078 | |
2079 | list_for_each_entry_safe(n, next, iova, list) { |
2080 | list_del(entry: &n->list); |
2081 | kfree(objp: n); |
2082 | } |
2083 | } |
2084 | |
2085 | static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu, |
2086 | struct list_head *iova_copy) |
2087 | { |
2088 | struct list_head *iova = &iommu->iova_list; |
2089 | struct vfio_iova *n; |
2090 | int ret; |
2091 | |
2092 | list_for_each_entry(n, iova, list) { |
2093 | ret = vfio_iommu_iova_insert(head: iova_copy, start: n->start, end: n->end); |
2094 | if (ret) |
2095 | goto out_free; |
2096 | } |
2097 | |
2098 | return 0; |
2099 | |
2100 | out_free: |
2101 | vfio_iommu_iova_free(iova: iova_copy); |
2102 | return ret; |
2103 | } |
2104 | |
2105 | static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, |
2106 | struct list_head *iova_copy) |
2107 | { |
2108 | struct list_head *iova = &iommu->iova_list; |
2109 | |
2110 | vfio_iommu_iova_free(iova); |
2111 | |
2112 | list_splice_tail(list: iova_copy, head: iova); |
2113 | } |
2114 | |
2115 | static int vfio_iommu_domain_alloc(struct device *dev, void *data) |
2116 | { |
2117 | struct iommu_domain **domain = data; |
2118 | |
2119 | *domain = iommu_paging_domain_alloc(dev); |
2120 | return 1; /* Don't iterate */ |
2121 | } |
2122 | |
2123 | static int vfio_iommu_type1_attach_group(void *iommu_data, |
2124 | struct iommu_group *iommu_group, enum vfio_group_type type) |
2125 | { |
2126 | struct vfio_iommu *iommu = iommu_data; |
2127 | struct vfio_iommu_group *group; |
2128 | struct vfio_domain *domain, *d; |
2129 | bool resv_msi; |
2130 | phys_addr_t resv_msi_base = 0; |
2131 | struct iommu_domain_geometry *geo; |
2132 | LIST_HEAD(iova_copy); |
2133 | LIST_HEAD(group_resv_regions); |
2134 | int ret = -EBUSY; |
2135 | |
2136 | mutex_lock(&iommu->lock); |
2137 | |
2138 | /* Attach could require pinning, so disallow while vaddr is invalid. */ |
2139 | if (iommu->vaddr_invalid_count) |
2140 | goto out_unlock; |
2141 | |
2142 | /* Check for duplicates */ |
2143 | ret = -EINVAL; |
2144 | if (vfio_iommu_find_iommu_group(iommu, iommu_group)) |
2145 | goto out_unlock; |
2146 | |
2147 | ret = -ENOMEM; |
2148 | group = kzalloc(sizeof(*group), GFP_KERNEL); |
2149 | if (!group) |
2150 | goto out_unlock; |
2151 | group->iommu_group = iommu_group; |
2152 | |
2153 | if (type == VFIO_EMULATED_IOMMU) { |
2154 | list_add(new: &group->next, head: &iommu->emulated_iommu_groups); |
2155 | /* |
2156 | * An emulated IOMMU group cannot dirty memory directly, it can |
2157 | * only use interfaces that provide dirty tracking. |
2158 | * The iommu scope can only be promoted with the addition of a |
2159 | * dirty tracking group. |
2160 | */ |
2161 | group->pinned_page_dirty_scope = true; |
2162 | ret = 0; |
2163 | goto out_unlock; |
2164 | } |
2165 | |
2166 | ret = -ENOMEM; |
2167 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); |
2168 | if (!domain) |
2169 | goto out_free_group; |
2170 | |
2171 | /* |
2172 | * Going via the iommu_group iterator avoids races, and trivially gives |
2173 | * us a representative device for the IOMMU API call. We don't actually |
2174 | * want to iterate beyond the first device (if any). |
2175 | */ |
2176 | iommu_group_for_each_dev(group: iommu_group, data: &domain->domain, |
2177 | fn: vfio_iommu_domain_alloc); |
2178 | if (IS_ERR(ptr: domain->domain)) { |
2179 | ret = PTR_ERR(ptr: domain->domain); |
2180 | goto out_free_domain; |
2181 | } |
2182 | |
2183 | ret = iommu_attach_group(domain: domain->domain, group: group->iommu_group); |
2184 | if (ret) |
2185 | goto out_domain; |
2186 | |
2187 | /* Get aperture info */ |
2188 | geo = &domain->domain->geometry; |
2189 | if (vfio_iommu_aper_conflict(iommu, start: geo->aperture_start, |
2190 | end: geo->aperture_end)) { |
2191 | ret = -EINVAL; |
2192 | goto out_detach; |
2193 | } |
2194 | |
2195 | ret = iommu_get_group_resv_regions(group: iommu_group, head: &group_resv_regions); |
2196 | if (ret) |
2197 | goto out_detach; |
2198 | |
2199 | if (vfio_iommu_resv_conflict(iommu, resv_regions: &group_resv_regions)) { |
2200 | ret = -EINVAL; |
2201 | goto out_detach; |
2202 | } |
2203 | |
2204 | /* |
2205 | * We don't want to work on the original iova list as the list |
2206 | * gets modified and in case of failure we have to retain the |
2207 | * original list. Get a copy here. |
2208 | */ |
2209 | ret = vfio_iommu_iova_get_copy(iommu, iova_copy: &iova_copy); |
2210 | if (ret) |
2211 | goto out_detach; |
2212 | |
2213 | ret = vfio_iommu_aper_resize(iova: &iova_copy, start: geo->aperture_start, |
2214 | end: geo->aperture_end); |
2215 | if (ret) |
2216 | goto out_detach; |
2217 | |
2218 | ret = vfio_iommu_resv_exclude(iova: &iova_copy, resv_regions: &group_resv_regions); |
2219 | if (ret) |
2220 | goto out_detach; |
2221 | |
2222 | resv_msi = vfio_iommu_has_sw_msi(group_resv_regions: &group_resv_regions, base: &resv_msi_base); |
2223 | |
2224 | INIT_LIST_HEAD(list: &domain->group_list); |
2225 | list_add(new: &group->next, head: &domain->group_list); |
2226 | |
2227 | if (!allow_unsafe_interrupts && |
2228 | !iommu_group_has_isolated_msi(group: iommu_group)) { |
2229 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", |
2230 | __func__); |
2231 | ret = -EPERM; |
2232 | goto out_detach; |
2233 | } |
2234 | |
2235 | /* |
2236 | * If the IOMMU can block non-coherent operations (ie PCIe TLPs with |
2237 | * no-snoop set) then VFIO always turns this feature on because on Intel |
2238 | * platforms it optimizes KVM to disable wbinvd emulation. |
2239 | */ |
2240 | if (domain->domain->ops->enforce_cache_coherency) |
2241 | domain->enforce_cache_coherency = |
2242 | domain->domain->ops->enforce_cache_coherency( |
2243 | domain->domain); |
2244 | |
2245 | /* |
2246 | * Try to match an existing compatible domain. We don't want to |
2247 | * preclude an IOMMU driver supporting multiple bus_types and being |
2248 | * able to include different bus_types in the same IOMMU domain, so |
2249 | * we test whether the domains use the same iommu_ops rather than |
2250 | * testing if they're on the same bus_type. |
2251 | */ |
2252 | list_for_each_entry(d, &iommu->domain_list, next) { |
2253 | if (d->domain->ops == domain->domain->ops && |
2254 | d->enforce_cache_coherency == |
2255 | domain->enforce_cache_coherency) { |
2256 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2257 | if (!iommu_attach_group(domain: d->domain, |
2258 | group: group->iommu_group)) { |
2259 | list_add(new: &group->next, head: &d->group_list); |
2260 | iommu_domain_free(domain: domain->domain); |
2261 | kfree(objp: domain); |
2262 | goto done; |
2263 | } |
2264 | |
2265 | ret = iommu_attach_group(domain: domain->domain, |
2266 | group: group->iommu_group); |
2267 | if (ret) |
2268 | goto out_domain; |
2269 | } |
2270 | } |
2271 | |
2272 | /* replay mappings on new domains */ |
2273 | ret = vfio_iommu_replay(iommu, domain); |
2274 | if (ret) |
2275 | goto out_detach; |
2276 | |
2277 | if (resv_msi) { |
2278 | ret = iommu_get_msi_cookie(domain: domain->domain, base: resv_msi_base); |
2279 | if (ret && ret != -ENODEV) |
2280 | goto out_detach; |
2281 | } |
2282 | |
2283 | list_add(new: &domain->next, head: &iommu->domain_list); |
2284 | vfio_update_pgsize_bitmap(iommu); |
2285 | done: |
2286 | /* Delete the old one and insert new iova list */ |
2287 | vfio_iommu_iova_insert_copy(iommu, iova_copy: &iova_copy); |
2288 | |
2289 | /* |
2290 | * An iommu backed group can dirty memory directly and therefore |
2291 | * demotes the iommu scope until it declares itself dirty tracking |
2292 | * capable via the page pinning interface. |
2293 | */ |
2294 | iommu->num_non_pinned_groups++; |
2295 | mutex_unlock(lock: &iommu->lock); |
2296 | vfio_iommu_resv_free(resv_regions: &group_resv_regions); |
2297 | |
2298 | return 0; |
2299 | |
2300 | out_detach: |
2301 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2302 | out_domain: |
2303 | iommu_domain_free(domain: domain->domain); |
2304 | vfio_iommu_iova_free(iova: &iova_copy); |
2305 | vfio_iommu_resv_free(resv_regions: &group_resv_regions); |
2306 | out_free_domain: |
2307 | kfree(objp: domain); |
2308 | out_free_group: |
2309 | kfree(objp: group); |
2310 | out_unlock: |
2311 | mutex_unlock(lock: &iommu->lock); |
2312 | return ret; |
2313 | } |
2314 | |
2315 | static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) |
2316 | { |
2317 | struct rb_node *node; |
2318 | |
2319 | while ((node = rb_first(&iommu->dma_list))) |
2320 | vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); |
2321 | } |
2322 | |
2323 | static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) |
2324 | { |
2325 | struct rb_node *n, *p; |
2326 | |
2327 | n = rb_first(&iommu->dma_list); |
2328 | for (; n; n = rb_next(n)) { |
2329 | struct vfio_dma *dma; |
2330 | long locked = 0, unlocked = 0; |
2331 | |
2332 | dma = rb_entry(n, struct vfio_dma, node); |
2333 | unlocked += vfio_unmap_unpin(iommu, dma, do_accounting: false); |
2334 | p = rb_first(&dma->pfn_list); |
2335 | for (; p; p = rb_next(p)) { |
2336 | struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, |
2337 | node); |
2338 | |
2339 | if (!is_invalid_reserved_pfn(pfn: vpfn->pfn)) |
2340 | locked++; |
2341 | } |
2342 | vfio_lock_acct(dma, npage: locked - unlocked, async: true); |
2343 | } |
2344 | } |
2345 | |
2346 | /* |
2347 | * Called when a domain is removed in detach. It is possible that |
2348 | * the removed domain decided the iova aperture window. Modify the |
2349 | * iova aperture with the smallest window among existing domains. |
2350 | */ |
2351 | static void vfio_iommu_aper_expand(struct vfio_iommu *iommu, |
2352 | struct list_head *iova_copy) |
2353 | { |
2354 | struct vfio_domain *domain; |
2355 | struct vfio_iova *node; |
2356 | dma_addr_t start = 0; |
2357 | dma_addr_t end = (dma_addr_t)~0; |
2358 | |
2359 | if (list_empty(head: iova_copy)) |
2360 | return; |
2361 | |
2362 | list_for_each_entry(domain, &iommu->domain_list, next) { |
2363 | struct iommu_domain_geometry *geo = &domain->domain->geometry; |
2364 | |
2365 | if (geo->aperture_start > start) |
2366 | start = geo->aperture_start; |
2367 | if (geo->aperture_end < end) |
2368 | end = geo->aperture_end; |
2369 | } |
2370 | |
2371 | /* Modify aperture limits. The new aper is either same or bigger */ |
2372 | node = list_first_entry(iova_copy, struct vfio_iova, list); |
2373 | node->start = start; |
2374 | node = list_last_entry(iova_copy, struct vfio_iova, list); |
2375 | node->end = end; |
2376 | } |
2377 | |
2378 | /* |
2379 | * Called when a group is detached. The reserved regions for that |
2380 | * group can be part of valid iova now. But since reserved regions |
2381 | * may be duplicated among groups, populate the iova valid regions |
2382 | * list again. |
2383 | */ |
2384 | static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu, |
2385 | struct list_head *iova_copy) |
2386 | { |
2387 | struct vfio_domain *d; |
2388 | struct vfio_iommu_group *g; |
2389 | struct vfio_iova *node; |
2390 | dma_addr_t start, end; |
2391 | LIST_HEAD(resv_regions); |
2392 | int ret; |
2393 | |
2394 | if (list_empty(head: iova_copy)) |
2395 | return -EINVAL; |
2396 | |
2397 | list_for_each_entry(d, &iommu->domain_list, next) { |
2398 | list_for_each_entry(g, &d->group_list, next) { |
2399 | ret = iommu_get_group_resv_regions(group: g->iommu_group, |
2400 | head: &resv_regions); |
2401 | if (ret) |
2402 | goto done; |
2403 | } |
2404 | } |
2405 | |
2406 | node = list_first_entry(iova_copy, struct vfio_iova, list); |
2407 | start = node->start; |
2408 | node = list_last_entry(iova_copy, struct vfio_iova, list); |
2409 | end = node->end; |
2410 | |
2411 | /* purge the iova list and create new one */ |
2412 | vfio_iommu_iova_free(iova: iova_copy); |
2413 | |
2414 | ret = vfio_iommu_aper_resize(iova: iova_copy, start, end); |
2415 | if (ret) |
2416 | goto done; |
2417 | |
2418 | /* Exclude current reserved regions from iova ranges */ |
2419 | ret = vfio_iommu_resv_exclude(iova: iova_copy, resv_regions: &resv_regions); |
2420 | done: |
2421 | vfio_iommu_resv_free(resv_regions: &resv_regions); |
2422 | return ret; |
2423 | } |
2424 | |
2425 | static void vfio_iommu_type1_detach_group(void *iommu_data, |
2426 | struct iommu_group *iommu_group) |
2427 | { |
2428 | struct vfio_iommu *iommu = iommu_data; |
2429 | struct vfio_domain *domain; |
2430 | struct vfio_iommu_group *group; |
2431 | bool update_dirty_scope = false; |
2432 | LIST_HEAD(iova_copy); |
2433 | |
2434 | mutex_lock(&iommu->lock); |
2435 | list_for_each_entry(group, &iommu->emulated_iommu_groups, next) { |
2436 | if (group->iommu_group != iommu_group) |
2437 | continue; |
2438 | update_dirty_scope = !group->pinned_page_dirty_scope; |
2439 | list_del(entry: &group->next); |
2440 | kfree(objp: group); |
2441 | |
2442 | if (list_empty(head: &iommu->emulated_iommu_groups) && |
2443 | list_empty(head: &iommu->domain_list)) { |
2444 | WARN_ON(!list_empty(&iommu->device_list)); |
2445 | vfio_iommu_unmap_unpin_all(iommu); |
2446 | } |
2447 | goto detach_group_done; |
2448 | } |
2449 | |
2450 | /* |
2451 | * Get a copy of iova list. This will be used to update |
2452 | * and to replace the current one later. Please note that |
2453 | * we will leave the original list as it is if update fails. |
2454 | */ |
2455 | vfio_iommu_iova_get_copy(iommu, iova_copy: &iova_copy); |
2456 | |
2457 | list_for_each_entry(domain, &iommu->domain_list, next) { |
2458 | group = find_iommu_group(domain, iommu_group); |
2459 | if (!group) |
2460 | continue; |
2461 | |
2462 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2463 | update_dirty_scope = !group->pinned_page_dirty_scope; |
2464 | list_del(entry: &group->next); |
2465 | kfree(objp: group); |
2466 | /* |
2467 | * Group ownership provides privilege, if the group list is |
2468 | * empty, the domain goes away. If it's the last domain with |
2469 | * iommu and external domain doesn't exist, then all the |
2470 | * mappings go away too. If it's the last domain with iommu and |
2471 | * external domain exist, update accounting |
2472 | */ |
2473 | if (list_empty(head: &domain->group_list)) { |
2474 | if (list_is_singular(head: &iommu->domain_list)) { |
2475 | if (list_empty(head: &iommu->emulated_iommu_groups)) { |
2476 | WARN_ON(!list_empty( |
2477 | &iommu->device_list)); |
2478 | vfio_iommu_unmap_unpin_all(iommu); |
2479 | } else { |
2480 | vfio_iommu_unmap_unpin_reaccount(iommu); |
2481 | } |
2482 | } |
2483 | iommu_domain_free(domain: domain->domain); |
2484 | list_del(entry: &domain->next); |
2485 | kfree(objp: domain); |
2486 | vfio_iommu_aper_expand(iommu, iova_copy: &iova_copy); |
2487 | vfio_update_pgsize_bitmap(iommu); |
2488 | } |
2489 | break; |
2490 | } |
2491 | |
2492 | if (!vfio_iommu_resv_refresh(iommu, iova_copy: &iova_copy)) |
2493 | vfio_iommu_iova_insert_copy(iommu, iova_copy: &iova_copy); |
2494 | else |
2495 | vfio_iommu_iova_free(iova: &iova_copy); |
2496 | |
2497 | detach_group_done: |
2498 | /* |
2499 | * Removal of a group without dirty tracking may allow the iommu scope |
2500 | * to be promoted. |
2501 | */ |
2502 | if (update_dirty_scope) { |
2503 | iommu->num_non_pinned_groups--; |
2504 | if (iommu->dirty_page_tracking) |
2505 | vfio_iommu_populate_bitmap_full(iommu); |
2506 | } |
2507 | mutex_unlock(lock: &iommu->lock); |
2508 | } |
2509 | |
2510 | static void *vfio_iommu_type1_open(unsigned long arg) |
2511 | { |
2512 | struct vfio_iommu *iommu; |
2513 | |
2514 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); |
2515 | if (!iommu) |
2516 | return ERR_PTR(error: -ENOMEM); |
2517 | |
2518 | switch (arg) { |
2519 | case VFIO_TYPE1_IOMMU: |
2520 | break; |
2521 | case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: |
2522 | case VFIO_TYPE1v2_IOMMU: |
2523 | iommu->v2 = true; |
2524 | break; |
2525 | default: |
2526 | kfree(objp: iommu); |
2527 | return ERR_PTR(error: -EINVAL); |
2528 | } |
2529 | |
2530 | INIT_LIST_HEAD(list: &iommu->domain_list); |
2531 | INIT_LIST_HEAD(list: &iommu->iova_list); |
2532 | iommu->dma_list = RB_ROOT; |
2533 | iommu->dma_avail = dma_entry_limit; |
2534 | mutex_init(&iommu->lock); |
2535 | mutex_init(&iommu->device_list_lock); |
2536 | INIT_LIST_HEAD(list: &iommu->device_list); |
2537 | iommu->pgsize_bitmap = PAGE_MASK; |
2538 | INIT_LIST_HEAD(list: &iommu->emulated_iommu_groups); |
2539 | |
2540 | return iommu; |
2541 | } |
2542 | |
2543 | static void vfio_release_domain(struct vfio_domain *domain) |
2544 | { |
2545 | struct vfio_iommu_group *group, *group_tmp; |
2546 | |
2547 | list_for_each_entry_safe(group, group_tmp, |
2548 | &domain->group_list, next) { |
2549 | iommu_detach_group(domain: domain->domain, group: group->iommu_group); |
2550 | list_del(entry: &group->next); |
2551 | kfree(objp: group); |
2552 | } |
2553 | |
2554 | iommu_domain_free(domain: domain->domain); |
2555 | } |
2556 | |
2557 | static void vfio_iommu_type1_release(void *iommu_data) |
2558 | { |
2559 | struct vfio_iommu *iommu = iommu_data; |
2560 | struct vfio_domain *domain, *domain_tmp; |
2561 | struct vfio_iommu_group *group, *next_group; |
2562 | |
2563 | list_for_each_entry_safe(group, next_group, |
2564 | &iommu->emulated_iommu_groups, next) { |
2565 | list_del(entry: &group->next); |
2566 | kfree(objp: group); |
2567 | } |
2568 | |
2569 | vfio_iommu_unmap_unpin_all(iommu); |
2570 | |
2571 | list_for_each_entry_safe(domain, domain_tmp, |
2572 | &iommu->domain_list, next) { |
2573 | vfio_release_domain(domain); |
2574 | list_del(entry: &domain->next); |
2575 | kfree(objp: domain); |
2576 | } |
2577 | |
2578 | vfio_iommu_iova_free(iova: &iommu->iova_list); |
2579 | |
2580 | kfree(objp: iommu); |
2581 | } |
2582 | |
2583 | static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu) |
2584 | { |
2585 | struct vfio_domain *domain; |
2586 | int ret = 1; |
2587 | |
2588 | mutex_lock(&iommu->lock); |
2589 | list_for_each_entry(domain, &iommu->domain_list, next) { |
2590 | if (!(domain->enforce_cache_coherency)) { |
2591 | ret = 0; |
2592 | break; |
2593 | } |
2594 | } |
2595 | mutex_unlock(lock: &iommu->lock); |
2596 | |
2597 | return ret; |
2598 | } |
2599 | |
2600 | static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu) |
2601 | { |
2602 | bool ret; |
2603 | |
2604 | mutex_lock(&iommu->lock); |
2605 | ret = !list_empty(head: &iommu->emulated_iommu_groups); |
2606 | mutex_unlock(lock: &iommu->lock); |
2607 | return ret; |
2608 | } |
2609 | |
2610 | static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, |
2611 | unsigned long arg) |
2612 | { |
2613 | switch (arg) { |
2614 | case VFIO_TYPE1_IOMMU: |
2615 | case VFIO_TYPE1v2_IOMMU: |
2616 | case VFIO_UNMAP_ALL: |
2617 | return 1; |
2618 | case VFIO_UPDATE_VADDR: |
2619 | /* |
2620 | * Disable this feature if mdevs are present. They cannot |
2621 | * safely pin/unpin/rw while vaddrs are being updated. |
2622 | */ |
2623 | return iommu && !vfio_iommu_has_emulated(iommu); |
2624 | case VFIO_DMA_CC_IOMMU: |
2625 | if (!iommu) |
2626 | return 0; |
2627 | return vfio_domains_have_enforce_cache_coherency(iommu); |
2628 | default: |
2629 | return 0; |
2630 | } |
2631 | } |
2632 | |
2633 | static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps, |
2634 | struct vfio_iommu_type1_info_cap_iova_range *cap_iovas, |
2635 | size_t size) |
2636 | { |
2637 | struct vfio_info_cap_header *header; |
2638 | struct vfio_iommu_type1_info_cap_iova_range *iova_cap; |
2639 | |
2640 | header = vfio_info_cap_add(caps, size, |
2641 | VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, version: 1); |
2642 | if (IS_ERR(ptr: header)) |
2643 | return PTR_ERR(ptr: header); |
2644 | |
2645 | iova_cap = container_of(header, |
2646 | struct vfio_iommu_type1_info_cap_iova_range, |
2647 | header); |
2648 | iova_cap->nr_iovas = cap_iovas->nr_iovas; |
2649 | memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges, |
2650 | cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges)); |
2651 | return 0; |
2652 | } |
2653 | |
2654 | static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, |
2655 | struct vfio_info_cap *caps) |
2656 | { |
2657 | struct vfio_iommu_type1_info_cap_iova_range *cap_iovas; |
2658 | struct vfio_iova *iova; |
2659 | size_t size; |
2660 | int iovas = 0, i = 0, ret; |
2661 | |
2662 | list_for_each_entry(iova, &iommu->iova_list, list) |
2663 | iovas++; |
2664 | |
2665 | if (!iovas) { |
2666 | /* |
2667 | * Return 0 as a container with a single mdev device |
2668 | * will have an empty list |
2669 | */ |
2670 | return 0; |
2671 | } |
2672 | |
2673 | size = struct_size(cap_iovas, iova_ranges, iovas); |
2674 | |
2675 | cap_iovas = kzalloc(size, GFP_KERNEL); |
2676 | if (!cap_iovas) |
2677 | return -ENOMEM; |
2678 | |
2679 | cap_iovas->nr_iovas = iovas; |
2680 | |
2681 | list_for_each_entry(iova, &iommu->iova_list, list) { |
2682 | cap_iovas->iova_ranges[i].start = iova->start; |
2683 | cap_iovas->iova_ranges[i].end = iova->end; |
2684 | i++; |
2685 | } |
2686 | |
2687 | ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size); |
2688 | |
2689 | kfree(objp: cap_iovas); |
2690 | return ret; |
2691 | } |
2692 | |
2693 | static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, |
2694 | struct vfio_info_cap *caps) |
2695 | { |
2696 | struct vfio_iommu_type1_info_cap_migration cap_mig = {}; |
2697 | |
2698 | cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; |
2699 | cap_mig.header.version = 1; |
2700 | |
2701 | cap_mig.flags = 0; |
2702 | /* support minimum pgsize */ |
2703 | cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap); |
2704 | cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX; |
2705 | |
2706 | return vfio_info_add_capability(caps, cap: &cap_mig.header, size: sizeof(cap_mig)); |
2707 | } |
2708 | |
2709 | static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, |
2710 | struct vfio_info_cap *caps) |
2711 | { |
2712 | struct vfio_iommu_type1_info_dma_avail cap_dma_avail; |
2713 | |
2714 | cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; |
2715 | cap_dma_avail.header.version = 1; |
2716 | |
2717 | cap_dma_avail.avail = iommu->dma_avail; |
2718 | |
2719 | return vfio_info_add_capability(caps, cap: &cap_dma_avail.header, |
2720 | size: sizeof(cap_dma_avail)); |
2721 | } |
2722 | |
2723 | static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, |
2724 | unsigned long arg) |
2725 | { |
2726 | struct vfio_iommu_type1_info info = {}; |
2727 | unsigned long minsz; |
2728 | struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; |
2729 | int ret; |
2730 | |
2731 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); |
2732 | |
2733 | if (copy_from_user(to: &info, from: (void __user *)arg, n: minsz)) |
2734 | return -EFAULT; |
2735 | |
2736 | if (info.argsz < minsz) |
2737 | return -EINVAL; |
2738 | |
2739 | minsz = min_t(size_t, info.argsz, sizeof(info)); |
2740 | |
2741 | mutex_lock(&iommu->lock); |
2742 | info.flags = VFIO_IOMMU_INFO_PGSIZES; |
2743 | |
2744 | info.iova_pgsizes = iommu->pgsize_bitmap; |
2745 | |
2746 | ret = vfio_iommu_migration_build_caps(iommu, caps: &caps); |
2747 | |
2748 | if (!ret) |
2749 | ret = vfio_iommu_dma_avail_build_caps(iommu, caps: &caps); |
2750 | |
2751 | if (!ret) |
2752 | ret = vfio_iommu_iova_build_caps(iommu, caps: &caps); |
2753 | |
2754 | mutex_unlock(lock: &iommu->lock); |
2755 | |
2756 | if (ret) |
2757 | return ret; |
2758 | |
2759 | if (caps.size) { |
2760 | info.flags |= VFIO_IOMMU_INFO_CAPS; |
2761 | |
2762 | if (info.argsz < sizeof(info) + caps.size) { |
2763 | info.argsz = sizeof(info) + caps.size; |
2764 | } else { |
2765 | vfio_info_cap_shift(caps: &caps, offset: sizeof(info)); |
2766 | if (copy_to_user(to: (void __user *)arg + |
2767 | sizeof(info), from: caps.buf, |
2768 | n: caps.size)) { |
2769 | kfree(objp: caps.buf); |
2770 | return -EFAULT; |
2771 | } |
2772 | info.cap_offset = sizeof(info); |
2773 | } |
2774 | |
2775 | kfree(objp: caps.buf); |
2776 | } |
2777 | |
2778 | return copy_to_user(to: (void __user *)arg, from: &info, n: minsz) ? |
2779 | -EFAULT : 0; |
2780 | } |
2781 | |
2782 | static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu, |
2783 | unsigned long arg) |
2784 | { |
2785 | struct vfio_iommu_type1_dma_map map; |
2786 | unsigned long minsz; |
2787 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE | |
2788 | VFIO_DMA_MAP_FLAG_VADDR; |
2789 | |
2790 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); |
2791 | |
2792 | if (copy_from_user(to: &map, from: (void __user *)arg, n: minsz)) |
2793 | return -EFAULT; |
2794 | |
2795 | if (map.argsz < minsz || map.flags & ~mask) |
2796 | return -EINVAL; |
2797 | |
2798 | return vfio_dma_do_map(iommu, map: &map); |
2799 | } |
2800 | |
2801 | static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu, |
2802 | unsigned long arg) |
2803 | { |
2804 | struct vfio_iommu_type1_dma_unmap unmap; |
2805 | struct vfio_bitmap bitmap = { 0 }; |
2806 | uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP | |
2807 | VFIO_DMA_UNMAP_FLAG_VADDR | |
2808 | VFIO_DMA_UNMAP_FLAG_ALL; |
2809 | unsigned long minsz; |
2810 | int ret; |
2811 | |
2812 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); |
2813 | |
2814 | if (copy_from_user(to: &unmap, from: (void __user *)arg, n: minsz)) |
2815 | return -EFAULT; |
2816 | |
2817 | if (unmap.argsz < minsz || unmap.flags & ~mask) |
2818 | return -EINVAL; |
2819 | |
2820 | if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && |
2821 | (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL | |
2822 | VFIO_DMA_UNMAP_FLAG_VADDR))) |
2823 | return -EINVAL; |
2824 | |
2825 | if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { |
2826 | unsigned long pgshift; |
2827 | |
2828 | if (unmap.argsz < (minsz + sizeof(bitmap))) |
2829 | return -EINVAL; |
2830 | |
2831 | if (copy_from_user(to: &bitmap, |
2832 | from: (void __user *)(arg + minsz), |
2833 | n: sizeof(bitmap))) |
2834 | return -EFAULT; |
2835 | |
2836 | if (!access_ok((void __user *)bitmap.data, bitmap.size)) |
2837 | return -EINVAL; |
2838 | |
2839 | pgshift = __ffs(bitmap.pgsize); |
2840 | ret = verify_bitmap_size(npages: unmap.size >> pgshift, |
2841 | bitmap_size: bitmap.size); |
2842 | if (ret) |
2843 | return ret; |
2844 | } |
2845 | |
2846 | ret = vfio_dma_do_unmap(iommu, unmap: &unmap, bitmap: &bitmap); |
2847 | if (ret) |
2848 | return ret; |
2849 | |
2850 | return copy_to_user(to: (void __user *)arg, from: &unmap, n: minsz) ? |
2851 | -EFAULT : 0; |
2852 | } |
2853 | |
2854 | static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, |
2855 | unsigned long arg) |
2856 | { |
2857 | struct vfio_iommu_type1_dirty_bitmap dirty; |
2858 | uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START | |
2859 | VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | |
2860 | VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; |
2861 | unsigned long minsz; |
2862 | int ret = 0; |
2863 | |
2864 | if (!iommu->v2) |
2865 | return -EACCES; |
2866 | |
2867 | minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags); |
2868 | |
2869 | if (copy_from_user(to: &dirty, from: (void __user *)arg, n: minsz)) |
2870 | return -EFAULT; |
2871 | |
2872 | if (dirty.argsz < minsz || dirty.flags & ~mask) |
2873 | return -EINVAL; |
2874 | |
2875 | /* only one flag should be set at a time */ |
2876 | if (__ffs(dirty.flags) != __fls(word: dirty.flags)) |
2877 | return -EINVAL; |
2878 | |
2879 | if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { |
2880 | size_t pgsize; |
2881 | |
2882 | mutex_lock(&iommu->lock); |
2883 | pgsize = 1 << __ffs(iommu->pgsize_bitmap); |
2884 | if (!iommu->dirty_page_tracking) { |
2885 | ret = vfio_dma_bitmap_alloc_all(iommu, pgsize); |
2886 | if (!ret) |
2887 | iommu->dirty_page_tracking = true; |
2888 | } |
2889 | mutex_unlock(lock: &iommu->lock); |
2890 | return ret; |
2891 | } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { |
2892 | mutex_lock(&iommu->lock); |
2893 | if (iommu->dirty_page_tracking) { |
2894 | iommu->dirty_page_tracking = false; |
2895 | vfio_dma_bitmap_free_all(iommu); |
2896 | } |
2897 | mutex_unlock(lock: &iommu->lock); |
2898 | return 0; |
2899 | } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { |
2900 | struct vfio_iommu_type1_dirty_bitmap_get range; |
2901 | unsigned long pgshift; |
2902 | size_t data_size = dirty.argsz - minsz; |
2903 | size_t iommu_pgsize; |
2904 | |
2905 | if (!data_size || data_size < sizeof(range)) |
2906 | return -EINVAL; |
2907 | |
2908 | if (copy_from_user(to: &range, from: (void __user *)(arg + minsz), |
2909 | n: sizeof(range))) |
2910 | return -EFAULT; |
2911 | |
2912 | if (range.iova + range.size < range.iova) |
2913 | return -EINVAL; |
2914 | if (!access_ok((void __user *)range.bitmap.data, |
2915 | range.bitmap.size)) |
2916 | return -EINVAL; |
2917 | |
2918 | pgshift = __ffs(range.bitmap.pgsize); |
2919 | ret = verify_bitmap_size(npages: range.size >> pgshift, |
2920 | bitmap_size: range.bitmap.size); |
2921 | if (ret) |
2922 | return ret; |
2923 | |
2924 | mutex_lock(&iommu->lock); |
2925 | |
2926 | iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); |
2927 | |
2928 | /* allow only smallest supported pgsize */ |
2929 | if (range.bitmap.pgsize != iommu_pgsize) { |
2930 | ret = -EINVAL; |
2931 | goto out_unlock; |
2932 | } |
2933 | if (range.iova & (iommu_pgsize - 1)) { |
2934 | ret = -EINVAL; |
2935 | goto out_unlock; |
2936 | } |
2937 | if (!range.size || range.size & (iommu_pgsize - 1)) { |
2938 | ret = -EINVAL; |
2939 | goto out_unlock; |
2940 | } |
2941 | |
2942 | if (iommu->dirty_page_tracking) |
2943 | ret = vfio_iova_dirty_bitmap(bitmap: range.bitmap.data, |
2944 | iommu, iova: range.iova, |
2945 | size: range.size, |
2946 | pgsize: range.bitmap.pgsize); |
2947 | else |
2948 | ret = -EINVAL; |
2949 | out_unlock: |
2950 | mutex_unlock(lock: &iommu->lock); |
2951 | |
2952 | return ret; |
2953 | } |
2954 | |
2955 | return -EINVAL; |
2956 | } |
2957 | |
2958 | static long vfio_iommu_type1_ioctl(void *iommu_data, |
2959 | unsigned int cmd, unsigned long arg) |
2960 | { |
2961 | struct vfio_iommu *iommu = iommu_data; |
2962 | |
2963 | switch (cmd) { |
2964 | case VFIO_CHECK_EXTENSION: |
2965 | return vfio_iommu_type1_check_extension(iommu, arg); |
2966 | case VFIO_IOMMU_GET_INFO: |
2967 | return vfio_iommu_type1_get_info(iommu, arg); |
2968 | case VFIO_IOMMU_MAP_DMA: |
2969 | return vfio_iommu_type1_map_dma(iommu, arg); |
2970 | case VFIO_IOMMU_UNMAP_DMA: |
2971 | return vfio_iommu_type1_unmap_dma(iommu, arg); |
2972 | case VFIO_IOMMU_DIRTY_PAGES: |
2973 | return vfio_iommu_type1_dirty_pages(iommu, arg); |
2974 | default: |
2975 | return -ENOTTY; |
2976 | } |
2977 | } |
2978 | |
2979 | static void vfio_iommu_type1_register_device(void *iommu_data, |
2980 | struct vfio_device *vdev) |
2981 | { |
2982 | struct vfio_iommu *iommu = iommu_data; |
2983 | |
2984 | if (!vdev->ops->dma_unmap) |
2985 | return; |
2986 | |
2987 | /* |
2988 | * list_empty(&iommu->device_list) is tested under the iommu->lock while |
2989 | * iteration for dma_unmap must be done under the device_list_lock. |
2990 | * Holding both locks here allows avoiding the device_list_lock in |
2991 | * several fast paths. See vfio_notify_dma_unmap() |
2992 | */ |
2993 | mutex_lock(&iommu->lock); |
2994 | mutex_lock(&iommu->device_list_lock); |
2995 | list_add(new: &vdev->iommu_entry, head: &iommu->device_list); |
2996 | mutex_unlock(lock: &iommu->device_list_lock); |
2997 | mutex_unlock(lock: &iommu->lock); |
2998 | } |
2999 | |
3000 | static void vfio_iommu_type1_unregister_device(void *iommu_data, |
3001 | struct vfio_device *vdev) |
3002 | { |
3003 | struct vfio_iommu *iommu = iommu_data; |
3004 | |
3005 | if (!vdev->ops->dma_unmap) |
3006 | return; |
3007 | |
3008 | mutex_lock(&iommu->lock); |
3009 | mutex_lock(&iommu->device_list_lock); |
3010 | list_del(entry: &vdev->iommu_entry); |
3011 | mutex_unlock(lock: &iommu->device_list_lock); |
3012 | mutex_unlock(lock: &iommu->lock); |
3013 | } |
3014 | |
3015 | static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, |
3016 | dma_addr_t user_iova, void *data, |
3017 | size_t count, bool write, |
3018 | size_t *copied) |
3019 | { |
3020 | struct mm_struct *mm; |
3021 | unsigned long vaddr; |
3022 | struct vfio_dma *dma; |
3023 | bool kthread = current->mm == NULL; |
3024 | size_t offset; |
3025 | |
3026 | *copied = 0; |
3027 | |
3028 | dma = vfio_find_dma(iommu, start: user_iova, size: 1); |
3029 | if (!dma) |
3030 | return -EINVAL; |
3031 | |
3032 | if ((write && !(dma->prot & IOMMU_WRITE)) || |
3033 | !(dma->prot & IOMMU_READ)) |
3034 | return -EPERM; |
3035 | |
3036 | mm = dma->mm; |
3037 | if (!mmget_not_zero(mm)) |
3038 | return -EPERM; |
3039 | |
3040 | if (kthread) |
3041 | kthread_use_mm(mm); |
3042 | else if (current->mm != mm) |
3043 | goto out; |
3044 | |
3045 | offset = user_iova - dma->iova; |
3046 | |
3047 | if (count > dma->size - offset) |
3048 | count = dma->size - offset; |
3049 | |
3050 | vaddr = dma->vaddr + offset; |
3051 | |
3052 | if (write) { |
3053 | *copied = copy_to_user(to: (void __user *)vaddr, from: data, |
3054 | n: count) ? 0 : count; |
3055 | if (*copied && iommu->dirty_page_tracking) { |
3056 | unsigned long pgshift = __ffs(iommu->pgsize_bitmap); |
3057 | /* |
3058 | * Bitmap populated with the smallest supported page |
3059 | * size |
3060 | */ |
3061 | bitmap_set(map: dma->bitmap, start: offset >> pgshift, |
3062 | nbits: ((offset + *copied - 1) >> pgshift) - |
3063 | (offset >> pgshift) + 1); |
3064 | } |
3065 | } else |
3066 | *copied = copy_from_user(to: data, from: (void __user *)vaddr, |
3067 | n: count) ? 0 : count; |
3068 | if (kthread) |
3069 | kthread_unuse_mm(mm); |
3070 | out: |
3071 | mmput(mm); |
3072 | return *copied ? 0 : -EFAULT; |
3073 | } |
3074 | |
3075 | static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova, |
3076 | void *data, size_t count, bool write) |
3077 | { |
3078 | struct vfio_iommu *iommu = iommu_data; |
3079 | int ret = 0; |
3080 | size_t done; |
3081 | |
3082 | mutex_lock(&iommu->lock); |
3083 | |
3084 | if (WARN_ONCE(iommu->vaddr_invalid_count, |
3085 | "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) { |
3086 | ret = -EBUSY; |
3087 | goto out; |
3088 | } |
3089 | |
3090 | while (count > 0) { |
3091 | ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data, |
3092 | count, write, copied: &done); |
3093 | if (ret) |
3094 | break; |
3095 | |
3096 | count -= done; |
3097 | data += done; |
3098 | user_iova += done; |
3099 | } |
3100 | |
3101 | out: |
3102 | mutex_unlock(lock: &iommu->lock); |
3103 | return ret; |
3104 | } |
3105 | |
3106 | static struct iommu_domain * |
3107 | vfio_iommu_type1_group_iommu_domain(void *iommu_data, |
3108 | struct iommu_group *iommu_group) |
3109 | { |
3110 | struct iommu_domain *domain = ERR_PTR(error: -ENODEV); |
3111 | struct vfio_iommu *iommu = iommu_data; |
3112 | struct vfio_domain *d; |
3113 | |
3114 | if (!iommu || !iommu_group) |
3115 | return ERR_PTR(error: -EINVAL); |
3116 | |
3117 | mutex_lock(&iommu->lock); |
3118 | list_for_each_entry(d, &iommu->domain_list, next) { |
3119 | if (find_iommu_group(domain: d, iommu_group)) { |
3120 | domain = d->domain; |
3121 | break; |
3122 | } |
3123 | } |
3124 | mutex_unlock(lock: &iommu->lock); |
3125 | |
3126 | return domain; |
3127 | } |
3128 | |
3129 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { |
3130 | .name = "vfio-iommu-type1", |
3131 | .owner = THIS_MODULE, |
3132 | .open = vfio_iommu_type1_open, |
3133 | .release = vfio_iommu_type1_release, |
3134 | .ioctl = vfio_iommu_type1_ioctl, |
3135 | .attach_group = vfio_iommu_type1_attach_group, |
3136 | .detach_group = vfio_iommu_type1_detach_group, |
3137 | .pin_pages = vfio_iommu_type1_pin_pages, |
3138 | .unpin_pages = vfio_iommu_type1_unpin_pages, |
3139 | .register_device = vfio_iommu_type1_register_device, |
3140 | .unregister_device = vfio_iommu_type1_unregister_device, |
3141 | .dma_rw = vfio_iommu_type1_dma_rw, |
3142 | .group_iommu_domain = vfio_iommu_type1_group_iommu_domain, |
3143 | }; |
3144 | |
3145 | static int __init vfio_iommu_type1_init(void) |
3146 | { |
3147 | return vfio_register_iommu_driver(ops: &vfio_iommu_driver_ops_type1); |
3148 | } |
3149 | |
3150 | static void __exit vfio_iommu_type1_cleanup(void) |
3151 | { |
3152 | vfio_unregister_iommu_driver(ops: &vfio_iommu_driver_ops_type1); |
3153 | } |
3154 | |
3155 | module_init(vfio_iommu_type1_init); |
3156 | module_exit(vfio_iommu_type1_cleanup); |
3157 | |
3158 | MODULE_VERSION(DRIVER_VERSION); |
3159 | MODULE_LICENSE("GPL v2"); |
3160 | MODULE_AUTHOR(DRIVER_AUTHOR); |
3161 | MODULE_DESCRIPTION(DRIVER_DESC); |
3162 |
Definitions
- allow_unsafe_interrupts
- disable_hugepages
- dma_entry_limit
- vfio_iommu
- vfio_domain
- vfio_dma
- vfio_batch
- vfio_iommu_group
- vfio_iova
- vfio_pfn
- vfio_regions
- vfio_find_dma
- vfio_find_dma_first_node
- vfio_link_dma
- vfio_unlink_dma
- vfio_dma_bitmap_alloc
- vfio_dma_bitmap_free
- vfio_dma_populate_bitmap
- vfio_iommu_populate_bitmap_full
- vfio_dma_bitmap_alloc_all
- vfio_dma_bitmap_free_all
- vfio_find_vpfn
- vfio_link_pfn
- vfio_unlink_pfn
- vfio_add_to_pfn_list
- vfio_remove_from_pfn_list
- vfio_iova_get_vfio_pfn
- vfio_iova_put_vfio_pfn
- mm_lock_acct
- vfio_lock_acct
- is_invalid_reserved_pfn
- put_pfn
- __vfio_batch_init
- vfio_batch_init
- vfio_batch_init_single
- vfio_batch_unpin
- vfio_batch_fini
- follow_fault_pfn
- vaddr_get_pfns
- vfio_pin_pages_remote
- vfio_unpin_pages_remote
- vfio_pin_page_external
- vfio_unpin_page_external
- vfio_iommu_type1_pin_pages
- vfio_iommu_type1_unpin_pages
- vfio_sync_unpin
- unmap_unpin_fast
- unmap_unpin_slow
- vfio_unmap_unpin
- vfio_remove_dma
- vfio_update_pgsize_bitmap
- update_user_bitmap
- vfio_iova_dirty_bitmap
- verify_bitmap_size
- vfio_notify_dma_unmap
- vfio_dma_do_unmap
- vfio_iommu_map
- vfio_pin_map_dma
- vfio_iommu_iova_dma_valid
- vfio_change_dma_owner
- vfio_dma_do_map
- vfio_iommu_replay
- find_iommu_group
- vfio_iommu_find_iommu_group
- vfio_iommu_has_sw_msi
- vfio_iommu_iova_insert
- vfio_iommu_aper_conflict
- vfio_iommu_aper_resize
- vfio_iommu_resv_conflict
- vfio_iommu_resv_exclude
- vfio_iommu_resv_free
- vfio_iommu_iova_free
- vfio_iommu_iova_get_copy
- vfio_iommu_iova_insert_copy
- vfio_iommu_domain_alloc
- vfio_iommu_type1_attach_group
- vfio_iommu_unmap_unpin_all
- vfio_iommu_unmap_unpin_reaccount
- vfio_iommu_aper_expand
- vfio_iommu_resv_refresh
- vfio_iommu_type1_detach_group
- vfio_iommu_type1_open
- vfio_release_domain
- vfio_iommu_type1_release
- vfio_domains_have_enforce_cache_coherency
- vfio_iommu_has_emulated
- vfio_iommu_type1_check_extension
- vfio_iommu_iova_add_cap
- vfio_iommu_iova_build_caps
- vfio_iommu_migration_build_caps
- vfio_iommu_dma_avail_build_caps
- vfio_iommu_type1_get_info
- vfio_iommu_type1_map_dma
- vfio_iommu_type1_unmap_dma
- vfio_iommu_type1_dirty_pages
- vfio_iommu_type1_ioctl
- vfio_iommu_type1_register_device
- vfio_iommu_type1_unregister_device
- vfio_iommu_type1_dma_rw_chunk
- vfio_iommu_type1_dma_rw
- vfio_iommu_type1_group_iommu_domain
- vfio_iommu_driver_ops_type1
- vfio_iommu_type1_init
Improve your Profiling and Debugging skills
Find out more