1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | #include <linux/cred.h> |
3 | #include <linux/device.h> |
4 | #include <linux/dma-buf.h> |
5 | #include <linux/dma-resv.h> |
6 | #include <linux/highmem.h> |
7 | #include <linux/init.h> |
8 | #include <linux/kernel.h> |
9 | #include <linux/memfd.h> |
10 | #include <linux/miscdevice.h> |
11 | #include <linux/module.h> |
12 | #include <linux/shmem_fs.h> |
13 | #include <linux/hugetlb.h> |
14 | #include <linux/slab.h> |
15 | #include <linux/udmabuf.h> |
16 | #include <linux/vmalloc.h> |
17 | #include <linux/iosys-map.h> |
18 | |
19 | static int list_limit = 1024; |
20 | module_param(list_limit, int, 0644); |
21 | MODULE_PARM_DESC(list_limit, "udmabuf_create_list->count limit. Default is 1024."); |
22 | |
23 | static int size_limit_mb = 64; |
24 | module_param(size_limit_mb, int, 0644); |
25 | MODULE_PARM_DESC(size_limit_mb, "Max size of a dmabuf, in megabytes. Default is 64."); |
26 | |
27 | struct udmabuf { |
28 | pgoff_t pagecount; |
29 | struct folio **folios; |
30 | |
31 | /** |
32 | * Unlike folios, pinned_folios is only used for unpin. |
33 | * So, nr_pinned is not the same to pagecount, the pinned_folios |
34 | * only set each folio which already pinned when udmabuf_create. |
35 | * Note that, since a folio may be pinned multiple times, each folio |
36 | * can be added to pinned_folios multiple times, depending on how many |
37 | * times the folio has been pinned when create. |
38 | */ |
39 | pgoff_t nr_pinned; |
40 | struct folio **pinned_folios; |
41 | |
42 | struct sg_table *sg; |
43 | struct miscdevice *device; |
44 | pgoff_t *offsets; |
45 | }; |
46 | |
47 | static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) |
48 | { |
49 | struct vm_area_struct *vma = vmf->vma; |
50 | struct udmabuf *ubuf = vma->vm_private_data; |
51 | pgoff_t pgoff = vmf->pgoff; |
52 | unsigned long addr, pfn; |
53 | vm_fault_t ret; |
54 | |
55 | if (pgoff >= ubuf->pagecount) |
56 | return VM_FAULT_SIGBUS; |
57 | |
58 | pfn = folio_pfn(folio: ubuf->folios[pgoff]); |
59 | pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT; |
60 | |
61 | ret = vmf_insert_pfn(vma, addr: vmf->address, pfn); |
62 | if (ret & VM_FAULT_ERROR) |
63 | return ret; |
64 | |
65 | /* pre fault */ |
66 | pgoff = vma->vm_pgoff; |
67 | addr = vma->vm_start; |
68 | |
69 | for (; addr < vma->vm_end; pgoff++, addr += PAGE_SIZE) { |
70 | if (addr == vmf->address) |
71 | continue; |
72 | |
73 | if (WARN_ON(pgoff >= ubuf->pagecount)) |
74 | break; |
75 | |
76 | pfn = folio_pfn(folio: ubuf->folios[pgoff]); |
77 | pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT; |
78 | |
79 | /** |
80 | * If the below vmf_insert_pfn() fails, we do not return an |
81 | * error here during this pre-fault step. However, an error |
82 | * will be returned if the failure occurs when the addr is |
83 | * truly accessed. |
84 | */ |
85 | if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR) |
86 | break; |
87 | } |
88 | |
89 | return ret; |
90 | } |
91 | |
92 | static const struct vm_operations_struct udmabuf_vm_ops = { |
93 | .fault = udmabuf_vm_fault, |
94 | }; |
95 | |
96 | static int mmap_udmabuf(struct dma_buf *buf, struct vm_area_struct *vma) |
97 | { |
98 | struct udmabuf *ubuf = buf->priv; |
99 | |
100 | if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) |
101 | return -EINVAL; |
102 | |
103 | vma->vm_ops = &udmabuf_vm_ops; |
104 | vma->vm_private_data = ubuf; |
105 | vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); |
106 | return 0; |
107 | } |
108 | |
109 | static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) |
110 | { |
111 | struct udmabuf *ubuf = buf->priv; |
112 | unsigned long *pfns; |
113 | void *vaddr; |
114 | pgoff_t pg; |
115 | |
116 | dma_resv_assert_held(buf->resv); |
117 | |
118 | /** |
119 | * HVO may free tail pages, so just use pfn to map each folio |
120 | * into vmalloc area. |
121 | */ |
122 | pfns = kvmalloc_array(ubuf->pagecount, sizeof(*pfns), GFP_KERNEL); |
123 | if (!pfns) |
124 | return -ENOMEM; |
125 | |
126 | for (pg = 0; pg < ubuf->pagecount; pg++) { |
127 | unsigned long pfn = folio_pfn(folio: ubuf->folios[pg]); |
128 | |
129 | pfn += ubuf->offsets[pg] >> PAGE_SHIFT; |
130 | pfns[pg] = pfn; |
131 | } |
132 | |
133 | vaddr = vmap_pfn(pfns, count: ubuf->pagecount, PAGE_KERNEL); |
134 | kvfree(addr: pfns); |
135 | if (!vaddr) |
136 | return -EINVAL; |
137 | |
138 | iosys_map_set_vaddr(map, vaddr); |
139 | return 0; |
140 | } |
141 | |
142 | static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) |
143 | { |
144 | struct udmabuf *ubuf = buf->priv; |
145 | |
146 | dma_resv_assert_held(buf->resv); |
147 | |
148 | vm_unmap_ram(mem: map->vaddr, count: ubuf->pagecount); |
149 | } |
150 | |
151 | static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, |
152 | enum dma_data_direction direction) |
153 | { |
154 | struct udmabuf *ubuf = buf->priv; |
155 | struct sg_table *sg; |
156 | struct scatterlist *sgl; |
157 | unsigned int i = 0; |
158 | int ret; |
159 | |
160 | sg = kzalloc(sizeof(*sg), GFP_KERNEL); |
161 | if (!sg) |
162 | return ERR_PTR(error: -ENOMEM); |
163 | |
164 | ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL); |
165 | if (ret < 0) |
166 | goto err_alloc; |
167 | |
168 | for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) |
169 | sg_set_folio(sg: sgl, folio: ubuf->folios[i], PAGE_SIZE, |
170 | offset: ubuf->offsets[i]); |
171 | |
172 | ret = dma_map_sgtable(dev, sgt: sg, dir: direction, attrs: 0); |
173 | if (ret < 0) |
174 | goto err_map; |
175 | return sg; |
176 | |
177 | err_map: |
178 | sg_free_table(sg); |
179 | err_alloc: |
180 | kfree(objp: sg); |
181 | return ERR_PTR(error: ret); |
182 | } |
183 | |
184 | static void put_sg_table(struct device *dev, struct sg_table *sg, |
185 | enum dma_data_direction direction) |
186 | { |
187 | dma_unmap_sgtable(dev, sgt: sg, dir: direction, attrs: 0); |
188 | sg_free_table(sg); |
189 | kfree(objp: sg); |
190 | } |
191 | |
192 | static struct sg_table *map_udmabuf(struct dma_buf_attachment *at, |
193 | enum dma_data_direction direction) |
194 | { |
195 | return get_sg_table(dev: at->dev, buf: at->dmabuf, direction); |
196 | } |
197 | |
198 | static void unmap_udmabuf(struct dma_buf_attachment *at, |
199 | struct sg_table *sg, |
200 | enum dma_data_direction direction) |
201 | { |
202 | return put_sg_table(dev: at->dev, sg, direction); |
203 | } |
204 | |
205 | static void unpin_all_folios(struct udmabuf *ubuf) |
206 | { |
207 | pgoff_t i; |
208 | |
209 | for (i = 0; i < ubuf->nr_pinned; ++i) |
210 | unpin_folio(folio: ubuf->pinned_folios[i]); |
211 | |
212 | kvfree(addr: ubuf->pinned_folios); |
213 | } |
214 | |
215 | static __always_inline int init_udmabuf(struct udmabuf *ubuf, pgoff_t pgcnt) |
216 | { |
217 | ubuf->folios = kvmalloc_array(pgcnt, sizeof(*ubuf->folios), GFP_KERNEL); |
218 | if (!ubuf->folios) |
219 | return -ENOMEM; |
220 | |
221 | ubuf->offsets = kvcalloc(pgcnt, sizeof(*ubuf->offsets), GFP_KERNEL); |
222 | if (!ubuf->offsets) |
223 | return -ENOMEM; |
224 | |
225 | ubuf->pinned_folios = kvmalloc_array(pgcnt, |
226 | sizeof(*ubuf->pinned_folios), |
227 | GFP_KERNEL); |
228 | if (!ubuf->pinned_folios) |
229 | return -ENOMEM; |
230 | |
231 | return 0; |
232 | } |
233 | |
234 | static __always_inline void deinit_udmabuf(struct udmabuf *ubuf) |
235 | { |
236 | unpin_all_folios(ubuf); |
237 | kvfree(addr: ubuf->offsets); |
238 | kvfree(addr: ubuf->folios); |
239 | } |
240 | |
241 | static void release_udmabuf(struct dma_buf *buf) |
242 | { |
243 | struct udmabuf *ubuf = buf->priv; |
244 | struct device *dev = ubuf->device->this_device; |
245 | |
246 | if (ubuf->sg) |
247 | put_sg_table(dev, sg: ubuf->sg, direction: DMA_BIDIRECTIONAL); |
248 | |
249 | deinit_udmabuf(ubuf); |
250 | kfree(objp: ubuf); |
251 | } |
252 | |
253 | static int begin_cpu_udmabuf(struct dma_buf *buf, |
254 | enum dma_data_direction direction) |
255 | { |
256 | struct udmabuf *ubuf = buf->priv; |
257 | struct device *dev = ubuf->device->this_device; |
258 | int ret = 0; |
259 | |
260 | if (!ubuf->sg) { |
261 | ubuf->sg = get_sg_table(dev, buf, direction); |
262 | if (IS_ERR(ptr: ubuf->sg)) { |
263 | ret = PTR_ERR(ptr: ubuf->sg); |
264 | ubuf->sg = NULL; |
265 | } |
266 | } else { |
267 | dma_sync_sg_for_cpu(dev, sg: ubuf->sg->sgl, nelems: ubuf->sg->nents, |
268 | dir: direction); |
269 | } |
270 | |
271 | return ret; |
272 | } |
273 | |
274 | static int end_cpu_udmabuf(struct dma_buf *buf, |
275 | enum dma_data_direction direction) |
276 | { |
277 | struct udmabuf *ubuf = buf->priv; |
278 | struct device *dev = ubuf->device->this_device; |
279 | |
280 | if (!ubuf->sg) |
281 | return -EINVAL; |
282 | |
283 | dma_sync_sg_for_device(dev, sg: ubuf->sg->sgl, nelems: ubuf->sg->nents, dir: direction); |
284 | return 0; |
285 | } |
286 | |
287 | static const struct dma_buf_ops udmabuf_ops = { |
288 | .map_dma_buf = map_udmabuf, |
289 | .unmap_dma_buf = unmap_udmabuf, |
290 | .release = release_udmabuf, |
291 | .mmap = mmap_udmabuf, |
292 | .vmap = vmap_udmabuf, |
293 | .vunmap = vunmap_udmabuf, |
294 | .begin_cpu_access = begin_cpu_udmabuf, |
295 | .end_cpu_access = end_cpu_udmabuf, |
296 | }; |
297 | |
298 | #define SEALS_WANTED (F_SEAL_SHRINK) |
299 | #define SEALS_DENIED (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) |
300 | |
301 | static int check_memfd_seals(struct file *memfd) |
302 | { |
303 | int seals; |
304 | |
305 | if (!shmem_file(file: memfd) && !is_file_hugepages(file: memfd)) |
306 | return -EBADFD; |
307 | |
308 | seals = memfd_fcntl(file: memfd, F_GET_SEALS, arg: 0); |
309 | if (seals == -EINVAL) |
310 | return -EBADFD; |
311 | |
312 | if ((seals & SEALS_WANTED) != SEALS_WANTED || |
313 | (seals & SEALS_DENIED) != 0) |
314 | return -EINVAL; |
315 | |
316 | return 0; |
317 | } |
318 | |
319 | static struct dma_buf *export_udmabuf(struct udmabuf *ubuf, |
320 | struct miscdevice *device) |
321 | { |
322 | DEFINE_DMA_BUF_EXPORT_INFO(exp_info); |
323 | |
324 | ubuf->device = device; |
325 | exp_info.ops = &udmabuf_ops; |
326 | exp_info.size = ubuf->pagecount << PAGE_SHIFT; |
327 | exp_info.priv = ubuf; |
328 | exp_info.flags = O_RDWR; |
329 | |
330 | return dma_buf_export(exp_info: &exp_info); |
331 | } |
332 | |
333 | static long udmabuf_pin_folios(struct udmabuf *ubuf, struct file *memfd, |
334 | loff_t start, loff_t size, struct folio **folios) |
335 | { |
336 | pgoff_t nr_pinned = ubuf->nr_pinned; |
337 | pgoff_t upgcnt = ubuf->pagecount; |
338 | u32 cur_folio, cur_pgcnt; |
339 | pgoff_t pgoff, pgcnt; |
340 | long nr_folios; |
341 | loff_t end; |
342 | |
343 | pgcnt = size >> PAGE_SHIFT; |
344 | end = start + (pgcnt << PAGE_SHIFT) - 1; |
345 | nr_folios = memfd_pin_folios(memfd, start, end, folios, max_folios: pgcnt, offset: &pgoff); |
346 | if (nr_folios <= 0) |
347 | return nr_folios ? nr_folios : -EINVAL; |
348 | |
349 | cur_pgcnt = 0; |
350 | for (cur_folio = 0; cur_folio < nr_folios; ++cur_folio) { |
351 | pgoff_t subpgoff = pgoff; |
352 | size_t fsize = folio_size(folio: folios[cur_folio]); |
353 | |
354 | ubuf->pinned_folios[nr_pinned++] = folios[cur_folio]; |
355 | |
356 | for (; subpgoff < fsize; subpgoff += PAGE_SIZE) { |
357 | ubuf->folios[upgcnt] = folios[cur_folio]; |
358 | ubuf->offsets[upgcnt] = subpgoff; |
359 | ++upgcnt; |
360 | |
361 | if (++cur_pgcnt >= pgcnt) |
362 | goto end; |
363 | } |
364 | |
365 | /** |
366 | * In a given range, only the first subpage of the first folio |
367 | * has an offset, that is returned by memfd_pin_folios(). |
368 | * The first subpages of other folios (in the range) have an |
369 | * offset of 0. |
370 | */ |
371 | pgoff = 0; |
372 | } |
373 | end: |
374 | ubuf->pagecount = upgcnt; |
375 | ubuf->nr_pinned = nr_pinned; |
376 | return 0; |
377 | } |
378 | |
379 | static long udmabuf_create(struct miscdevice *device, |
380 | struct udmabuf_create_list *head, |
381 | struct udmabuf_create_item *list) |
382 | { |
383 | unsigned long max_nr_folios = 0; |
384 | struct folio **folios = NULL; |
385 | pgoff_t pgcnt = 0, pglimit; |
386 | struct udmabuf *ubuf; |
387 | struct dma_buf *dmabuf; |
388 | long ret = -EINVAL; |
389 | u32 i, flags; |
390 | |
391 | ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL); |
392 | if (!ubuf) |
393 | return -ENOMEM; |
394 | |
395 | pglimit = ((u64)size_limit_mb * 1024 * 1024) >> PAGE_SHIFT; |
396 | for (i = 0; i < head->count; i++) { |
397 | pgoff_t subpgcnt; |
398 | |
399 | if (!PAGE_ALIGNED(list[i].offset)) |
400 | goto err_noinit; |
401 | if (!PAGE_ALIGNED(list[i].size)) |
402 | goto err_noinit; |
403 | |
404 | subpgcnt = list[i].size >> PAGE_SHIFT; |
405 | pgcnt += subpgcnt; |
406 | if (pgcnt > pglimit) |
407 | goto err_noinit; |
408 | |
409 | max_nr_folios = max_t(unsigned long, subpgcnt, max_nr_folios); |
410 | } |
411 | |
412 | if (!pgcnt) |
413 | goto err_noinit; |
414 | |
415 | ret = init_udmabuf(ubuf, pgcnt); |
416 | if (ret) |
417 | goto err; |
418 | |
419 | folios = kvmalloc_array(max_nr_folios, sizeof(*folios), GFP_KERNEL); |
420 | if (!folios) { |
421 | ret = -ENOMEM; |
422 | goto err; |
423 | } |
424 | |
425 | for (i = 0; i < head->count; i++) { |
426 | struct file *memfd = fget(fd: list[i].memfd); |
427 | |
428 | if (!memfd) { |
429 | ret = -EBADFD; |
430 | goto err; |
431 | } |
432 | |
433 | /* |
434 | * Take the inode lock to protect against concurrent |
435 | * memfd_add_seals(), which takes this lock in write mode. |
436 | */ |
437 | inode_lock_shared(inode: file_inode(f: memfd)); |
438 | ret = check_memfd_seals(memfd); |
439 | if (ret) |
440 | goto out_unlock; |
441 | |
442 | ret = udmabuf_pin_folios(ubuf, memfd, start: list[i].offset, |
443 | size: list[i].size, folios); |
444 | out_unlock: |
445 | inode_unlock_shared(inode: file_inode(f: memfd)); |
446 | fput(memfd); |
447 | if (ret) |
448 | goto err; |
449 | } |
450 | |
451 | flags = head->flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0; |
452 | dmabuf = export_udmabuf(ubuf, device); |
453 | if (IS_ERR(ptr: dmabuf)) { |
454 | ret = PTR_ERR(ptr: dmabuf); |
455 | goto err; |
456 | } |
457 | /* |
458 | * Ownership of ubuf is held by the dmabuf from here. |
459 | * If the following dma_buf_fd() fails, dma_buf_put() cleans up both the |
460 | * dmabuf and the ubuf (through udmabuf_ops.release). |
461 | */ |
462 | |
463 | ret = dma_buf_fd(dmabuf, flags); |
464 | if (ret < 0) |
465 | dma_buf_put(dmabuf); |
466 | |
467 | kvfree(addr: folios); |
468 | return ret; |
469 | |
470 | err: |
471 | deinit_udmabuf(ubuf); |
472 | err_noinit: |
473 | kfree(objp: ubuf); |
474 | kvfree(addr: folios); |
475 | return ret; |
476 | } |
477 | |
478 | static long udmabuf_ioctl_create(struct file *filp, unsigned long arg) |
479 | { |
480 | struct udmabuf_create create; |
481 | struct udmabuf_create_list head; |
482 | struct udmabuf_create_item list; |
483 | |
484 | if (copy_from_user(to: &create, from: (void __user *)arg, |
485 | n: sizeof(create))) |
486 | return -EFAULT; |
487 | |
488 | head.flags = create.flags; |
489 | head.count = 1; |
490 | list.memfd = create.memfd; |
491 | list.offset = create.offset; |
492 | list.size = create.size; |
493 | |
494 | return udmabuf_create(device: filp->private_data, head: &head, list: &list); |
495 | } |
496 | |
497 | static long udmabuf_ioctl_create_list(struct file *filp, unsigned long arg) |
498 | { |
499 | struct udmabuf_create_list head; |
500 | struct udmabuf_create_item *list; |
501 | int ret = -EINVAL; |
502 | u32 lsize; |
503 | |
504 | if (copy_from_user(to: &head, from: (void __user *)arg, n: sizeof(head))) |
505 | return -EFAULT; |
506 | if (head.count > list_limit) |
507 | return -EINVAL; |
508 | lsize = sizeof(struct udmabuf_create_item) * head.count; |
509 | list = memdup_user((void __user *)(arg + sizeof(head)), lsize); |
510 | if (IS_ERR(ptr: list)) |
511 | return PTR_ERR(ptr: list); |
512 | |
513 | ret = udmabuf_create(device: filp->private_data, head: &head, list); |
514 | kfree(objp: list); |
515 | return ret; |
516 | } |
517 | |
518 | static long udmabuf_ioctl(struct file *filp, unsigned int ioctl, |
519 | unsigned long arg) |
520 | { |
521 | long ret; |
522 | |
523 | switch (ioctl) { |
524 | case UDMABUF_CREATE: |
525 | ret = udmabuf_ioctl_create(filp, arg); |
526 | break; |
527 | case UDMABUF_CREATE_LIST: |
528 | ret = udmabuf_ioctl_create_list(filp, arg); |
529 | break; |
530 | default: |
531 | ret = -ENOTTY; |
532 | break; |
533 | } |
534 | return ret; |
535 | } |
536 | |
537 | static const struct file_operations udmabuf_fops = { |
538 | .owner = THIS_MODULE, |
539 | .unlocked_ioctl = udmabuf_ioctl, |
540 | #ifdef CONFIG_COMPAT |
541 | .compat_ioctl = udmabuf_ioctl, |
542 | #endif |
543 | }; |
544 | |
545 | static struct miscdevice udmabuf_misc = { |
546 | .minor = MISC_DYNAMIC_MINOR, |
547 | .name = "udmabuf", |
548 | .fops = &udmabuf_fops, |
549 | }; |
550 | |
551 | static int __init udmabuf_dev_init(void) |
552 | { |
553 | int ret; |
554 | |
555 | ret = misc_register(misc: &udmabuf_misc); |
556 | if (ret < 0) { |
557 | pr_err("Could not initialize udmabuf device\n"); |
558 | return ret; |
559 | } |
560 | |
561 | ret = dma_coerce_mask_and_coherent(dev: udmabuf_misc.this_device, |
562 | DMA_BIT_MASK(64)); |
563 | if (ret < 0) { |
564 | pr_err("Could not setup DMA mask for udmabuf device\n"); |
565 | misc_deregister(misc: &udmabuf_misc); |
566 | return ret; |
567 | } |
568 | |
569 | return 0; |
570 | } |
571 | |
572 | static void __exit udmabuf_dev_exit(void) |
573 | { |
574 | misc_deregister(misc: &udmabuf_misc); |
575 | } |
576 | |
577 | module_init(udmabuf_dev_init) |
578 | module_exit(udmabuf_dev_exit) |
579 | |
580 | MODULE_AUTHOR("Gerd Hoffmann <kraxel@redhat.com>"); |
581 |
Definitions
- list_limit
- size_limit_mb
- udmabuf
- udmabuf_vm_fault
- udmabuf_vm_ops
- mmap_udmabuf
- vmap_udmabuf
- vunmap_udmabuf
- get_sg_table
- put_sg_table
- map_udmabuf
- unmap_udmabuf
- unpin_all_folios
- init_udmabuf
- deinit_udmabuf
- release_udmabuf
- begin_cpu_udmabuf
- end_cpu_udmabuf
- udmabuf_ops
- check_memfd_seals
- export_udmabuf
- udmabuf_pin_folios
- udmabuf_create
- udmabuf_ioctl_create
- udmabuf_ioctl_create_list
- udmabuf_ioctl
- udmabuf_fops
- udmabuf_misc
- udmabuf_dev_init
Improve your Profiling and Debugging skills
Find out more