1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/kernel.h> |
3 | #include <linux/errno.h> |
4 | #include <linux/fs.h> |
5 | #include <linux/file.h> |
6 | #include <linux/mm.h> |
7 | #include <linux/slab.h> |
8 | #include <linux/nospec.h> |
9 | #include <linux/hugetlb.h> |
10 | #include <linux/compat.h> |
11 | #include <linux/io_uring.h> |
12 | |
13 | #include <uapi/linux/io_uring.h> |
14 | |
15 | #include "io_uring.h" |
16 | #include "openclose.h" |
17 | #include "rsrc.h" |
18 | |
19 | struct io_rsrc_update { |
20 | struct file *file; |
21 | u64 arg; |
22 | u32 nr_args; |
23 | u32 offset; |
24 | }; |
25 | |
26 | static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); |
27 | static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); |
28 | static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, |
29 | struct io_mapped_ubuf **pimu, |
30 | struct page **last_hpage); |
31 | |
32 | /* only define max */ |
33 | #define IORING_MAX_FIXED_FILES (1U << 20) |
34 | #define IORING_MAX_REG_BUFFERS (1U << 14) |
35 | |
36 | static const struct io_mapped_ubuf dummy_ubuf = { |
37 | /* set invalid range, so io_import_fixed() fails meeting it */ |
38 | .ubuf = -1UL, |
39 | .ubuf_end = 0, |
40 | }; |
41 | |
42 | int __io_account_mem(struct user_struct *user, unsigned long nr_pages) |
43 | { |
44 | unsigned long page_limit, cur_pages, new_pages; |
45 | |
46 | if (!nr_pages) |
47 | return 0; |
48 | |
49 | /* Don't allow more pages than we can safely lock */ |
50 | page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
51 | |
52 | cur_pages = atomic_long_read(v: &user->locked_vm); |
53 | do { |
54 | new_pages = cur_pages + nr_pages; |
55 | if (new_pages > page_limit) |
56 | return -ENOMEM; |
57 | } while (!atomic_long_try_cmpxchg(v: &user->locked_vm, |
58 | old: &cur_pages, new: new_pages)); |
59 | return 0; |
60 | } |
61 | |
62 | static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) |
63 | { |
64 | if (ctx->user) |
65 | __io_unaccount_mem(user: ctx->user, nr_pages); |
66 | |
67 | if (ctx->mm_account) |
68 | atomic64_sub(i: nr_pages, v: &ctx->mm_account->pinned_vm); |
69 | } |
70 | |
71 | static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) |
72 | { |
73 | int ret; |
74 | |
75 | if (ctx->user) { |
76 | ret = __io_account_mem(user: ctx->user, nr_pages); |
77 | if (ret) |
78 | return ret; |
79 | } |
80 | |
81 | if (ctx->mm_account) |
82 | atomic64_add(i: nr_pages, v: &ctx->mm_account->pinned_vm); |
83 | |
84 | return 0; |
85 | } |
86 | |
87 | static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, |
88 | void __user *arg, unsigned index) |
89 | { |
90 | struct iovec __user *src; |
91 | |
92 | #ifdef CONFIG_COMPAT |
93 | if (ctx->compat) { |
94 | struct compat_iovec __user *ciovs; |
95 | struct compat_iovec ciov; |
96 | |
97 | ciovs = (struct compat_iovec __user *) arg; |
98 | if (copy_from_user(to: &ciov, from: &ciovs[index], n: sizeof(ciov))) |
99 | return -EFAULT; |
100 | |
101 | dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); |
102 | dst->iov_len = ciov.iov_len; |
103 | return 0; |
104 | } |
105 | #endif |
106 | src = (struct iovec __user *) arg; |
107 | if (copy_from_user(to: dst, from: &src[index], n: sizeof(*dst))) |
108 | return -EFAULT; |
109 | return 0; |
110 | } |
111 | |
112 | static int io_buffer_validate(struct iovec *iov) |
113 | { |
114 | unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); |
115 | |
116 | /* |
117 | * Don't impose further limits on the size and buffer |
118 | * constraints here, we'll -EINVAL later when IO is |
119 | * submitted if they are wrong. |
120 | */ |
121 | if (!iov->iov_base) |
122 | return iov->iov_len ? -EFAULT : 0; |
123 | if (!iov->iov_len) |
124 | return -EFAULT; |
125 | |
126 | /* arbitrary limit, but we need something */ |
127 | if (iov->iov_len > SZ_1G) |
128 | return -EFAULT; |
129 | |
130 | if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) |
131 | return -EOVERFLOW; |
132 | |
133 | return 0; |
134 | } |
135 | |
136 | static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) |
137 | { |
138 | struct io_mapped_ubuf *imu = *slot; |
139 | unsigned int i; |
140 | |
141 | if (imu != &dummy_ubuf) { |
142 | for (i = 0; i < imu->nr_bvecs; i++) |
143 | unpin_user_page(page: imu->bvec[i].bv_page); |
144 | if (imu->acct_pages) |
145 | io_unaccount_mem(ctx, nr_pages: imu->acct_pages); |
146 | kvfree(addr: imu); |
147 | } |
148 | *slot = NULL; |
149 | } |
150 | |
151 | static void io_rsrc_put_work(struct io_rsrc_node *node) |
152 | { |
153 | struct io_rsrc_put *prsrc = &node->item; |
154 | |
155 | if (prsrc->tag) |
156 | io_post_aux_cqe(ctx: node->ctx, user_data: prsrc->tag, res: 0, cflags: 0); |
157 | |
158 | switch (node->type) { |
159 | case IORING_RSRC_FILE: |
160 | io_rsrc_file_put(ctx: node->ctx, prsrc); |
161 | break; |
162 | case IORING_RSRC_BUFFER: |
163 | io_rsrc_buf_put(ctx: node->ctx, prsrc); |
164 | break; |
165 | default: |
166 | WARN_ON_ONCE(1); |
167 | break; |
168 | } |
169 | } |
170 | |
171 | void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) |
172 | { |
173 | if (!io_alloc_cache_put(cache: &ctx->rsrc_node_cache, entry: &node->cache)) |
174 | kfree(objp: node); |
175 | } |
176 | |
177 | void io_rsrc_node_ref_zero(struct io_rsrc_node *node) |
178 | __must_hold(&node->ctx->uring_lock) |
179 | { |
180 | struct io_ring_ctx *ctx = node->ctx; |
181 | |
182 | while (!list_empty(head: &ctx->rsrc_ref_list)) { |
183 | node = list_first_entry(&ctx->rsrc_ref_list, |
184 | struct io_rsrc_node, node); |
185 | /* recycle ref nodes in order */ |
186 | if (node->refs) |
187 | break; |
188 | list_del(entry: &node->node); |
189 | |
190 | if (likely(!node->empty)) |
191 | io_rsrc_put_work(node); |
192 | io_rsrc_node_destroy(ctx, node); |
193 | } |
194 | if (list_empty(head: &ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) |
195 | wake_up_all(&ctx->rsrc_quiesce_wq); |
196 | } |
197 | |
198 | struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) |
199 | { |
200 | struct io_rsrc_node *ref_node; |
201 | struct io_cache_entry *entry; |
202 | |
203 | entry = io_alloc_cache_get(cache: &ctx->rsrc_node_cache); |
204 | if (entry) { |
205 | ref_node = container_of(entry, struct io_rsrc_node, cache); |
206 | } else { |
207 | ref_node = kzalloc(size: sizeof(*ref_node), GFP_KERNEL); |
208 | if (!ref_node) |
209 | return NULL; |
210 | } |
211 | |
212 | ref_node->ctx = ctx; |
213 | ref_node->empty = 0; |
214 | ref_node->refs = 1; |
215 | return ref_node; |
216 | } |
217 | |
218 | __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, |
219 | struct io_ring_ctx *ctx) |
220 | { |
221 | struct io_rsrc_node *backup; |
222 | DEFINE_WAIT(we); |
223 | int ret; |
224 | |
225 | /* As We may drop ->uring_lock, other task may have started quiesce */ |
226 | if (data->quiesce) |
227 | return -ENXIO; |
228 | |
229 | backup = io_rsrc_node_alloc(ctx); |
230 | if (!backup) |
231 | return -ENOMEM; |
232 | ctx->rsrc_node->empty = true; |
233 | ctx->rsrc_node->type = -1; |
234 | list_add_tail(new: &ctx->rsrc_node->node, head: &ctx->rsrc_ref_list); |
235 | io_put_rsrc_node(ctx, node: ctx->rsrc_node); |
236 | ctx->rsrc_node = backup; |
237 | |
238 | if (list_empty(head: &ctx->rsrc_ref_list)) |
239 | return 0; |
240 | |
241 | if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { |
242 | atomic_set(v: &ctx->cq_wait_nr, i: 1); |
243 | smp_mb(); |
244 | } |
245 | |
246 | ctx->rsrc_quiesce++; |
247 | data->quiesce = true; |
248 | do { |
249 | prepare_to_wait(wq_head: &ctx->rsrc_quiesce_wq, wq_entry: &we, TASK_INTERRUPTIBLE); |
250 | mutex_unlock(lock: &ctx->uring_lock); |
251 | |
252 | ret = io_run_task_work_sig(ctx); |
253 | if (ret < 0) { |
254 | mutex_lock(&ctx->uring_lock); |
255 | if (list_empty(head: &ctx->rsrc_ref_list)) |
256 | ret = 0; |
257 | break; |
258 | } |
259 | |
260 | schedule(); |
261 | __set_current_state(TASK_RUNNING); |
262 | mutex_lock(&ctx->uring_lock); |
263 | ret = 0; |
264 | } while (!list_empty(head: &ctx->rsrc_ref_list)); |
265 | |
266 | finish_wait(wq_head: &ctx->rsrc_quiesce_wq, wq_entry: &we); |
267 | data->quiesce = false; |
268 | ctx->rsrc_quiesce--; |
269 | |
270 | if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { |
271 | atomic_set(v: &ctx->cq_wait_nr, i: 0); |
272 | smp_mb(); |
273 | } |
274 | return ret; |
275 | } |
276 | |
277 | static void io_free_page_table(void **table, size_t size) |
278 | { |
279 | unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); |
280 | |
281 | for (i = 0; i < nr_tables; i++) |
282 | kfree(objp: table[i]); |
283 | kfree(objp: table); |
284 | } |
285 | |
286 | static void io_rsrc_data_free(struct io_rsrc_data *data) |
287 | { |
288 | size_t size = data->nr * sizeof(data->tags[0][0]); |
289 | |
290 | if (data->tags) |
291 | io_free_page_table(table: (void **)data->tags, size); |
292 | kfree(objp: data); |
293 | } |
294 | |
295 | static __cold void **io_alloc_page_table(size_t size) |
296 | { |
297 | unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); |
298 | size_t init_size = size; |
299 | void **table; |
300 | |
301 | table = kcalloc(n: nr_tables, size: sizeof(*table), GFP_KERNEL_ACCOUNT); |
302 | if (!table) |
303 | return NULL; |
304 | |
305 | for (i = 0; i < nr_tables; i++) { |
306 | unsigned int this_size = min_t(size_t, size, PAGE_SIZE); |
307 | |
308 | table[i] = kzalloc(size: this_size, GFP_KERNEL_ACCOUNT); |
309 | if (!table[i]) { |
310 | io_free_page_table(table, size: init_size); |
311 | return NULL; |
312 | } |
313 | size -= this_size; |
314 | } |
315 | return table; |
316 | } |
317 | |
318 | __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, |
319 | u64 __user *utags, |
320 | unsigned nr, struct io_rsrc_data **pdata) |
321 | { |
322 | struct io_rsrc_data *data; |
323 | int ret = 0; |
324 | unsigned i; |
325 | |
326 | data = kzalloc(size: sizeof(*data), GFP_KERNEL); |
327 | if (!data) |
328 | return -ENOMEM; |
329 | data->tags = (u64 **)io_alloc_page_table(size: nr * sizeof(data->tags[0][0])); |
330 | if (!data->tags) { |
331 | kfree(objp: data); |
332 | return -ENOMEM; |
333 | } |
334 | |
335 | data->nr = nr; |
336 | data->ctx = ctx; |
337 | data->rsrc_type = type; |
338 | if (utags) { |
339 | ret = -EFAULT; |
340 | for (i = 0; i < nr; i++) { |
341 | u64 *tag_slot = io_get_tag_slot(data, idx: i); |
342 | |
343 | if (copy_from_user(to: tag_slot, from: &utags[i], |
344 | n: sizeof(*tag_slot))) |
345 | goto fail; |
346 | } |
347 | } |
348 | *pdata = data; |
349 | return 0; |
350 | fail: |
351 | io_rsrc_data_free(data); |
352 | return ret; |
353 | } |
354 | |
355 | static int __io_sqe_files_update(struct io_ring_ctx *ctx, |
356 | struct io_uring_rsrc_update2 *up, |
357 | unsigned nr_args) |
358 | { |
359 | u64 __user *tags = u64_to_user_ptr(up->tags); |
360 | __s32 __user *fds = u64_to_user_ptr(up->data); |
361 | struct io_rsrc_data *data = ctx->file_data; |
362 | struct io_fixed_file *file_slot; |
363 | int fd, i, err = 0; |
364 | unsigned int done; |
365 | |
366 | if (!ctx->file_data) |
367 | return -ENXIO; |
368 | if (up->offset + nr_args > ctx->nr_user_files) |
369 | return -EINVAL; |
370 | |
371 | for (done = 0; done < nr_args; done++) { |
372 | u64 tag = 0; |
373 | |
374 | if ((tags && copy_from_user(to: &tag, from: &tags[done], n: sizeof(tag))) || |
375 | copy_from_user(to: &fd, from: &fds[done], n: sizeof(fd))) { |
376 | err = -EFAULT; |
377 | break; |
378 | } |
379 | if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { |
380 | err = -EINVAL; |
381 | break; |
382 | } |
383 | if (fd == IORING_REGISTER_FILES_SKIP) |
384 | continue; |
385 | |
386 | i = array_index_nospec(up->offset + done, ctx->nr_user_files); |
387 | file_slot = io_fixed_file_slot(table: &ctx->file_table, i); |
388 | |
389 | if (file_slot->file_ptr) { |
390 | err = io_queue_rsrc_removal(data, idx: i, |
391 | rsrc: io_slot_file(slot: file_slot)); |
392 | if (err) |
393 | break; |
394 | file_slot->file_ptr = 0; |
395 | io_file_bitmap_clear(table: &ctx->file_table, bit: i); |
396 | } |
397 | if (fd != -1) { |
398 | struct file *file = fget(fd); |
399 | |
400 | if (!file) { |
401 | err = -EBADF; |
402 | break; |
403 | } |
404 | /* |
405 | * Don't allow io_uring instances to be registered. If |
406 | * UNIX isn't enabled, then this causes a reference |
407 | * cycle and this instance can never get freed. If UNIX |
408 | * is enabled we'll handle it just fine, but there's |
409 | * still no point in allowing a ring fd as it doesn't |
410 | * support regular read/write anyway. |
411 | */ |
412 | if (io_is_uring_fops(file)) { |
413 | fput(file); |
414 | err = -EBADF; |
415 | break; |
416 | } |
417 | err = io_scm_file_account(ctx, file); |
418 | if (err) { |
419 | fput(file); |
420 | break; |
421 | } |
422 | *io_get_tag_slot(data, idx: i) = tag; |
423 | io_fixed_file_set(file_slot, file); |
424 | io_file_bitmap_set(table: &ctx->file_table, bit: i); |
425 | } |
426 | } |
427 | return done ? done : err; |
428 | } |
429 | |
430 | static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, |
431 | struct io_uring_rsrc_update2 *up, |
432 | unsigned int nr_args) |
433 | { |
434 | u64 __user *tags = u64_to_user_ptr(up->tags); |
435 | struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); |
436 | struct page *last_hpage = NULL; |
437 | __u32 done; |
438 | int i, err; |
439 | |
440 | if (!ctx->buf_data) |
441 | return -ENXIO; |
442 | if (up->offset + nr_args > ctx->nr_user_bufs) |
443 | return -EINVAL; |
444 | |
445 | for (done = 0; done < nr_args; done++) { |
446 | struct io_mapped_ubuf *imu; |
447 | u64 tag = 0; |
448 | |
449 | err = io_copy_iov(ctx, dst: &iov, arg: iovs, index: done); |
450 | if (err) |
451 | break; |
452 | if (tags && copy_from_user(to: &tag, from: &tags[done], n: sizeof(tag))) { |
453 | err = -EFAULT; |
454 | break; |
455 | } |
456 | err = io_buffer_validate(iov: &iov); |
457 | if (err) |
458 | break; |
459 | if (!iov.iov_base && tag) { |
460 | err = -EINVAL; |
461 | break; |
462 | } |
463 | err = io_sqe_buffer_register(ctx, iov: &iov, pimu: &imu, last_hpage: &last_hpage); |
464 | if (err) |
465 | break; |
466 | |
467 | i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); |
468 | if (ctx->user_bufs[i] != &dummy_ubuf) { |
469 | err = io_queue_rsrc_removal(data: ctx->buf_data, idx: i, |
470 | rsrc: ctx->user_bufs[i]); |
471 | if (unlikely(err)) { |
472 | io_buffer_unmap(ctx, slot: &imu); |
473 | break; |
474 | } |
475 | ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; |
476 | } |
477 | |
478 | ctx->user_bufs[i] = imu; |
479 | *io_get_tag_slot(data: ctx->buf_data, idx: i) = tag; |
480 | } |
481 | return done ? done : err; |
482 | } |
483 | |
484 | static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, |
485 | struct io_uring_rsrc_update2 *up, |
486 | unsigned nr_args) |
487 | { |
488 | __u32 tmp; |
489 | |
490 | lockdep_assert_held(&ctx->uring_lock); |
491 | |
492 | if (check_add_overflow(up->offset, nr_args, &tmp)) |
493 | return -EOVERFLOW; |
494 | |
495 | switch (type) { |
496 | case IORING_RSRC_FILE: |
497 | return __io_sqe_files_update(ctx, up, nr_args); |
498 | case IORING_RSRC_BUFFER: |
499 | return __io_sqe_buffers_update(ctx, up, nr_args); |
500 | } |
501 | return -EINVAL; |
502 | } |
503 | |
504 | int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, |
505 | unsigned nr_args) |
506 | { |
507 | struct io_uring_rsrc_update2 up; |
508 | |
509 | if (!nr_args) |
510 | return -EINVAL; |
511 | memset(&up, 0, sizeof(up)); |
512 | if (copy_from_user(to: &up, from: arg, n: sizeof(struct io_uring_rsrc_update))) |
513 | return -EFAULT; |
514 | if (up.resv || up.resv2) |
515 | return -EINVAL; |
516 | return __io_register_rsrc_update(ctx, type: IORING_RSRC_FILE, up: &up, nr_args); |
517 | } |
518 | |
519 | int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, |
520 | unsigned size, unsigned type) |
521 | { |
522 | struct io_uring_rsrc_update2 up; |
523 | |
524 | if (size != sizeof(up)) |
525 | return -EINVAL; |
526 | if (copy_from_user(to: &up, from: arg, n: sizeof(up))) |
527 | return -EFAULT; |
528 | if (!up.nr || up.resv || up.resv2) |
529 | return -EINVAL; |
530 | return __io_register_rsrc_update(ctx, type, up: &up, nr_args: up.nr); |
531 | } |
532 | |
533 | __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, |
534 | unsigned int size, unsigned int type) |
535 | { |
536 | struct io_uring_rsrc_register rr; |
537 | |
538 | /* keep it extendible */ |
539 | if (size != sizeof(rr)) |
540 | return -EINVAL; |
541 | |
542 | memset(&rr, 0, sizeof(rr)); |
543 | if (copy_from_user(to: &rr, from: arg, n: size)) |
544 | return -EFAULT; |
545 | if (!rr.nr || rr.resv2) |
546 | return -EINVAL; |
547 | if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) |
548 | return -EINVAL; |
549 | |
550 | switch (type) { |
551 | case IORING_RSRC_FILE: |
552 | if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) |
553 | break; |
554 | return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), |
555 | nr_args: rr.nr, u64_to_user_ptr(rr.tags)); |
556 | case IORING_RSRC_BUFFER: |
557 | if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) |
558 | break; |
559 | return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), |
560 | nr_args: rr.nr, u64_to_user_ptr(rr.tags)); |
561 | } |
562 | return -EINVAL; |
563 | } |
564 | |
565 | int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
566 | { |
567 | struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); |
568 | |
569 | if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) |
570 | return -EINVAL; |
571 | if (sqe->rw_flags || sqe->splice_fd_in) |
572 | return -EINVAL; |
573 | |
574 | up->offset = READ_ONCE(sqe->off); |
575 | up->nr_args = READ_ONCE(sqe->len); |
576 | if (!up->nr_args) |
577 | return -EINVAL; |
578 | up->arg = READ_ONCE(sqe->addr); |
579 | return 0; |
580 | } |
581 | |
582 | static int io_files_update_with_index_alloc(struct io_kiocb *req, |
583 | unsigned int issue_flags) |
584 | { |
585 | struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); |
586 | __s32 __user *fds = u64_to_user_ptr(up->arg); |
587 | unsigned int done; |
588 | struct file *file; |
589 | int ret, fd; |
590 | |
591 | if (!req->ctx->file_data) |
592 | return -ENXIO; |
593 | |
594 | for (done = 0; done < up->nr_args; done++) { |
595 | if (copy_from_user(to: &fd, from: &fds[done], n: sizeof(fd))) { |
596 | ret = -EFAULT; |
597 | break; |
598 | } |
599 | |
600 | file = fget(fd); |
601 | if (!file) { |
602 | ret = -EBADF; |
603 | break; |
604 | } |
605 | ret = io_fixed_fd_install(req, issue_flags, file, |
606 | IORING_FILE_INDEX_ALLOC); |
607 | if (ret < 0) |
608 | break; |
609 | if (copy_to_user(to: &fds[done], from: &ret, n: sizeof(ret))) { |
610 | __io_close_fixed(ctx: req->ctx, issue_flags, offset: ret); |
611 | ret = -EFAULT; |
612 | break; |
613 | } |
614 | } |
615 | |
616 | if (done) |
617 | return done; |
618 | return ret; |
619 | } |
620 | |
621 | int io_files_update(struct io_kiocb *req, unsigned int issue_flags) |
622 | { |
623 | struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); |
624 | struct io_ring_ctx *ctx = req->ctx; |
625 | struct io_uring_rsrc_update2 up2; |
626 | int ret; |
627 | |
628 | up2.offset = up->offset; |
629 | up2.data = up->arg; |
630 | up2.nr = 0; |
631 | up2.tags = 0; |
632 | up2.resv = 0; |
633 | up2.resv2 = 0; |
634 | |
635 | if (up->offset == IORING_FILE_INDEX_ALLOC) { |
636 | ret = io_files_update_with_index_alloc(req, issue_flags); |
637 | } else { |
638 | io_ring_submit_lock(ctx, issue_flags); |
639 | ret = __io_register_rsrc_update(ctx, type: IORING_RSRC_FILE, |
640 | up: &up2, nr_args: up->nr_args); |
641 | io_ring_submit_unlock(ctx, issue_flags); |
642 | } |
643 | |
644 | if (ret < 0) |
645 | req_set_fail(req); |
646 | io_req_set_res(req, res: ret, cflags: 0); |
647 | return IOU_OK; |
648 | } |
649 | |
650 | int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) |
651 | { |
652 | struct io_ring_ctx *ctx = data->ctx; |
653 | struct io_rsrc_node *node = ctx->rsrc_node; |
654 | u64 *tag_slot = io_get_tag_slot(data, idx); |
655 | |
656 | ctx->rsrc_node = io_rsrc_node_alloc(ctx); |
657 | if (unlikely(!ctx->rsrc_node)) { |
658 | ctx->rsrc_node = node; |
659 | return -ENOMEM; |
660 | } |
661 | |
662 | node->item.rsrc = rsrc; |
663 | node->type = data->rsrc_type; |
664 | node->item.tag = *tag_slot; |
665 | *tag_slot = 0; |
666 | list_add_tail(new: &node->node, head: &ctx->rsrc_ref_list); |
667 | io_put_rsrc_node(ctx, node); |
668 | return 0; |
669 | } |
670 | |
671 | void __io_sqe_files_unregister(struct io_ring_ctx *ctx) |
672 | { |
673 | int i; |
674 | |
675 | for (i = 0; i < ctx->nr_user_files; i++) { |
676 | struct file *file = io_file_from_index(table: &ctx->file_table, index: i); |
677 | |
678 | /* skip scm accounted files, they'll be freed by ->ring_sock */ |
679 | if (!file || io_file_need_scm(filp: file)) |
680 | continue; |
681 | io_file_bitmap_clear(table: &ctx->file_table, bit: i); |
682 | fput(file); |
683 | } |
684 | |
685 | #if defined(CONFIG_UNIX) |
686 | if (ctx->ring_sock) { |
687 | struct sock *sock = ctx->ring_sock->sk; |
688 | struct sk_buff *skb; |
689 | |
690 | while ((skb = skb_dequeue(list: &sock->sk_receive_queue)) != NULL) |
691 | kfree_skb(skb); |
692 | } |
693 | #endif |
694 | io_free_file_tables(table: &ctx->file_table); |
695 | io_file_table_set_alloc_range(ctx, off: 0, len: 0); |
696 | io_rsrc_data_free(data: ctx->file_data); |
697 | ctx->file_data = NULL; |
698 | ctx->nr_user_files = 0; |
699 | } |
700 | |
701 | int io_sqe_files_unregister(struct io_ring_ctx *ctx) |
702 | { |
703 | unsigned nr = ctx->nr_user_files; |
704 | int ret; |
705 | |
706 | if (!ctx->file_data) |
707 | return -ENXIO; |
708 | |
709 | /* |
710 | * Quiesce may unlock ->uring_lock, and while it's not held |
711 | * prevent new requests using the table. |
712 | */ |
713 | ctx->nr_user_files = 0; |
714 | ret = io_rsrc_ref_quiesce(data: ctx->file_data, ctx); |
715 | ctx->nr_user_files = nr; |
716 | if (!ret) |
717 | __io_sqe_files_unregister(ctx); |
718 | return ret; |
719 | } |
720 | |
721 | /* |
722 | * Ensure the UNIX gc is aware of our file set, so we are certain that |
723 | * the io_uring can be safely unregistered on process exit, even if we have |
724 | * loops in the file referencing. We account only files that can hold other |
725 | * files because otherwise they can't form a loop and so are not interesting |
726 | * for GC. |
727 | */ |
728 | int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) |
729 | { |
730 | #if defined(CONFIG_UNIX) |
731 | struct sock *sk = ctx->ring_sock->sk; |
732 | struct sk_buff_head *head = &sk->sk_receive_queue; |
733 | struct scm_fp_list *fpl; |
734 | struct sk_buff *skb; |
735 | |
736 | if (likely(!io_file_need_scm(file))) |
737 | return 0; |
738 | |
739 | /* |
740 | * See if we can merge this file into an existing skb SCM_RIGHTS |
741 | * file set. If there's no room, fall back to allocating a new skb |
742 | * and filling it in. |
743 | */ |
744 | spin_lock_irq(lock: &head->lock); |
745 | skb = skb_peek(list_: head); |
746 | if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) |
747 | __skb_unlink(skb, list: head); |
748 | else |
749 | skb = NULL; |
750 | spin_unlock_irq(lock: &head->lock); |
751 | |
752 | if (!skb) { |
753 | fpl = kzalloc(size: sizeof(*fpl), GFP_KERNEL); |
754 | if (!fpl) |
755 | return -ENOMEM; |
756 | |
757 | skb = alloc_skb(size: 0, GFP_KERNEL); |
758 | if (!skb) { |
759 | kfree(objp: fpl); |
760 | return -ENOMEM; |
761 | } |
762 | |
763 | fpl->user = get_uid(current_user()); |
764 | fpl->max = SCM_MAX_FD; |
765 | fpl->count = 0; |
766 | |
767 | UNIXCB(skb).fp = fpl; |
768 | skb->sk = sk; |
769 | skb->destructor = io_uring_destruct_scm; |
770 | refcount_add(i: skb->truesize, r: &sk->sk_wmem_alloc); |
771 | } |
772 | |
773 | fpl = UNIXCB(skb).fp; |
774 | fpl->fp[fpl->count++] = get_file(f: file); |
775 | unix_inflight(user: fpl->user, fp: file); |
776 | skb_queue_head(list: head, newsk: skb); |
777 | fput(file); |
778 | #endif |
779 | return 0; |
780 | } |
781 | |
782 | static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) |
783 | { |
784 | #if defined(CONFIG_UNIX) |
785 | struct sock *sock = ctx->ring_sock->sk; |
786 | struct sk_buff_head list, *head = &sock->sk_receive_queue; |
787 | struct sk_buff *skb; |
788 | int i; |
789 | |
790 | __skb_queue_head_init(list: &list); |
791 | |
792 | /* |
793 | * Find the skb that holds this file in its SCM_RIGHTS. When found, |
794 | * remove this entry and rearrange the file array. |
795 | */ |
796 | skb = skb_dequeue(list: head); |
797 | while (skb) { |
798 | struct scm_fp_list *fp; |
799 | |
800 | fp = UNIXCB(skb).fp; |
801 | for (i = 0; i < fp->count; i++) { |
802 | int left; |
803 | |
804 | if (fp->fp[i] != file) |
805 | continue; |
806 | |
807 | unix_notinflight(user: fp->user, fp: fp->fp[i]); |
808 | left = fp->count - 1 - i; |
809 | if (left) { |
810 | memmove(&fp->fp[i], &fp->fp[i + 1], |
811 | left * sizeof(struct file *)); |
812 | } |
813 | fp->count--; |
814 | if (!fp->count) { |
815 | kfree_skb(skb); |
816 | skb = NULL; |
817 | } else { |
818 | __skb_queue_tail(list: &list, newsk: skb); |
819 | } |
820 | fput(file); |
821 | file = NULL; |
822 | break; |
823 | } |
824 | |
825 | if (!file) |
826 | break; |
827 | |
828 | __skb_queue_tail(list: &list, newsk: skb); |
829 | |
830 | skb = skb_dequeue(list: head); |
831 | } |
832 | |
833 | if (skb_peek(list_: &list)) { |
834 | spin_lock_irq(lock: &head->lock); |
835 | while ((skb = __skb_dequeue(list: &list)) != NULL) |
836 | __skb_queue_tail(list: head, newsk: skb); |
837 | spin_unlock_irq(lock: &head->lock); |
838 | } |
839 | #endif |
840 | } |
841 | |
842 | static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) |
843 | { |
844 | struct file *file = prsrc->file; |
845 | |
846 | if (likely(!io_file_need_scm(file))) |
847 | fput(file); |
848 | else |
849 | io_rsrc_file_scm_put(ctx, file); |
850 | } |
851 | |
852 | int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, |
853 | unsigned nr_args, u64 __user *tags) |
854 | { |
855 | __s32 __user *fds = (__s32 __user *) arg; |
856 | struct file *file; |
857 | int fd, ret; |
858 | unsigned i; |
859 | |
860 | if (ctx->file_data) |
861 | return -EBUSY; |
862 | if (!nr_args) |
863 | return -EINVAL; |
864 | if (nr_args > IORING_MAX_FIXED_FILES) |
865 | return -EMFILE; |
866 | if (nr_args > rlimit(RLIMIT_NOFILE)) |
867 | return -EMFILE; |
868 | ret = io_rsrc_data_alloc(ctx, type: IORING_RSRC_FILE, utags: tags, nr: nr_args, |
869 | pdata: &ctx->file_data); |
870 | if (ret) |
871 | return ret; |
872 | |
873 | if (!io_alloc_file_tables(table: &ctx->file_table, nr_files: nr_args)) { |
874 | io_rsrc_data_free(data: ctx->file_data); |
875 | ctx->file_data = NULL; |
876 | return -ENOMEM; |
877 | } |
878 | |
879 | for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { |
880 | struct io_fixed_file *file_slot; |
881 | |
882 | if (fds && copy_from_user(to: &fd, from: &fds[i], n: sizeof(fd))) { |
883 | ret = -EFAULT; |
884 | goto fail; |
885 | } |
886 | /* allow sparse sets */ |
887 | if (!fds || fd == -1) { |
888 | ret = -EINVAL; |
889 | if (unlikely(*io_get_tag_slot(ctx->file_data, i))) |
890 | goto fail; |
891 | continue; |
892 | } |
893 | |
894 | file = fget(fd); |
895 | ret = -EBADF; |
896 | if (unlikely(!file)) |
897 | goto fail; |
898 | |
899 | /* |
900 | * Don't allow io_uring instances to be registered. If UNIX |
901 | * isn't enabled, then this causes a reference cycle and this |
902 | * instance can never get freed. If UNIX is enabled we'll |
903 | * handle it just fine, but there's still no point in allowing |
904 | * a ring fd as it doesn't support regular read/write anyway. |
905 | */ |
906 | if (io_is_uring_fops(file)) { |
907 | fput(file); |
908 | goto fail; |
909 | } |
910 | ret = io_scm_file_account(ctx, file); |
911 | if (ret) { |
912 | fput(file); |
913 | goto fail; |
914 | } |
915 | file_slot = io_fixed_file_slot(table: &ctx->file_table, i); |
916 | io_fixed_file_set(file_slot, file); |
917 | io_file_bitmap_set(table: &ctx->file_table, bit: i); |
918 | } |
919 | |
920 | /* default it to the whole table */ |
921 | io_file_table_set_alloc_range(ctx, off: 0, len: ctx->nr_user_files); |
922 | return 0; |
923 | fail: |
924 | __io_sqe_files_unregister(ctx); |
925 | return ret; |
926 | } |
927 | |
928 | static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) |
929 | { |
930 | io_buffer_unmap(ctx, slot: &prsrc->buf); |
931 | prsrc->buf = NULL; |
932 | } |
933 | |
934 | void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) |
935 | { |
936 | unsigned int i; |
937 | |
938 | for (i = 0; i < ctx->nr_user_bufs; i++) |
939 | io_buffer_unmap(ctx, slot: &ctx->user_bufs[i]); |
940 | kfree(objp: ctx->user_bufs); |
941 | io_rsrc_data_free(data: ctx->buf_data); |
942 | ctx->user_bufs = NULL; |
943 | ctx->buf_data = NULL; |
944 | ctx->nr_user_bufs = 0; |
945 | } |
946 | |
947 | int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) |
948 | { |
949 | unsigned nr = ctx->nr_user_bufs; |
950 | int ret; |
951 | |
952 | if (!ctx->buf_data) |
953 | return -ENXIO; |
954 | |
955 | /* |
956 | * Quiesce may unlock ->uring_lock, and while it's not held |
957 | * prevent new requests using the table. |
958 | */ |
959 | ctx->nr_user_bufs = 0; |
960 | ret = io_rsrc_ref_quiesce(data: ctx->buf_data, ctx); |
961 | ctx->nr_user_bufs = nr; |
962 | if (!ret) |
963 | __io_sqe_buffers_unregister(ctx); |
964 | return ret; |
965 | } |
966 | |
967 | /* |
968 | * Not super efficient, but this is just a registration time. And we do cache |
969 | * the last compound head, so generally we'll only do a full search if we don't |
970 | * match that one. |
971 | * |
972 | * We check if the given compound head page has already been accounted, to |
973 | * avoid double accounting it. This allows us to account the full size of the |
974 | * page, not just the constituent pages of a huge page. |
975 | */ |
976 | static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, |
977 | int nr_pages, struct page *hpage) |
978 | { |
979 | int i, j; |
980 | |
981 | /* check current page array */ |
982 | for (i = 0; i < nr_pages; i++) { |
983 | if (!PageCompound(page: pages[i])) |
984 | continue; |
985 | if (compound_head(pages[i]) == hpage) |
986 | return true; |
987 | } |
988 | |
989 | /* check previously registered pages */ |
990 | for (i = 0; i < ctx->nr_user_bufs; i++) { |
991 | struct io_mapped_ubuf *imu = ctx->user_bufs[i]; |
992 | |
993 | for (j = 0; j < imu->nr_bvecs; j++) { |
994 | if (!PageCompound(page: imu->bvec[j].bv_page)) |
995 | continue; |
996 | if (compound_head(imu->bvec[j].bv_page) == hpage) |
997 | return true; |
998 | } |
999 | } |
1000 | |
1001 | return false; |
1002 | } |
1003 | |
1004 | static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, |
1005 | int nr_pages, struct io_mapped_ubuf *imu, |
1006 | struct page **last_hpage) |
1007 | { |
1008 | int i, ret; |
1009 | |
1010 | imu->acct_pages = 0; |
1011 | for (i = 0; i < nr_pages; i++) { |
1012 | if (!PageCompound(page: pages[i])) { |
1013 | imu->acct_pages++; |
1014 | } else { |
1015 | struct page *hpage; |
1016 | |
1017 | hpage = compound_head(pages[i]); |
1018 | if (hpage == *last_hpage) |
1019 | continue; |
1020 | *last_hpage = hpage; |
1021 | if (headpage_already_acct(ctx, pages, nr_pages: i, hpage)) |
1022 | continue; |
1023 | imu->acct_pages += page_size(page: hpage) >> PAGE_SHIFT; |
1024 | } |
1025 | } |
1026 | |
1027 | if (!imu->acct_pages) |
1028 | return 0; |
1029 | |
1030 | ret = io_account_mem(ctx, nr_pages: imu->acct_pages); |
1031 | if (ret) |
1032 | imu->acct_pages = 0; |
1033 | return ret; |
1034 | } |
1035 | |
1036 | struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) |
1037 | { |
1038 | unsigned long start, end, nr_pages; |
1039 | struct page **pages = NULL; |
1040 | int ret; |
1041 | |
1042 | end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
1043 | start = ubuf >> PAGE_SHIFT; |
1044 | nr_pages = end - start; |
1045 | WARN_ON(!nr_pages); |
1046 | |
1047 | pages = kvmalloc_array(n: nr_pages, size: sizeof(struct page *), GFP_KERNEL); |
1048 | if (!pages) |
1049 | return ERR_PTR(error: -ENOMEM); |
1050 | |
1051 | mmap_read_lock(current->mm); |
1052 | ret = pin_user_pages(start: ubuf, nr_pages, gup_flags: FOLL_WRITE | FOLL_LONGTERM, pages); |
1053 | mmap_read_unlock(current->mm); |
1054 | |
1055 | /* success, mapped all pages */ |
1056 | if (ret == nr_pages) { |
1057 | *npages = nr_pages; |
1058 | return pages; |
1059 | } |
1060 | |
1061 | /* partial map, or didn't map anything */ |
1062 | if (ret >= 0) { |
1063 | /* if we did partial map, release any pages we did get */ |
1064 | if (ret) |
1065 | unpin_user_pages(pages, npages: ret); |
1066 | ret = -EFAULT; |
1067 | } |
1068 | kvfree(addr: pages); |
1069 | return ERR_PTR(error: ret); |
1070 | } |
1071 | |
1072 | static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, |
1073 | struct io_mapped_ubuf **pimu, |
1074 | struct page **last_hpage) |
1075 | { |
1076 | struct io_mapped_ubuf *imu = NULL; |
1077 | struct page **pages = NULL; |
1078 | unsigned long off; |
1079 | size_t size; |
1080 | int ret, nr_pages, i; |
1081 | struct folio *folio = NULL; |
1082 | |
1083 | *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; |
1084 | if (!iov->iov_base) |
1085 | return 0; |
1086 | |
1087 | ret = -ENOMEM; |
1088 | pages = io_pin_pages(ubuf: (unsigned long) iov->iov_base, len: iov->iov_len, |
1089 | npages: &nr_pages); |
1090 | if (IS_ERR(ptr: pages)) { |
1091 | ret = PTR_ERR(ptr: pages); |
1092 | pages = NULL; |
1093 | goto done; |
1094 | } |
1095 | |
1096 | /* If it's a huge page, try to coalesce them into a single bvec entry */ |
1097 | if (nr_pages > 1) { |
1098 | folio = page_folio(pages[0]); |
1099 | for (i = 1; i < nr_pages; i++) { |
1100 | /* |
1101 | * Pages must be consecutive and on the same folio for |
1102 | * this to work |
1103 | */ |
1104 | if (page_folio(pages[i]) != folio || |
1105 | pages[i] != pages[i - 1] + 1) { |
1106 | folio = NULL; |
1107 | break; |
1108 | } |
1109 | } |
1110 | if (folio) { |
1111 | /* |
1112 | * The pages are bound to the folio, it doesn't |
1113 | * actually unpin them but drops all but one reference, |
1114 | * which is usually put down by io_buffer_unmap(). |
1115 | * Note, needs a better helper. |
1116 | */ |
1117 | unpin_user_pages(pages: &pages[1], npages: nr_pages - 1); |
1118 | nr_pages = 1; |
1119 | } |
1120 | } |
1121 | |
1122 | imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); |
1123 | if (!imu) |
1124 | goto done; |
1125 | |
1126 | ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); |
1127 | if (ret) { |
1128 | unpin_user_pages(pages, npages: nr_pages); |
1129 | goto done; |
1130 | } |
1131 | |
1132 | off = (unsigned long) iov->iov_base & ~PAGE_MASK; |
1133 | size = iov->iov_len; |
1134 | /* store original address for later verification */ |
1135 | imu->ubuf = (unsigned long) iov->iov_base; |
1136 | imu->ubuf_end = imu->ubuf + iov->iov_len; |
1137 | imu->nr_bvecs = nr_pages; |
1138 | *pimu = imu; |
1139 | ret = 0; |
1140 | |
1141 | if (folio) { |
1142 | bvec_set_page(bv: &imu->bvec[0], page: pages[0], len: size, offset: off); |
1143 | goto done; |
1144 | } |
1145 | for (i = 0; i < nr_pages; i++) { |
1146 | size_t vec_len; |
1147 | |
1148 | vec_len = min_t(size_t, size, PAGE_SIZE - off); |
1149 | bvec_set_page(bv: &imu->bvec[i], page: pages[i], len: vec_len, offset: off); |
1150 | off = 0; |
1151 | size -= vec_len; |
1152 | } |
1153 | done: |
1154 | if (ret) |
1155 | kvfree(addr: imu); |
1156 | kvfree(addr: pages); |
1157 | return ret; |
1158 | } |
1159 | |
1160 | static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) |
1161 | { |
1162 | ctx->user_bufs = kcalloc(n: nr_args, size: sizeof(*ctx->user_bufs), GFP_KERNEL); |
1163 | return ctx->user_bufs ? 0 : -ENOMEM; |
1164 | } |
1165 | |
1166 | int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, |
1167 | unsigned int nr_args, u64 __user *tags) |
1168 | { |
1169 | struct page *last_hpage = NULL; |
1170 | struct io_rsrc_data *data; |
1171 | int i, ret; |
1172 | struct iovec iov; |
1173 | |
1174 | BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); |
1175 | |
1176 | if (ctx->user_bufs) |
1177 | return -EBUSY; |
1178 | if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) |
1179 | return -EINVAL; |
1180 | ret = io_rsrc_data_alloc(ctx, type: IORING_RSRC_BUFFER, utags: tags, nr: nr_args, pdata: &data); |
1181 | if (ret) |
1182 | return ret; |
1183 | ret = io_buffers_map_alloc(ctx, nr_args); |
1184 | if (ret) { |
1185 | io_rsrc_data_free(data); |
1186 | return ret; |
1187 | } |
1188 | |
1189 | for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { |
1190 | if (arg) { |
1191 | ret = io_copy_iov(ctx, dst: &iov, arg, index: i); |
1192 | if (ret) |
1193 | break; |
1194 | ret = io_buffer_validate(iov: &iov); |
1195 | if (ret) |
1196 | break; |
1197 | } else { |
1198 | memset(&iov, 0, sizeof(iov)); |
1199 | } |
1200 | |
1201 | if (!iov.iov_base && *io_get_tag_slot(data, idx: i)) { |
1202 | ret = -EINVAL; |
1203 | break; |
1204 | } |
1205 | |
1206 | ret = io_sqe_buffer_register(ctx, iov: &iov, pimu: &ctx->user_bufs[i], |
1207 | last_hpage: &last_hpage); |
1208 | if (ret) |
1209 | break; |
1210 | } |
1211 | |
1212 | WARN_ON_ONCE(ctx->buf_data); |
1213 | |
1214 | ctx->buf_data = data; |
1215 | if (ret) |
1216 | __io_sqe_buffers_unregister(ctx); |
1217 | return ret; |
1218 | } |
1219 | |
1220 | int io_import_fixed(int ddir, struct iov_iter *iter, |
1221 | struct io_mapped_ubuf *imu, |
1222 | u64 buf_addr, size_t len) |
1223 | { |
1224 | u64 buf_end; |
1225 | size_t offset; |
1226 | |
1227 | if (WARN_ON_ONCE(!imu)) |
1228 | return -EFAULT; |
1229 | if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) |
1230 | return -EFAULT; |
1231 | /* not inside the mapped region */ |
1232 | if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) |
1233 | return -EFAULT; |
1234 | |
1235 | /* |
1236 | * Might not be a start of buffer, set size appropriately |
1237 | * and advance us to the beginning. |
1238 | */ |
1239 | offset = buf_addr - imu->ubuf; |
1240 | iov_iter_bvec(i: iter, direction: ddir, bvec: imu->bvec, nr_segs: imu->nr_bvecs, count: offset + len); |
1241 | |
1242 | if (offset) { |
1243 | /* |
1244 | * Don't use iov_iter_advance() here, as it's really slow for |
1245 | * using the latter parts of a big fixed buffer - it iterates |
1246 | * over each segment manually. We can cheat a bit here, because |
1247 | * we know that: |
1248 | * |
1249 | * 1) it's a BVEC iter, we set it up |
1250 | * 2) all bvecs are PAGE_SIZE in size, except potentially the |
1251 | * first and last bvec |
1252 | * |
1253 | * So just find our index, and adjust the iterator afterwards. |
1254 | * If the offset is within the first bvec (or the whole first |
1255 | * bvec, just use iov_iter_advance(). This makes it easier |
1256 | * since we can just skip the first segment, which may not |
1257 | * be PAGE_SIZE aligned. |
1258 | */ |
1259 | const struct bio_vec *bvec = imu->bvec; |
1260 | |
1261 | if (offset <= bvec->bv_len) { |
1262 | /* |
1263 | * Note, huge pages buffers consists of one large |
1264 | * bvec entry and should always go this way. The other |
1265 | * branch doesn't expect non PAGE_SIZE'd chunks. |
1266 | */ |
1267 | iter->bvec = bvec; |
1268 | iter->nr_segs = bvec->bv_len; |
1269 | iter->count -= offset; |
1270 | iter->iov_offset = offset; |
1271 | } else { |
1272 | unsigned long seg_skip; |
1273 | |
1274 | /* skip first vec */ |
1275 | offset -= bvec->bv_len; |
1276 | seg_skip = 1 + (offset >> PAGE_SHIFT); |
1277 | |
1278 | iter->bvec = bvec + seg_skip; |
1279 | iter->nr_segs -= seg_skip; |
1280 | iter->count -= bvec->bv_len + offset; |
1281 | iter->iov_offset = offset & ~PAGE_MASK; |
1282 | } |
1283 | } |
1284 | |
1285 | return 0; |
1286 | } |
1287 | |