rsrc.c source code [linux/io_uring/rsrc.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/kernel.h>
3	#include <linux/errno.h>
4	#include <linux/fs.h>
5	#include <linux/file.h>
6	#include <linux/mm.h>
7	#include <linux/slab.h>
8	#include <linux/nospec.h>
9	#include <linux/hugetlb.h>
10	#include <linux/compat.h>
11	#include <linux/io_uring.h>
12
13	#include <uapi/linux/io_uring.h>
14
15	#include "io_uring.h"
16	#include "openclose.h"
17	#include "rsrc.h"
18
19	struct io_rsrc_update {
20	struct file *file;
21	u64 arg;
22	u32 nr_args;
23	u32 offset;
24	};
25
26	static void io_rsrc_buf_put(struct io_ring_ctx ctx, struct* io_rsrc_put *prsrc);
27	static void io_rsrc_file_put(struct io_ring_ctx ctx, struct* io_rsrc_put *prsrc);
28	static int io_sqe_buffer_register(struct io_ring_ctx ctx, struct* iovec *iov,
29	struct io_mapped_ubuf **pimu,
30	struct page **last_hpage);
31
32	/ only define max /
33	#define IORING_MAX_FIXED_FILES (1U << 20)
34	#define IORING_MAX_REG_BUFFERS (1U << 14)
35
36	static const struct io_mapped_ubuf dummy_ubuf = {
37	/ set invalid range, so io_import_fixed() fails meeting it /
38	.ubuf = -`1UL`,
39	.ubuf_end = `0`,
40	};
41
42	int __io_account_mem(struct user_struct user, unsigned* long nr_pages)
43	{
44	unsigned long page_limit, cur_pages, new_pages;
45
46	if (!nr_pages)
47	return `0`;
48
49	/ Don't allow more pages than we can safely lock /
50	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
51
52	cur_pages = atomic_long_read(v: &user->locked_vm);
53	do {
54	new_pages = cur_pages + nr_pages;
55	if (new_pages > page_limit)
56	return -ENOMEM;
57	} while (!atomic_long_try_cmpxchg(v: &user->locked_vm,
58	old: &cur_pages, new: new_pages));
59	return `0`;
60	}
61
62	static void io_unaccount_mem(struct io_ring_ctx ctx, unsigned* long nr_pages)
63	{
64	if (ctx->user)
65	__io_unaccount_mem(user: ctx->user, nr_pages);
66
67	if (ctx->mm_account)
68	atomic64_sub(i: nr_pages, v: &ctx->mm_account->pinned_vm);
69	}
70
71	static int io_account_mem(struct io_ring_ctx ctx, unsigned* long nr_pages)
72	{
73	int ret;
74
75	if (ctx->user) {
76	ret = __io_account_mem(user: ctx->user, nr_pages);
77	if (ret)
78	return ret;
79	}
80
81	if (ctx->mm_account)
82	atomic64_add(i: nr_pages, v: &ctx->mm_account->pinned_vm);
83
84	return `0`;
85	}
86
87	static int io_copy_iov(struct io_ring_ctx ctx, struct* iovec *dst,
88	void __user arg, unsigned* index)
89	{
90	struct iovec __user *src;
91
92	#ifdef CONFIG_COMPAT
93	if (ctx->compat) {
94	struct compat_iovec __user *ciovs;
95	struct compat_iovec ciov;
96
97	ciovs = (struct compat_iovec __user *) arg;
98	if (copy_from_user(to: &ciov, from: &ciovs[index], n: sizeof(ciov)))
99	return -EFAULT;
100
101	dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
102	dst->iov_len = ciov.iov_len;
103	return `0`;
104	}
105	#endif
106	src = (struct iovec __user *) arg;
107	if (copy_from_user(to: dst, from: &src[index], n: sizeof(*dst)))
108	return -EFAULT;
109	return `0`;
110	}
111
112	static int io_buffer_validate(struct iovec *iov)
113	{
114	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - `1`);
115
116	/*
117	* Don't impose further limits on the size and buffer
118	* constraints here, we'll -EINVAL later when IO is
119	* submitted if they are wrong.
120	*/
121	if (!iov->iov_base)
122	return iov->iov_len ? -EFAULT : `0`;
123	if (!iov->iov_len)
124	return -EFAULT;
125
126	/ arbitrary limit, but we need something /
127	if (iov->iov_len > SZ_1G)
128	return -EFAULT;
129
130	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
131	return -EOVERFLOW;
132
133	return `0`;
134	}
135
136	static void io_buffer_unmap(struct io_ring_ctx ctx, struct* io_mapped_ubuf **slot)
137	{
138	struct io_mapped_ubuf imu = slot;
139	unsigned int i;
140
141	if (imu != &dummy_ubuf) {
142	for (i = `0`; i < imu->nr_bvecs; i++)
143	unpin_user_page(page: imu->bvec[i].bv_page);
144	if (imu->acct_pages)
145	io_unaccount_mem(ctx, nr_pages: imu->acct_pages);
146	kvfree(addr: imu);
147	}
148	*slot = NULL;
149	}
150
151	static void io_rsrc_put_work(struct io_rsrc_node *node)
152	{
153	struct io_rsrc_put *prsrc = &node->item;
154
155	if (prsrc->tag)
156	io_post_aux_cqe(ctx: node->ctx, user_data: prsrc->tag, res: `0`, cflags: `0`);
157
158	switch (node->type) {
159	case IORING_RSRC_FILE:
160	io_rsrc_file_put(ctx: node->ctx, prsrc);
161	break;
162	case IORING_RSRC_BUFFER:
163	io_rsrc_buf_put(ctx: node->ctx, prsrc);
164	break;
165	default:
166	WARN_ON_ONCE(`1`);
167	break;
168	}
169	}
170
171	void io_rsrc_node_destroy(struct io_ring_ctx ctx, struct* io_rsrc_node *node)
172	{
173	if (!io_alloc_cache_put(cache: &ctx->rsrc_node_cache, entry: &node->cache))
174	kfree(objp: node);
175	}
176
177	void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
178	__must_hold(&node->ctx->uring_lock)
179	{
180	struct io_ring_ctx *ctx = node->ctx;
181
182	while (!list_empty(head: &ctx->rsrc_ref_list)) {
183	node = list_first_entry(&ctx->rsrc_ref_list,
184	struct io_rsrc_node, node);
185	/ recycle ref nodes in order /
186	if (node->refs)
187	break;
188	list_del(entry: &node->node);
189
190	if (likely(!node->empty))
191	io_rsrc_put_work(node);
192	io_rsrc_node_destroy(ctx, node);
193	}
194	if (list_empty(head: &ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
195	wake_up_all(&ctx->rsrc_quiesce_wq);
196	}
197
198	struct io_rsrc_node io_rsrc_node_alloc(struct* io_ring_ctx *ctx)
199	{
200	struct io_rsrc_node *ref_node;
201	struct io_cache_entry *entry;
202
203	entry = io_alloc_cache_get(cache: &ctx->rsrc_node_cache);
204	if (entry) {
205	ref_node = container_of(entry, struct io_rsrc_node, cache);
206	} else {
207	ref_node = kzalloc(size: sizeof(*ref_node), GFP_KERNEL);
208	if (!ref_node)
209	return NULL;
210	}
211
212	ref_node->ctx = ctx;
213	ref_node->empty = `0`;
214	ref_node->refs = `1`;
215	return ref_node;
216	}
217
218	__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
219	struct io_ring_ctx *ctx)
220	{
221	struct io_rsrc_node *backup;
222	DEFINE_WAIT(we);
223	int ret;
224
225	/ As We may drop ->uring_lock, other task may have started quiesce /
226	if (data->quiesce)
227	return -ENXIO;
228
229	backup = io_rsrc_node_alloc(ctx);
230	if (!backup)
231	return -ENOMEM;
232	ctx->rsrc_node->empty = true;
233	ctx->rsrc_node->type = -`1`;
234	list_add_tail(new: &ctx->rsrc_node->node, head: &ctx->rsrc_ref_list);
235	io_put_rsrc_node(ctx, node: ctx->rsrc_node);
236	ctx->rsrc_node = backup;
237
238	if (list_empty(head: &ctx->rsrc_ref_list))
239	return `0`;
240
241	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
242	atomic_set(v: &ctx->cq_wait_nr, i: `1`);
243	smp_mb();
244	}
245
246	ctx->rsrc_quiesce++;
247	data->quiesce = true;
248	do {
249	prepare_to_wait(wq_head: &ctx->rsrc_quiesce_wq, wq_entry: &we, TASK_INTERRUPTIBLE);
250	mutex_unlock(lock: &ctx->uring_lock);
251
252	ret = io_run_task_work_sig(ctx);
253	if (ret < `0`) {
254	mutex_lock(&ctx->uring_lock);
255	if (list_empty(head: &ctx->rsrc_ref_list))
256	ret = `0`;
257	break;
258	}
259
260	schedule();
261	__set_current_state(TASK_RUNNING);
262	mutex_lock(&ctx->uring_lock);
263	ret = `0`;
264	} while (!list_empty(head: &ctx->rsrc_ref_list));
265
266	finish_wait(wq_head: &ctx->rsrc_quiesce_wq, wq_entry: &we);
267	data->quiesce = false;
268	ctx->rsrc_quiesce--;
269
270	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
271	atomic_set(v: &ctx->cq_wait_nr, i: `0`);
272	smp_mb();
273	}
274	return ret;
275	}
276
277	static void io_free_page_table(void **table, size_t size)
278	{
279	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
280
281	for (i = `0`; i < nr_tables; i++)
282	kfree(objp: table[i]);
283	kfree(objp: table);
284	}
285
286	static void io_rsrc_data_free(struct io_rsrc_data *data)
287	{
288	size_t size = data->nr * sizeof(data->tags[`0`][`0`]);
289
290	if (data->tags)
291	io_free_page_table(table: (void **)data->tags, size);
292	kfree(objp: data);
293	}
294
295	static __cold void **io_alloc_page_table(size_t size)
296	{
297	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
298	size_t init_size = size;
299	void **table;
300
301	table = kcalloc(n: nr_tables, size: sizeof(*table), GFP_KERNEL_ACCOUNT);
302	if (!table)
303	return NULL;
304
305	for (i = `0`; i < nr_tables; i++) {
306	unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
307
308	table[i] = kzalloc(size: this_size, GFP_KERNEL_ACCOUNT);
309	if (!table[i]) {
310	io_free_page_table(table, size: init_size);
311	return NULL;
312	}
313	size -= this_size;
314	}
315	return table;
316	}
317
318	__cold static int io_rsrc_data_alloc(struct io_ring_ctx ctx, int* type,
319	u64 __user *utags,
320	unsigned nr, struct io_rsrc_data **pdata)
321	{
322	struct io_rsrc_data *data;
323	int ret = `0`;
324	unsigned i;
325
326	data = kzalloc(size: sizeof(*data), GFP_KERNEL);
327	if (!data)
328	return -ENOMEM;
329	data->tags = (u64 *)io_alloc_page_table(size: nr sizeof(data->tags[`0`][`0`]));
330	if (!data->tags) {
331	kfree(objp: data);
332	return -ENOMEM;
333	}
334
335	data->nr = nr;
336	data->ctx = ctx;
337	data->rsrc_type = type;
338	if (utags) {
339	ret = -EFAULT;
340	for (i = `0`; i < nr; i++) {
341	u64 *tag_slot = io_get_tag_slot(data, idx: i);
342
343	if (copy_from_user(to: tag_slot, from: &utags[i],
344	n: sizeof(*tag_slot)))
345	goto fail;
346	}
347	}
348	*pdata = data;
349	return `0`;
350	fail:
351	io_rsrc_data_free(data);
352	return ret;
353	}
354
355	static int __io_sqe_files_update(struct io_ring_ctx *ctx,
356	struct io_uring_rsrc_update2 *up,
357	unsigned nr_args)
358	{
359	u64 __user *tags = u64_to_user_ptr(up->tags);
360	__s32 __user *fds = u64_to_user_ptr(up->data);
361	struct io_rsrc_data *data = ctx->file_data;
362	struct io_fixed_file *file_slot;
363	int fd, i, err = `0`;
364	unsigned int done;
365
366	if (!ctx->file_data)
367	return -ENXIO;
368	if (up->offset + nr_args > ctx->nr_user_files)
369	return -EINVAL;
370
371	for (done = `0`; done < nr_args; done++) {
372	u64 tag = `0`;
373
374	if ((tags && copy_from_user(to: &tag, from: &tags[done], n: sizeof(tag))) \|\|
375	copy_from_user(to: &fd, from: &fds[done], n: sizeof(fd))) {
376	err = -EFAULT;
377	break;
378	}
379	if ((fd == IORING_REGISTER_FILES_SKIP \|\| fd == -`1`) && tag) {
380	err = -EINVAL;
381	break;
382	}
383	if (fd == IORING_REGISTER_FILES_SKIP)
384	continue;
385
386	i = array_index_nospec(up->offset + done, ctx->nr_user_files);
387	file_slot = io_fixed_file_slot(table: &ctx->file_table, i);
388
389	if (file_slot->file_ptr) {
390	err = io_queue_rsrc_removal(data, idx: i,
391	rsrc: io_slot_file(slot: file_slot));
392	if (err)
393	break;
394	file_slot->file_ptr = `0`;
395	io_file_bitmap_clear(table: &ctx->file_table, bit: i);
396	}
397	if (fd != -`1`) {
398	struct file *file = fget(fd);
399
400	if (!file) {
401	err = -EBADF;
402	break;
403	}
404	/*
405	* Don't allow io_uring instances to be registered. If
406	* UNIX isn't enabled, then this causes a reference
407	* cycle and this instance can never get freed. If UNIX
408	* is enabled we'll handle it just fine, but there's
409	* still no point in allowing a ring fd as it doesn't
410	* support regular read/write anyway.
411	*/
412	if (io_is_uring_fops(file)) {
413	fput(file);
414	err = -EBADF;
415	break;
416	}
417	err = io_scm_file_account(ctx, file);
418	if (err) {
419	fput(file);
420	break;
421	}
422	*io_get_tag_slot(data, idx: i) = tag;
423	io_fixed_file_set(file_slot, file);
424	io_file_bitmap_set(table: &ctx->file_table, bit: i);
425	}
426	}
427	return done ? done : err;
428	}
429
430	static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
431	struct io_uring_rsrc_update2 *up,
432	unsigned int nr_args)
433	{
434	u64 __user *tags = u64_to_user_ptr(up->tags);
435	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
436	struct page *last_hpage = NULL;
437	__u32 done;
438	int i, err;
439
440	if (!ctx->buf_data)
441	return -ENXIO;
442	if (up->offset + nr_args > ctx->nr_user_bufs)
443	return -EINVAL;
444
445	for (done = `0`; done < nr_args; done++) {
446	struct io_mapped_ubuf *imu;
447	u64 tag = `0`;
448
449	err = io_copy_iov(ctx, dst: &iov, arg: iovs, index: done);
450	if (err)
451	break;
452	if (tags && copy_from_user(to: &tag, from: &tags[done], n: sizeof(tag))) {
453	err = -EFAULT;
454	break;
455	}
456	err = io_buffer_validate(iov: &iov);
457	if (err)
458	break;
459	if (!iov.iov_base && tag) {
460	err = -EINVAL;
461	break;
462	}
463	err = io_sqe_buffer_register(ctx, iov: &iov, pimu: &imu, last_hpage: &last_hpage);
464	if (err)
465	break;
466
467	i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
468	if (ctx->user_bufs[i] != &dummy_ubuf) {
469	err = io_queue_rsrc_removal(data: ctx->buf_data, idx: i,
470	rsrc: ctx->user_bufs[i]);
471	if (unlikely(err)) {
472	io_buffer_unmap(ctx, slot: &imu);
473	break;
474	}
475	ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
476	}
477
478	ctx->user_bufs[i] = imu;
479	*io_get_tag_slot(data: ctx->buf_data, idx: i) = tag;
480	}
481	return done ? done : err;
482	}
483
484	static int __io_register_rsrc_update(struct io_ring_ctx ctx, unsigned* type,
485	struct io_uring_rsrc_update2 *up,
486	unsigned nr_args)
487	{
488	__u32 tmp;
489
490	lockdep_assert_held(&ctx->uring_lock);
491
492	if (check_add_overflow(up->offset, nr_args, &tmp))
493	return -EOVERFLOW;
494
495	switch (type) {
496	case IORING_RSRC_FILE:
497	return __io_sqe_files_update(ctx, up, nr_args);
498	case IORING_RSRC_BUFFER:
499	return __io_sqe_buffers_update(ctx, up, nr_args);
500	}
501	return -EINVAL;
502	}
503
504	int io_register_files_update(struct io_ring_ctx ctx, void* __user *arg,
505	unsigned nr_args)
506	{
507	struct io_uring_rsrc_update2 up;
508
509	if (!nr_args)
510	return -EINVAL;
511	memset(&up, `0`, sizeof(up));
512	if (copy_from_user(to: &up, from: arg, n: sizeof(struct io_uring_rsrc_update)))
513	return -EFAULT;
514	if (up.resv \|\| up.resv2)
515	return -EINVAL;
516	return __io_register_rsrc_update(ctx, type: IORING_RSRC_FILE, up: &up, nr_args);
517	}
518
519	int io_register_rsrc_update(struct io_ring_ctx ctx, void* __user *arg,
520	unsigned size, unsigned type)
521	{
522	struct io_uring_rsrc_update2 up;
523
524	if (size != sizeof(up))
525	return -EINVAL;
526	if (copy_from_user(to: &up, from: arg, n: sizeof(up)))
527	return -EFAULT;
528	if (!up.nr \|\| up.resv \|\| up.resv2)
529	return -EINVAL;
530	return __io_register_rsrc_update(ctx, type, up: &up, nr_args: up.nr);
531	}
532
533	__cold int io_register_rsrc(struct io_ring_ctx ctx, void* __user *arg,
534	unsigned int size, unsigned int type)
535	{
536	struct io_uring_rsrc_register rr;
537
538	/ keep it extendible /
539	if (size != sizeof(rr))
540	return -EINVAL;
541
542	memset(&rr, `0`, sizeof(rr));
543	if (copy_from_user(to: &rr, from: arg, n: size))
544	return -EFAULT;
545	if (!rr.nr \|\| rr.resv2)
546	return -EINVAL;
547	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
548	return -EINVAL;
549
550	switch (type) {
551	case IORING_RSRC_FILE:
552	if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
553	break;
554	return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
555	nr_args: rr.nr, u64_to_user_ptr(rr.tags));
556	case IORING_RSRC_BUFFER:
557	if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
558	break;
559	return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
560	nr_args: rr.nr, u64_to_user_ptr(rr.tags));
561	}
562	return -EINVAL;
563	}
564
565	int io_files_update_prep(struct io_kiocb req, const* struct io_uring_sqe *sqe)
566	{
567	struct io_rsrc_update up = io_kiocb_to_cmd(req, struct* io_rsrc_update);
568
569	if (unlikely(req->flags & (REQ_F_FIXED_FILE \| REQ_F_BUFFER_SELECT)))
570	return -EINVAL;
571	if (sqe->rw_flags \|\| sqe->splice_fd_in)
572	return -EINVAL;
573
574	up->offset = READ_ONCE(sqe->off);
575	up->nr_args = READ_ONCE(sqe->len);
576	if (!up->nr_args)
577	return -EINVAL;
578	up->arg = READ_ONCE(sqe->addr);
579	return `0`;
580	}
581
582	static int io_files_update_with_index_alloc(struct io_kiocb *req,
583	unsigned int issue_flags)
584	{
585	struct io_rsrc_update up = io_kiocb_to_cmd(req, struct* io_rsrc_update);
586	__s32 __user *fds = u64_to_user_ptr(up->arg);
587	unsigned int done;
588	struct file *file;
589	int ret, fd;
590
591	if (!req->ctx->file_data)
592	return -ENXIO;
593
594	for (done = `0`; done < up->nr_args; done++) {
595	if (copy_from_user(to: &fd, from: &fds[done], n: sizeof(fd))) {
596	ret = -EFAULT;
597	break;
598	}
599
600	file = fget(fd);
601	if (!file) {
602	ret = -EBADF;
603	break;
604	}
605	ret = io_fixed_fd_install(req, issue_flags, file,
606	IORING_FILE_INDEX_ALLOC);
607	if (ret < `0`)
608	break;
609	if (copy_to_user(to: &fds[done], from: &ret, n: sizeof(ret))) {
610	__io_close_fixed(ctx: req->ctx, issue_flags, offset: ret);
611	ret = -EFAULT;
612	break;
613	}
614	}
615
616	if (done)
617	return done;
618	return ret;
619	}
620
621	int io_files_update(struct io_kiocb req, unsigned* int issue_flags)
622	{
623	struct io_rsrc_update up = io_kiocb_to_cmd(req, struct* io_rsrc_update);
624	struct io_ring_ctx *ctx = req->ctx;
625	struct io_uring_rsrc_update2 up2;
626	int ret;
627
628	up2.offset = up->offset;
629	up2.data = up->arg;
630	up2.nr = `0`;
631	up2.tags = `0`;
632	up2.resv = `0`;
633	up2.resv2 = `0`;
634
635	if (up->offset == IORING_FILE_INDEX_ALLOC) {
636	ret = io_files_update_with_index_alloc(req, issue_flags);
637	} else {
638	io_ring_submit_lock(ctx, issue_flags);
639	ret = __io_register_rsrc_update(ctx, type: IORING_RSRC_FILE,
640	up: &up2, nr_args: up->nr_args);
641	io_ring_submit_unlock(ctx, issue_flags);
642	}
643
644	if (ret < `0`)
645	req_set_fail(req);
646	io_req_set_res(req, res: ret, cflags: `0`);
647	return IOU_OK;
648	}
649
650	int io_queue_rsrc_removal(struct io_rsrc_data data, unsigned* idx, void *rsrc)
651	{
652	struct io_ring_ctx *ctx = data->ctx;
653	struct io_rsrc_node *node = ctx->rsrc_node;
654	u64 *tag_slot = io_get_tag_slot(data, idx);
655
656	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
657	if (unlikely(!ctx->rsrc_node)) {
658	ctx->rsrc_node = node;
659	return -ENOMEM;
660	}
661
662	node->item.rsrc = rsrc;
663	node->type = data->rsrc_type;
664	node->item.tag = *tag_slot;
665	*tag_slot = `0`;
666	list_add_tail(new: &node->node, head: &ctx->rsrc_ref_list);
667	io_put_rsrc_node(ctx, node);
668	return `0`;
669	}
670
671	void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
672	{
673	int i;
674
675	for (i = `0`; i < ctx->nr_user_files; i++) {
676	struct file *file = io_file_from_index(table: &ctx->file_table, index: i);
677
678	/ skip scm accounted files, they'll be freed by ->ring_sock /
679	if (!file \|\| io_file_need_scm(filp: file))
680	continue;
681	io_file_bitmap_clear(table: &ctx->file_table, bit: i);
682	fput(file);
683	}
684
685	#if defined(CONFIG_UNIX)
686	if (ctx->ring_sock) {
687	struct sock *sock = ctx->ring_sock->sk;
688	struct sk_buff *skb;
689
690	while ((skb = skb_dequeue(list: &sock->sk_receive_queue)) != NULL)
691	kfree_skb(skb);
692	}
693	#endif
694	io_free_file_tables(table: &ctx->file_table);
695	io_file_table_set_alloc_range(ctx, off: `0`, len: `0`);
696	io_rsrc_data_free(data: ctx->file_data);
697	ctx->file_data = NULL;
698	ctx->nr_user_files = `0`;
699	}
700
701	int io_sqe_files_unregister(struct io_ring_ctx *ctx)
702	{
703	unsigned nr = ctx->nr_user_files;
704	int ret;
705
706	if (!ctx->file_data)
707	return -ENXIO;
708
709	/*
710	* Quiesce may unlock ->uring_lock, and while it's not held
711	* prevent new requests using the table.
712	*/
713	ctx->nr_user_files = `0`;
714	ret = io_rsrc_ref_quiesce(data: ctx->file_data, ctx);
715	ctx->nr_user_files = nr;
716	if (!ret)
717	__io_sqe_files_unregister(ctx);
718	return ret;
719	}
720
721	/*
722	* Ensure the UNIX gc is aware of our file set, so we are certain that
723	* the io_uring can be safely unregistered on process exit, even if we have
724	* loops in the file referencing. We account only files that can hold other
725	* files because otherwise they can't form a loop and so are not interesting
726	* for GC.
727	*/
728	int __io_scm_file_account(struct io_ring_ctx ctx, struct* file *file)
729	{
730	#if defined(CONFIG_UNIX)
731	struct sock *sk = ctx->ring_sock->sk;
732	struct sk_buff_head *head = &sk->sk_receive_queue;
733	struct scm_fp_list *fpl;
734	struct sk_buff *skb;
735
736	if (likely(!io_file_need_scm(file)))
737	return `0`;
738
739	/*
740	* See if we can merge this file into an existing skb SCM_RIGHTS
741	* file set. If there's no room, fall back to allocating a new skb
742	* and filling it in.
743	*/
744	spin_lock_irq(lock: &head->lock);
745	skb = skb_peek(list_: head);
746	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
747	__skb_unlink(skb, list: head);
748	else
749	skb = NULL;
750	spin_unlock_irq(lock: &head->lock);
751
752	if (!skb) {
753	fpl = kzalloc(size: sizeof(*fpl), GFP_KERNEL);
754	if (!fpl)
755	return -ENOMEM;
756
757	skb = alloc_skb(size: `0`, GFP_KERNEL);
758	if (!skb) {
759	kfree(objp: fpl);
760	return -ENOMEM;
761	}
762
763	fpl->user = get_uid(current_user());
764	fpl->max = SCM_MAX_FD;
765	fpl->count = `0`;
766
767	UNIXCB(skb).fp = fpl;
768	skb->sk = sk;
769	skb->destructor = io_uring_destruct_scm;
770	refcount_add(i: skb->truesize, r: &sk->sk_wmem_alloc);
771	}
772
773	fpl = UNIXCB(skb).fp;
774	fpl->fp[fpl->count++] = get_file(f: file);
775	unix_inflight(user: fpl->user, fp: file);
776	skb_queue_head(list: head, newsk: skb);
777	fput(file);
778	#endif
779	return `0`;
780	}
781
782	static __cold void io_rsrc_file_scm_put(struct io_ring_ctx ctx, struct* file *file)
783	{
784	#if defined(CONFIG_UNIX)
785	struct sock *sock = ctx->ring_sock->sk;
786	struct sk_buff_head list, *head = &sock->sk_receive_queue;
787	struct sk_buff *skb;
788	int i;
789
790	__skb_queue_head_init(list: &list);
791
792	/*
793	* Find the skb that holds this file in its SCM_RIGHTS. When found,
794	* remove this entry and rearrange the file array.
795	*/
796	skb = skb_dequeue(list: head);
797	while (skb) {
798	struct scm_fp_list *fp;
799
800	fp = UNIXCB(skb).fp;
801	for (i = `0`; i < fp->count; i++) {
802	int left;
803
804	if (fp->fp[i] != file)
805	continue;
806
807	unix_notinflight(user: fp->user, fp: fp->fp[i]);
808	left = fp->count - `1` - i;
809	if (left) {
810	memmove(&fp->fp[i], &fp->fp[i + `1`],
811	left * sizeof(struct file *));
812	}
813	fp->count--;
814	if (!fp->count) {
815	kfree_skb(skb);
816	skb = NULL;
817	} else {
818	__skb_queue_tail(list: &list, newsk: skb);
819	}
820	fput(file);
821	file = NULL;
822	break;
823	}
824
825	if (!file)
826	break;
827
828	__skb_queue_tail(list: &list, newsk: skb);
829
830	skb = skb_dequeue(list: head);
831	}
832
833	if (skb_peek(list_: &list)) {
834	spin_lock_irq(lock: &head->lock);
835	while ((skb = __skb_dequeue(list: &list)) != NULL)
836	__skb_queue_tail(list: head, newsk: skb);
837	spin_unlock_irq(lock: &head->lock);
838	}
839	#endif
840	}
841
842	static void io_rsrc_file_put(struct io_ring_ctx ctx, struct* io_rsrc_put *prsrc)
843	{
844	struct file *file = prsrc->file;
845
846	if (likely(!io_file_need_scm(file)))
847	fput(file);
848	else
849	io_rsrc_file_scm_put(ctx, file);
850	}
851
852	int io_sqe_files_register(struct io_ring_ctx ctx, void* __user *arg,
853	unsigned nr_args, u64 __user *tags)
854	{
855	__s32 __user fds = (__s32 __user ) arg;
856	struct file *file;
857	int fd, ret;
858	unsigned i;
859
860	if (ctx->file_data)
861	return -EBUSY;
862	if (!nr_args)
863	return -EINVAL;
864	if (nr_args > IORING_MAX_FIXED_FILES)
865	return -EMFILE;
866	if (nr_args > rlimit(RLIMIT_NOFILE))
867	return -EMFILE;
868	ret = io_rsrc_data_alloc(ctx, type: IORING_RSRC_FILE, utags: tags, nr: nr_args,
869	pdata: &ctx->file_data);
870	if (ret)
871	return ret;
872
873	if (!io_alloc_file_tables(table: &ctx->file_table, nr_files: nr_args)) {
874	io_rsrc_data_free(data: ctx->file_data);
875	ctx->file_data = NULL;
876	return -ENOMEM;
877	}
878
879	for (i = `0`; i < nr_args; i++, ctx->nr_user_files++) {
880	struct io_fixed_file *file_slot;
881
882	if (fds && copy_from_user(to: &fd, from: &fds[i], n: sizeof(fd))) {
883	ret = -EFAULT;
884	goto fail;
885	}
886	/ allow sparse sets /
887	if (!fds \|\| fd == -`1`) {
888	ret = -EINVAL;
889	if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
890	goto fail;
891	continue;
892	}
893
894	file = fget(fd);
895	ret = -EBADF;
896	if (unlikely(!file))
897	goto fail;
898
899	/*
900	* Don't allow io_uring instances to be registered. If UNIX
901	* isn't enabled, then this causes a reference cycle and this
902	* instance can never get freed. If UNIX is enabled we'll
903	* handle it just fine, but there's still no point in allowing
904	* a ring fd as it doesn't support regular read/write anyway.
905	*/
906	if (io_is_uring_fops(file)) {
907	fput(file);
908	goto fail;
909	}
910	ret = io_scm_file_account(ctx, file);
911	if (ret) {
912	fput(file);
913	goto fail;
914	}
915	file_slot = io_fixed_file_slot(table: &ctx->file_table, i);
916	io_fixed_file_set(file_slot, file);
917	io_file_bitmap_set(table: &ctx->file_table, bit: i);
918	}
919
920	/ default it to the whole table /
921	io_file_table_set_alloc_range(ctx, off: `0`, len: ctx->nr_user_files);
922	return `0`;
923	fail:
924	__io_sqe_files_unregister(ctx);
925	return ret;
926	}
927
928	static void io_rsrc_buf_put(struct io_ring_ctx ctx, struct* io_rsrc_put *prsrc)
929	{
930	io_buffer_unmap(ctx, slot: &prsrc->buf);
931	prsrc->buf = NULL;
932	}
933
934	void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
935	{
936	unsigned int i;
937
938	for (i = `0`; i < ctx->nr_user_bufs; i++)
939	io_buffer_unmap(ctx, slot: &ctx->user_bufs[i]);
940	kfree(objp: ctx->user_bufs);
941	io_rsrc_data_free(data: ctx->buf_data);
942	ctx->user_bufs = NULL;
943	ctx->buf_data = NULL;
944	ctx->nr_user_bufs = `0`;
945	}
946
947	int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
948	{
949	unsigned nr = ctx->nr_user_bufs;
950	int ret;
951
952	if (!ctx->buf_data)
953	return -ENXIO;
954
955	/*
956	* Quiesce may unlock ->uring_lock, and while it's not held
957	* prevent new requests using the table.
958	*/
959	ctx->nr_user_bufs = `0`;
960	ret = io_rsrc_ref_quiesce(data: ctx->buf_data, ctx);
961	ctx->nr_user_bufs = nr;
962	if (!ret)
963	__io_sqe_buffers_unregister(ctx);
964	return ret;
965	}
966
967	/*
968	* Not super efficient, but this is just a registration time. And we do cache
969	* the last compound head, so generally we'll only do a full search if we don't
970	* match that one.
971	*
972	* We check if the given compound head page has already been accounted, to
973	* avoid double accounting it. This allows us to account the full size of the
974	* page, not just the constituent pages of a huge page.
975	*/
976	static bool headpage_already_acct(struct io_ring_ctx ctx, struct* page **pages,
977	int nr_pages, struct page *hpage)
978	{
979	int i, j;
980
981	/ check current page array /
982	for (i = `0`; i < nr_pages; i++) {
983	if (!PageCompound(page: pages[i]))
984	continue;
985	if (compound_head(pages[i]) == hpage)
986	return true;
987	}
988
989	/ check previously registered pages /
990	for (i = `0`; i < ctx->nr_user_bufs; i++) {
991	struct io_mapped_ubuf *imu = ctx->user_bufs[i];
992
993	for (j = `0`; j < imu->nr_bvecs; j++) {
994	if (!PageCompound(page: imu->bvec[j].bv_page))
995	continue;
996	if (compound_head(imu->bvec[j].bv_page) == hpage)
997	return true;
998	}
999	}
1000
1001	return false;
1002	}
1003
1004	static int io_buffer_account_pin(struct io_ring_ctx ctx, struct* page **pages,
1005	int nr_pages, struct io_mapped_ubuf *imu,
1006	struct page **last_hpage)
1007	{
1008	int i, ret;
1009
1010	imu->acct_pages = `0`;
1011	for (i = `0`; i < nr_pages; i++) {
1012	if (!PageCompound(page: pages[i])) {
1013	imu->acct_pages++;
1014	} else {
1015	struct page *hpage;
1016
1017	hpage = compound_head(pages[i]);
1018	if (hpage == *last_hpage)
1019	continue;
1020	*last_hpage = hpage;
1021	if (headpage_already_acct(ctx, pages, nr_pages: i, hpage))
1022	continue;
1023	imu->acct_pages += page_size(page: hpage) >> PAGE_SHIFT;
1024	}
1025	}
1026
1027	if (!imu->acct_pages)
1028	return `0`;
1029
1030	ret = io_account_mem(ctx, nr_pages: imu->acct_pages);
1031	if (ret)
1032	imu->acct_pages = `0`;
1033	return ret;
1034	}
1035
1036	struct page *io_pin_pages(unsigned* long ubuf, unsigned long len, int *npages)
1037	{
1038	unsigned long start, end, nr_pages;
1039	struct page **pages = NULL;
1040	int ret;
1041
1042	end = (ubuf + len + PAGE_SIZE - `1`) >> PAGE_SHIFT;
1043	start = ubuf >> PAGE_SHIFT;
1044	nr_pages = end - start;
1045	WARN_ON(!nr_pages);
1046
1047	pages = kvmalloc_array(n: nr_pages, size: sizeof(struct page *), GFP_KERNEL);
1048	if (!pages)
1049	return ERR_PTR(error: -ENOMEM);
1050
1051	mmap_read_lock(current->mm);
1052	ret = pin_user_pages(start: ubuf, nr_pages, gup_flags: FOLL_WRITE \| FOLL_LONGTERM, pages);
1053	mmap_read_unlock(current->mm);
1054
1055	/ success, mapped all pages /
1056	if (ret == nr_pages) {
1057	*npages = nr_pages;
1058	return pages;
1059	}
1060
1061	/ partial map, or didn't map anything /
1062	if (ret >= `0`) {
1063	/ if we did partial map, release any pages we did get /
1064	if (ret)
1065	unpin_user_pages(pages, npages: ret);
1066	ret = -EFAULT;
1067	}
1068	kvfree(addr: pages);
1069	return ERR_PTR(error: ret);
1070	}
1071
1072	static int io_sqe_buffer_register(struct io_ring_ctx ctx, struct* iovec *iov,
1073	struct io_mapped_ubuf **pimu,
1074	struct page **last_hpage)
1075	{
1076	struct io_mapped_ubuf *imu = NULL;
1077	struct page **pages = NULL;
1078	unsigned long off;
1079	size_t size;
1080	int ret, nr_pages, i;
1081	struct folio *folio = NULL;
1082
1083	pimu = (struct* io_mapped_ubuf *)&dummy_ubuf;
1084	if (!iov->iov_base)
1085	return `0`;
1086
1087	ret = -ENOMEM;
1088	pages = io_pin_pages(ubuf: (unsigned long) iov->iov_base, len: iov->iov_len,
1089	npages: &nr_pages);
1090	if (IS_ERR(ptr: pages)) {
1091	ret = PTR_ERR(ptr: pages);
1092	pages = NULL;
1093	goto done;
1094	}
1095
1096	/ If it's a huge page, try to coalesce them into a single bvec entry /
1097	if (nr_pages > `1`) {
1098	folio = page_folio(pages[`0`]);
1099	for (i = `1`; i < nr_pages; i++) {
1100	/*
1101	* Pages must be consecutive and on the same folio for
1102	* this to work
1103	*/
1104	if (page_folio(pages[i]) != folio \|\|
1105	pages[i] != pages[i - `1`] + `1`) {
1106	folio = NULL;
1107	break;
1108	}
1109	}
1110	if (folio) {
1111	/*
1112	* The pages are bound to the folio, it doesn't
1113	* actually unpin them but drops all but one reference,
1114	* which is usually put down by io_buffer_unmap().
1115	* Note, needs a better helper.
1116	*/
1117	unpin_user_pages(pages: &pages[`1`], npages: nr_pages - `1`);
1118	nr_pages = `1`;
1119	}
1120	}
1121
1122	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1123	if (!imu)
1124	goto done;
1125
1126	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1127	if (ret) {
1128	unpin_user_pages(pages, npages: nr_pages);
1129	goto done;
1130	}
1131
1132	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1133	size = iov->iov_len;
1134	/ store original address for later verification /
1135	imu->ubuf = (unsigned long) iov->iov_base;
1136	imu->ubuf_end = imu->ubuf + iov->iov_len;
1137	imu->nr_bvecs = nr_pages;
1138	*pimu = imu;
1139	ret = `0`;
1140
1141	if (folio) {
1142	bvec_set_page(bv: &imu->bvec[`0`], page: pages[`0`], len: size, offset: off);
1143	goto done;
1144	}
1145	for (i = `0`; i < nr_pages; i++) {
1146	size_t vec_len;
1147
1148	vec_len = min_t(size_t, size, PAGE_SIZE - off);
1149	bvec_set_page(bv: &imu->bvec[i], page: pages[i], len: vec_len, offset: off);
1150	off = `0`;
1151	size -= vec_len;
1152	}
1153	done:
1154	if (ret)
1155	kvfree(addr: imu);
1156	kvfree(addr: pages);
1157	return ret;
1158	}
1159
1160	static int io_buffers_map_alloc(struct io_ring_ctx ctx, unsigned* int nr_args)
1161	{
1162	ctx->user_bufs = kcalloc(n: nr_args, size: sizeof(*ctx->user_bufs), GFP_KERNEL);
1163	return ctx->user_bufs ? `0` : -ENOMEM;
1164	}
1165
1166	int io_sqe_buffers_register(struct io_ring_ctx ctx, void* __user *arg,
1167	unsigned int nr_args, u64 __user *tags)
1168	{
1169	struct page *last_hpage = NULL;
1170	struct io_rsrc_data *data;
1171	int i, ret;
1172	struct iovec iov;
1173
1174	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (`1u` << `16`));
1175
1176	if (ctx->user_bufs)
1177	return -EBUSY;
1178	if (!nr_args \|\| nr_args > IORING_MAX_REG_BUFFERS)
1179	return -EINVAL;
1180	ret = io_rsrc_data_alloc(ctx, type: IORING_RSRC_BUFFER, utags: tags, nr: nr_args, pdata: &data);
1181	if (ret)
1182	return ret;
1183	ret = io_buffers_map_alloc(ctx, nr_args);
1184	if (ret) {
1185	io_rsrc_data_free(data);
1186	return ret;
1187	}
1188
1189	for (i = `0`; i < nr_args; i++, ctx->nr_user_bufs++) {
1190	if (arg) {
1191	ret = io_copy_iov(ctx, dst: &iov, arg, index: i);
1192	if (ret)
1193	break;
1194	ret = io_buffer_validate(iov: &iov);
1195	if (ret)
1196	break;
1197	} else {
1198	memset(&iov, `0`, sizeof(iov));
1199	}
1200
1201	if (!iov.iov_base && *io_get_tag_slot(data, idx: i)) {
1202	ret = -EINVAL;
1203	break;
1204	}
1205
1206	ret = io_sqe_buffer_register(ctx, iov: &iov, pimu: &ctx->user_bufs[i],
1207	last_hpage: &last_hpage);
1208	if (ret)
1209	break;
1210	}
1211
1212	WARN_ON_ONCE(ctx->buf_data);
1213
1214	ctx->buf_data = data;
1215	if (ret)
1216	__io_sqe_buffers_unregister(ctx);
1217	return ret;
1218	}
1219
1220	int io_import_fixed(int ddir, struct iov_iter *iter,
1221	struct io_mapped_ubuf *imu,
1222	u64 buf_addr, size_t len)
1223	{
1224	u64 buf_end;
1225	size_t offset;
1226
1227	if (WARN_ON_ONCE(!imu))
1228	return -EFAULT;
1229	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1230	return -EFAULT;
1231	/ not inside the mapped region /
1232	if (unlikely(buf_addr < imu->ubuf \|\| buf_end > imu->ubuf_end))
1233	return -EFAULT;
1234
1235	/*
1236	* Might not be a start of buffer, set size appropriately
1237	* and advance us to the beginning.
1238	*/
1239	offset = buf_addr - imu->ubuf;
1240	iov_iter_bvec(i: iter, direction: ddir, bvec: imu->bvec, nr_segs: imu->nr_bvecs, count: offset + len);
1241
1242	if (offset) {
1243	/*
1244	* Don't use iov_iter_advance() here, as it's really slow for
1245	* using the latter parts of a big fixed buffer - it iterates
1246	* over each segment manually. We can cheat a bit here, because
1247	* we know that:
1248	*
1249	* 1) it's a BVEC iter, we set it up
1250	* 2) all bvecs are PAGE_SIZE in size, except potentially the
1251	* first and last bvec
1252	*
1253	* So just find our index, and adjust the iterator afterwards.
1254	* If the offset is within the first bvec (or the whole first
1255	* bvec, just use iov_iter_advance(). This makes it easier
1256	* since we can just skip the first segment, which may not
1257	* be PAGE_SIZE aligned.
1258	*/
1259	const struct bio_vec *bvec = imu->bvec;
1260
1261	if (offset <= bvec->bv_len) {
1262	/*
1263	* Note, huge pages buffers consists of one large
1264	* bvec entry and should always go this way. The other
1265	* branch doesn't expect non PAGE_SIZE'd chunks.
1266	*/
1267	iter->bvec = bvec;
1268	iter->nr_segs = bvec->bv_len;
1269	iter->count -= offset;
1270	iter->iov_offset = offset;
1271	} else {
1272	unsigned long seg_skip;
1273
1274	/ skip first vec /
1275	offset -= bvec->bv_len;
1276	seg_skip = `1` + (offset >> PAGE_SHIFT);
1277
1278	iter->bvec = bvec + seg_skip;
1279	iter->nr_segs -= seg_skip;
1280	iter->count -= bvec->bv_len + offset;
1281	iter->iov_offset = offset & ~PAGE_MASK;
1282	}
1283	}
1284
1285	return `0`;
1286	}
1287

source code of linux/io_uring/rsrc.c