io_uring.c source code [linux/io_uring/io_uring.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Shared application/kernel submission and completion ring pairs, for
4	* supporting fast/efficient IO.
5	*
6	* A note on the read/write ordering memory barriers that are matched between
7	* the application and kernel side.
8	*
9	* After the application reads the CQ ring tail, it must use an
10	* appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11	* before writing the tail (using smp_load_acquire to read the tail will
12	* do). It also needs a smp_mb() before updating CQ head (ordering the
13	* entry load(s) with the head store), pairing with an implicit barrier
14	* through a control-dependency in io_get_cqe (smp_store_release to
15	* store head will do). Failure to do so could lead to reading invalid
16	* CQ entries.
17	*
18	* Likewise, the application must use an appropriate smp_wmb() before
19	* writing the SQ tail (ordering SQ entry stores with the tail store),
20	* which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21	* to store the tail will do). And it needs a barrier ordering the SQ
22	* head load before writing new SQ entries (smp_load_acquire to read
23	* head will do).
24	*
25	* When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26	* needs to check the SQ flags for IORING_SQ_NEED_WAKEUP after
27	* updating the SQ tail; a full memory barrier smp_mb() is needed
28	* between.
29	*
30	* Also see the examples in the liburing library:
31	*
32	* git://git.kernel.dk/liburing
33	*
34	* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35	* from data shared between the kernel and application. This is done both
36	* for ordering purposes, but also to ensure that once a value is loaded from
37	* data that the application could potentially modify, it remains stable.
38	*
39	* Copyright (C) 2018-2019 Jens Axboe
40	* Copyright (c) 2018-2019 Christoph Hellwig
41	*/
42	#include <linux/kernel.h>
43	#include <linux/init.h>
44	#include <linux/errno.h>
45	#include <linux/syscalls.h>
46	#include <net/compat.h>
47	#include <linux/refcount.h>
48	#include <linux/uio.h>
49	#include <linux/bits.h>
50
51	#include <linux/sched/signal.h>
52	#include <linux/fs.h>
53	#include <linux/file.h>
54	#include <linux/fdtable.h>
55	#include <linux/mm.h>
56	#include <linux/mman.h>
57	#include <linux/percpu.h>
58	#include <linux/slab.h>
59	#include <linux/bvec.h>
60	#include <linux/net.h>
61	#include <net/sock.h>
62	#include <net/af_unix.h>
63	#include <net/scm.h>
64	#include <linux/anon_inodes.h>
65	#include <linux/sched/mm.h>
66	#include <linux/uaccess.h>
67	#include <linux/nospec.h>
68	#include <linux/highmem.h>
69	#include <linux/fsnotify.h>
70	#include <linux/fadvise.h>
71	#include <linux/task_work.h>
72	#include <linux/io_uring.h>
73	#include <linux/audit.h>
74	#include <linux/security.h>
75	#include <asm/shmparam.h>
76
77	#define CREATE_TRACE_POINTS
78	#include <trace/events/io_uring.h>
79
80	#include <uapi/linux/io_uring.h>
81
82	#include "io-wq.h"
83
84	#include "io_uring.h"
85	#include "opdef.h"
86	#include "refs.h"
87	#include "tctx.h"
88	#include "sqpoll.h"
89	#include "fdinfo.h"
90	#include "kbuf.h"
91	#include "rsrc.h"
92	#include "cancel.h"
93	#include "net.h"
94	#include "notif.h"
95	#include "waitid.h"
96	#include "futex.h"
97
98	#include "timeout.h"
99	#include "poll.h"
100	#include "rw.h"
101	#include "alloc_cache.h"
102
103	#define IORING_MAX_ENTRIES 32768
104	#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
105
106	#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
107	IORING_REGISTER_LAST + IORING_OP_LAST)
108
109	#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE \| IOSQE_IO_LINK \| \
110	IOSQE_IO_HARDLINK \| IOSQE_ASYNC)
111
112	#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS \| IOSQE_BUFFER_SELECT \| \
113	IOSQE_IO_DRAIN \| IOSQE_CQE_SKIP_SUCCESS)
114
115	#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED \| REQ_F_NEED_CLEANUP \| \
116	REQ_F_POLLED \| REQ_F_INFLIGHT \| REQ_F_CREDS \| \
117	REQ_F_ASYNC_DATA)
118
119	#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT \| REQ_F_LINK \| REQ_F_HARDLINK \|\
120	IO_REQ_CLEAN_FLAGS)
121
122	#define IO_TCTX_REFS_CACHE_NR (1U << 10)
123
124	#define IO_COMPL_BATCH 32
125	#define IO_REQ_ALLOC_BATCH 8
126
127	enum {
128	IO_CHECK_CQ_OVERFLOW_BIT,
129	IO_CHECK_CQ_DROPPED_BIT,
130	};
131
132	enum {
133	IO_EVENTFD_OP_SIGNAL_BIT,
134	IO_EVENTFD_OP_FREE_BIT,
135	};
136
137	struct io_defer_entry {
138	struct list_head list;
139	struct io_kiocb *req;
140	u32 seq;
141	};
142
143	/ requests with any of those set should undergo io_disarm_next() /
144	#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT \| REQ_F_LINK_TIMEOUT \| REQ_F_FAIL)
145	#define IO_REQ_LINK_FLAGS (REQ_F_LINK \| REQ_F_HARDLINK)
146
147	static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
148	struct task_struct *task,
149	bool cancel_all);
150
151	static void io_queue_sqe(struct io_kiocb *req);
152
153	struct kmem_cache *req_cachep;
154
155	static int __read_mostly sysctl_io_uring_disabled;
156	static int __read_mostly sysctl_io_uring_group = -`1`;
157
158	#ifdef CONFIG_SYSCTL
159	static struct ctl_table kernel_io_uring_disabled_table[] = {
160	{
161	.procname = "io_uring_disabled",
162	.data = &sysctl_io_uring_disabled,
163	.maxlen = sizeof(sysctl_io_uring_disabled),
164	.mode = `0644`,
165	.proc_handler = proc_dointvec_minmax,
166	.extra1 = SYSCTL_ZERO,
167	.extra2 = SYSCTL_TWO,
168	},
169	{
170	.procname = "io_uring_group",
171	.data = &sysctl_io_uring_group,
172	.maxlen = sizeof(gid_t),
173	.mode = `0644`,
174	.proc_handler = proc_dointvec,
175	},
176	{},
177	};
178	#endif
179
180	struct sock io_uring_get_socket(struct* file *file)
181	{
182	#if defined(CONFIG_UNIX)
183	if (io_is_uring_fops(file)) {
184	struct io_ring_ctx *ctx = file->private_data;
185
186	return ctx->ring_sock->sk;
187	}
188	#endif
189	return NULL;
190	}
191	EXPORT_SYMBOL(io_uring_get_socket);
192
193	static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
194	{
195	if (!wq_list_empty(&ctx->submit_state.compl_reqs) \|\|
196	ctx->submit_state.cqes_count)
197	__io_submit_flush_completions(ctx);
198	}
199
200	static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
201	{
202	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
203	}
204
205	static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
206	{
207	return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
208	}
209
210	static bool io_match_linked(struct io_kiocb *head)
211	{
212	struct io_kiocb *req;
213
214	io_for_each_link(req, head) {
215	if (req->flags & REQ_F_INFLIGHT)
216	return true;
217	}
218	return false;
219	}
220
221	/*
222	* As io_match_task() but protected against racing with linked timeouts.
223	* User must not hold timeout_lock.
224	*/
225	bool io_match_task_safe(struct io_kiocb head, struct* task_struct *task,
226	bool cancel_all)
227	{
228	bool matched;
229
230	if (task && head->task != task)
231	return false;
232	if (cancel_all)
233	return true;
234
235	if (head->flags & REQ_F_LINK_TIMEOUT) {
236	struct io_ring_ctx *ctx = head->ctx;
237
238	/ protect against races with linked timeouts /
239	spin_lock_irq(lock: &ctx->timeout_lock);
240	matched = io_match_linked(head);
241	spin_unlock_irq(lock: &ctx->timeout_lock);
242	} else {
243	matched = io_match_linked(head);
244	}
245	return matched;
246	}
247
248	static inline void req_fail_link_node(struct io_kiocb req, int* res)
249	{
250	req_set_fail(req);
251	io_req_set_res(req, res, cflags: `0`);
252	}
253
254	static inline void io_req_add_to_cache(struct io_kiocb req, struct* io_ring_ctx *ctx)
255	{
256	wq_stack_add_head(node: &req->comp_list, stack: &ctx->submit_state.free_list);
257	}
258
259	static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
260	{
261	struct io_ring_ctx ctx = container_of(ref, struct* io_ring_ctx, refs);
262
263	complete(&ctx->ref_comp);
264	}
265
266	static __cold void io_fallback_req_func(struct work_struct *work)
267	{
268	struct io_ring_ctx ctx = container_of(work, struct* io_ring_ctx,
269	fallback_work.work);
270	struct llist_node *node = llist_del_all(head: &ctx->fallback_llist);
271	struct io_kiocb req, tmp;
272	struct io_tw_state ts = { .locked = true, };
273
274	mutex_lock(&ctx->uring_lock);
275	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
276	req->io_task_work.func(req, &ts);
277	if (WARN_ON_ONCE(!ts.locked))
278	return;
279	io_submit_flush_completions(ctx);
280	mutex_unlock(lock: &ctx->uring_lock);
281	}
282
283	static int io_alloc_hash_table(struct io_hash_table table, unsigned* bits)
284	{
285	unsigned hash_buckets = `1U` << bits;
286	size_t hash_size = hash_buckets * sizeof(table->hbs[`0`]);
287
288	table->hbs = kmalloc(size: hash_size, GFP_KERNEL);
289	if (!table->hbs)
290	return -ENOMEM;
291
292	table->hash_bits = bits;
293	init_hash_table(table, size: hash_buckets);
294	return `0`;
295	}
296
297	static __cold struct io_ring_ctx io_ring_ctx_alloc(struct* io_uring_params *p)
298	{
299	struct io_ring_ctx *ctx;
300	int hash_bits;
301
302	ctx = kzalloc(size: sizeof(*ctx), GFP_KERNEL);
303	if (!ctx)
304	return NULL;
305
306	xa_init(xa: &ctx->io_bl_xa);
307
308	/*
309	* Use 5 bits less than the max cq entries, that should give us around
310	* 32 entries per hash list if totally full and uniformly spread, but
311	* don't keep too many buckets to not overconsume memory.
312	*/
313	hash_bits = ilog2(p->cq_entries) - `5`;
314	hash_bits = clamp(hash_bits, `1`, `8`);
315	if (io_alloc_hash_table(table: &ctx->cancel_table, bits: hash_bits))
316	goto err;
317	if (io_alloc_hash_table(table: &ctx->cancel_table_locked, bits: hash_bits))
318	goto err;
319	if (percpu_ref_init(ref: &ctx->refs, release: io_ring_ctx_ref_free,
320	flags: `0`, GFP_KERNEL))
321	goto err;
322
323	ctx->flags = p->flags;
324	init_waitqueue_head(&ctx->sqo_sq_wait);
325	INIT_LIST_HEAD(list: &ctx->sqd_list);
326	INIT_LIST_HEAD(list: &ctx->cq_overflow_list);
327	INIT_LIST_HEAD(list: &ctx->io_buffers_cache);
328	io_alloc_cache_init(cache: &ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
329	size: sizeof(struct io_rsrc_node));
330	io_alloc_cache_init(cache: &ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
331	size: sizeof(struct async_poll));
332	io_alloc_cache_init(cache: &ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
333	size: sizeof(struct io_async_msghdr));
334	io_futex_cache_init(ctx);
335	init_completion(x: &ctx->ref_comp);
336	xa_init_flags(xa: &ctx->personalities, XA_FLAGS_ALLOC1);
337	mutex_init(&ctx->uring_lock);
338	init_waitqueue_head(&ctx->cq_wait);
339	init_waitqueue_head(&ctx->poll_wq);
340	init_waitqueue_head(&ctx->rsrc_quiesce_wq);
341	spin_lock_init(&ctx->completion_lock);
342	spin_lock_init(&ctx->timeout_lock);
343	INIT_WQ_LIST(&ctx->iopoll_list);
344	INIT_LIST_HEAD(list: &ctx->io_buffers_comp);
345	INIT_LIST_HEAD(list: &ctx->defer_list);
346	INIT_LIST_HEAD(list: &ctx->timeout_list);
347	INIT_LIST_HEAD(list: &ctx->ltimeout_list);
348	INIT_LIST_HEAD(list: &ctx->rsrc_ref_list);
349	init_llist_head(list: &ctx->work_llist);
350	INIT_LIST_HEAD(list: &ctx->tctx_list);
351	ctx->submit_state.free_list.next = NULL;
352	INIT_WQ_LIST(&ctx->locked_free_list);
353	INIT_HLIST_HEAD(&ctx->waitid_list);
354	#ifdef CONFIG_FUTEX
355	INIT_HLIST_HEAD(&ctx->futex_list);
356	#endif
357	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
358	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
359	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
360	return ctx;
361	err:
362	kfree(objp: ctx->cancel_table.hbs);
363	kfree(objp: ctx->cancel_table_locked.hbs);
364	kfree(objp: ctx->io_bl);
365	xa_destroy(&ctx->io_bl_xa);
366	kfree(objp: ctx);
367	return NULL;
368	}
369
370	static void io_account_cq_overflow(struct io_ring_ctx *ctx)
371	{
372	struct io_rings *r = ctx->rings;
373
374	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + `1`);
375	ctx->cq_extra--;
376	}
377
378	static bool req_need_defer(struct io_kiocb *req, u32 seq)
379	{
380	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
381	struct io_ring_ctx *ctx = req->ctx;
382
383	return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
384	}
385
386	return false;
387	}
388
389	static void io_clean_op(struct io_kiocb *req)
390	{
391	if (req->flags & REQ_F_BUFFER_SELECTED) {
392	spin_lock(lock: &req->ctx->completion_lock);
393	io_put_kbuf_comp(req);
394	spin_unlock(lock: &req->ctx->completion_lock);
395	}
396
397	if (req->flags & REQ_F_NEED_CLEANUP) {
398	const struct io_cold_def *def = &io_cold_defs[req->opcode];
399
400	if (def->cleanup)
401	def->cleanup(req);
402	}
403	if ((req->flags & REQ_F_POLLED) && req->apoll) {
404	kfree(objp: req->apoll->double_poll);
405	kfree(objp: req->apoll);
406	req->apoll = NULL;
407	}
408	if (req->flags & REQ_F_INFLIGHT) {
409	struct io_uring_task *tctx = req->task->io_uring;
410
411	atomic_dec(v: &tctx->inflight_tracked);
412	}
413	if (req->flags & REQ_F_CREDS)
414	put_cred(cred: req->creds);
415	if (req->flags & REQ_F_ASYNC_DATA) {
416	kfree(objp: req->async_data);
417	req->async_data = NULL;
418	}
419	req->flags &= ~IO_REQ_CLEAN_FLAGS;
420	}
421
422	static inline void io_req_track_inflight(struct io_kiocb *req)
423	{
424	if (!(req->flags & REQ_F_INFLIGHT)) {
425	req->flags \|= REQ_F_INFLIGHT;
426	atomic_inc(v: &req->task->io_uring->inflight_tracked);
427	}
428	}
429
430	static struct io_kiocb __io_prep_linked_timeout(struct* io_kiocb *req)
431	{
432	if (WARN_ON_ONCE(!req->link))
433	return NULL;
434
435	req->flags &= ~REQ_F_ARM_LTIMEOUT;
436	req->flags \|= REQ_F_LINK_TIMEOUT;
437
438	/ linked timeouts should have two refs once prep'ed /
439	io_req_set_refcount(req);
440	__io_req_set_refcount(req: req->link, nr: `2`);
441	return req->link;
442	}
443
444	static inline struct io_kiocb io_prep_linked_timeout(struct* io_kiocb *req)
445	{
446	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
447	return NULL;
448	return __io_prep_linked_timeout(req);
449	}
450
451	static noinline void __io_arm_ltimeout(struct io_kiocb *req)
452	{
453	io_queue_linked_timeout(req: __io_prep_linked_timeout(req));
454	}
455
456	static inline void io_arm_ltimeout(struct io_kiocb *req)
457	{
458	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
459	__io_arm_ltimeout(req);
460	}
461
462	static void io_prep_async_work(struct io_kiocb *req)
463	{
464	const struct io_issue_def *def = &io_issue_defs[req->opcode];
465	struct io_ring_ctx *ctx = req->ctx;
466
467	if (!(req->flags & REQ_F_CREDS)) {
468	req->flags \|= REQ_F_CREDS;
469	req->creds = get_current_cred();
470	}
471
472	req->work.list.next = NULL;
473	req->work.flags = `0`;
474	req->work.cancel_seq = atomic_read(v: &ctx->cancel_seq);
475	if (req->flags & REQ_F_FORCE_ASYNC)
476	req->work.flags \|= IO_WQ_WORK_CONCURRENT;
477
478	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
479	req->flags \|= io_file_get_flags(file: req->file);
480
481	if (req->file && (req->flags & REQ_F_ISREG)) {
482	bool should_hash = def->hash_reg_file;
483
484	/ don't serialize this request if the fs doesn't need it /
485	if (should_hash && (req->file->f_flags & O_DIRECT) &&
486	(req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
487	should_hash = false;
488	if (should_hash \|\| (ctx->flags & IORING_SETUP_IOPOLL))
489	io_wq_hash_work(work: &req->work, val: file_inode(f: req->file));
490	} else if (!req->file \|\| !S_ISBLK(file_inode(req->file)->i_mode)) {
491	if (def->unbound_nonreg_file)
492	req->work.flags \|= IO_WQ_WORK_UNBOUND;
493	}
494	}
495
496	static void io_prep_async_link(struct io_kiocb *req)
497	{
498	struct io_kiocb *cur;
499
500	if (req->flags & REQ_F_LINK_TIMEOUT) {
501	struct io_ring_ctx *ctx = req->ctx;
502
503	spin_lock_irq(lock: &ctx->timeout_lock);
504	io_for_each_link(cur, req)
505	io_prep_async_work(req: cur);
506	spin_unlock_irq(lock: &ctx->timeout_lock);
507	} else {
508	io_for_each_link(cur, req)
509	io_prep_async_work(req: cur);
510	}
511	}
512
513	void io_queue_iowq(struct io_kiocb req, struct* io_tw_state *ts_dont_use)
514	{
515	struct io_kiocb *link = io_prep_linked_timeout(req);
516	struct io_uring_task *tctx = req->task->io_uring;
517
518	BUG_ON(!tctx);
519	BUG_ON(!tctx->io_wq);
520
521	/ init ->work of the whole link before punting /
522	io_prep_async_link(req);
523
524	/*
525	* Not expected to happen, but if we do have a bug where this _can_
526	* happen, catch it here and ensure the request is marked as
527	* canceled. That will make io-wq go through the usual work cancel
528	* procedure rather than attempt to run this request (or create a new
529	* worker for it).
530	*/
531	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
532	req->work.flags \|= IO_WQ_WORK_CANCEL;
533
534	trace_io_uring_queue_async_work(req, rw: io_wq_is_hashed(work: &req->work));
535	io_wq_enqueue(wq: tctx->io_wq, work: &req->work);
536	if (link)
537	io_queue_linked_timeout(req: link);
538	}
539
540	static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
541	{
542	while (!list_empty(head: &ctx->defer_list)) {
543	struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
544	struct io_defer_entry, list);
545
546	if (req_need_defer(req: de->req, seq: de->seq))
547	break;
548	list_del_init(entry: &de->list);
549	io_req_task_queue(req: de->req);
550	kfree(objp: de);
551	}
552	}
553
554
555	static void io_eventfd_ops(struct rcu_head *rcu)
556	{
557	struct io_ev_fd ev_fd = container_of(rcu, struct* io_ev_fd, rcu);
558	int ops = atomic_xchg(v: &ev_fd->ops, new: `0`);
559
560	if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
561	eventfd_signal_mask(ctx: ev_fd->cq_ev_fd, n: `1`, EPOLL_URING_WAKE);
562
563	/ IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback*
564	* ordering in a race but if references are 0 we know we have to free
565	* it regardless.
566	*/
567	if (atomic_dec_and_test(v: &ev_fd->refs)) {
568	eventfd_ctx_put(ctx: ev_fd->cq_ev_fd);
569	kfree(objp: ev_fd);
570	}
571	}
572
573	static void io_eventfd_signal(struct io_ring_ctx *ctx)
574	{
575	struct io_ev_fd *ev_fd = NULL;
576
577	rcu_read_lock();
578	/*
579	* rcu_dereference ctx->io_ev_fd once and use it for both for checking
580	* and eventfd_signal
581	*/
582	ev_fd = rcu_dereference(ctx->io_ev_fd);
583
584	/*
585	* Check again if ev_fd exists incase an io_eventfd_unregister call
586	* completed between the NULL check of ctx->io_ev_fd at the start of
587	* the function and rcu_read_lock.
588	*/
589	if (unlikely(!ev_fd))
590	goto out;
591	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
592	goto out;
593	if (ev_fd->eventfd_async && !io_wq_current_is_worker())
594	goto out;
595
596	if (likely(eventfd_signal_allowed())) {
597	eventfd_signal_mask(ctx: ev_fd->cq_ev_fd, n: `1`, EPOLL_URING_WAKE);
598	} else {
599	atomic_inc(v: &ev_fd->refs);
600	if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), v: &ev_fd->ops))
601	call_rcu_hurry(head: &ev_fd->rcu, func: io_eventfd_ops);
602	else
603	atomic_dec(v: &ev_fd->refs);
604	}
605
606	out:
607	rcu_read_unlock();
608	}
609
610	static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
611	{
612	bool skip;
613
614	spin_lock(lock: &ctx->completion_lock);
615
616	/*
617	* Eventfd should only get triggered when at least one event has been
618	* posted. Some applications rely on the eventfd notification count
619	* only changing IFF a new CQE has been added to the CQ ring. There's
620	* no depedency on 1:1 relationship between how many times this
621	* function is called (and hence the eventfd count) and number of CQEs
622	* posted to the CQ ring.
623	*/
624	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
625	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
626	spin_unlock(lock: &ctx->completion_lock);
627	if (skip)
628	return;
629
630	io_eventfd_signal(ctx);
631	}
632
633	void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
634	{
635	if (ctx->poll_activated)
636	io_poll_wq_wake(ctx);
637	if (ctx->off_timeout_used)
638	io_flush_timeouts(ctx);
639	if (ctx->drain_active) {
640	spin_lock(lock: &ctx->completion_lock);
641	io_queue_deferred(ctx);
642	spin_unlock(lock: &ctx->completion_lock);
643	}
644	if (ctx->has_evfd)
645	io_eventfd_flush_signal(ctx);
646	}
647
648	static inline void __io_cq_lock(struct io_ring_ctx *ctx)
649	{
650	if (!ctx->lockless_cq)
651	spin_lock(lock: &ctx->completion_lock);
652	}
653
654	static inline void io_cq_lock(struct io_ring_ctx *ctx)
655	__acquires(ctx->completion_lock)
656	{
657	spin_lock(lock: &ctx->completion_lock);
658	}
659
660	static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
661	{
662	io_commit_cqring(ctx);
663	if (!ctx->task_complete) {
664	if (!ctx->lockless_cq)
665	spin_unlock(lock: &ctx->completion_lock);
666	/ IOPOLL rings only need to wake up if it's also SQPOLL /
667	if (!ctx->syscall_iopoll)
668	io_cqring_wake(ctx);
669	}
670	io_commit_cqring_flush(ctx);
671	}
672
673	static void io_cq_unlock_post(struct io_ring_ctx *ctx)
674	__releases(ctx->completion_lock)
675	{
676	io_commit_cqring(ctx);
677	spin_unlock(lock: &ctx->completion_lock);
678	io_cqring_wake(ctx);
679	io_commit_cqring_flush(ctx);
680	}
681
682	/ Returns true if there are no backlogged entries after the flush /
683	static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
684	{
685	struct io_overflow_cqe *ocqe;
686	LIST_HEAD(list);
687
688	spin_lock(lock: &ctx->completion_lock);
689	list_splice_init(list: &ctx->cq_overflow_list, head: &list);
690	clear_bit(nr: IO_CHECK_CQ_OVERFLOW_BIT, addr: &ctx->check_cq);
691	spin_unlock(lock: &ctx->completion_lock);
692
693	while (!list_empty(head: &list)) {
694	ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
695	list_del(entry: &ocqe->list);
696	kfree(objp: ocqe);
697	}
698	}
699
700	static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
701	{
702	size_t cqe_size = sizeof(struct io_uring_cqe);
703
704	if (__io_cqring_events(ctx) == ctx->cq_entries)
705	return;
706
707	if (ctx->flags & IORING_SETUP_CQE32)
708	cqe_size <<= `1`;
709
710	io_cq_lock(ctx);
711	while (!list_empty(head: &ctx->cq_overflow_list)) {
712	struct io_uring_cqe *cqe;
713	struct io_overflow_cqe *ocqe;
714
715	if (!io_get_cqe_overflow(ctx, ret: &cqe, overflow: true))
716	break;
717	ocqe = list_first_entry(&ctx->cq_overflow_list,
718	struct io_overflow_cqe, list);
719	memcpy(cqe, &ocqe->cqe, cqe_size);
720	list_del(entry: &ocqe->list);
721	kfree(objp: ocqe);
722	}
723
724	if (list_empty(head: &ctx->cq_overflow_list)) {
725	clear_bit(nr: IO_CHECK_CQ_OVERFLOW_BIT, addr: &ctx->check_cq);
726	atomic_andnot(IORING_SQ_CQ_OVERFLOW, v: &ctx->rings->sq_flags);
727	}
728	io_cq_unlock_post(ctx);
729	}
730
731	static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
732	{
733	/ iopoll syncs against uring_lock, not completion_lock /
734	if (ctx->flags & IORING_SETUP_IOPOLL)
735	mutex_lock(&ctx->uring_lock);
736	__io_cqring_overflow_flush(ctx);
737	if (ctx->flags & IORING_SETUP_IOPOLL)
738	mutex_unlock(lock: &ctx->uring_lock);
739	}
740
741	static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
742	{
743	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
744	io_cqring_do_overflow_flush(ctx);
745	}
746
747	/ can be called by any task /
748	static void io_put_task_remote(struct task_struct *task)
749	{
750	struct io_uring_task *tctx = task->io_uring;
751
752	percpu_counter_sub(fbc: &tctx->inflight, amount: `1`);
753	if (unlikely(atomic_read(&tctx->in_cancel)))
754	wake_up(&tctx->wait);
755	put_task_struct(t: task);
756	}
757
758	/ used by a task to put its own references /
759	static void io_put_task_local(struct task_struct *task)
760	{
761	task->io_uring->cached_refs++;
762	}
763
764	/ must to be called somewhat shortly after putting a request /
765	static inline void io_put_task(struct task_struct *task)
766	{
767	if (likely(task == current))
768	io_put_task_local(task);
769	else
770	io_put_task_remote(task);
771	}
772
773	void io_task_refs_refill(struct io_uring_task *tctx)
774	{
775	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
776
777	percpu_counter_add(fbc: &tctx->inflight, amount: refill);
778	refcount_add(i: refill, r: &current->usage);
779	tctx->cached_refs += refill;
780	}
781
782	static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
783	{
784	struct io_uring_task *tctx = task->io_uring;
785	unsigned int refs = tctx->cached_refs;
786
787	if (refs) {
788	tctx->cached_refs = `0`;
789	percpu_counter_sub(fbc: &tctx->inflight, amount: refs);
790	put_task_struct_many(t: task, nr: refs);
791	}
792	}
793
794	static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
795	s32 res, u32 cflags, u64 extra1, u64 extra2)
796	{
797	struct io_overflow_cqe *ocqe;
798	size_t ocq_size = sizeof(struct io_overflow_cqe);
799	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
800
801	lockdep_assert_held(&ctx->completion_lock);
802
803	if (is_cqe32)
804	ocq_size += sizeof(struct io_uring_cqe);
805
806	ocqe = kmalloc(size: ocq_size, GFP_ATOMIC \| __GFP_ACCOUNT);
807	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
808	if (!ocqe) {
809	/*
810	* If we're in ring overflow flush mode, or in task cancel mode,
811	* or cannot allocate an overflow entry, then we need to drop it
812	* on the floor.
813	*/
814	io_account_cq_overflow(ctx);
815	set_bit(nr: IO_CHECK_CQ_DROPPED_BIT, addr: &ctx->check_cq);
816	return false;
817	}
818	if (list_empty(head: &ctx->cq_overflow_list)) {
819	set_bit(nr: IO_CHECK_CQ_OVERFLOW_BIT, addr: &ctx->check_cq);
820	atomic_or(IORING_SQ_CQ_OVERFLOW, v: &ctx->rings->sq_flags);
821
822	}
823	ocqe->cqe.user_data = user_data;
824	ocqe->cqe.res = res;
825	ocqe->cqe.flags = cflags;
826	if (is_cqe32) {
827	ocqe->cqe.big_cqe[`0`] = extra1;
828	ocqe->cqe.big_cqe[`1`] = extra2;
829	}
830	list_add_tail(new: &ocqe->list, head: &ctx->cq_overflow_list);
831	return true;
832	}
833
834	void io_req_cqe_overflow(struct io_kiocb *req)
835	{
836	io_cqring_event_overflow(ctx: req->ctx, user_data: req->cqe.user_data,
837	res: req->cqe.res, cflags: req->cqe.flags,
838	extra1: req->big_cqe.extra1, extra2: req->big_cqe.extra2);
839	memset(&req->big_cqe, `0`, sizeof(req->big_cqe));
840	}
841
842	/*
843	* writes to the cq entry need to come after reading head; the
844	* control dependency is enough as we're using WRITE_ONCE to
845	* fill the cq entry
846	*/
847	bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
848	{
849	struct io_rings *rings = ctx->rings;
850	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - `1`);
851	unsigned int free, queued, len;
852
853	/*
854	* Posting into the CQ when there are pending overflowed CQEs may break
855	* ordering guarantees, which will affect links, F_MORE users and more.
856	* Force overflow the completion.
857	*/
858	if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
859	return false;
860
861	/ userspace may cheat modifying the tail, be safe and do min /
862	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
863	free = ctx->cq_entries - queued;
864	/ we need a contiguous range, limit based on the current array offset /
865	len = min(free, ctx->cq_entries - off);
866	if (!len)
867	return false;
868
869	if (ctx->flags & IORING_SETUP_CQE32) {
870	off <<= `1`;
871	len <<= `1`;
872	}
873
874	ctx->cqe_cached = &rings->cqes[off];
875	ctx->cqe_sentinel = ctx->cqe_cached + len;
876	return true;
877	}
878
879	static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
880	u32 cflags)
881	{
882	struct io_uring_cqe *cqe;
883
884	ctx->cq_extra++;
885
886	/*
887	* If we can't get a cq entry, userspace overflowed the
888	* submission (by quite a lot). Increment the overflow count in
889	* the ring.
890	*/
891	if (likely(io_get_cqe(ctx, &cqe))) {
892	trace_io_uring_complete(ctx, NULL, user_data, res, cflags, extra1: `0`, extra2: `0`);
893
894	WRITE_ONCE(cqe->user_data, user_data);
895	WRITE_ONCE(cqe->res, res);
896	WRITE_ONCE(cqe->flags, cflags);
897
898	if (ctx->flags & IORING_SETUP_CQE32) {
899	WRITE_ONCE(cqe->big_cqe[`0`], `0`);
900	WRITE_ONCE(cqe->big_cqe[`1`], `0`);
901	}
902	return true;
903	}
904	return false;
905	}
906
907	static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
908	__must_hold(&ctx->uring_lock)
909	{
910	struct io_submit_state *state = &ctx->submit_state;
911	unsigned int i;
912
913	lockdep_assert_held(&ctx->uring_lock);
914	for (i = `0`; i < state->cqes_count; i++) {
915	struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
916
917	if (!io_fill_cqe_aux(ctx, user_data: cqe->user_data, res: cqe->res, cflags: cqe->flags)) {
918	if (ctx->lockless_cq) {
919	spin_lock(lock: &ctx->completion_lock);
920	io_cqring_event_overflow(ctx, user_data: cqe->user_data,
921	res: cqe->res, cflags: cqe->flags, extra1: `0`, extra2: `0`);
922	spin_unlock(lock: &ctx->completion_lock);
923	} else {
924	io_cqring_event_overflow(ctx, user_data: cqe->user_data,
925	res: cqe->res, cflags: cqe->flags, extra1: `0`, extra2: `0`);
926	}
927	}
928	}
929	state->cqes_count = `0`;
930	}
931
932	static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
933	bool allow_overflow)
934	{
935	bool filled;
936
937	io_cq_lock(ctx);
938	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
939	if (!filled && allow_overflow)
940	filled = io_cqring_event_overflow(ctx, user_data, res, cflags, extra1: `0`, extra2: `0`);
941
942	io_cq_unlock_post(ctx);
943	return filled;
944	}
945
946	bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
947	{
948	return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow: true);
949	}
950
951	/*
952	* A helper for multishot requests posting additional CQEs.
953	* Should only be used from a task_work including IO_URING_F_MULTISHOT.
954	*/
955	bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
956	{
957	struct io_ring_ctx *ctx = req->ctx;
958	u64 user_data = req->cqe.user_data;
959	struct io_uring_cqe *cqe;
960
961	if (!defer)
962	return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow: false);
963
964	lockdep_assert_held(&ctx->uring_lock);
965
966	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
967	__io_cq_lock(ctx);
968	__io_flush_post_cqes(ctx);
969	/ no need to flush - flush is deferred /
970	__io_cq_unlock_post(ctx);
971	}
972
973	/ For defered completions this is not as strict as it is otherwise,*
974	* however it's main job is to prevent unbounded posted completions,
975	* and in that it works just as well.
976	*/
977	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
978	return false;
979
980	cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++];
981	cqe->user_data = user_data;
982	cqe->res = res;
983	cqe->flags = cflags;
984	return true;
985	}
986
987	static void __io_req_complete_post(struct io_kiocb req, unsigned* issue_flags)
988	{
989	struct io_ring_ctx *ctx = req->ctx;
990	struct io_rsrc_node *rsrc_node = NULL;
991
992	io_cq_lock(ctx);
993	if (!(req->flags & REQ_F_CQE_SKIP)) {
994	if (!io_fill_cqe_req(ctx, req))
995	io_req_cqe_overflow(req);
996	}
997
998	/*
999	* If we're the last reference to this request, add to our locked
1000	* free_list cache.
1001	*/
1002	if (req_ref_put_and_test(req)) {
1003	if (req->flags & IO_REQ_LINK_FLAGS) {
1004	if (req->flags & IO_DISARM_MASK)
1005	io_disarm_next(req);
1006	if (req->link) {
1007	io_req_task_queue(req: req->link);
1008	req->link = NULL;
1009	}
1010	}
1011	io_put_kbuf_comp(req);
1012	if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
1013	io_clean_op(req);
1014	io_put_file(req);
1015
1016	rsrc_node = req->rsrc_node;
1017	/*
1018	* Selected buffer deallocation in io_clean_op() assumes that
1019	* we don't hold ->completion_lock. Clean them here to avoid
1020	* deadlocks.
1021	*/
1022	io_put_task_remote(task: req->task);
1023	wq_list_add_head(node: &req->comp_list, list: &ctx->locked_free_list);
1024	ctx->locked_free_nr++;
1025	}
1026	io_cq_unlock_post(ctx);
1027
1028	if (rsrc_node) {
1029	io_ring_submit_lock(ctx, issue_flags);
1030	io_put_rsrc_node(ctx, node: rsrc_node);
1031	io_ring_submit_unlock(ctx, issue_flags);
1032	}
1033	}
1034
1035	void io_req_complete_post(struct io_kiocb req, unsigned* issue_flags)
1036	{
1037	if (req->ctx->task_complete && req->ctx->submitter_task != current) {
1038	req->io_task_work.func = io_req_task_complete;
1039	io_req_task_work_add(req);
1040	} else if (!(issue_flags & IO_URING_F_UNLOCKED) \|\|
1041	!(req->ctx->flags & IORING_SETUP_IOPOLL)) {
1042	__io_req_complete_post(req, issue_flags);
1043	} else {
1044	struct io_ring_ctx *ctx = req->ctx;
1045
1046	mutex_lock(&ctx->uring_lock);
1047	__io_req_complete_post(req, issue_flags: issue_flags & ~IO_URING_F_UNLOCKED);
1048	mutex_unlock(lock: &ctx->uring_lock);
1049	}
1050	}
1051
1052	void io_req_defer_failed(struct io_kiocb *req, s32 res)
1053	__must_hold(&ctx->uring_lock)
1054	{
1055	const struct io_cold_def *def = &io_cold_defs[req->opcode];
1056
1057	lockdep_assert_held(&req->ctx->uring_lock);
1058
1059	req_set_fail(req);
1060	io_req_set_res(req, res, cflags: io_put_kbuf(req, issue_flags: IO_URING_F_UNLOCKED));
1061	if (def->fail)
1062	def->fail(req);
1063	io_req_complete_defer(req);
1064	}
1065
1066	/*
1067	* Don't initialise the fields below on every allocation, but do that in
1068	* advance and keep them valid across allocations.
1069	*/
1070	static void io_preinit_req(struct io_kiocb req, struct* io_ring_ctx *ctx)
1071	{
1072	req->ctx = ctx;
1073	req->link = NULL;
1074	req->async_data = NULL;
1075	/ not necessary, but safer to zero /
1076	memset(&req->cqe, `0`, sizeof(req->cqe));
1077	memset(&req->big_cqe, `0`, sizeof(req->big_cqe));
1078	}
1079
1080	static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1081	struct io_submit_state *state)
1082	{
1083	spin_lock(lock: &ctx->completion_lock);
1084	wq_list_splice(list: &ctx->locked_free_list, to: &state->free_list);
1085	ctx->locked_free_nr = `0`;
1086	spin_unlock(lock: &ctx->completion_lock);
1087	}
1088
1089	/*
1090	* A request might get retired back into the request caches even before opcode
1091	* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
1092	* Because of that, io_alloc_req() should be called only under ->uring_lock
1093	* and with extra caution to not get a request that is still worked on.
1094	*/
1095	__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
1096	__must_hold(&ctx->uring_lock)
1097	{
1098	gfp_t gfp = GFP_KERNEL \| __GFP_NOWARN;
1099	void *reqs[IO_REQ_ALLOC_BATCH];
1100	int ret, i;
1101
1102	/*
1103	* If we have more than a batch's worth of requests in our IRQ side
1104	* locked cache, grab the lock and move them over to our submission
1105	* side cache.
1106	*/
1107	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
1108	io_flush_cached_locked_reqs(ctx, state: &ctx->submit_state);
1109	if (!io_req_cache_empty(ctx))
1110	return true;
1111	}
1112
1113	ret = kmem_cache_alloc_bulk(s: req_cachep, flags: gfp, ARRAY_SIZE(reqs), p: reqs);
1114
1115	/*
1116	* Bulk alloc is all-or-nothing. If we fail to get a batch,
1117	* retry single alloc to be on the safe side.
1118	*/
1119	if (unlikely(ret <= `0`)) {
1120	reqs[`0`] = kmem_cache_alloc(cachep: req_cachep, flags: gfp);
1121	if (!reqs[`0`])
1122	return false;
1123	ret = `1`;
1124	}
1125
1126	percpu_ref_get_many(ref: &ctx->refs, nr: ret);
1127	for (i = `0`; i < ret; i++) {
1128	struct io_kiocb *req = reqs[i];
1129
1130	io_preinit_req(req, ctx);
1131	io_req_add_to_cache(req, ctx);
1132	}
1133	return true;
1134	}
1135
1136	__cold void io_free_req(struct io_kiocb *req)
1137	{
1138	/ refs were already put, restore them for io_req_task_complete() /
1139	req->flags &= ~REQ_F_REFCOUNT;
1140	/ we only want to free it, don't post CQEs /
1141	req->flags \|= REQ_F_CQE_SKIP;
1142	req->io_task_work.func = io_req_task_complete;
1143	io_req_task_work_add(req);
1144	}
1145
1146	static void __io_req_find_next_prep(struct io_kiocb *req)
1147	{
1148	struct io_ring_ctx *ctx = req->ctx;
1149
1150	spin_lock(lock: &ctx->completion_lock);
1151	io_disarm_next(req);
1152	spin_unlock(lock: &ctx->completion_lock);
1153	}
1154
1155	static inline struct io_kiocb io_req_find_next(struct* io_kiocb *req)
1156	{
1157	struct io_kiocb *nxt;
1158
1159	/*
1160	* If LINK is set, we have dependent requests in this chain. If we
1161	* didn't fail this request, queue the first one up, moving any other
1162	* dependencies to the next request. In case of failure, fail the rest
1163	* of the chain.
1164	*/
1165	if (unlikely(req->flags & IO_DISARM_MASK))
1166	__io_req_find_next_prep(req);
1167	nxt = req->link;
1168	req->link = NULL;
1169	return nxt;
1170	}
1171
1172	static void ctx_flush_and_put(struct io_ring_ctx ctx, struct* io_tw_state *ts)
1173	{
1174	if (!ctx)
1175	return;
1176	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1177	atomic_andnot(IORING_SQ_TASKRUN, v: &ctx->rings->sq_flags);
1178	if (ts->locked) {
1179	io_submit_flush_completions(ctx);
1180	mutex_unlock(lock: &ctx->uring_lock);
1181	ts->locked = false;
1182	}
1183	percpu_ref_put(ref: &ctx->refs);
1184	}
1185
1186	static unsigned int handle_tw_list(struct llist_node *node,
1187	struct io_ring_ctx **ctx,
1188	struct io_tw_state *ts,
1189	struct llist_node *last)
1190	{
1191	unsigned int count = `0`;
1192
1193	while (node && node != last) {
1194	struct llist_node *next = node->next;
1195	struct io_kiocb req = container_of(node, struct* io_kiocb,
1196	io_task_work.node);
1197
1198	prefetch(container_of(next, struct io_kiocb, io_task_work.node));
1199
1200	if (req->ctx != *ctx) {
1201	ctx_flush_and_put(ctx: *ctx, ts);
1202	*ctx = req->ctx;
1203	/ if not contended, grab and improve batching /
1204	ts->locked = mutex_trylock(lock: &(*ctx)->uring_lock);
1205	percpu_ref_get(ref: &(*ctx)->refs);
1206	}
1207	INDIRECT_CALL_2(req->io_task_work.func,
1208	io_poll_task_func, io_req_rw_complete,
1209	req, ts);
1210	node = next;
1211	count++;
1212	if (unlikely(need_resched())) {
1213	ctx_flush_and_put(ctx: *ctx, ts);
1214	*ctx = NULL;
1215	cond_resched();
1216	}
1217	}
1218
1219	return count;
1220	}
1221
1222	/**
1223	* io_llist_xchg - swap all entries in a lock-less list
1224	* @head: the head of lock-less list to delete all entries
1225	* @new: new entry as the head of the list
1226	*
1227	* If list is empty, return NULL, otherwise, return the pointer to the first entry.
1228	* The order of entries returned is from the newest to the oldest added one.
1229	*/
1230	static inline struct llist_node io_llist_xchg(struct* llist_head *head,
1231	struct llist_node *new)
1232	{
1233	return xchg(&head->first, new);
1234	}
1235
1236	/**
1237	* io_llist_cmpxchg - possibly swap all entries in a lock-less list
1238	* @head: the head of lock-less list to delete all entries
1239	* @old: expected old value of the first entry of the list
1240	* @new: new entry as the head of the list
1241	*
1242	* perform a cmpxchg on the first entry of the list.
1243	*/
1244
1245	static inline struct llist_node io_llist_cmpxchg(struct* llist_head *head,
1246	struct llist_node *old,
1247	struct llist_node *new)
1248	{
1249	return cmpxchg(&head->first, old, new);
1250	}
1251
1252	static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
1253	{
1254	struct llist_node *node = llist_del_all(head: &tctx->task_list);
1255	struct io_ring_ctx *last_ctx = NULL;
1256	struct io_kiocb *req;
1257
1258	while (node) {
1259	req = container_of(node, struct io_kiocb, io_task_work.node);
1260	node = node->next;
1261	if (sync && last_ctx != req->ctx) {
1262	if (last_ctx) {
1263	flush_delayed_work(dwork: &last_ctx->fallback_work);
1264	percpu_ref_put(ref: &last_ctx->refs);
1265	}
1266	last_ctx = req->ctx;
1267	percpu_ref_get(ref: &last_ctx->refs);
1268	}
1269	if (llist_add(new: &req->io_task_work.node,
1270	head: &req->ctx->fallback_llist))
1271	schedule_delayed_work(dwork: &req->ctx->fallback_work, delay: `1`);
1272	}
1273
1274	if (last_ctx) {
1275	flush_delayed_work(dwork: &last_ctx->fallback_work);
1276	percpu_ref_put(ref: &last_ctx->refs);
1277	}
1278	}
1279
1280	void tctx_task_work(struct callback_head *cb)
1281	{
1282	struct io_tw_state ts = {};
1283	struct io_ring_ctx *ctx = NULL;
1284	struct io_uring_task tctx = container_of(cb, struct* io_uring_task,
1285	task_work);
1286	struct llist_node fake = {};
1287	struct llist_node *node;
1288	unsigned int loops = `0`;
1289	unsigned int count = `0`;
1290
1291	if (unlikely(current->flags & PF_EXITING)) {
1292	io_fallback_tw(tctx, sync: true);
1293	return;
1294	}
1295
1296	do {
1297	loops++;
1298	node = io_llist_xchg(head: &tctx->task_list, new: &fake);
1299	count += handle_tw_list(node, ctx: &ctx, ts: &ts, last: &fake);
1300
1301	/ skip expensive cmpxchg if there are items in the list /
1302	if (READ_ONCE(tctx->task_list.first) != &fake)
1303	continue;
1304	if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
1305	io_submit_flush_completions(ctx);
1306	if (READ_ONCE(tctx->task_list.first) != &fake)
1307	continue;
1308	}
1309	node = io_llist_cmpxchg(head: &tctx->task_list, old: &fake, NULL);
1310	} while (node != &fake);
1311
1312	ctx_flush_and_put(ctx, ts: &ts);
1313
1314	/ relaxed read is enough as only the task itself sets ->in_cancel /
1315	if (unlikely(atomic_read(&tctx->in_cancel)))
1316	io_uring_drop_tctx_refs(current);
1317
1318	trace_io_uring_task_work_run(tctx, count, loops);
1319	}
1320
1321	static inline void io_req_local_work_add(struct io_kiocb req, unsigned* flags)
1322	{
1323	struct io_ring_ctx *ctx = req->ctx;
1324	unsigned nr_wait, nr_tw, nr_tw_prev;
1325	struct llist_node *first;
1326
1327	if (req->flags & (REQ_F_LINK \| REQ_F_HARDLINK))
1328	flags &= ~IOU_F_TWQ_LAZY_WAKE;
1329
1330	first = READ_ONCE(ctx->work_llist.first);
1331	do {
1332	nr_tw_prev = `0`;
1333	if (first) {
1334	struct io_kiocb *first_req = container_of(first,
1335	struct io_kiocb,
1336	io_task_work.node);
1337	/*
1338	* Might be executed at any moment, rely on
1339	* SLAB_TYPESAFE_BY_RCU to keep it alive.
1340	*/
1341	nr_tw_prev = READ_ONCE(first_req->nr_tw);
1342	}
1343	nr_tw = nr_tw_prev + `1`;
1344	/ Large enough to fail the nr_wait comparison below /
1345	if (!(flags & IOU_F_TWQ_LAZY_WAKE))
1346	nr_tw = -`1U`;
1347
1348	req->nr_tw = nr_tw;
1349	req->io_task_work.node.next = first;
1350	} while (!try_cmpxchg(&ctx->work_llist.first, &first,
1351	&req->io_task_work.node));
1352
1353	if (!first) {
1354	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1355	atomic_or(IORING_SQ_TASKRUN, v: &ctx->rings->sq_flags);
1356	if (ctx->has_evfd)
1357	io_eventfd_signal(ctx);
1358	}
1359
1360	nr_wait = atomic_read(v: &ctx->cq_wait_nr);
1361	/ no one is waiting /
1362	if (!nr_wait)
1363	return;
1364	/ either not enough or the previous add has already woken it up /
1365	if (nr_wait > nr_tw \|\| nr_tw_prev >= nr_wait)
1366	return;
1367	/ pairs with set_current_state() in io_cqring_wait() /
1368	smp_mb__after_atomic();
1369	wake_up_state(tsk: ctx->submitter_task, TASK_INTERRUPTIBLE);
1370	}
1371
1372	static void io_req_normal_work_add(struct io_kiocb *req)
1373	{
1374	struct io_uring_task *tctx = req->task->io_uring;
1375	struct io_ring_ctx *ctx = req->ctx;
1376
1377	/ task_work already pending, we're done /
1378	if (!llist_add(new: &req->io_task_work.node, head: &tctx->task_list))
1379	return;
1380
1381	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1382	atomic_or(IORING_SQ_TASKRUN, v: &ctx->rings->sq_flags);
1383
1384	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
1385	return;
1386
1387	io_fallback_tw(tctx, sync: false);
1388	}
1389
1390	void __io_req_task_work_add(struct io_kiocb req, unsigned* flags)
1391	{
1392	if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
1393	rcu_read_lock();
1394	io_req_local_work_add(req, flags);
1395	rcu_read_unlock();
1396	} else {
1397	io_req_normal_work_add(req);
1398	}
1399	}
1400
1401	static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
1402	{
1403	struct llist_node *node;
1404
1405	node = llist_del_all(head: &ctx->work_llist);
1406	while (node) {
1407	struct io_kiocb req = container_of(node, struct* io_kiocb,
1408	io_task_work.node);
1409
1410	node = node->next;
1411	io_req_normal_work_add(req);
1412	}
1413	}
1414
1415	static int __io_run_local_work(struct io_ring_ctx ctx, struct* io_tw_state *ts)
1416	{
1417	struct llist_node *node;
1418	unsigned int loops = `0`;
1419	int ret = `0`;
1420
1421	if (WARN_ON_ONCE(ctx->submitter_task != current))
1422	return -EEXIST;
1423	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1424	atomic_andnot(IORING_SQ_TASKRUN, v: &ctx->rings->sq_flags);
1425	again:
1426	/*
1427	* llists are in reverse order, flip it back the right way before
1428	* running the pending items.
1429	*/
1430	node = llist_reverse_order(head: io_llist_xchg(head: &ctx->work_llist, NULL));
1431	while (node) {
1432	struct llist_node *next = node->next;
1433	struct io_kiocb req = container_of(node, struct* io_kiocb,
1434	io_task_work.node);
1435	prefetch(container_of(next, struct io_kiocb, io_task_work.node));
1436	INDIRECT_CALL_2(req->io_task_work.func,
1437	io_poll_task_func, io_req_rw_complete,
1438	req, ts);
1439	ret++;
1440	node = next;
1441	}
1442	loops++;
1443
1444	if (!llist_empty(head: &ctx->work_llist))
1445	goto again;
1446	if (ts->locked) {
1447	io_submit_flush_completions(ctx);
1448	if (!llist_empty(head: &ctx->work_llist))
1449	goto again;
1450	}
1451	trace_io_uring_local_work_run(ctx, count: ret, loops);
1452	return ret;
1453	}
1454
1455	static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
1456	{
1457	struct io_tw_state ts = { .locked = true, };
1458	int ret;
1459
1460	if (llist_empty(head: &ctx->work_llist))
1461	return `0`;
1462
1463	ret = __io_run_local_work(ctx, ts: &ts);
1464	/ shouldn't happen! /
1465	if (WARN_ON_ONCE(!ts.locked))
1466	mutex_lock(&ctx->uring_lock);
1467	return ret;
1468	}
1469
1470	static int io_run_local_work(struct io_ring_ctx *ctx)
1471	{
1472	struct io_tw_state ts = {};
1473	int ret;
1474
1475	ts.locked = mutex_trylock(lock: &ctx->uring_lock);
1476	ret = __io_run_local_work(ctx, ts: &ts);
1477	if (ts.locked)
1478	mutex_unlock(lock: &ctx->uring_lock);
1479
1480	return ret;
1481	}
1482
1483	static void io_req_task_cancel(struct io_kiocb req, struct* io_tw_state *ts)
1484	{
1485	io_tw_lock(ctx: req->ctx, ts);
1486	io_req_defer_failed(req, res: req->cqe.res);
1487	}
1488
1489	void io_req_task_submit(struct io_kiocb req, struct* io_tw_state *ts)
1490	{
1491	io_tw_lock(ctx: req->ctx, ts);
1492	/ req->task == current here, checking PF_EXITING is safe /
1493	if (unlikely(req->task->flags & PF_EXITING))
1494	io_req_defer_failed(req, res: -EFAULT);
1495	else if (req->flags & REQ_F_FORCE_ASYNC)
1496	io_queue_iowq(req, ts_dont_use: ts);
1497	else
1498	io_queue_sqe(req);
1499	}
1500
1501	void io_req_task_queue_fail(struct io_kiocb req, int* ret)
1502	{
1503	io_req_set_res(req, res: ret, cflags: `0`);
1504	req->io_task_work.func = io_req_task_cancel;
1505	io_req_task_work_add(req);
1506	}
1507
1508	void io_req_task_queue(struct io_kiocb *req)
1509	{
1510	req->io_task_work.func = io_req_task_submit;
1511	io_req_task_work_add(req);
1512	}
1513
1514	void io_queue_next(struct io_kiocb *req)
1515	{
1516	struct io_kiocb *nxt = io_req_find_next(req);
1517
1518	if (nxt)
1519	io_req_task_queue(req: nxt);
1520	}
1521
1522	static void io_free_batch_list(struct io_ring_ctx *ctx,
1523	struct io_wq_work_node *node)
1524	__must_hold(&ctx->uring_lock)
1525	{
1526	do {
1527	struct io_kiocb req = container_of(node, struct* io_kiocb,
1528	comp_list);
1529
1530	if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
1531	if (req->flags & REQ_F_REFCOUNT) {
1532	node = req->comp_list.next;
1533	if (!req_ref_put_and_test(req))
1534	continue;
1535	}
1536	if ((req->flags & REQ_F_POLLED) && req->apoll) {
1537	struct async_poll *apoll = req->apoll;
1538
1539	if (apoll->double_poll)
1540	kfree(objp: apoll->double_poll);
1541	if (!io_alloc_cache_put(cache: &ctx->apoll_cache, entry: &apoll->cache))
1542	kfree(objp: apoll);
1543	req->flags &= ~REQ_F_POLLED;
1544	}
1545	if (req->flags & IO_REQ_LINK_FLAGS)
1546	io_queue_next(req);
1547	if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
1548	io_clean_op(req);
1549	}
1550	io_put_file(req);
1551
1552	io_req_put_rsrc_locked(req, ctx);
1553
1554	io_put_task(task: req->task);
1555	node = req->comp_list.next;
1556	io_req_add_to_cache(req, ctx);
1557	} while (node);
1558	}
1559
1560	void __io_submit_flush_completions(struct io_ring_ctx *ctx)
1561	__must_hold(&ctx->uring_lock)
1562	{
1563	struct io_submit_state *state = &ctx->submit_state;
1564	struct io_wq_work_node *node;
1565
1566	__io_cq_lock(ctx);
1567	/ must come first to preserve CQE ordering in failure cases /
1568	if (state->cqes_count)
1569	__io_flush_post_cqes(ctx);
1570	__wq_list_for_each(node, &state->compl_reqs) {
1571	struct io_kiocb req = container_of(node, struct* io_kiocb,
1572	comp_list);
1573
1574	if (!(req->flags & REQ_F_CQE_SKIP) &&
1575	unlikely(!io_fill_cqe_req(ctx, req))) {
1576	if (ctx->lockless_cq) {
1577	spin_lock(lock: &ctx->completion_lock);
1578	io_req_cqe_overflow(req);
1579	spin_unlock(lock: &ctx->completion_lock);
1580	} else {
1581	io_req_cqe_overflow(req);
1582	}
1583	}
1584	}
1585	__io_cq_unlock_post(ctx);
1586
1587	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
1588	io_free_batch_list(ctx, node: state->compl_reqs.first);
1589	INIT_WQ_LIST(&state->compl_reqs);
1590	}
1591	}
1592
1593	static unsigned io_cqring_events(struct io_ring_ctx *ctx)
1594	{
1595	/ See comment at the top of this file /
1596	smp_rmb();
1597	return __io_cqring_events(ctx);
1598	}
1599
1600	/*
1601	* We can't just wait for polled events to come to us, we have to actively
1602	* find and complete them.
1603	*/
1604	static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
1605	{
1606	if (!(ctx->flags & IORING_SETUP_IOPOLL))
1607	return;
1608
1609	mutex_lock(&ctx->uring_lock);
1610	while (!wq_list_empty(&ctx->iopoll_list)) {
1611	/ let it sleep and repeat later if can't complete a request /
1612	if (io_do_iopoll(ctx, force_nonspin: true) == `0`)
1613	break;
1614	/*
1615	* Ensure we allow local-to-the-cpu processing to take place,
1616	* in this case we need to ensure that we reap all events.
1617	* Also let task_work, etc. to progress by releasing the mutex
1618	*/
1619	if (need_resched()) {
1620	mutex_unlock(lock: &ctx->uring_lock);
1621	cond_resched();
1622	mutex_lock(&ctx->uring_lock);
1623	}
1624	}
1625	mutex_unlock(lock: &ctx->uring_lock);
1626	}
1627
1628	static int io_iopoll_check(struct io_ring_ctx ctx, long* min)
1629	{
1630	unsigned int nr_events = `0`;
1631	unsigned long check_cq;
1632
1633	if (!io_allowed_run_tw(ctx))
1634	return -EEXIST;
1635
1636	check_cq = READ_ONCE(ctx->check_cq);
1637	if (unlikely(check_cq)) {
1638	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
1639	__io_cqring_overflow_flush(ctx);
1640	/*
1641	* Similarly do not spin if we have not informed the user of any
1642	* dropped CQE.
1643	*/
1644	if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
1645	return -EBADR;
1646	}
1647	/*
1648	* Don't enter poll loop if we already have events pending.
1649	* If we do, we can potentially be spinning for commands that
1650	* already triggered a CQE (eg in error).
1651	*/
1652	if (io_cqring_events(ctx))
1653	return `0`;
1654
1655	do {
1656	int ret = `0`;
1657
1658	/*
1659	* If a submit got punted to a workqueue, we can have the
1660	* application entering polling for a command before it gets
1661	* issued. That app will hold the uring_lock for the duration
1662	* of the poll right here, so we need to take a breather every
1663	* now and then to ensure that the issue has a chance to add
1664	* the poll to the issued list. Otherwise we can spin here
1665	* forever, while the workqueue is stuck trying to acquire the
1666	* very same mutex.
1667	*/
1668	if (wq_list_empty(&ctx->iopoll_list) \|\|
1669	io_task_work_pending(ctx)) {
1670	u32 tail = ctx->cached_cq_tail;
1671
1672	(void) io_run_local_work_locked(ctx);
1673
1674	if (task_work_pending(current) \|\|
1675	wq_list_empty(&ctx->iopoll_list)) {
1676	mutex_unlock(lock: &ctx->uring_lock);
1677	io_run_task_work();
1678	mutex_lock(&ctx->uring_lock);
1679	}
1680	/ some requests don't go through iopoll_list /
1681	if (tail != ctx->cached_cq_tail \|\|
1682	wq_list_empty(&ctx->iopoll_list))
1683	break;
1684	}
1685	ret = io_do_iopoll(ctx, force_nonspin: !min);
1686	if (unlikely(ret < `0`))
1687	return ret;
1688
1689	if (task_sigpending(current))
1690	return -EINTR;
1691	if (need_resched())
1692	break;
1693
1694	nr_events += ret;
1695	} while (nr_events < min);
1696
1697	return `0`;
1698	}
1699
1700	void io_req_task_complete(struct io_kiocb req, struct* io_tw_state *ts)
1701	{
1702	if (ts->locked)
1703	io_req_complete_defer(req);
1704	else
1705	io_req_complete_post(req, issue_flags: IO_URING_F_UNLOCKED);
1706	}
1707
1708	/*
1709	* After the iocb has been issued, it's safe to be found on the poll list.
1710	* Adding the kiocb to the list AFTER submission ensures that we don't
1711	* find it from a io_do_iopoll() thread before the issuer is done
1712	* accessing the kiocb cookie.
1713	*/
1714	static void io_iopoll_req_issued(struct io_kiocb req, unsigned* int issue_flags)
1715	{
1716	struct io_ring_ctx *ctx = req->ctx;
1717	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
1718
1719	/ workqueue context doesn't hold uring_lock, grab it now /
1720	if (unlikely(needs_lock))
1721	mutex_lock(&ctx->uring_lock);
1722
1723	/*
1724	* Track whether we have multiple files in our lists. This will impact
1725	* how we do polling eventually, not spinning if we're on potentially
1726	* different devices.
1727	*/
1728	if (wq_list_empty(&ctx->iopoll_list)) {
1729	ctx->poll_multi_queue = false;
1730	} else if (!ctx->poll_multi_queue) {
1731	struct io_kiocb *list_req;
1732
1733	list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
1734	comp_list);
1735	if (list_req->file != req->file)
1736	ctx->poll_multi_queue = true;
1737	}
1738
1739	/*
1740	* For fast devices, IO may have already completed. If it has, add
1741	* it to the front so we find it first.
1742	*/
1743	if (READ_ONCE(req->iopoll_completed))
1744	wq_list_add_head(node: &req->comp_list, list: &ctx->iopoll_list);
1745	else
1746	wq_list_add_tail(node: &req->comp_list, list: &ctx->iopoll_list);
1747
1748	if (unlikely(needs_lock)) {
1749	/*
1750	* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
1751	* in sq thread task context or in io worker task context. If
1752	* current task context is sq thread, we don't need to check
1753	* whether should wake up sq thread.
1754	*/
1755	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1756	wq_has_sleeper(wq_head: &ctx->sq_data->wait))
1757	wake_up(&ctx->sq_data->wait);
1758
1759	mutex_unlock(lock: &ctx->uring_lock);
1760	}
1761	}
1762
1763	unsigned int io_file_get_flags(struct file *file)
1764	{
1765	unsigned int res = `0`;
1766
1767	if (S_ISREG(file_inode(file)->i_mode))
1768	res \|= REQ_F_ISREG;
1769	if ((file->f_flags & O_NONBLOCK) \|\| (file->f_mode & FMODE_NOWAIT))
1770	res \|= REQ_F_SUPPORT_NOWAIT;
1771	return res;
1772	}
1773
1774	bool io_alloc_async_data(struct io_kiocb *req)
1775	{
1776	WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
1777	req->async_data = kmalloc(size: io_cold_defs[req->opcode].async_size, GFP_KERNEL);
1778	if (req->async_data) {
1779	req->flags \|= REQ_F_ASYNC_DATA;
1780	return false;
1781	}
1782	return true;
1783	}
1784
1785	int io_req_prep_async(struct io_kiocb *req)
1786	{
1787	const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
1788	const struct io_issue_def *def = &io_issue_defs[req->opcode];
1789
1790	/ assign early for deferred execution for non-fixed file /
1791	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
1792	req->file = io_file_get_normal(req, fd: req->cqe.fd);
1793	if (!cdef->prep_async)
1794	return `0`;
1795	if (WARN_ON_ONCE(req_has_async_data(req)))
1796	return -EFAULT;
1797	if (!def->manual_alloc) {
1798	if (io_alloc_async_data(req))
1799	return -EAGAIN;
1800	}
1801	return cdef->prep_async(req);
1802	}
1803
1804	static u32 io_get_sequence(struct io_kiocb *req)
1805	{
1806	u32 seq = req->ctx->cached_sq_head;
1807	struct io_kiocb *cur;
1808
1809	/ need original cached_sq_head, but it was increased for each req /
1810	io_for_each_link(cur, req)
1811	seq--;
1812	return seq;
1813	}
1814
1815	static __cold void io_drain_req(struct io_kiocb *req)
1816	__must_hold(&ctx->uring_lock)
1817	{
1818	struct io_ring_ctx *ctx = req->ctx;
1819	struct io_defer_entry *de;
1820	int ret;
1821	u32 seq = io_get_sequence(req);
1822
1823	/ Still need defer if there is pending req in defer list. /
1824	spin_lock(lock: &ctx->completion_lock);
1825	if (!req_need_defer(req, seq) && list_empty_careful(head: &ctx->defer_list)) {
1826	spin_unlock(lock: &ctx->completion_lock);
1827	queue:
1828	ctx->drain_active = false;
1829	io_req_task_queue(req);
1830	return;
1831	}
1832	spin_unlock(lock: &ctx->completion_lock);
1833
1834	io_prep_async_link(req);
1835	de = kmalloc(size: sizeof(*de), GFP_KERNEL);
1836	if (!de) {
1837	ret = -ENOMEM;
1838	io_req_defer_failed(req, res: ret);
1839	return;
1840	}
1841
1842	spin_lock(lock: &ctx->completion_lock);
1843	if (!req_need_defer(req, seq) && list_empty(head: &ctx->defer_list)) {
1844	spin_unlock(lock: &ctx->completion_lock);
1845	kfree(objp: de);
1846	goto queue;
1847	}
1848
1849	trace_io_uring_defer(req);
1850	de->req = req;
1851	de->seq = seq;
1852	list_add_tail(new: &de->list, head: &ctx->defer_list);
1853	spin_unlock(lock: &ctx->completion_lock);
1854	}
1855
1856	static bool io_assign_file(struct io_kiocb req, const* struct io_issue_def *def,
1857	unsigned int issue_flags)
1858	{
1859	if (req->file \|\| !def->needs_file)
1860	return true;
1861
1862	if (req->flags & REQ_F_FIXED_FILE)
1863	req->file = io_file_get_fixed(req, fd: req->cqe.fd, issue_flags);
1864	else
1865	req->file = io_file_get_normal(req, fd: req->cqe.fd);
1866
1867	return !!req->file;
1868	}
1869
1870	static int io_issue_sqe(struct io_kiocb req, unsigned* int issue_flags)
1871	{
1872	const struct io_issue_def *def = &io_issue_defs[req->opcode];
1873	const struct cred *creds = NULL;
1874	int ret;
1875
1876	if (unlikely(!io_assign_file(req, def, issue_flags)))
1877	return -EBADF;
1878
1879	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
1880	creds = override_creds(req->creds);
1881
1882	if (!def->audit_skip)
1883	audit_uring_entry(op: req->opcode);
1884
1885	ret = def->issue(req, issue_flags);
1886
1887	if (!def->audit_skip)
1888	audit_uring_exit(success: !ret, code: ret);
1889
1890	if (creds)
1891	revert_creds(creds);
1892
1893	if (ret == IOU_OK) {
1894	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1895	io_req_complete_defer(req);
1896	else
1897	io_req_complete_post(req, issue_flags);
1898	} else if (ret != IOU_ISSUE_SKIP_COMPLETE)
1899	return ret;
1900
1901	/ If the op doesn't have a file, we're not polling for it /
1902	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
1903	io_iopoll_req_issued(req, issue_flags);
1904
1905	return `0`;
1906	}
1907
1908	int io_poll_issue(struct io_kiocb req, struct* io_tw_state *ts)
1909	{
1910	io_tw_lock(ctx: req->ctx, ts);
1911	return io_issue_sqe(req, issue_flags: IO_URING_F_NONBLOCK\|IO_URING_F_MULTISHOT\|
1912	IO_URING_F_COMPLETE_DEFER);
1913	}
1914
1915	struct io_wq_work io_wq_free_work(struct* io_wq_work *work)
1916	{
1917	struct io_kiocb req = container_of(work, struct* io_kiocb, work);
1918	struct io_kiocb *nxt = NULL;
1919
1920	if (req_ref_put_and_test(req)) {
1921	if (req->flags & IO_REQ_LINK_FLAGS)
1922	nxt = io_req_find_next(req);
1923	io_free_req(req);
1924	}
1925	return nxt ? &nxt->work : NULL;
1926	}
1927
1928	void io_wq_submit_work(struct io_wq_work *work)
1929	{
1930	struct io_kiocb req = container_of(work, struct* io_kiocb, work);
1931	const struct io_issue_def *def = &io_issue_defs[req->opcode];
1932	unsigned int issue_flags = IO_URING_F_UNLOCKED \| IO_URING_F_IOWQ;
1933	bool needs_poll = false;
1934	int ret = `0`, err = -ECANCELED;
1935
1936	/ one will be dropped by ->io_wq_free_work() after returning to io-wq /
1937	if (!(req->flags & REQ_F_REFCOUNT))
1938	__io_req_set_refcount(req, nr: `2`);
1939	else
1940	req_ref_get(req);
1941
1942	io_arm_ltimeout(req);
1943
1944	/ either cancelled or io-wq is dying, so don't touch tctx->iowq /
1945	if (work->flags & IO_WQ_WORK_CANCEL) {
1946	fail:
1947	io_req_task_queue_fail(req, ret: err);
1948	return;
1949	}
1950	if (!io_assign_file(req, def, issue_flags)) {
1951	err = -EBADF;
1952	work->flags \|= IO_WQ_WORK_CANCEL;
1953	goto fail;
1954	}
1955
1956	if (req->flags & REQ_F_FORCE_ASYNC) {
1957	bool opcode_poll = def->pollin \|\| def->pollout;
1958
1959	if (opcode_poll && file_can_poll(file: req->file)) {
1960	needs_poll = true;
1961	issue_flags \|= IO_URING_F_NONBLOCK;
1962	}
1963	}
1964
1965	do {
1966	ret = io_issue_sqe(req, issue_flags);
1967	if (ret != -EAGAIN)
1968	break;
1969
1970	/*
1971	* If REQ_F_NOWAIT is set, then don't wait or retry with
1972	* poll. -EAGAIN is final for that case.
1973	*/
1974	if (req->flags & REQ_F_NOWAIT)
1975	break;
1976
1977	/*
1978	* We can get EAGAIN for iopolled IO even though we're
1979	* forcing a sync submission from here, since we can't
1980	* wait for request slots on the block side.
1981	*/
1982	if (!needs_poll) {
1983	if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
1984	break;
1985	if (io_wq_worker_stopped())
1986	break;
1987	cond_resched();
1988	continue;
1989	}
1990
1991	if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
1992	return;
1993	/ aborted or ready, in either case retry blocking /
1994	needs_poll = false;
1995	issue_flags &= ~IO_URING_F_NONBLOCK;
1996	} while (`1`);
1997
1998	/ avoid locking problems by failing it from a clean context /
1999	if (ret < `0`)
2000	io_req_task_queue_fail(req, ret);
2001	}
2002
2003	inline struct file io_file_get_fixed(struct* io_kiocb req, int* fd,
2004	unsigned int issue_flags)
2005	{
2006	struct io_ring_ctx *ctx = req->ctx;
2007	struct io_fixed_file *slot;
2008	struct file *file = NULL;
2009
2010	io_ring_submit_lock(ctx, issue_flags);
2011
2012	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
2013	goto out;
2014	fd = array_index_nospec(fd, ctx->nr_user_files);
2015	slot = io_fixed_file_slot(table: &ctx->file_table, i: fd);
2016	file = io_slot_file(slot);
2017	req->flags \|= io_slot_flags(slot);
2018	io_req_set_rsrc_node(req, ctx, issue_flags: `0`);
2019	out:
2020	io_ring_submit_unlock(ctx, issue_flags);
2021	return file;
2022	}
2023
2024	struct file io_file_get_normal(struct* io_kiocb req, int* fd)
2025	{
2026	struct file *file = fget(fd);
2027
2028	trace_io_uring_file_get(req, fd);
2029
2030	/ we don't allow fixed io_uring files /
2031	if (file && io_is_uring_fops(file))
2032	io_req_track_inflight(req);
2033	return file;
2034	}
2035
2036	static void io_queue_async(struct io_kiocb req, int* ret)
2037	__must_hold(&req->ctx->uring_lock)
2038	{
2039	struct io_kiocb *linked_timeout;
2040
2041	if (ret != -EAGAIN \|\| (req->flags & REQ_F_NOWAIT)) {
2042	io_req_defer_failed(req, res: ret);
2043	return;
2044	}
2045
2046	linked_timeout = io_prep_linked_timeout(req);
2047
2048	switch (io_arm_poll_handler(req, issue_flags: `0`)) {
2049	case IO_APOLL_READY:
2050	io_kbuf_recycle(req, issue_flags: `0`);
2051	io_req_task_queue(req);
2052	break;
2053	case IO_APOLL_ABORTED:
2054	io_kbuf_recycle(req, issue_flags: `0`);
2055	io_queue_iowq(req, NULL);
2056	break;
2057	case IO_APOLL_OK:
2058	break;
2059	}
2060
2061	if (linked_timeout)
2062	io_queue_linked_timeout(req: linked_timeout);
2063	}
2064
2065	static inline void io_queue_sqe(struct io_kiocb *req)
2066	__must_hold(&req->ctx->uring_lock)
2067	{
2068	int ret;
2069
2070	ret = io_issue_sqe(req, issue_flags: IO_URING_F_NONBLOCK\|IO_URING_F_COMPLETE_DEFER);
2071
2072	/*
2073	* We async punt it if the file wasn't marked NOWAIT, or if the file
2074	* doesn't support non-blocking read/write attempts
2075	*/
2076	if (likely(!ret))
2077	io_arm_ltimeout(req);
2078	else
2079	io_queue_async(req, ret);
2080	}
2081
2082	static void io_queue_sqe_fallback(struct io_kiocb *req)
2083	__must_hold(&req->ctx->uring_lock)
2084	{
2085	if (unlikely(req->flags & REQ_F_FAIL)) {
2086	/*
2087	* We don't submit, fail them all, for that replace hardlinks
2088	* with normal links. Extra REQ_F_LINK is tolerated.
2089	*/
2090	req->flags &= ~REQ_F_HARDLINK;
2091	req->flags \|= REQ_F_LINK;
2092	io_req_defer_failed(req, res: req->cqe.res);
2093	} else {
2094	int ret = io_req_prep_async(req);
2095
2096	if (unlikely(ret)) {
2097	io_req_defer_failed(req, res: ret);
2098	return;
2099	}
2100
2101	if (unlikely(req->ctx->drain_active))
2102	io_drain_req(req);
2103	else
2104	io_queue_iowq(req, NULL);
2105	}
2106	}
2107
2108	/*
2109	* Check SQE restrictions (opcode and flags).
2110	*
2111	* Returns 'true' if SQE is allowed, 'false' otherwise.
2112	*/
2113	static inline bool io_check_restriction(struct io_ring_ctx *ctx,
2114	struct io_kiocb *req,
2115	unsigned int sqe_flags)
2116	{
2117	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
2118	return false;
2119
2120	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
2121	ctx->restrictions.sqe_flags_required)
2122	return false;
2123
2124	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed \|
2125	ctx->restrictions.sqe_flags_required))
2126	return false;
2127
2128	return true;
2129	}
2130
2131	static void io_init_req_drain(struct io_kiocb *req)
2132	{
2133	struct io_ring_ctx *ctx = req->ctx;
2134	struct io_kiocb *head = ctx->submit_state.link.head;
2135
2136	ctx->drain_active = true;
2137	if (head) {
2138	/*
2139	* If we need to drain a request in the middle of a link, drain
2140	* the head request and the next request/link after the current
2141	* link. Considering sequential execution of links,
2142	* REQ_F_IO_DRAIN will be maintained for every request of our
2143	* link.
2144	*/
2145	head->flags \|= REQ_F_IO_DRAIN \| REQ_F_FORCE_ASYNC;
2146	ctx->drain_next = true;
2147	}
2148	}
2149
2150	static int io_init_req(struct io_ring_ctx ctx, struct* io_kiocb *req,
2151	const struct io_uring_sqe *sqe)
2152	__must_hold(&ctx->uring_lock)
2153	{
2154	const struct io_issue_def *def;
2155	unsigned int sqe_flags;
2156	int personality;
2157	u8 opcode;
2158
2159	/ req is partially pre-initialised, see io_preinit_req() /
2160	req->opcode = opcode = READ_ONCE(sqe->opcode);
2161	/ same numerical values with corresponding REQ_F_, safe to copy /*
2162	req->flags = sqe_flags = READ_ONCE(sqe->flags);
2163	req->cqe.user_data = READ_ONCE(sqe->user_data);
2164	req->file = NULL;
2165	req->rsrc_node = NULL;
2166	req->task = current;
2167
2168	if (unlikely(opcode >= IORING_OP_LAST)) {
2169	req->opcode = `0`;
2170	return -EINVAL;
2171	}
2172	def = &io_issue_defs[opcode];
2173	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
2174	/ enforce forwards compatibility on users /
2175	if (sqe_flags & ~SQE_VALID_FLAGS)
2176	return -EINVAL;
2177	if (sqe_flags & IOSQE_BUFFER_SELECT) {
2178	if (!def->buffer_select)
2179	return -EOPNOTSUPP;
2180	req->buf_index = READ_ONCE(sqe->buf_group);
2181	}
2182	if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
2183	ctx->drain_disabled = true;
2184	if (sqe_flags & IOSQE_IO_DRAIN) {
2185	if (ctx->drain_disabled)
2186	return -EOPNOTSUPP;
2187	io_init_req_drain(req);
2188	}
2189	}
2190	if (unlikely(ctx->restricted \|\| ctx->drain_active \|\| ctx->drain_next)) {
2191	if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
2192	return -EACCES;
2193	/ knock it to the slow queue path, will be drained there /
2194	if (ctx->drain_active)
2195	req->flags \|= REQ_F_FORCE_ASYNC;
2196	/ if there is no link, we're at "next" request and need to drain /
2197	if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
2198	ctx->drain_next = false;
2199	ctx->drain_active = true;
2200	req->flags \|= REQ_F_IO_DRAIN \| REQ_F_FORCE_ASYNC;
2201	}
2202	}
2203
2204	if (!def->ioprio && sqe->ioprio)
2205	return -EINVAL;
2206	if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
2207	return -EINVAL;
2208
2209	if (def->needs_file) {
2210	struct io_submit_state *state = &ctx->submit_state;
2211
2212	req->cqe.fd = READ_ONCE(sqe->fd);
2213
2214	/*
2215	* Plug now if we have more than 2 IO left after this, and the
2216	* target is potentially a read/write to block based storage.
2217	*/
2218	if (state->need_plug && def->plug) {
2219	state->plug_started = true;
2220	state->need_plug = false;
2221	blk_start_plug_nr_ios(&state->plug, state->submit_nr);
2222	}
2223	}
2224
2225	personality = READ_ONCE(sqe->personality);
2226	if (personality) {
2227	int ret;
2228
2229	req->creds = xa_load(&ctx->personalities, index: personality);
2230	if (!req->creds)
2231	return -EINVAL;
2232	get_cred(cred: req->creds);
2233	ret = security_uring_override_creds(new: req->creds);
2234	if (ret) {
2235	put_cred(cred: req->creds);
2236	return ret;
2237	}
2238	req->flags \|= REQ_F_CREDS;
2239	}
2240
2241	return def->prep(req, sqe);
2242	}
2243
2244	static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
2245	struct io_kiocb req, int* ret)
2246	{
2247	struct io_ring_ctx *ctx = req->ctx;
2248	struct io_submit_link *link = &ctx->submit_state.link;
2249	struct io_kiocb *head = link->head;
2250
2251	trace_io_uring_req_failed(sqe, req, error: ret);
2252
2253	/*
2254	* Avoid breaking links in the middle as it renders links with SQPOLL
2255	* unusable. Instead of failing eagerly, continue assembling the link if
2256	* applicable and mark the head with REQ_F_FAIL. The link flushing code
2257	* should find the flag and handle the rest.
2258	*/
2259	req_fail_link_node(req, res: ret);
2260	if (head && !(head->flags & REQ_F_FAIL))
2261	req_fail_link_node(req: head, res: -ECANCELED);
2262
2263	if (!(req->flags & IO_REQ_LINK_FLAGS)) {
2264	if (head) {
2265	link->last->link = req;
2266	link->head = NULL;
2267	req = head;
2268	}
2269	io_queue_sqe_fallback(req);
2270	return ret;
2271	}
2272
2273	if (head)
2274	link->last->link = req;
2275	else
2276	link->head = req;
2277	link->last = req;
2278	return `0`;
2279	}
2280
2281	static inline int io_submit_sqe(struct io_ring_ctx ctx, struct* io_kiocb *req,
2282	const struct io_uring_sqe *sqe)
2283	__must_hold(&ctx->uring_lock)
2284	{
2285	struct io_submit_link *link = &ctx->submit_state.link;
2286	int ret;
2287
2288	ret = io_init_req(ctx, req, sqe);
2289	if (unlikely(ret))
2290	return io_submit_fail_init(sqe, req, ret);
2291
2292	trace_io_uring_submit_req(req);
2293
2294	/*
2295	* If we already have a head request, queue this one for async
2296	* submittal once the head completes. If we don't have a head but
2297	* IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
2298	* submitted sync once the chain is complete. If none of those
2299	* conditions are true (normal request), then just queue it.
2300	*/
2301	if (unlikely(link->head)) {
2302	ret = io_req_prep_async(req);
2303	if (unlikely(ret))
2304	return io_submit_fail_init(sqe, req, ret);
2305
2306	trace_io_uring_link(req, target_req: link->head);
2307	link->last->link = req;
2308	link->last = req;
2309
2310	if (req->flags & IO_REQ_LINK_FLAGS)
2311	return `0`;
2312	/ last request of the link, flush it /
2313	req = link->head;
2314	link->head = NULL;
2315	if (req->flags & (REQ_F_FORCE_ASYNC \| REQ_F_FAIL))
2316	goto fallback;
2317
2318	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS \|
2319	REQ_F_FORCE_ASYNC \| REQ_F_FAIL))) {
2320	if (req->flags & IO_REQ_LINK_FLAGS) {
2321	link->head = req;
2322	link->last = req;
2323	} else {
2324	fallback:
2325	io_queue_sqe_fallback(req);
2326	}
2327	return `0`;
2328	}
2329
2330	io_queue_sqe(req);
2331	return `0`;
2332	}
2333
2334	/*
2335	* Batched submission is done, ensure local IO is flushed out.
2336	*/
2337	static void io_submit_state_end(struct io_ring_ctx *ctx)
2338	{
2339	struct io_submit_state *state = &ctx->submit_state;
2340
2341	if (unlikely(state->link.head))
2342	io_queue_sqe_fallback(req: state->link.head);
2343	/ flush only after queuing links as they can generate completions /
2344	io_submit_flush_completions(ctx);
2345	if (state->plug_started)
2346	blk_finish_plug(&state->plug);
2347	}
2348
2349	/*
2350	* Start submission side cache.
2351	*/
2352	static void io_submit_state_start(struct io_submit_state *state,
2353	unsigned int max_ios)
2354	{
2355	state->plug_started = false;
2356	state->need_plug = max_ios > `2`;
2357	state->submit_nr = max_ios;
2358	/ set only head, no need to init link_last in advance /
2359	state->link.head = NULL;
2360	}
2361
2362	static void io_commit_sqring(struct io_ring_ctx *ctx)
2363	{
2364	struct io_rings *rings = ctx->rings;
2365
2366	/*
2367	* Ensure any loads from the SQEs are done at this point,
2368	* since once we write the new head, the application could
2369	* write new data to them.
2370	*/
2371	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2372	}
2373
2374	/*
2375	* Fetch an sqe, if one is available. Note this returns a pointer to memory
2376	* that is mapped by userspace. This means that care needs to be taken to
2377	* ensure that reads are stable, as we cannot rely on userspace always
2378	* being a good citizen. If members of the sqe are validated and then later
2379	* used, it's important that those reads are done through READ_ONCE() to
2380	* prevent a re-load down the line.
2381	*/
2382	static bool io_get_sqe(struct io_ring_ctx ctx, const* struct io_uring_sqe **sqe)
2383	{
2384	unsigned mask = ctx->sq_entries - `1`;
2385	unsigned head = ctx->cached_sq_head++ & mask;
2386
2387	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
2388	head = READ_ONCE(ctx->sq_array[head]);
2389	if (unlikely(head >= ctx->sq_entries)) {
2390	/ drop invalid entries /
2391	spin_lock(lock: &ctx->completion_lock);
2392	ctx->cq_extra--;
2393	spin_unlock(lock: &ctx->completion_lock);
2394	WRITE_ONCE(ctx->rings->sq_dropped,
2395	READ_ONCE(ctx->rings->sq_dropped) + `1`);
2396	return false;
2397	}
2398	}
2399
2400	/*
2401	* The cached sq head (or cq tail) serves two purposes:
2402	*
2403	* 1) allows us to batch the cost of updating the user visible
2404	* head updates.
2405	* 2) allows the kernel side to track the head on its own, even
2406	* though the application is the one updating it.
2407	*/
2408
2409	/ double index for 128-byte SQEs, twice as long /
2410	if (ctx->flags & IORING_SETUP_SQE128)
2411	head <<= `1`;
2412	*sqe = &ctx->sq_sqes[head];
2413	return true;
2414	}
2415
2416	int io_submit_sqes(struct io_ring_ctx ctx, unsigned* int nr)
2417	__must_hold(&ctx->uring_lock)
2418	{
2419	unsigned int entries = io_sqring_entries(ctx);
2420	unsigned int left;
2421	int ret;
2422
2423	if (unlikely(!entries))
2424	return `0`;
2425	/ make sure SQ entry isn't read before tail /
2426	ret = left = min(nr, entries);
2427	io_get_task_refs(nr: left);
2428	io_submit_state_start(state: &ctx->submit_state, max_ios: left);
2429
2430	do {
2431	const struct io_uring_sqe *sqe;
2432	struct io_kiocb *req;
2433
2434	if (unlikely(!io_alloc_req(ctx, &req)))
2435	break;
2436	if (unlikely(!io_get_sqe(ctx, &sqe))) {
2437	io_req_add_to_cache(req, ctx);
2438	break;
2439	}
2440
2441	/*
2442	* Continue submitting even for sqe failure if the
2443	* ring was setup with IORING_SETUP_SUBMIT_ALL
2444	*/
2445	if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
2446	!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
2447	left--;
2448	break;
2449	}
2450	} while (--left);
2451
2452	if (unlikely(left)) {
2453	ret -= left;
2454	/ try again if it submitted nothing and can't allocate a req /
2455	if (!ret && io_req_cache_empty(ctx))
2456	ret = -EAGAIN;
2457	current->io_uring->cached_refs += left;
2458	}
2459
2460	io_submit_state_end(ctx);
2461	/ Commit SQ ring head once we've consumed and submitted all SQEs /
2462	io_commit_sqring(ctx);
2463	return ret;
2464	}
2465
2466	struct io_wait_queue {
2467	struct wait_queue_entry wq;
2468	struct io_ring_ctx *ctx;
2469	unsigned cq_tail;
2470	unsigned nr_timeouts;
2471	ktime_t timeout;
2472	};
2473
2474	static inline bool io_has_work(struct io_ring_ctx *ctx)
2475	{
2476	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) \|\|
2477	!llist_empty(head: &ctx->work_llist);
2478	}
2479
2480	static inline bool io_should_wake(struct io_wait_queue *iowq)
2481	{
2482	struct io_ring_ctx *ctx = iowq->ctx;
2483	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
2484
2485	/*
2486	* Wake up if we have enough events, or if a timeout occurred since we
2487	* started waiting. For timeouts, we always want to return to userspace,
2488	* regardless of event count.
2489	*/
2490	return dist >= `0` \|\| atomic_read(v: &ctx->cq_timeouts) != iowq->nr_timeouts;
2491	}
2492
2493	static int io_wake_function(struct wait_queue_entry curr, unsigned* int mode,
2494	int wake_flags, void *key)
2495	{
2496	struct io_wait_queue iowq = container_of(curr, struct* io_wait_queue, wq);
2497
2498	/*
2499	* Cannot safely flush overflowed CQEs from here, ensure we wake up
2500	* the task, and the next invocation will do it.
2501	*/
2502	if (io_should_wake(iowq) \|\| io_has_work(ctx: iowq->ctx))
2503	return autoremove_wake_function(wq_entry: curr, mode, sync: wake_flags, key);
2504	return -`1`;
2505	}
2506
2507	int io_run_task_work_sig(struct io_ring_ctx *ctx)
2508	{
2509	if (!llist_empty(head: &ctx->work_llist)) {
2510	__set_current_state(TASK_RUNNING);
2511	if (io_run_local_work(ctx) > `0`)
2512	return `0`;
2513	}
2514	if (io_run_task_work() > `0`)
2515	return `0`;
2516	if (task_sigpending(current))
2517	return -EINTR;
2518	return `0`;
2519	}
2520
2521	static bool current_pending_io(void)
2522	{
2523	struct io_uring_task *tctx = current->io_uring;
2524
2525	if (!tctx)
2526	return false;
2527	return percpu_counter_read_positive(fbc: &tctx->inflight);
2528	}
2529
2530	/ when returns >0, the caller should retry /
2531	static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
2532	struct io_wait_queue *iowq)
2533	{
2534	int io_wait, ret;
2535
2536	if (unlikely(READ_ONCE(ctx->check_cq)))
2537	return `1`;
2538	if (unlikely(!llist_empty(&ctx->work_llist)))
2539	return `1`;
2540	if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
2541	return `1`;
2542	if (unlikely(task_sigpending(current)))
2543	return -EINTR;
2544	if (unlikely(io_should_wake(iowq)))
2545	return `0`;
2546
2547	/*
2548	* Mark us as being in io_wait if we have pending requests, so cpufreq
2549	* can take into account that the task is waiting for IO - turns out
2550	* to be important for low QD IO.
2551	*/
2552	io_wait = current->in_iowait;
2553	if (current_pending_io())
2554	current->in_iowait = `1`;
2555	ret = `0`;
2556	if (iowq->timeout == KTIME_MAX)
2557	schedule();
2558	else if (!schedule_hrtimeout(expires: &iowq->timeout, mode: HRTIMER_MODE_ABS))
2559	ret = -ETIME;
2560	current->in_iowait = io_wait;
2561	return ret;
2562	}
2563
2564	/*
2565	* Wait until events become available, if we don't already have some. The
2566	* application must reap them itself, as they reside on the shared cq ring.
2567	*/
2568	static int io_cqring_wait(struct io_ring_ctx ctx, int* min_events,
2569	const sigset_t __user *sig, size_t sigsz,
2570	struct __kernel_timespec __user *uts)
2571	{
2572	struct io_wait_queue iowq;
2573	struct io_rings *rings = ctx->rings;
2574	int ret;
2575
2576	if (!io_allowed_run_tw(ctx))
2577	return -EEXIST;
2578	if (!llist_empty(head: &ctx->work_llist))
2579	io_run_local_work(ctx);
2580	io_run_task_work();
2581	io_cqring_overflow_flush(ctx);
2582	/ if user messes with these they will just get an early return /
2583	if (__io_cqring_events_user(ctx) >= min_events)
2584	return `0`;
2585
2586	if (sig) {
2587	#ifdef CONFIG_COMPAT
2588	if (in_compat_syscall())
2589	ret = set_compat_user_sigmask(umask: (const compat_sigset_t __user *)sig,
2590	sigsetsize: sigsz);
2591	else
2592	#endif
2593	ret = set_user_sigmask(umask: sig, sigsetsize: sigsz);
2594
2595	if (ret)
2596	return ret;
2597	}
2598
2599	init_waitqueue_func_entry(wq_entry: &iowq.wq, func: io_wake_function);
2600	iowq.wq.private = current;
2601	INIT_LIST_HEAD(list: &iowq.wq.entry);
2602	iowq.ctx = ctx;
2603	iowq.nr_timeouts = atomic_read(v: &ctx->cq_timeouts);
2604	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
2605	iowq.timeout = KTIME_MAX;
2606
2607	if (uts) {
2608	struct timespec64 ts;
2609
2610	if (get_timespec64(ts: &ts, uts))
2611	return -EFAULT;
2612	iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
2613	}
2614
2615	trace_io_uring_cqring_wait(ctx, min_events);
2616	do {
2617	unsigned long check_cq;
2618
2619	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
2620	int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
2621
2622	atomic_set(v: &ctx->cq_wait_nr, i: nr_wait);
2623	set_current_state(TASK_INTERRUPTIBLE);
2624	} else {
2625	prepare_to_wait_exclusive(wq_head: &ctx->cq_wait, wq_entry: &iowq.wq,
2626	TASK_INTERRUPTIBLE);
2627	}
2628
2629	ret = io_cqring_wait_schedule(ctx, iowq: &iowq);
2630	__set_current_state(TASK_RUNNING);
2631	atomic_set(v: &ctx->cq_wait_nr, i: `0`);
2632
2633	if (ret < `0`)
2634	break;
2635	/*
2636	* Run task_work after scheduling and before io_should_wake().
2637	* If we got woken because of task_work being processed, run it
2638	* now rather than let the caller do another wait loop.
2639	*/
2640	io_run_task_work();
2641	if (!llist_empty(head: &ctx->work_llist))
2642	io_run_local_work(ctx);
2643
2644	check_cq = READ_ONCE(ctx->check_cq);
2645	if (unlikely(check_cq)) {
2646	/ let the caller flush overflows, retry /
2647	if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
2648	io_cqring_do_overflow_flush(ctx);
2649	if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
2650	ret = -EBADR;
2651	break;
2652	}
2653	}
2654
2655	if (io_should_wake(iowq: &iowq)) {
2656	ret = `0`;
2657	break;
2658	}
2659	cond_resched();
2660	} while (`1`);
2661
2662	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
2663	finish_wait(wq_head: &ctx->cq_wait, wq_entry: &iowq.wq);
2664	restore_saved_sigmask_unless(interrupted: ret == -EINTR);
2665
2666	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : `0`;
2667	}
2668
2669	static void io_mem_free(void *ptr)
2670	{
2671	if (!ptr)
2672	return;
2673
2674	folio_put(folio: virt_to_folio(x: ptr));
2675	}
2676
2677	static void io_pages_free(struct page **pages, int* npages)
2678	{
2679	struct page **page_array;
2680	int i;
2681
2682	if (!pages)
2683	return;
2684
2685	page_array = *pages;
2686	if (!page_array)
2687	return;
2688
2689	for (i = `0`; i < npages; i++)
2690	unpin_user_page(page: page_array[i]);
2691	kvfree(addr: page_array);
2692	*pages = NULL;
2693	}
2694
2695	static void __io_uaddr_map(struct* page **pages, unsigned* short *npages,
2696	unsigned long uaddr, size_t size)
2697	{
2698	struct page **page_array;
2699	unsigned int nr_pages;
2700	int ret, i;
2701
2702	*npages = `0`;
2703
2704	if (uaddr & (PAGE_SIZE - `1`) \|\| !size)
2705	return ERR_PTR(error: -EINVAL);
2706
2707	nr_pages = (size + PAGE_SIZE - `1`) >> PAGE_SHIFT;
2708	if (nr_pages > USHRT_MAX)
2709	return ERR_PTR(error: -EINVAL);
2710	page_array = kvmalloc_array(n: nr_pages, size: sizeof(struct page *), GFP_KERNEL);
2711	if (!page_array)
2712	return ERR_PTR(error: -ENOMEM);
2713
2714	ret = pin_user_pages_fast(start: uaddr, nr_pages, gup_flags: FOLL_WRITE \| FOLL_LONGTERM,
2715	pages: page_array);
2716	if (ret != nr_pages) {
2717	err:
2718	io_pages_free(pages: &page_array, npages: ret > `0` ? ret : `0`);
2719	return ret < `0` ? ERR_PTR(error: ret) : ERR_PTR(error: -EFAULT);
2720	}
2721	/*
2722	* Should be a single page. If the ring is small enough that we can
2723	* use a normal page, that is fine. If we need multiple pages, then
2724	* userspace should use a huge page. That's the only way to guarantee
2725	* that we get contigious memory, outside of just being lucky or
2726	* (currently) having low memory fragmentation.
2727	*/
2728	if (page_array[`0`] != page_array[ret - `1`])
2729	goto err;
2730
2731	/*
2732	* Can't support mapping user allocated ring memory on 32-bit archs
2733	* where it could potentially reside in highmem. Just fail those with
2734	* -EINVAL, just like we did on kernels that didn't support this
2735	* feature.
2736	*/
2737	for (i = `0`; i < nr_pages; i++) {
2738	if (PageHighMem(page: page_array[i])) {
2739	ret = -EINVAL;
2740	goto err;
2741	}
2742	}
2743
2744	*pages = page_array;
2745	*npages = nr_pages;
2746	return page_to_virt(page_array[`0`]);
2747	}
2748
2749	static void io_rings_map(struct* io_ring_ctx ctx, unsigned* long uaddr,
2750	size_t size)
2751	{
2752	return __io_uaddr_map(pages: &ctx->ring_pages, npages: &ctx->n_ring_pages, uaddr,
2753	size);
2754	}
2755
2756	static void io_sqes_map(struct* io_ring_ctx ctx, unsigned* long uaddr,
2757	size_t size)
2758	{
2759	return __io_uaddr_map(pages: &ctx->sqe_pages, npages: &ctx->n_sqe_pages, uaddr,
2760	size);
2761	}
2762
2763	static void io_rings_free(struct io_ring_ctx *ctx)
2764	{
2765	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
2766	io_mem_free(ptr: ctx->rings);
2767	io_mem_free(ptr: ctx->sq_sqes);
2768	ctx->rings = NULL;
2769	ctx->sq_sqes = NULL;
2770	} else {
2771	io_pages_free(pages: &ctx->ring_pages, npages: ctx->n_ring_pages);
2772	ctx->n_ring_pages = `0`;
2773	io_pages_free(pages: &ctx->sqe_pages, npages: ctx->n_sqe_pages);
2774	ctx->n_sqe_pages = `0`;
2775	}
2776	}
2777
2778	static void *io_mem_alloc(size_t size)
2779	{
2780	gfp_t gfp = GFP_KERNEL_ACCOUNT \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP;
2781	void *ret;
2782
2783	ret = (void *) __get_free_pages(gfp_mask: gfp, order: get_order(size));
2784	if (ret)
2785	return ret;
2786	return ERR_PTR(error: -ENOMEM);
2787	}
2788
2789	static unsigned long rings_size(struct io_ring_ctx ctx, unsigned* int sq_entries,
2790	unsigned int cq_entries, size_t *sq_offset)
2791	{
2792	struct io_rings *rings;
2793	size_t off, sq_array_size;
2794
2795	off = struct_size(rings, cqes, cq_entries);
2796	if (off == SIZE_MAX)
2797	return SIZE_MAX;
2798	if (ctx->flags & IORING_SETUP_CQE32) {
2799	if (check_shl_overflow(off, `1`, &off))
2800	return SIZE_MAX;
2801	}
2802
2803	#ifdef CONFIG_SMP
2804	off = ALIGN(off, SMP_CACHE_BYTES);
2805	if (off == `0`)
2806	return SIZE_MAX;
2807	#endif
2808
2809	if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
2810	if (sq_offset)
2811	*sq_offset = SIZE_MAX;
2812	return off;
2813	}
2814
2815	if (sq_offset)
2816	*sq_offset = off;
2817
2818	sq_array_size = array_size(sizeof(u32), sq_entries);
2819	if (sq_array_size == SIZE_MAX)
2820	return SIZE_MAX;
2821
2822	if (check_add_overflow(off, sq_array_size, &off))
2823	return SIZE_MAX;
2824
2825	return off;
2826	}
2827
2828	static int io_eventfd_register(struct io_ring_ctx ctx, void* __user *arg,
2829	unsigned int eventfd_async)
2830	{
2831	struct io_ev_fd *ev_fd;
2832	__s32 __user *fds = arg;
2833	int fd;
2834
2835	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
2836	lockdep_is_held(&ctx->uring_lock));
2837	if (ev_fd)
2838	return -EBUSY;
2839
2840	if (copy_from_user(to: &fd, from: fds, n: sizeof(*fds)))
2841	return -EFAULT;
2842
2843	ev_fd = kmalloc(size: sizeof(*ev_fd), GFP_KERNEL);
2844	if (!ev_fd)
2845	return -ENOMEM;
2846
2847	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
2848	if (IS_ERR(ptr: ev_fd->cq_ev_fd)) {
2849	int ret = PTR_ERR(ptr: ev_fd->cq_ev_fd);
2850	kfree(objp: ev_fd);
2851	return ret;
2852	}
2853
2854	spin_lock(lock: &ctx->completion_lock);
2855	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
2856	spin_unlock(lock: &ctx->completion_lock);
2857
2858	ev_fd->eventfd_async = eventfd_async;
2859	ctx->has_evfd = true;
2860	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
2861	atomic_set(v: &ev_fd->refs, i: `1`);
2862	atomic_set(v: &ev_fd->ops, i: `0`);
2863	return `0`;
2864	}
2865
2866	static int io_eventfd_unregister(struct io_ring_ctx *ctx)
2867	{
2868	struct io_ev_fd *ev_fd;
2869
2870	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
2871	lockdep_is_held(&ctx->uring_lock));
2872	if (ev_fd) {
2873	ctx->has_evfd = false;
2874	rcu_assign_pointer(ctx->io_ev_fd, NULL);
2875	if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), v: &ev_fd->ops))
2876	call_rcu(head: &ev_fd->rcu, func: io_eventfd_ops);
2877	return `0`;
2878	}
2879
2880	return -ENXIO;
2881	}
2882
2883	static void io_req_caches_free(struct io_ring_ctx *ctx)
2884	{
2885	struct io_kiocb *req;
2886	int nr = `0`;
2887
2888	mutex_lock(&ctx->uring_lock);
2889	io_flush_cached_locked_reqs(ctx, state: &ctx->submit_state);
2890
2891	while (!io_req_cache_empty(ctx)) {
2892	req = io_extract_req(ctx);
2893	kmem_cache_free(s: req_cachep, objp: req);
2894	nr++;
2895	}
2896	if (nr)
2897	percpu_ref_put_many(ref: &ctx->refs, nr);
2898	mutex_unlock(lock: &ctx->uring_lock);
2899	}
2900
2901	static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
2902	{
2903	kfree(container_of(entry, struct io_rsrc_node, cache));
2904	}
2905
2906	static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2907	{
2908	io_sq_thread_finish(ctx);
2909	/ __io_rsrc_put_work() may need uring_lock to progress, wait w/o it /
2910	if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
2911	return;
2912
2913	mutex_lock(&ctx->uring_lock);
2914	if (ctx->buf_data)
2915	__io_sqe_buffers_unregister(ctx);
2916	if (ctx->file_data)
2917	__io_sqe_files_unregister(ctx);
2918	io_cqring_overflow_kill(ctx);
2919	io_eventfd_unregister(ctx);
2920	io_alloc_cache_free(cache: &ctx->apoll_cache, free: io_apoll_cache_free);
2921	io_alloc_cache_free(cache: &ctx->netmsg_cache, free: io_netmsg_cache_free);
2922	io_futex_cache_free(ctx);
2923	io_destroy_buffers(ctx);
2924	mutex_unlock(lock: &ctx->uring_lock);
2925	if (ctx->sq_creds)
2926	put_cred(cred: ctx->sq_creds);
2927	if (ctx->submitter_task)
2928	put_task_struct(t: ctx->submitter_task);
2929
2930	/ there are no registered resources left, nobody uses it /
2931	if (ctx->rsrc_node)
2932	io_rsrc_node_destroy(ctx, ref_node: ctx->rsrc_node);
2933
2934	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
2935
2936	#if defined(CONFIG_UNIX)
2937	if (ctx->ring_sock) {
2938	ctx->ring_sock->file = NULL; / so that iput() is called /
2939	sock_release(sock: ctx->ring_sock);
2940	}
2941	#endif
2942	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2943
2944	io_alloc_cache_free(cache: &ctx->rsrc_node_cache, free: io_rsrc_node_cache_free);
2945	if (ctx->mm_account) {
2946	mmdrop(mm: ctx->mm_account);
2947	ctx->mm_account = NULL;
2948	}
2949	io_rings_free(ctx);
2950
2951	percpu_ref_exit(ref: &ctx->refs);
2952	free_uid(ctx->user);
2953	io_req_caches_free(ctx);
2954	if (ctx->hash_map)
2955	io_wq_put_hash(hash: ctx->hash_map);
2956	kfree(objp: ctx->cancel_table.hbs);
2957	kfree(objp: ctx->cancel_table_locked.hbs);
2958	kfree(objp: ctx->io_bl);
2959	xa_destroy(&ctx->io_bl_xa);
2960	kfree(objp: ctx);
2961	}
2962
2963	static __cold void io_activate_pollwq_cb(struct callback_head *cb)
2964	{
2965	struct io_ring_ctx ctx = container_of(cb, struct* io_ring_ctx,
2966	poll_wq_task_work);
2967
2968	mutex_lock(&ctx->uring_lock);
2969	ctx->poll_activated = true;
2970	mutex_unlock(lock: &ctx->uring_lock);
2971
2972	/*
2973	* Wake ups for some events between start of polling and activation
2974	* might've been lost due to loose synchronisation.
2975	*/
2976	wake_up_all(&ctx->poll_wq);
2977	percpu_ref_put(ref: &ctx->refs);
2978	}
2979
2980	static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
2981	{
2982	spin_lock(lock: &ctx->completion_lock);
2983	/ already activated or in progress /
2984	if (ctx->poll_activated \|\| ctx->poll_wq_task_work.func)
2985	goto out;
2986	if (WARN_ON_ONCE(!ctx->task_complete))
2987	goto out;
2988	if (!ctx->submitter_task)
2989	goto out;
2990	/*
2991	* with ->submitter_task only the submitter task completes requests, we
2992	* only need to sync with it, which is done by injecting a tw
2993	*/
2994	init_task_work(twork: &ctx->poll_wq_task_work, func: io_activate_pollwq_cb);
2995	percpu_ref_get(ref: &ctx->refs);
2996	if (task_work_add(task: ctx->submitter_task, twork: &ctx->poll_wq_task_work, mode: TWA_SIGNAL))
2997	percpu_ref_put(ref: &ctx->refs);
2998	out:
2999	spin_unlock(lock: &ctx->completion_lock);
3000	}
3001
3002	static __poll_t io_uring_poll(struct file file, poll_table wait)
3003	{
3004	struct io_ring_ctx *ctx = file->private_data;
3005	__poll_t mask = `0`;
3006
3007	if (unlikely(!ctx->poll_activated))
3008	io_activate_pollwq(ctx);
3009
3010	poll_wait(filp: file, wait_address: &ctx->poll_wq, p: wait);
3011	/*
3012	* synchronizes with barrier from wq_has_sleeper call in
3013	* io_commit_cqring
3014	*/
3015	smp_rmb();
3016	if (!io_sqring_full(ctx))
3017	mask \|= EPOLLOUT \| EPOLLWRNORM;
3018
3019	/*
3020	* Don't flush cqring overflow list here, just do a simple check.
3021	* Otherwise there could possible be ABBA deadlock:
3022	* CPU0 CPU1
3023	* ---- ----
3024	* lock(&ctx->uring_lock);
3025	* lock(&ep->mtx);
3026	* lock(&ctx->uring_lock);
3027	* lock(&ep->mtx);
3028	*
3029	* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
3030	* pushes them to do the flush.
3031	*/
3032
3033	if (__io_cqring_events_user(ctx) \|\| io_has_work(ctx))
3034	mask \|= EPOLLIN \| EPOLLRDNORM;
3035
3036	return mask;
3037	}
3038
3039	static int io_unregister_personality(struct io_ring_ctx ctx, unsigned* id)
3040	{
3041	const struct cred *creds;
3042
3043	creds = xa_erase(&ctx->personalities, index: id);
3044	if (creds) {
3045	put_cred(cred: creds);
3046	return `0`;
3047	}
3048
3049	return -EINVAL;
3050	}
3051
3052	struct io_tctx_exit {
3053	struct callback_head task_work;
3054	struct completion completion;
3055	struct io_ring_ctx *ctx;
3056	};
3057
3058	static __cold void io_tctx_exit_cb(struct callback_head *cb)
3059	{
3060	struct io_uring_task *tctx = current->io_uring;
3061	struct io_tctx_exit *work;
3062
3063	work = container_of(cb, struct io_tctx_exit, task_work);
3064	/*
3065	* When @in_cancel, we're in cancellation and it's racy to remove the
3066	* node. It'll be removed by the end of cancellation, just ignore it.
3067	* tctx can be NULL if the queueing of this task_work raced with
3068	* work cancelation off the exec path.
3069	*/
3070	if (tctx && !atomic_read(v: &tctx->in_cancel))
3071	io_uring_del_tctx_node(index: (unsigned long)work->ctx);
3072	complete(&work->completion);
3073	}
3074
3075	static __cold bool io_cancel_ctx_cb(struct io_wq_work work, void* *data)
3076	{
3077	struct io_kiocb req = container_of(work, struct* io_kiocb, work);
3078
3079	return req->ctx == data;
3080	}
3081
3082	static __cold void io_ring_exit_work(struct work_struct *work)
3083	{
3084	struct io_ring_ctx ctx = container_of(work, struct* io_ring_ctx, exit_work);
3085	unsigned long timeout = jiffies + HZ * `60` * `5`;
3086	unsigned long interval = HZ / `20`;
3087	struct io_tctx_exit exit;
3088	struct io_tctx_node *node;
3089	int ret;
3090
3091	/*
3092	* If we're doing polled IO and end up having requests being
3093	* submitted async (out-of-line), then completions can come in while
3094	* we're waiting for refs to drop. We need to reap these manually,
3095	* as nobody else will be looking for them.
3096	*/
3097	do {
3098	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
3099	mutex_lock(&ctx->uring_lock);
3100	io_cqring_overflow_kill(ctx);
3101	mutex_unlock(lock: &ctx->uring_lock);
3102	}
3103
3104	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
3105	io_move_task_work_from_local(ctx);
3106
3107	while (io_uring_try_cancel_requests(ctx, NULL, cancel_all: true))
3108	cond_resched();
3109
3110	if (ctx->sq_data) {
3111	struct io_sq_data *sqd = ctx->sq_data;
3112	struct task_struct *tsk;
3113
3114	io_sq_thread_park(sqd);
3115	tsk = sqd->thread;
3116	if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
3117	io_wq_cancel_cb(wq: tsk->io_uring->io_wq,
3118	cancel: io_cancel_ctx_cb, data: ctx, cancel_all: true);
3119	io_sq_thread_unpark(sqd);
3120	}
3121
3122	io_req_caches_free(ctx);
3123
3124	if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
3125	/ there is little hope left, don't run it too often /
3126	interval = HZ * `60`;
3127	}
3128	/*
3129	* This is really an uninterruptible wait, as it has to be
3130	* complete. But it's also run from a kworker, which doesn't
3131	* take signals, so it's fine to make it interruptible. This
3132	* avoids scenarios where we knowingly can wait much longer
3133	* on completions, for example if someone does a SIGSTOP on
3134	* a task that needs to finish task_work to make this loop
3135	* complete. That's a synthetic situation that should not
3136	* cause a stuck task backtrace, and hence a potential panic
3137	* on stuck tasks if that is enabled.
3138	*/
3139	} while (!wait_for_completion_interruptible_timeout(x: &ctx->ref_comp, timeout: interval));
3140
3141	init_completion(x: &exit.completion);
3142	init_task_work(twork: &exit.task_work, func: io_tctx_exit_cb);
3143	exit.ctx = ctx;
3144	/*
3145	* Some may use context even when all refs and requests have been put,
3146	* and they are free to do so while still holding uring_lock or
3147	* completion_lock, see io_req_task_submit(). Apart from other work,
3148	* this lock/unlock section also waits them to finish.
3149	*/
3150	mutex_lock(&ctx->uring_lock);
3151	while (!list_empty(head: &ctx->tctx_list)) {
3152	WARN_ON_ONCE(time_after(jiffies, timeout));
3153
3154	node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
3155	ctx_node);
3156	/ don't spin on a single task if cancellation failed /
3157	list_rotate_left(head: &ctx->tctx_list);
3158	ret = task_work_add(task: node->task, twork: &exit.task_work, mode: TWA_SIGNAL);
3159	if (WARN_ON_ONCE(ret))
3160	continue;
3161
3162	mutex_unlock(lock: &ctx->uring_lock);
3163	/*
3164	* See comment above for
3165	* wait_for_completion_interruptible_timeout() on why this
3166	* wait is marked as interruptible.
3167	*/
3168	wait_for_completion_interruptible(x: &exit.completion);
3169	mutex_lock(&ctx->uring_lock);
3170	}
3171	mutex_unlock(lock: &ctx->uring_lock);
3172	spin_lock(lock: &ctx->completion_lock);
3173	spin_unlock(lock: &ctx->completion_lock);
3174
3175	/ pairs with RCU read section in io_req_local_work_add() /
3176	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
3177	synchronize_rcu();
3178
3179	io_ring_ctx_free(ctx);
3180	}
3181
3182	static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
3183	{
3184	unsigned long index;
3185	struct creds *creds;
3186
3187	mutex_lock(&ctx->uring_lock);
3188	percpu_ref_kill(ref: &ctx->refs);
3189	xa_for_each(&ctx->personalities, index, creds)
3190	io_unregister_personality(ctx, id: index);
3191	if (ctx->rings)
3192	io_poll_remove_all(ctx, NULL, cancel_all: true);
3193	mutex_unlock(lock: &ctx->uring_lock);
3194
3195	/*
3196	* If we failed setting up the ctx, we might not have any rings
3197	* and therefore did not submit any requests
3198	*/
3199	if (ctx->rings)
3200	io_kill_timeouts(ctx, NULL, cancel_all: true);
3201
3202	flush_delayed_work(dwork: &ctx->fallback_work);
3203
3204	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
3205	/*
3206	* Use system_unbound_wq to avoid spawning tons of event kworkers
3207	* if we're exiting a ton of rings at the same time. It just adds
3208	* noise and overhead, there's no discernable change in runtime
3209	* over using system_wq.
3210	*/
3211	queue_work(wq: system_unbound_wq, work: &ctx->exit_work);
3212	}
3213
3214	static int io_uring_release(struct inode inode, struct* file *file)
3215	{
3216	struct io_ring_ctx *ctx = file->private_data;
3217
3218	file->private_data = NULL;
3219	io_ring_ctx_wait_and_kill(ctx);
3220	return `0`;
3221	}
3222
3223	struct io_task_cancel {
3224	struct task_struct *task;
3225	bool all;
3226	};
3227
3228	static bool io_cancel_task_cb(struct io_wq_work work, void* *data)
3229	{
3230	struct io_kiocb req = container_of(work, struct* io_kiocb, work);
3231	struct io_task_cancel *cancel = data;
3232
3233	return io_match_task_safe(head: req, task: cancel->task, cancel_all: cancel->all);
3234	}
3235
3236	static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
3237	struct task_struct *task,
3238	bool cancel_all)
3239	{
3240	struct io_defer_entry *de;
3241	LIST_HEAD(list);
3242
3243	spin_lock(lock: &ctx->completion_lock);
3244	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
3245	if (io_match_task_safe(head: de->req, task, cancel_all)) {
3246	list_cut_position(list: &list, head: &ctx->defer_list, entry: &de->list);
3247	break;
3248	}
3249	}
3250	spin_unlock(lock: &ctx->completion_lock);
3251	if (list_empty(head: &list))
3252	return false;
3253
3254	while (!list_empty(head: &list)) {
3255	de = list_first_entry(&list, struct io_defer_entry, list);
3256	list_del_init(entry: &de->list);
3257	io_req_task_queue_fail(req: de->req, ret: -ECANCELED);
3258	kfree(objp: de);
3259	}
3260	return true;
3261	}
3262
3263	static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
3264	{
3265	struct io_tctx_node *node;
3266	enum io_wq_cancel cret;
3267	bool ret = false;
3268
3269	mutex_lock(&ctx->uring_lock);
3270	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
3271	struct io_uring_task *tctx = node->task->io_uring;
3272
3273	/*
3274	* io_wq will stay alive while we hold uring_lock, because it's
3275	* killed after ctx nodes, which requires to take the lock.
3276	*/
3277	if (!tctx \|\| !tctx->io_wq)
3278	continue;
3279	cret = io_wq_cancel_cb(wq: tctx->io_wq, cancel: io_cancel_ctx_cb, data: ctx, cancel_all: true);
3280	ret \|= (cret != IO_WQ_CANCEL_NOTFOUND);
3281	}
3282	mutex_unlock(lock: &ctx->uring_lock);
3283
3284	return ret;
3285	}
3286
3287	static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
3288	struct task_struct *task, bool cancel_all)
3289	{
3290	struct hlist_node *tmp;
3291	struct io_kiocb *req;
3292	bool ret = false;
3293
3294	lockdep_assert_held(&ctx->uring_lock);
3295
3296	hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
3297	hash_node) {
3298	struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
3299	struct io_uring_cmd);
3300	struct file *file = req->file;
3301
3302	if (!cancel_all && req->task != task)
3303	continue;
3304
3305	if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
3306	/ ->sqe isn't available if no async data /
3307	if (!req_has_async_data(req))
3308	cmd->sqe = NULL;
3309	file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
3310	ret = true;
3311	}
3312	}
3313	io_submit_flush_completions(ctx);
3314
3315	return ret;
3316	}
3317
3318	static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
3319	struct task_struct *task,
3320	bool cancel_all)
3321	{
3322	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
3323	struct io_uring_task *tctx = task ? task->io_uring : NULL;
3324	enum io_wq_cancel cret;
3325	bool ret = false;
3326
3327	/ set it so io_req_local_work_add() would wake us up /
3328	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
3329	atomic_set(v: &ctx->cq_wait_nr, i: `1`);
3330	smp_mb();
3331	}
3332
3333	/ failed during ring init, it couldn't have issued any requests /
3334	if (!ctx->rings)
3335	return false;
3336
3337	if (!task) {
3338	ret \|= io_uring_try_cancel_iowq(ctx);
3339	} else if (tctx && tctx->io_wq) {
3340	/*
3341	* Cancels requests of all rings, not only @ctx, but
3342	* it's fine as the task is in exit/exec.
3343	*/
3344	cret = io_wq_cancel_cb(wq: tctx->io_wq, cancel: io_cancel_task_cb,
3345	data: &cancel, cancel_all: true);
3346	ret \|= (cret != IO_WQ_CANCEL_NOTFOUND);
3347	}
3348
3349	/ SQPOLL thread does its own polling /
3350	if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) \|\|
3351	(ctx->sq_data && ctx->sq_data->thread == current)) {
3352	while (!wq_list_empty(&ctx->iopoll_list)) {
3353	io_iopoll_try_reap_events(ctx);
3354	ret = true;
3355	cond_resched();
3356	}
3357	}
3358
3359	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
3360	io_allowed_defer_tw_run(ctx))
3361	ret \|= io_run_local_work(ctx) > `0`;
3362	ret \|= io_cancel_defer_files(ctx, task, cancel_all);
3363	mutex_lock(&ctx->uring_lock);
3364	ret \|= io_poll_remove_all(ctx, tsk: task, cancel_all);
3365	ret \|= io_waitid_remove_all(ctx, task, cancel_all);
3366	ret \|= io_futex_remove_all(ctx, task, cancel_all);
3367	ret \|= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
3368	mutex_unlock(lock: &ctx->uring_lock);
3369	ret \|= io_kill_timeouts(ctx, tsk: task, cancel_all);
3370	if (task)
3371	ret \|= io_run_task_work() > `0`;
3372	return ret;
3373	}
3374
3375	static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
3376	{
3377	if (tracked)
3378	return atomic_read(v: &tctx->inflight_tracked);
3379	return percpu_counter_sum(fbc: &tctx->inflight);
3380	}
3381
3382	/*
3383	* Find any io_uring ctx that this task has registered or done IO on, and cancel
3384	* requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
3385	*/
3386	__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
3387	{
3388	struct io_uring_task *tctx = current->io_uring;
3389	struct io_ring_ctx *ctx;
3390	struct io_tctx_node *node;
3391	unsigned long index;
3392	s64 inflight;
3393	DEFINE_WAIT(wait);
3394
3395	WARN_ON_ONCE(sqd && sqd->thread != current);
3396
3397	if (!current->io_uring)
3398	return;
3399	if (tctx->io_wq)
3400	io_wq_exit_start(wq: tctx->io_wq);
3401
3402	atomic_inc(v: &tctx->in_cancel);
3403	do {
3404	bool loop = false;
3405
3406	io_uring_drop_tctx_refs(current);
3407	/ read completions before cancelations /
3408	inflight = tctx_inflight(tctx, tracked: !cancel_all);
3409	if (!inflight)
3410	break;
3411
3412	if (!sqd) {
3413	xa_for_each(&tctx->xa, index, node) {
3414	/ sqpoll task will cancel all its requests /
3415	if (node->ctx->sq_data)
3416	continue;
3417	loop \|= io_uring_try_cancel_requests(ctx: node->ctx,
3418	current, cancel_all);
3419	}
3420	} else {
3421	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
3422	loop \|= io_uring_try_cancel_requests(ctx,
3423	current,
3424	cancel_all);
3425	}
3426
3427	if (loop) {
3428	cond_resched();
3429	continue;
3430	}
3431
3432	prepare_to_wait(wq_head: &tctx->wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
3433	io_run_task_work();
3434	io_uring_drop_tctx_refs(current);
3435	xa_for_each(&tctx->xa, index, node) {
3436	if (!llist_empty(head: &node->ctx->work_llist)) {
3437	WARN_ON_ONCE(node->ctx->submitter_task &&
3438	node->ctx->submitter_task != current);
3439	goto end_wait;
3440	}
3441	}
3442	/*
3443	* If we've seen completions, retry without waiting. This
3444	* avoids a race where a completion comes in before we did
3445	* prepare_to_wait().
3446	*/
3447	if (inflight == tctx_inflight(tctx, tracked: !cancel_all))
3448	schedule();
3449	end_wait:
3450	finish_wait(wq_head: &tctx->wait, wq_entry: &wait);
3451	} while (`1`);
3452
3453	io_uring_clean_tctx(tctx);
3454	if (cancel_all) {
3455	/*
3456	* We shouldn't run task_works after cancel, so just leave
3457	* ->in_cancel set for normal exit.
3458	*/
3459	atomic_dec(v: &tctx->in_cancel);
3460	/ for exec all current's requests should be gone, kill tctx /
3461	__io_uring_free(current);
3462	}
3463	}
3464
3465	void __io_uring_cancel(bool cancel_all)
3466	{
3467	io_uring_cancel_generic(cancel_all, NULL);
3468	}
3469
3470	static void io_uring_validate_mmap_request(struct* file *file,
3471	loff_t pgoff, size_t sz)
3472	{
3473	struct io_ring_ctx *ctx = file->private_data;
3474	loff_t offset = pgoff << PAGE_SHIFT;
3475	struct page *page;
3476	void *ptr;
3477
3478	/ Don't allow mmap if the ring was setup without it /
3479	if (ctx->flags & IORING_SETUP_NO_MMAP)
3480	return ERR_PTR(error: -EINVAL);
3481
3482	switch (offset & IORING_OFF_MMAP_MASK) {
3483	case IORING_OFF_SQ_RING:
3484	case IORING_OFF_CQ_RING:
3485	ptr = ctx->rings;
3486	break;
3487	case IORING_OFF_SQES:
3488	ptr = ctx->sq_sqes;
3489	break;
3490	case IORING_OFF_PBUF_RING: {
3491	unsigned int bgid;
3492
3493	bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
3494	mutex_lock(&ctx->uring_lock);
3495	ptr = io_pbuf_get_address(ctx, bgid);
3496	mutex_unlock(lock: &ctx->uring_lock);
3497	if (!ptr)
3498	return ERR_PTR(error: -EINVAL);
3499	break;
3500	}
3501	default:
3502	return ERR_PTR(error: -EINVAL);
3503	}
3504
3505	page = virt_to_head_page(x: ptr);
3506	if (sz > page_size(page))
3507	return ERR_PTR(error: -EINVAL);
3508
3509	return ptr;
3510	}
3511
3512	#ifdef CONFIG_MMU
3513
3514	static __cold int io_uring_mmap(struct file file, struct* vm_area_struct *vma)
3515	{
3516	size_t sz = vma->vm_end - vma->vm_start;
3517	unsigned long pfn;
3518	void *ptr;
3519
3520	ptr = io_uring_validate_mmap_request(file, pgoff: vma->vm_pgoff, sz);
3521	if (IS_ERR(ptr))
3522	return PTR_ERR(ptr);
3523
3524	pfn = virt_to_phys(address: ptr) >> PAGE_SHIFT;
3525	return remap_pfn_range(vma, addr: vma->vm_start, pfn, size: sz, vma->vm_page_prot);
3526	}
3527
3528	static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
3529	unsigned long addr, unsigned long len,
3530	unsigned long pgoff, unsigned long flags)
3531	{
3532	void *ptr;
3533
3534	/*
3535	* Do not allow to map to user-provided address to avoid breaking the
3536	* aliasing rules. Userspace is not able to guess the offset address of
3537	* kernel kmalloc()ed memory area.
3538	*/
3539	if (addr)
3540	return -EINVAL;
3541
3542	ptr = io_uring_validate_mmap_request(file: filp, pgoff, sz: len);
3543	if (IS_ERR(ptr))
3544	return -ENOMEM;
3545
3546	/*
3547	* Some architectures have strong cache aliasing requirements.
3548	* For such architectures we need a coherent mapping which aliases
3549	* kernel memory and userspace memory. To achieve that:
3550	* - use a NULL file pointer to reference physical memory, and
3551	* - use the kernel virtual address of the shared io_uring context
3552	* (instead of the userspace-provided address, which has to be 0UL
3553	* anyway).
3554	* - use the same pgoff which the get_unmapped_area() uses to
3555	* calculate the page colouring.
3556	* For architectures without such aliasing requirements, the
3557	* architecture will return any suitable mapping because addr is 0.
3558	*/
3559	filp = NULL;
3560	flags \|= MAP_SHARED;
3561	pgoff = `0`; / has been translated to ptr above /
3562	#ifdef SHM_COLOUR
3563	addr = (uintptr_t) ptr;
3564	pgoff = addr >> PAGE_SHIFT;
3565	#else
3566	addr = `0UL`;
3567	#endif
3568	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
3569	}
3570
3571	#else /* !CONFIG_MMU */
3572
3573	static int io_uring_mmap(struct file file, struct* vm_area_struct *vma)
3574	{
3575	return is_nommu_shared_mapping(vma->vm_flags) ? `0` : -EINVAL;
3576	}
3577
3578	static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
3579	{
3580	return NOMMU_MAP_DIRECT \| NOMMU_MAP_READ \| NOMMU_MAP_WRITE;
3581	}
3582
3583	static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
3584	unsigned long addr, unsigned long len,
3585	unsigned long pgoff, unsigned long flags)
3586	{
3587	void *ptr;
3588
3589	ptr = io_uring_validate_mmap_request(file, pgoff, len);
3590	if (IS_ERR(ptr))
3591	return PTR_ERR(ptr);
3592
3593	return (unsigned long) ptr;
3594	}
3595
3596	#endif /* !CONFIG_MMU */
3597
3598	static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
3599	{
3600	if (flags & IORING_ENTER_EXT_ARG) {
3601	struct io_uring_getevents_arg arg;
3602
3603	if (argsz != sizeof(arg))
3604	return -EINVAL;
3605	if (copy_from_user(to: &arg, from: argp, n: sizeof(arg)))
3606	return -EFAULT;
3607	}
3608	return `0`;
3609	}
3610
3611	static int io_get_ext_arg(unsigned flags, const void __user argp, size_t argsz,
3612	struct __kernel_timespec __user **ts,
3613	const sigset_t __user **sig)
3614	{
3615	struct io_uring_getevents_arg arg;
3616
3617	/*
3618	* If EXT_ARG isn't set, then we have no timespec and the argp pointer
3619	* is just a pointer to the sigset_t.
3620	*/
3621	if (!(flags & IORING_ENTER_EXT_ARG)) {
3622	sig = (const* sigset_t __user *) argp;
3623	*ts = NULL;
3624	return `0`;
3625	}
3626
3627	/*
3628	* EXT_ARG is set - ensure we agree on the size of it and copy in our
3629	* timespec and sigset_t pointers if good.
3630	*/
3631	if (argsz != sizeof*(arg))
3632	return -EINVAL;
3633	if (copy_from_user(to: &arg, from: argp, n: sizeof(arg)))
3634	return -EFAULT;
3635	if (arg.pad)
3636	return -EINVAL;
3637	*sig = u64_to_user_ptr(arg.sigmask);
3638	*argsz = arg.sigmask_sz;
3639	*ts = u64_to_user_ptr(arg.ts);
3640	return `0`;
3641	}
3642
3643	SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3644	u32, min_complete, u32, flags, const void __user *, argp,
3645	size_t, argsz)
3646	{
3647	struct io_ring_ctx *ctx;
3648	struct fd f;
3649	long ret;
3650
3651	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS \| IORING_ENTER_SQ_WAKEUP \|
3652	IORING_ENTER_SQ_WAIT \| IORING_ENTER_EXT_ARG \|
3653	IORING_ENTER_REGISTERED_RING)))
3654	return -EINVAL;
3655
3656	/*
3657	* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
3658	* need only dereference our task private array to find it.
3659	*/
3660	if (flags & IORING_ENTER_REGISTERED_RING) {
3661	struct io_uring_task *tctx = current->io_uring;
3662
3663	if (unlikely(!tctx \|\| fd >= IO_RINGFD_REG_MAX))
3664	return -EINVAL;
3665	fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
3666	f.file = tctx->registered_rings[fd];
3667	f.flags = `0`;
3668	if (unlikely(!f.file))
3669	return -EBADF;
3670	} else {
3671	f = fdget(fd);
3672	if (unlikely(!f.file))
3673	return -EBADF;
3674	ret = -EOPNOTSUPP;
3675	if (unlikely(!io_is_uring_fops(f.file)))
3676	goto out;
3677	}
3678
3679	ctx = f.file->private_data;
3680	ret = -EBADFD;
3681	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
3682	goto out;
3683
3684	/*
3685	* For SQ polling, the thread will do all submissions and completions.
3686	* Just return the requested submit count, and wake the thread if
3687	* we were asked to.
3688	*/
3689	ret = `0`;
3690	if (ctx->flags & IORING_SETUP_SQPOLL) {
3691	io_cqring_overflow_flush(ctx);
3692
3693	if (unlikely(ctx->sq_data->thread == NULL)) {
3694	ret = -EOWNERDEAD;
3695	goto out;
3696	}
3697	if (flags & IORING_ENTER_SQ_WAKEUP)
3698	wake_up(&ctx->sq_data->wait);
3699	if (flags & IORING_ENTER_SQ_WAIT)
3700	io_sqpoll_wait_sq(ctx);
3701
3702	ret = to_submit;
3703	} else if (to_submit) {
3704	ret = io_uring_add_tctx_node(ctx);
3705	if (unlikely(ret))
3706	goto out;
3707
3708	mutex_lock(&ctx->uring_lock);
3709	ret = io_submit_sqes(ctx, nr: to_submit);
3710	if (ret != to_submit) {
3711	mutex_unlock(lock: &ctx->uring_lock);
3712	goto out;
3713	}
3714	if (flags & IORING_ENTER_GETEVENTS) {
3715	if (ctx->syscall_iopoll)
3716	goto iopoll_locked;
3717	/*
3718	* Ignore errors, we'll soon call io_cqring_wait() and
3719	* it should handle ownership problems if any.
3720	*/
3721	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
3722	(void)io_run_local_work_locked(ctx);
3723	}
3724	mutex_unlock(lock: &ctx->uring_lock);
3725	}
3726
3727	if (flags & IORING_ENTER_GETEVENTS) {
3728	int ret2;
3729
3730	if (ctx->syscall_iopoll) {
3731	/*
3732	* We disallow the app entering submit/complete with
3733	* polling, but we still need to lock the ring to
3734	* prevent racing with polled issue that got punted to
3735	* a workqueue.
3736	*/
3737	mutex_lock(&ctx->uring_lock);
3738	iopoll_locked:
3739	ret2 = io_validate_ext_arg(flags, argp, argsz);
3740	if (likely(!ret2)) {
3741	min_complete = min(min_complete,
3742	ctx->cq_entries);
3743	ret2 = io_iopoll_check(ctx, min: min_complete);
3744	}
3745	mutex_unlock(lock: &ctx->uring_lock);
3746	} else {
3747	const sigset_t __user *sig;
3748	struct __kernel_timespec __user *ts;
3749
3750	ret2 = io_get_ext_arg(flags, argp, argsz: &argsz, ts: &ts, sig: &sig);
3751	if (likely(!ret2)) {
3752	min_complete = min(min_complete,
3753	ctx->cq_entries);
3754	ret2 = io_cqring_wait(ctx, min_events: min_complete, sig,
3755	sigsz: argsz, uts: ts);
3756	}
3757	}
3758
3759	if (!ret) {
3760	ret = ret2;
3761
3762	/*
3763	* EBADR indicates that one or more CQE were dropped.
3764	* Once the user has been informed we can clear the bit
3765	* as they are obviously ok with those drops.
3766	*/
3767	if (unlikely(ret2 == -EBADR))
3768	clear_bit(nr: IO_CHECK_CQ_DROPPED_BIT,
3769	addr: &ctx->check_cq);
3770	}
3771	}
3772	out:
3773	fdput(fd: f);
3774	return ret;
3775	}
3776
3777	static const struct file_operations io_uring_fops = {
3778	.release = io_uring_release,
3779	.mmap = io_uring_mmap,
3780	#ifndef CONFIG_MMU
3781	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
3782	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
3783	#else
3784	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
3785	#endif
3786	.poll = io_uring_poll,
3787	#ifdef CONFIG_PROC_FS
3788	.show_fdinfo = io_uring_show_fdinfo,
3789	#endif
3790	};
3791
3792	bool io_is_uring_fops(struct file *file)
3793	{
3794	return file->f_op == &io_uring_fops;
3795	}
3796
3797	static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
3798	struct io_uring_params *p)
3799	{
3800	struct io_rings *rings;
3801	size_t size, sq_array_offset;
3802	void *ptr;
3803
3804	/ make sure these are sane, as we already accounted them /
3805	ctx->sq_entries = p->sq_entries;
3806	ctx->cq_entries = p->cq_entries;
3807
3808	size = rings_size(ctx, sq_entries: p->sq_entries, cq_entries: p->cq_entries, sq_offset: &sq_array_offset);
3809	if (size == SIZE_MAX)
3810	return -EOVERFLOW;
3811
3812	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
3813	rings = io_mem_alloc(size);
3814	else
3815	rings = io_rings_map(ctx, uaddr: p->cq_off.user_addr, size);
3816
3817	if (IS_ERR(ptr: rings))
3818	return PTR_ERR(ptr: rings);
3819
3820	ctx->rings = rings;
3821	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
3822	ctx->sq_array = (u32 )((char* *)rings + sq_array_offset);
3823	rings->sq_ring_mask = p->sq_entries - `1`;
3824	rings->cq_ring_mask = p->cq_entries - `1`;
3825	rings->sq_ring_entries = p->sq_entries;
3826	rings->cq_ring_entries = p->cq_entries;
3827
3828	if (p->flags & IORING_SETUP_SQE128)
3829	size = array_size(`2` * sizeof(struct io_uring_sqe), p->sq_entries);
3830	else
3831	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3832	if (size == SIZE_MAX) {
3833	io_rings_free(ctx);
3834	return -EOVERFLOW;
3835	}
3836
3837	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
3838	ptr = io_mem_alloc(size);
3839	else
3840	ptr = io_sqes_map(ctx, uaddr: p->sq_off.user_addr, size);
3841
3842	if (IS_ERR(ptr)) {
3843	io_rings_free(ctx);
3844	return PTR_ERR(ptr);
3845	}
3846
3847	ctx->sq_sqes = ptr;
3848	return `0`;
3849	}
3850
3851	static int io_uring_install_fd(struct file *file)
3852	{
3853	int fd;
3854
3855	fd = get_unused_fd_flags(O_RDWR \| O_CLOEXEC);
3856	if (fd < `0`)
3857	return fd;
3858	fd_install(fd, file);
3859	return fd;
3860	}
3861
3862	/*
3863	* Allocate an anonymous fd, this is what constitutes the application
3864	* visible backing of an io_uring instance. The application mmaps this
3865	* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
3866	* we have to tie this fd to a socket for file garbage collection purposes.
3867	*/
3868	static struct file io_uring_get_file(struct* io_ring_ctx *ctx)
3869	{
3870	struct file *file;
3871	#if defined(CONFIG_UNIX)
3872	int ret;
3873
3874	ret = sock_create_kern(net: &init_net, PF_UNIX, type: SOCK_RAW, IPPROTO_IP,
3875	res: &ctx->ring_sock);
3876	if (ret)
3877	return ERR_PTR(error: ret);
3878	#endif
3879
3880	file = anon_inode_getfile_secure(name: "[io_uring]", fops: &io_uring_fops, priv: ctx,
3881	O_RDWR \| O_CLOEXEC, NULL);
3882	#if defined(CONFIG_UNIX)
3883	if (IS_ERR(ptr: file)) {
3884	sock_release(sock: ctx->ring_sock);
3885	ctx->ring_sock = NULL;
3886	} else {
3887	ctx->ring_sock->file = file;
3888	}
3889	#endif
3890	return file;
3891	}
3892
3893	static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
3894	struct io_uring_params __user *params)
3895	{
3896	struct io_ring_ctx *ctx;
3897	struct io_uring_task *tctx;
3898	struct file *file;
3899	int ret;
3900
3901	if (!entries)
3902	return -EINVAL;
3903	if (entries > IORING_MAX_ENTRIES) {
3904	if (!(p->flags & IORING_SETUP_CLAMP))
3905	return -EINVAL;
3906	entries = IORING_MAX_ENTRIES;
3907	}
3908
3909	if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
3910	&& !(p->flags & IORING_SETUP_NO_MMAP))
3911	return -EINVAL;
3912
3913	/*
3914	* Use twice as many entries for the CQ ring. It's possible for the
3915	* application to drive a higher depth than the size of the SQ ring,
3916	* since the sqes are only used at submission time. This allows for
3917	* some flexibility in overcommitting a bit. If the application has
3918	* set IORING_SETUP_CQSIZE, it will have passed in the desired number
3919	* of CQ ring entries manually.
3920	*/
3921	p->sq_entries = roundup_pow_of_two(entries);
3922	if (p->flags & IORING_SETUP_CQSIZE) {
3923	/*
3924	* If IORING_SETUP_CQSIZE is set, we do the same roundup
3925	* to a power-of-two, if it isn't already. We do NOT impose
3926	* any cq vs sq ring sizing.
3927	*/
3928	if (!p->cq_entries)
3929	return -EINVAL;
3930	if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
3931	if (!(p->flags & IORING_SETUP_CLAMP))
3932	return -EINVAL;
3933	p->cq_entries = IORING_MAX_CQ_ENTRIES;
3934	}
3935	p->cq_entries = roundup_pow_of_two(p->cq_entries);
3936	if (p->cq_entries < p->sq_entries)
3937	return -EINVAL;
3938	} else {
3939	p->cq_entries = `2` * p->sq_entries;
3940	}
3941
3942	ctx = io_ring_ctx_alloc(p);
3943	if (!ctx)
3944	return -ENOMEM;
3945
3946	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
3947	!(ctx->flags & IORING_SETUP_IOPOLL) &&
3948	!(ctx->flags & IORING_SETUP_SQPOLL))
3949	ctx->task_complete = true;
3950
3951	if (ctx->task_complete \|\| (ctx->flags & IORING_SETUP_IOPOLL))
3952	ctx->lockless_cq = true;
3953
3954	/*
3955	* lazy poll_wq activation relies on ->task_complete for synchronisation
3956	* purposes, see io_activate_pollwq()
3957	*/
3958	if (!ctx->task_complete)
3959	ctx->poll_activated = true;
3960
3961	/*
3962	* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
3963	* space applications don't need to do io completion events
3964	* polling again, they can rely on io_sq_thread to do polling
3965	* work, which can reduce cpu usage and uring_lock contention.
3966	*/
3967	if (ctx->flags & IORING_SETUP_IOPOLL &&
3968	!(ctx->flags & IORING_SETUP_SQPOLL))
3969	ctx->syscall_iopoll = `1`;
3970
3971	ctx->compat = in_compat_syscall();
3972	if (!ns_capable_noaudit(ns: &init_user_ns, CAP_IPC_LOCK))
3973	ctx->user = get_uid(current_user());
3974
3975	/*
3976	* For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
3977	* COOP_TASKRUN is set, then IPIs are never needed by the app.
3978	*/
3979	ret = -EINVAL;
3980	if (ctx->flags & IORING_SETUP_SQPOLL) {
3981	/ IPI related flags don't make sense with SQPOLL /
3982	if (ctx->flags & (IORING_SETUP_COOP_TASKRUN \|
3983	IORING_SETUP_TASKRUN_FLAG \|
3984	IORING_SETUP_DEFER_TASKRUN))
3985	goto err;
3986	ctx->notify_method = TWA_SIGNAL_NO_IPI;
3987	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
3988	ctx->notify_method = TWA_SIGNAL_NO_IPI;
3989	} else {
3990	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
3991	!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
3992	goto err;
3993	ctx->notify_method = TWA_SIGNAL;
3994	}
3995
3996	/*
3997	* For DEFER_TASKRUN we require the completion task to be the same as the
3998	* submission task. This implies that there is only one submitter, so enforce
3999	* that.
4000	*/
4001	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
4002	!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
4003	goto err;
4004	}
4005
4006	/*
4007	* This is just grabbed for accounting purposes. When a process exits,
4008	* the mm is exited and dropped before the files, hence we need to hang
4009	* on to this mm purely for the purposes of being able to unaccount
4010	* memory (locked/pinned vm). It's not used for anything else.
4011	*/
4012	mmgrab(current->mm);
4013	ctx->mm_account = current->mm;
4014
4015	ret = io_allocate_scq_urings(ctx, p);
4016	if (ret)
4017	goto err;
4018
4019	ret = io_sq_offload_create(ctx, p);
4020	if (ret)
4021	goto err;
4022
4023	ret = io_rsrc_init(ctx);
4024	if (ret)
4025	goto err;
4026
4027	p->sq_off.head = offsetof(struct io_rings, sq.head);
4028	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
4029	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
4030	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
4031	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
4032	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
4033	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
4034	p->sq_off.array = (char )ctx->sq_array - (char* *)ctx->rings;
4035	p->sq_off.resv1 = `0`;
4036	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
4037	p->sq_off.user_addr = `0`;
4038
4039	p->cq_off.head = offsetof(struct io_rings, cq.head);
4040	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
4041	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
4042	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
4043	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
4044	p->cq_off.cqes = offsetof(struct io_rings, cqes);
4045	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
4046	p->cq_off.resv1 = `0`;
4047	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
4048	p->cq_off.user_addr = `0`;
4049
4050	p->features = IORING_FEAT_SINGLE_MMAP \| IORING_FEAT_NODROP \|
4051	IORING_FEAT_SUBMIT_STABLE \| IORING_FEAT_RW_CUR_POS \|
4052	IORING_FEAT_CUR_PERSONALITY \| IORING_FEAT_FAST_POLL \|
4053	IORING_FEAT_POLL_32BITS \| IORING_FEAT_SQPOLL_NONFIXED \|
4054	IORING_FEAT_EXT_ARG \| IORING_FEAT_NATIVE_WORKERS \|
4055	IORING_FEAT_RSRC_TAGS \| IORING_FEAT_CQE_SKIP \|
4056	IORING_FEAT_LINKED_FILE \| IORING_FEAT_REG_REG_RING;
4057
4058	if (copy_to_user(to: params, from: p, n: sizeof(*p))) {
4059	ret = -EFAULT;
4060	goto err;
4061	}
4062
4063	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
4064	&& !(ctx->flags & IORING_SETUP_R_DISABLED))
4065	WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
4066
4067	file = io_uring_get_file(ctx);
4068	if (IS_ERR(ptr: file)) {
4069	ret = PTR_ERR(ptr: file);
4070	goto err;
4071	}
4072
4073	ret = __io_uring_add_tctx_node(ctx);
4074	if (ret)
4075	goto err_fput;
4076	tctx = current->io_uring;
4077
4078	/*
4079	* Install ring fd as the very last thing, so we don't risk someone
4080	* having closed it before we finish setup
4081	*/
4082	if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
4083	ret = io_ring_add_registered_file(tctx, file, start: `0`, IO_RINGFD_REG_MAX);
4084	else
4085	ret = io_uring_install_fd(file);
4086	if (ret < `0`)
4087	goto err_fput;
4088
4089	trace_io_uring_create(fd: ret, ctx, sq_entries: p->sq_entries, cq_entries: p->cq_entries, flags: p->flags);
4090	return ret;
4091	err:
4092	io_ring_ctx_wait_and_kill(ctx);
4093	return ret;
4094	err_fput:
4095	fput(file);
4096	return ret;
4097	}
4098
4099	/*
4100	* Sets up an aio uring context, and returns the fd. Applications asks for a
4101	* ring size, we return the actual sq/cq ring sizes (among other things) in the
4102	* params structure passed in.
4103	*/
4104	static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
4105	{
4106	struct io_uring_params p;
4107	int i;
4108
4109	if (copy_from_user(to: &p, from: params, n: sizeof(p)))
4110	return -EFAULT;
4111	for (i = `0`; i < ARRAY_SIZE(p.resv); i++) {
4112	if (p.resv[i])
4113	return -EINVAL;
4114	}
4115
4116	if (p.flags & ~(IORING_SETUP_IOPOLL \| IORING_SETUP_SQPOLL \|
4117	IORING_SETUP_SQ_AFF \| IORING_SETUP_CQSIZE \|
4118	IORING_SETUP_CLAMP \| IORING_SETUP_ATTACH_WQ \|
4119	IORING_SETUP_R_DISABLED \| IORING_SETUP_SUBMIT_ALL \|
4120	IORING_SETUP_COOP_TASKRUN \| IORING_SETUP_TASKRUN_FLAG \|
4121	IORING_SETUP_SQE128 \| IORING_SETUP_CQE32 \|
4122	IORING_SETUP_SINGLE_ISSUER \| IORING_SETUP_DEFER_TASKRUN \|
4123	IORING_SETUP_NO_MMAP \| IORING_SETUP_REGISTERED_FD_ONLY \|
4124	IORING_SETUP_NO_SQARRAY))
4125	return -EINVAL;
4126
4127	return io_uring_create(entries, p: &p, params);
4128	}
4129
4130	static inline bool io_uring_allowed(void)
4131	{
4132	int disabled = READ_ONCE(sysctl_io_uring_disabled);
4133	kgid_t io_uring_group;
4134
4135	if (disabled == `2`)
4136	return false;
4137
4138	if (disabled == `0` \|\| capable(CAP_SYS_ADMIN))
4139	return true;
4140
4141	io_uring_group = make_kgid(from: &init_user_ns, gid: sysctl_io_uring_group);
4142	if (!gid_valid(gid: io_uring_group))
4143	return false;
4144
4145	return in_group_p(io_uring_group);
4146	}
4147
4148	SYSCALL_DEFINE2(io_uring_setup, u32, entries,
4149	struct io_uring_params __user *, params)
4150	{
4151	if (!io_uring_allowed())
4152	return -EPERM;
4153
4154	return io_uring_setup(entries, params);
4155	}
4156
4157	static __cold int io_probe(struct io_ring_ctx ctx, void* __user *arg,
4158	unsigned nr_args)
4159	{
4160	struct io_uring_probe *p;
4161	size_t size;
4162	int i, ret;
4163
4164	size = struct_size(p, ops, nr_args);
4165	if (size == SIZE_MAX)
4166	return -EOVERFLOW;
4167	p = kzalloc(size, GFP_KERNEL);
4168	if (!p)
4169	return -ENOMEM;
4170
4171	ret = -EFAULT;
4172	if (copy_from_user(to: p, from: arg, n: size))
4173	goto out;
4174	ret = -EINVAL;
4175	if (memchr_inv(p, c: `0`, size))
4176	goto out;
4177
4178	p->last_op = IORING_OP_LAST - `1`;
4179	if (nr_args > IORING_OP_LAST)
4180	nr_args = IORING_OP_LAST;
4181
4182	for (i = `0`; i < nr_args; i++) {
4183	p->ops[i].op = i;
4184	if (!io_issue_defs[i].not_supported)
4185	p->ops[i].flags = IO_URING_OP_SUPPORTED;
4186	}
4187	p->ops_len = i;
4188
4189	ret = `0`;
4190	if (copy_to_user(to: arg, from: p, n: size))
4191	ret = -EFAULT;
4192	out:
4193	kfree(objp: p);
4194	return ret;
4195	}
4196
4197	static int io_register_personality(struct io_ring_ctx *ctx)
4198	{
4199	const struct cred *creds;
4200	u32 id;
4201	int ret;
4202
4203	creds = get_current_cred();
4204
4205	ret = xa_alloc_cyclic(xa: &ctx->personalities, id: &id, entry: (void *)creds,
4206	XA_LIMIT(`0`, USHRT_MAX), next: &ctx->pers_next, GFP_KERNEL);
4207	if (ret < `0`) {
4208	put_cred(cred: creds);
4209	return ret;
4210	}
4211	return id;
4212	}
4213
4214	static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
4215	void __user arg, unsigned* int nr_args)
4216	{
4217	struct io_uring_restriction *res;
4218	size_t size;
4219	int i, ret;
4220
4221	/ Restrictions allowed only if rings started disabled /
4222	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
4223	return -EBADFD;
4224
4225	/ We allow only a single restrictions registration /
4226	if (ctx->restrictions.registered)
4227	return -EBUSY;
4228
4229	if (!arg \|\| nr_args > IORING_MAX_RESTRICTIONS)
4230	return -EINVAL;
4231
4232	size = array_size(nr_args, sizeof(*res));
4233	if (size == SIZE_MAX)
4234	return -EOVERFLOW;
4235
4236	res = memdup_user(arg, size);
4237	if (IS_ERR(ptr: res))
4238	return PTR_ERR(ptr: res);
4239
4240	ret = `0`;
4241
4242	for (i = `0`; i < nr_args; i++) {
4243	switch (res[i].opcode) {
4244	case IORING_RESTRICTION_REGISTER_OP:
4245	if (res[i].register_op >= IORING_REGISTER_LAST) {
4246	ret = -EINVAL;
4247	goto out;
4248	}
4249
4250	__set_bit(res[i].register_op,
4251	ctx->restrictions.register_op);
4252	break;
4253	case IORING_RESTRICTION_SQE_OP:
4254	if (res[i].sqe_op >= IORING_OP_LAST) {
4255	ret = -EINVAL;
4256	goto out;
4257	}
4258
4259	__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
4260	break;
4261	case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
4262	ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
4263	break;
4264	case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
4265	ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
4266	break;
4267	default:
4268	ret = -EINVAL;
4269	goto out;
4270	}
4271	}
4272
4273	out:
4274	/ Reset all restrictions if an error happened /
4275	if (ret != `0`)
4276	memset(&ctx->restrictions, `0`, sizeof(ctx->restrictions));
4277	else
4278	ctx->restrictions.registered = true;
4279
4280	kfree(objp: res);
4281	return ret;
4282	}
4283
4284	static int io_register_enable_rings(struct io_ring_ctx *ctx)
4285	{
4286	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
4287	return -EBADFD;
4288
4289	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
4290	WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
4291	/*
4292	* Lazy activation attempts would fail if it was polled before
4293	* submitter_task is set.
4294	*/
4295	if (wq_has_sleeper(wq_head: &ctx->poll_wq))
4296	io_activate_pollwq(ctx);
4297	}
4298
4299	if (ctx->restrictions.registered)
4300	ctx->restricted = `1`;
4301
4302	ctx->flags &= ~IORING_SETUP_R_DISABLED;
4303	if (ctx->sq_data && wq_has_sleeper(wq_head: &ctx->sq_data->wait))
4304	wake_up(&ctx->sq_data->wait);
4305	return `0`;
4306	}
4307
4308	static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
4309	cpumask_var_t new_mask)
4310	{
4311	int ret;
4312
4313	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
4314	ret = io_wq_cpu_affinity(current->io_uring, mask: new_mask);
4315	} else {
4316	mutex_unlock(lock: &ctx->uring_lock);
4317	ret = io_sqpoll_wq_cpu_affinity(ctx, mask: new_mask);
4318	mutex_lock(&ctx->uring_lock);
4319	}
4320
4321	return ret;
4322	}
4323
4324	static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
4325	void __user arg, unsigned* len)
4326	{
4327	cpumask_var_t new_mask;
4328	int ret;
4329
4330	if (!alloc_cpumask_var(mask: &new_mask, GFP_KERNEL))
4331	return -ENOMEM;
4332
4333	cpumask_clear(dstp: new_mask);
4334	if (len > cpumask_size())
4335	len = cpumask_size();
4336
4337	if (in_compat_syscall()) {
4338	ret = compat_get_bitmap(cpumask_bits(new_mask),
4339	umask: (const compat_ulong_t __user *)arg,
4340	bitmap_size: len * `8` / CHAR_BIT /);
4341	} else {
4342	ret = copy_from_user(to: new_mask, from: arg, n: len);
4343	}
4344
4345	if (ret) {
4346	free_cpumask_var(mask: new_mask);
4347	return -EFAULT;
4348	}
4349
4350	ret = __io_register_iowq_aff(ctx, new_mask);
4351	free_cpumask_var(mask: new_mask);
4352	return ret;
4353	}
4354
4355	static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
4356	{
4357	return __io_register_iowq_aff(ctx, NULL);
4358	}
4359
4360	static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
4361	void __user *arg)
4362	__must_hold(&ctx->uring_lock)
4363	{
4364	struct io_tctx_node *node;
4365	struct io_uring_task *tctx = NULL;
4366	struct io_sq_data *sqd = NULL;
4367	__u32 new_count[`2`];
4368	int i, ret;
4369
4370	if (copy_from_user(to: new_count, from: arg, n: sizeof(new_count)))
4371	return -EFAULT;
4372	for (i = `0`; i < ARRAY_SIZE(new_count); i++)
4373	if (new_count[i] > INT_MAX)
4374	return -EINVAL;
4375
4376	if (ctx->flags & IORING_SETUP_SQPOLL) {
4377	sqd = ctx->sq_data;
4378	if (sqd) {
4379	/*
4380	* Observe the correct sqd->lock -> ctx->uring_lock
4381	* ordering. Fine to drop uring_lock here, we hold
4382	* a ref to the ctx.
4383	*/
4384	refcount_inc(r: &sqd->refs);
4385	mutex_unlock(lock: &ctx->uring_lock);
4386	mutex_lock(&sqd->lock);
4387	mutex_lock(&ctx->uring_lock);
4388	if (sqd->thread)
4389	tctx = sqd->thread->io_uring;
4390	}
4391	} else {
4392	tctx = current->io_uring;
4393	}
4394
4395	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
4396
4397	for (i = `0`; i < ARRAY_SIZE(new_count); i++)
4398	if (new_count[i])
4399	ctx->iowq_limits[i] = new_count[i];
4400	ctx->iowq_limits_set = true;
4401
4402	if (tctx && tctx->io_wq) {
4403	ret = io_wq_max_workers(wq: tctx->io_wq, new_count);
4404	if (ret)
4405	goto err;
4406	} else {
4407	memset(new_count, `0`, sizeof(new_count));
4408	}
4409
4410	if (sqd) {
4411	mutex_unlock(lock: &sqd->lock);
4412	io_put_sq_data(sqd);
4413	}
4414
4415	if (copy_to_user(to: arg, from: new_count, n: sizeof(new_count)))
4416	return -EFAULT;
4417
4418	/ that's it for SQPOLL, only the SQPOLL task creates requests /
4419	if (sqd)
4420	return `0`;
4421
4422	/ now propagate the restriction to all registered users /
4423	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
4424	struct io_uring_task *tctx = node->task->io_uring;
4425
4426	if (WARN_ON_ONCE(!tctx->io_wq))
4427	continue;
4428
4429	for (i = `0`; i < ARRAY_SIZE(new_count); i++)
4430	new_count[i] = ctx->iowq_limits[i];
4431	/ ignore errors, it always returns zero anyway /
4432	(void)io_wq_max_workers(wq: tctx->io_wq, new_count);
4433	}
4434	return `0`;
4435	err:
4436	if (sqd) {
4437	mutex_unlock(lock: &sqd->lock);
4438	io_put_sq_data(sqd);
4439	}
4440	return ret;
4441	}
4442
4443	static int __io_uring_register(struct io_ring_ctx ctx, unsigned* opcode,
4444	void __user arg, unsigned* nr_args)
4445	__releases(ctx->uring_lock)
4446	__acquires(ctx->uring_lock)
4447	{
4448	int ret;
4449
4450	/*
4451	* We don't quiesce the refs for register anymore and so it can't be
4452	* dying as we're holding a file ref here.
4453	*/
4454	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
4455	return -ENXIO;
4456
4457	if (ctx->submitter_task && ctx->submitter_task != current)
4458	return -EEXIST;
4459
4460	if (ctx->restricted) {
4461	opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
4462	if (!test_bit(opcode, ctx->restrictions.register_op))
4463	return -EACCES;
4464	}
4465
4466	switch (opcode) {
4467	case IORING_REGISTER_BUFFERS:
4468	ret = -EFAULT;
4469	if (!arg)
4470	break;
4471	ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
4472	break;
4473	case IORING_UNREGISTER_BUFFERS:
4474	ret = -EINVAL;
4475	if (arg \|\| nr_args)
4476	break;
4477	ret = io_sqe_buffers_unregister(ctx);
4478	break;
4479	case IORING_REGISTER_FILES:
4480	ret = -EFAULT;
4481	if (!arg)
4482	break;
4483	ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
4484	break;
4485	case IORING_UNREGISTER_FILES:
4486	ret = -EINVAL;
4487	if (arg \|\| nr_args)
4488	break;
4489	ret = io_sqe_files_unregister(ctx);
4490	break;
4491	case IORING_REGISTER_FILES_UPDATE:
4492	ret = io_register_files_update(ctx, arg, nr_args);
4493	break;
4494	case IORING_REGISTER_EVENTFD:
4495	ret = -EINVAL;
4496	if (nr_args != `1`)
4497	break;
4498	ret = io_eventfd_register(ctx, arg, eventfd_async: `0`);
4499	break;
4500	case IORING_REGISTER_EVENTFD_ASYNC:
4501	ret = -EINVAL;
4502	if (nr_args != `1`)
4503	break;
4504	ret = io_eventfd_register(ctx, arg, eventfd_async: `1`);
4505	break;
4506	case IORING_UNREGISTER_EVENTFD:
4507	ret = -EINVAL;
4508	if (arg \|\| nr_args)
4509	break;
4510	ret = io_eventfd_unregister(ctx);
4511	break;
4512	case IORING_REGISTER_PROBE:
4513	ret = -EINVAL;
4514	if (!arg \|\| nr_args > `256`)
4515	break;
4516	ret = io_probe(ctx, arg, nr_args);
4517	break;
4518	case IORING_REGISTER_PERSONALITY:
4519	ret = -EINVAL;
4520	if (arg \|\| nr_args)
4521	break;
4522	ret = io_register_personality(ctx);
4523	break;
4524	case IORING_UNREGISTER_PERSONALITY:
4525	ret = -EINVAL;
4526	if (arg)
4527	break;
4528	ret = io_unregister_personality(ctx, id: nr_args);
4529	break;
4530	case IORING_REGISTER_ENABLE_RINGS:
4531	ret = -EINVAL;
4532	if (arg \|\| nr_args)
4533	break;
4534	ret = io_register_enable_rings(ctx);
4535	break;
4536	case IORING_REGISTER_RESTRICTIONS:
4537	ret = io_register_restrictions(ctx, arg, nr_args);
4538	break;
4539	case IORING_REGISTER_FILES2:
4540	ret = io_register_rsrc(ctx, arg, size: nr_args, type: IORING_RSRC_FILE);
4541	break;
4542	case IORING_REGISTER_FILES_UPDATE2:
4543	ret = io_register_rsrc_update(ctx, arg, size: nr_args,
4544	type: IORING_RSRC_FILE);
4545	break;
4546	case IORING_REGISTER_BUFFERS2:
4547	ret = io_register_rsrc(ctx, arg, size: nr_args, type: IORING_RSRC_BUFFER);
4548	break;
4549	case IORING_REGISTER_BUFFERS_UPDATE:
4550	ret = io_register_rsrc_update(ctx, arg, size: nr_args,
4551	type: IORING_RSRC_BUFFER);
4552	break;
4553	case IORING_REGISTER_IOWQ_AFF:
4554	ret = -EINVAL;
4555	if (!arg \|\| !nr_args)
4556	break;
4557	ret = io_register_iowq_aff(ctx, arg, len: nr_args);
4558	break;
4559	case IORING_UNREGISTER_IOWQ_AFF:
4560	ret = -EINVAL;
4561	if (arg \|\| nr_args)
4562	break;
4563	ret = io_unregister_iowq_aff(ctx);
4564	break;
4565	case IORING_REGISTER_IOWQ_MAX_WORKERS:
4566	ret = -EINVAL;
4567	if (!arg \|\| nr_args != `2`)
4568	break;
4569	ret = io_register_iowq_max_workers(ctx, arg);
4570	break;
4571	case IORING_REGISTER_RING_FDS:
4572	ret = io_ringfd_register(ctx, arg: arg, nr_args);
4573	break;
4574	case IORING_UNREGISTER_RING_FDS:
4575	ret = io_ringfd_unregister(ctx, arg: arg, nr_args);
4576	break;
4577	case IORING_REGISTER_PBUF_RING:
4578	ret = -EINVAL;
4579	if (!arg \|\| nr_args != `1`)
4580	break;
4581	ret = io_register_pbuf_ring(ctx, arg);
4582	break;
4583	case IORING_UNREGISTER_PBUF_RING:
4584	ret = -EINVAL;
4585	if (!arg \|\| nr_args != `1`)
4586	break;
4587	ret = io_unregister_pbuf_ring(ctx, arg);
4588	break;
4589	case IORING_REGISTER_SYNC_CANCEL:
4590	ret = -EINVAL;
4591	if (!arg \|\| nr_args != `1`)
4592	break;
4593	ret = io_sync_cancel(ctx, arg);
4594	break;
4595	case IORING_REGISTER_FILE_ALLOC_RANGE:
4596	ret = -EINVAL;
4597	if (!arg \|\| nr_args)
4598	break;
4599	ret = io_register_file_alloc_range(ctx, arg);
4600	break;
4601	default:
4602	ret = -EINVAL;
4603	break;
4604	}
4605
4606	return ret;
4607	}
4608
4609	SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
4610	void __user , arg, unsigned* int, nr_args)
4611	{
4612	struct io_ring_ctx *ctx;
4613	long ret = -EBADF;
4614	struct fd f;
4615	bool use_registered_ring;
4616
4617	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
4618	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
4619
4620	if (opcode >= IORING_REGISTER_LAST)
4621	return -EINVAL;
4622
4623	if (use_registered_ring) {
4624	/*
4625	* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
4626	* need only dereference our task private array to find it.
4627	*/
4628	struct io_uring_task *tctx = current->io_uring;
4629
4630	if (unlikely(!tctx \|\| fd >= IO_RINGFD_REG_MAX))
4631	return -EINVAL;
4632	fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
4633	f.file = tctx->registered_rings[fd];
4634	f.flags = `0`;
4635	if (unlikely(!f.file))
4636	return -EBADF;
4637	} else {
4638	f = fdget(fd);
4639	if (unlikely(!f.file))
4640	return -EBADF;
4641	ret = -EOPNOTSUPP;
4642	if (!io_is_uring_fops(file: f.file))
4643	goto out_fput;
4644	}
4645
4646	ctx = f.file->private_data;
4647
4648	mutex_lock(&ctx->uring_lock);
4649	ret = __io_uring_register(ctx, opcode, arg, nr_args);
4650	mutex_unlock(lock: &ctx->uring_lock);
4651	trace_io_uring_register(ctx, opcode, nr_files: ctx->nr_user_files, nr_bufs: ctx->nr_user_bufs, ret);
4652	out_fput:
4653	fdput(fd: f);
4654	return ret;
4655	}
4656
4657	static int __init io_uring_init(void)
4658	{
4659	#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
4660	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
4661	BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
4662	} while (0)
4663
4664	#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
4665	__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
4666	#define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
4667	__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
4668	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != `64`);
4669	BUILD_BUG_SQE_ELEM(`0`, __u8, opcode);
4670	BUILD_BUG_SQE_ELEM(`1`, __u8, flags);
4671	BUILD_BUG_SQE_ELEM(`2`, __u16, ioprio);
4672	BUILD_BUG_SQE_ELEM(`4`, __s32, fd);
4673	BUILD_BUG_SQE_ELEM(`8`, __u64, off);
4674	BUILD_BUG_SQE_ELEM(`8`, __u64, addr2);
4675	BUILD_BUG_SQE_ELEM(`8`, __u32, cmd_op);
4676	BUILD_BUG_SQE_ELEM(`12`, __u32, __pad1);
4677	BUILD_BUG_SQE_ELEM(`16`, __u64, addr);
4678	BUILD_BUG_SQE_ELEM(`16`, __u64, splice_off_in);
4679	BUILD_BUG_SQE_ELEM(`24`, __u32, len);
4680	BUILD_BUG_SQE_ELEM(`28`, __kernel_rwf_t, rw_flags);
4681	BUILD_BUG_SQE_ELEM(`28`, / compat / int, rw_flags);
4682	BUILD_BUG_SQE_ELEM(`28`, / compat / __u32, rw_flags);
4683	BUILD_BUG_SQE_ELEM(`28`, __u32, fsync_flags);
4684	BUILD_BUG_SQE_ELEM(`28`, / compat / __u16, poll_events);
4685	BUILD_BUG_SQE_ELEM(`28`, __u32, poll32_events);
4686	BUILD_BUG_SQE_ELEM(`28`, __u32, sync_range_flags);
4687	BUILD_BUG_SQE_ELEM(`28`, __u32, msg_flags);
4688	BUILD_BUG_SQE_ELEM(`28`, __u32, timeout_flags);
4689	BUILD_BUG_SQE_ELEM(`28`, __u32, accept_flags);
4690	BUILD_BUG_SQE_ELEM(`28`, __u32, cancel_flags);
4691	BUILD_BUG_SQE_ELEM(`28`, __u32, open_flags);
4692	BUILD_BUG_SQE_ELEM(`28`, __u32, statx_flags);
4693	BUILD_BUG_SQE_ELEM(`28`, __u32, fadvise_advice);
4694	BUILD_BUG_SQE_ELEM(`28`, __u32, splice_flags);
4695	BUILD_BUG_SQE_ELEM(`28`, __u32, rename_flags);
4696	BUILD_BUG_SQE_ELEM(`28`, __u32, unlink_flags);
4697	BUILD_BUG_SQE_ELEM(`28`, __u32, hardlink_flags);
4698	BUILD_BUG_SQE_ELEM(`28`, __u32, xattr_flags);
4699	BUILD_BUG_SQE_ELEM(`28`, __u32, msg_ring_flags);
4700	BUILD_BUG_SQE_ELEM(`32`, __u64, user_data);
4701	BUILD_BUG_SQE_ELEM(`40`, __u16, buf_index);
4702	BUILD_BUG_SQE_ELEM(`40`, __u16, buf_group);
4703	BUILD_BUG_SQE_ELEM(`42`, __u16, personality);
4704	BUILD_BUG_SQE_ELEM(`44`, __s32, splice_fd_in);
4705	BUILD_BUG_SQE_ELEM(`44`, __u32, file_index);
4706	BUILD_BUG_SQE_ELEM(`44`, __u16, addr_len);
4707	BUILD_BUG_SQE_ELEM(`46`, __u16, __pad3[`0`]);
4708	BUILD_BUG_SQE_ELEM(`48`, __u64, addr3);
4709	BUILD_BUG_SQE_ELEM_SIZE(`48`, `0`, cmd);
4710	BUILD_BUG_SQE_ELEM(`56`, __u64, __pad2);
4711
4712	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
4713	sizeof(struct io_uring_rsrc_update));
4714	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
4715	sizeof(struct io_uring_rsrc_update2));
4716
4717	/ ->buf_index is u16 /
4718	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != `0`);
4719	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
4720	offsetof(struct io_uring_buf_ring, tail));
4721
4722	/ should fit into one byte /
4723	BUILD_BUG_ON(SQE_VALID_FLAGS >= (`1` << `8`));
4724	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (`1` << `8`));
4725	BUILD_BUG_ON((SQE_VALID_FLAGS \| SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
4726
4727	BUILD_BUG_ON(__REQ_F_LAST_BIT > `8` * sizeof(int));
4728
4729	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
4730
4731	/ top 8bits are for internal use /
4732	BUILD_BUG_ON((IORING_URING_CMD_MASK & `0xff000000`) != `0`);
4733
4734	io_uring_optable_init();
4735
4736	/*
4737	* Allow user copy in the per-command field, which starts after the
4738	* file in io_kiocb and until the opcode field. The openat2 handling
4739	* requires copying in user memory into the io_kiocb object in that
4740	* range, and HARDENED_USERCOPY will complain if we haven't
4741	* correctly annotated this range.
4742	*/
4743	req_cachep = kmem_cache_create_usercopy(name: "io_kiocb",
4744	size: sizeof(struct io_kiocb), align: `0`,
4745	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \|
4746	SLAB_ACCOUNT \| SLAB_TYPESAFE_BY_RCU,
4747	offsetof(struct io_kiocb, cmd.data),
4748	sizeof_field(struct io_kiocb, cmd.data), NULL);
4749	io_buf_cachep = kmem_cache_create(name: "io_buffer", size: sizeof(struct io_buffer), align: `0`,
4750	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \| SLAB_ACCOUNT,
4751	NULL);
4752
4753	#ifdef CONFIG_SYSCTL
4754	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
4755	#endif
4756
4757	return `0`;
4758	};
4759	__initcall(io_uring_init);
4760

source code of linux/io_uring/io_uring.c