userfaultfd.c source code [linux/fs/userfaultfd.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* fs/userfaultfd.c
4	*
5	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6	* Copyright (C) 2008-2009 Red Hat, Inc.
7	* Copyright (C) 2015 Red Hat, Inc.
8	*
9	* Some part derived from fs/eventfd.c (anon inode setup) and
10	* mm/ksm.c (mm hashing).
11	*/
12
13	#include <linux/list.h>
14	#include <linux/hashtable.h>
15	#include <linux/sched/signal.h>
16	#include <linux/sched/mm.h>
17	#include <linux/mm.h>
18	#include <linux/mm_inline.h>
19	#include <linux/mmu_notifier.h>
20	#include <linux/poll.h>
21	#include <linux/slab.h>
22	#include <linux/seq_file.h>
23	#include <linux/file.h>
24	#include <linux/bug.h>
25	#include <linux/anon_inodes.h>
26	#include <linux/syscalls.h>
27	#include <linux/userfaultfd_k.h>
28	#include <linux/mempolicy.h>
29	#include <linux/ioctl.h>
30	#include <linux/security.h>
31	#include <linux/hugetlb.h>
32	#include <linux/swapops.h>
33	#include <linux/miscdevice.h>
34
35	static int sysctl_unprivileged_userfaultfd __read_mostly;
36
37	#ifdef CONFIG_SYSCTL
38	static struct ctl_table vm_userfaultfd_table[] = {
39	{
40	.procname = "unprivileged_userfaultfd",
41	.data = &sysctl_unprivileged_userfaultfd,
42	.maxlen = sizeof(sysctl_unprivileged_userfaultfd),
43	.mode = `0644`,
44	.proc_handler = proc_dointvec_minmax,
45	.extra1 = SYSCTL_ZERO,
46	.extra2 = SYSCTL_ONE,
47	},
48	{ }
49	};
50	#endif
51
52	static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
53
54	/*
55	* Start with fault_pending_wqh and fault_wqh so they're more likely
56	* to be in the same cacheline.
57	*
58	* Locking order:
59	* fd_wqh.lock
60	* fault_pending_wqh.lock
61	* fault_wqh.lock
62	* event_wqh.lock
63	*
64	* To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
65	* since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
66	* also taken in IRQ context.
67	*/
68	struct userfaultfd_ctx {
69	/ waitqueue head for the pending (i.e. not read) userfaults /
70	wait_queue_head_t fault_pending_wqh;
71	/ waitqueue head for the userfaults /
72	wait_queue_head_t fault_wqh;
73	/ waitqueue head for the pseudo fd to wakeup poll/read /
74	wait_queue_head_t fd_wqh;
75	/ waitqueue head for events /
76	wait_queue_head_t event_wqh;
77	/ a refile sequence protected by fault_pending_wqh lock /
78	seqcount_spinlock_t refile_seq;
79	/ pseudo fd refcounting /
80	refcount_t refcount;
81	/ userfaultfd syscall flags /
82	unsigned int flags;
83	/ features requested from the userspace /
84	unsigned int features;
85	/ released /
86	bool released;
87	/ memory mappings are changing because of non-cooperative event /
88	atomic_t mmap_changing;
89	/ mm with one ore more vmas attached to this userfaultfd_ctx /
90	struct mm_struct *mm;
91	};
92
93	struct userfaultfd_fork_ctx {
94	struct userfaultfd_ctx *orig;
95	struct userfaultfd_ctx *new;
96	struct list_head list;
97	};
98
99	struct userfaultfd_unmap_ctx {
100	struct userfaultfd_ctx *ctx;
101	unsigned long start;
102	unsigned long end;
103	struct list_head list;
104	};
105
106	struct userfaultfd_wait_queue {
107	struct uffd_msg msg;
108	wait_queue_entry_t wq;
109	struct userfaultfd_ctx *ctx;
110	bool waken;
111	};
112
113	struct userfaultfd_wake_range {
114	unsigned long start;
115	unsigned long len;
116	};
117
118	/ internal indication that UFFD_API ioctl was successfully executed /
119	#define UFFD_FEATURE_INITIALIZED (1u << 31)
120
121	static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
122	{
123	return ctx->features & UFFD_FEATURE_INITIALIZED;
124	}
125
126	static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
127	{
128	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
129	}
130
131	/*
132	* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
133	* meaningful when userfaultfd_wp()==true on the vma and when it's
134	* anonymous.
135	*/
136	bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
137	{
138	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
139
140	if (!ctx)
141	return false;
142
143	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
144	}
145
146	static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
147	vm_flags_t flags)
148	{
149	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
150
151	vm_flags_reset(vma, flags);
152	/*
153	* For shared mappings, we want to enable writenotify while
154	* userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
155	* recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
156	*/
157	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
158	vma_set_page_prot(vma);
159	}
160
161	static int userfaultfd_wake_function(wait_queue_entry_t wq, unsigned* mode,
162	int wake_flags, void *key)
163	{
164	struct userfaultfd_wake_range *range = key;
165	int ret;
166	struct userfaultfd_wait_queue *uwq;
167	unsigned long start, len;
168
169	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
170	ret = `0`;
171	/ len == 0 means wake all /
172	start = range->start;
173	len = range->len;
174	if (len && (start > uwq->msg.arg.pagefault.address \|\|
175	start + len <= uwq->msg.arg.pagefault.address))
176	goto out;
177	WRITE_ONCE(uwq->waken, true);
178	/*
179	* The Program-Order guarantees provided by the scheduler
180	* ensure uwq->waken is visible before the task is woken.
181	*/
182	ret = wake_up_state(tsk: wq->private, state: mode);
183	if (ret) {
184	/*
185	* Wake only once, autoremove behavior.
186	*
187	* After the effect of list_del_init is visible to the other
188	* CPUs, the waitqueue may disappear from under us, see the
189	* !list_empty_careful() in handle_userfault().
190	*
191	* try_to_wake_up() has an implicit smp_mb(), and the
192	* wq->private is read before calling the extern function
193	* "wake_up_state" (which in turns calls try_to_wake_up).
194	*/
195	list_del_init(entry: &wq->entry);
196	}
197	out:
198	return ret;
199	}
200
201	/**
202	* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
203	* context.
204	* @ctx: [in] Pointer to the userfaultfd context.
205	*/
206	static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
207	{
208	refcount_inc(r: &ctx->refcount);
209	}
210
211	/**
212	* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
213	* context.
214	* @ctx: [in] Pointer to userfaultfd context.
215	*
216	* The userfaultfd context reference must have been previously acquired either
217	* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
218	*/
219	static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
220	{
221	if (refcount_dec_and_test(r: &ctx->refcount)) {
222	VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
223	VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
224	VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
225	VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
226	VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
227	VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
228	VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
229	VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
230	mmdrop(mm: ctx->mm);
231	kmem_cache_free(s: userfaultfd_ctx_cachep, objp: ctx);
232	}
233	}
234
235	static inline void msg_init(struct uffd_msg *msg)
236	{
237	BUILD_BUG_ON(sizeof(struct uffd_msg) != `32`);
238	/*
239	* Must use memset to zero out the paddings or kernel data is
240	* leaked to userland.
241	*/
242	memset(msg, `0`, sizeof(struct uffd_msg));
243	}
244
245	static inline struct uffd_msg userfault_msg(unsigned long address,
246	unsigned long real_address,
247	unsigned int flags,
248	unsigned long reason,
249	unsigned int features)
250	{
251	struct uffd_msg msg;
252
253	msg_init(msg: &msg);
254	msg.event = UFFD_EVENT_PAGEFAULT;
255
256	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
257	real_address : address;
258
259	/*
260	* These flags indicate why the userfault occurred:
261	* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
262	* - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
263	* - Neither of these flags being set indicates a MISSING fault.
264	*
265	* Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
266	* fault. Otherwise, it was a read fault.
267	*/
268	if (flags & FAULT_FLAG_WRITE)
269	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WRITE;
270	if (reason & VM_UFFD_WP)
271	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WP;
272	if (reason & VM_UFFD_MINOR)
273	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_MINOR;
274	if (features & UFFD_FEATURE_THREAD_ID)
275	msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
276	return msg;
277	}
278
279	#ifdef CONFIG_HUGETLB_PAGE
280	/*
281	* Same functionality as userfaultfd_must_wait below with modifications for
282	* hugepmd ranges.
283	*/
284	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
285	struct vm_fault *vmf,
286	unsigned long reason)
287	{
288	struct vm_area_struct *vma = vmf->vma;
289	pte_t *ptep, pte;
290	bool ret = true;
291
292	assert_fault_locked(vmf);
293
294	ptep = hugetlb_walk(vma, addr: vmf->address, sz: vma_mmu_pagesize(vma));
295	if (!ptep)
296	goto out;
297
298	ret = false;
299	pte = huge_ptep_get(ptep);
300
301	/*
302	* Lockless access: we're in a wait_event so it's ok if it
303	* changes under us. PTE markers should be handled the same as none
304	* ptes here.
305	*/
306	if (huge_pte_none_mostly(pte))
307	ret = true;
308	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
309	ret = true;
310	out:
311	return ret;
312	}
313	#else
314	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
315	struct vm_fault *vmf,
316	unsigned long reason)
317	{
318	return false; / should never get here /
319	}
320	#endif /* CONFIG_HUGETLB_PAGE */
321
322	/*
323	* Verify the pagetables are still not ok after having reigstered into
324	* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
325	* userfault that has already been resolved, if userfaultfd_read and
326	* UFFDIO_COPY\|ZEROPAGE are being run simultaneously on two different
327	* threads.
328	*/
329	static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
330	struct vm_fault *vmf,
331	unsigned long reason)
332	{
333	struct mm_struct *mm = ctx->mm;
334	unsigned long address = vmf->address;
335	pgd_t *pgd;
336	p4d_t *p4d;
337	pud_t *pud;
338	pmd_t *pmd, _pmd;
339	pte_t *pte;
340	pte_t ptent;
341	bool ret = true;
342
343	assert_fault_locked(vmf);
344
345	pgd = pgd_offset(mm, address);
346	if (!pgd_present(pgd: *pgd))
347	goto out;
348	p4d = p4d_offset(pgd, address);
349	if (!p4d_present(p4d: *p4d))
350	goto out;
351	pud = pud_offset(p4d, address);
352	if (!pud_present(pud: *pud))
353	goto out;
354	pmd = pmd_offset(pud, address);
355	again:
356	_pmd = pmdp_get_lockless(pmdp: pmd);
357	if (pmd_none(pmd: _pmd))
358	goto out;
359
360	ret = false;
361	if (!pmd_present(pmd: _pmd) \|\| pmd_devmap(pmd: _pmd))
362	goto out;
363
364	if (pmd_trans_huge(pmd: _pmd)) {
365	if (!pmd_write(pmd: _pmd) && (reason & VM_UFFD_WP))
366	ret = true;
367	goto out;
368	}
369
370	pte = pte_offset_map(pmd, addr: address);
371	if (!pte) {
372	ret = true;
373	goto again;
374	}
375	/*
376	* Lockless access: we're in a wait_event so it's ok if it
377	* changes under us. PTE markers should be handled the same as none
378	* ptes here.
379	*/
380	ptent = ptep_get(ptep: pte);
381	if (pte_none_mostly(pte: ptent))
382	ret = true;
383	if (!pte_write(pte: ptent) && (reason & VM_UFFD_WP))
384	ret = true;
385	pte_unmap(pte);
386
387	out:
388	return ret;
389	}
390
391	static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
392	{
393	if (flags & FAULT_FLAG_INTERRUPTIBLE)
394	return TASK_INTERRUPTIBLE;
395
396	if (flags & FAULT_FLAG_KILLABLE)
397	return TASK_KILLABLE;
398
399	return TASK_UNINTERRUPTIBLE;
400	}
401
402	/*
403	* The locking rules involved in returning VM_FAULT_RETRY depending on
404	* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
405	* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
406	* recommendation in __lock_page_or_retry is not an understatement.
407	*
408	* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
409	* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
410	* not set.
411	*
412	* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
413	* set, VM_FAULT_RETRY can still be returned if and only if there are
414	* fatal_signal_pending()s, and the mmap_lock must be released before
415	* returning it.
416	*/
417	vm_fault_t handle_userfault(struct vm_fault vmf, unsigned* long reason)
418	{
419	struct vm_area_struct *vma = vmf->vma;
420	struct mm_struct *mm = vma->vm_mm;
421	struct userfaultfd_ctx *ctx;
422	struct userfaultfd_wait_queue uwq;
423	vm_fault_t ret = VM_FAULT_SIGBUS;
424	bool must_wait;
425	unsigned int blocking_state;
426
427	/*
428	* We don't do userfault handling for the final child pid update.
429	*
430	* We also don't do userfault handling during
431	* coredumping. hugetlbfs has the special
432	* hugetlb_follow_page_mask() to skip missing pages in the
433	* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
434	* the no_page_table() helper in follow_page_mask(), but the
435	* shmem_vm_ops->fault method is invoked even during
436	* coredumping and it ends up here.
437	*/
438	if (current->flags & (PF_EXITING\|PF_DUMPCORE))
439	goto out;
440
441	assert_fault_locked(vmf);
442
443	ctx = vma->vm_userfaultfd_ctx.ctx;
444	if (!ctx)
445	goto out;
446
447	BUG_ON(ctx->mm != mm);
448
449	/ Any unrecognized flag is a bug. /
450	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
451	/ 0 or > 1 flags set is a bug; we expect exactly 1. /
452	VM_BUG_ON(!reason \|\| (reason & (reason - `1`)));
453
454	if (ctx->features & UFFD_FEATURE_SIGBUS)
455	goto out;
456	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
457	goto out;
458
459	/*
460	* If it's already released don't get it. This avoids to loop
461	* in __get_user_pages if userfaultfd_release waits on the
462	* caller of handle_userfault to release the mmap_lock.
463	*/
464	if (unlikely(READ_ONCE(ctx->released))) {
465	/*
466	* Don't return VM_FAULT_SIGBUS in this case, so a non
467	* cooperative manager can close the uffd after the
468	* last UFFDIO_COPY, without risking to trigger an
469	* involuntary SIGBUS if the process was starting the
470	* userfaultfd while the userfaultfd was still armed
471	* (but after the last UFFDIO_COPY). If the uffd
472	* wasn't already closed when the userfault reached
473	* this point, that would normally be solved by
474	* userfaultfd_must_wait returning 'false'.
475	*
476	* If we were to return VM_FAULT_SIGBUS here, the non
477	* cooperative manager would be instead forced to
478	* always call UFFDIO_UNREGISTER before it can safely
479	* close the uffd.
480	*/
481	ret = VM_FAULT_NOPAGE;
482	goto out;
483	}
484
485	/*
486	* Check that we can return VM_FAULT_RETRY.
487	*
488	* NOTE: it should become possible to return VM_FAULT_RETRY
489	* even if FAULT_FLAG_TRIED is set without leading to gup()
490	* -EBUSY failures, if the userfaultfd is to be extended for
491	* VM_UFFD_WP tracking and we intend to arm the userfault
492	* without first stopping userland access to the memory. For
493	* VM_UFFD_MISSING userfaults this is enough for now.
494	*/
495	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
496	/*
497	* Validate the invariant that nowait must allow retry
498	* to be sure not to return SIGBUS erroneously on
499	* nowait invocations.
500	*/
501	BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
502	#ifdef CONFIG_DEBUG_VM
503	if (printk_ratelimit()) {
504	printk(KERN_WARNING
505	"FAULT_FLAG_ALLOW_RETRY missing %x\n",
506	vmf->flags);
507	dump_stack();
508	}
509	#endif
510	goto out;
511	}
512
513	/*
514	* Handle nowait, not much to do other than tell it to retry
515	* and wait.
516	*/
517	ret = VM_FAULT_RETRY;
518	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
519	goto out;
520
521	/ take the reference before dropping the mmap_lock /
522	userfaultfd_ctx_get(ctx);
523
524	init_waitqueue_func_entry(wq_entry: &uwq.wq, func: userfaultfd_wake_function);
525	uwq.wq.private = current;
526	uwq.msg = userfault_msg(address: vmf->address, real_address: vmf->real_address, flags: vmf->flags,
527	reason, features: ctx->features);
528	uwq.ctx = ctx;
529	uwq.waken = false;
530
531	blocking_state = userfaultfd_get_blocking_state(flags: vmf->flags);
532
533	/*
534	* Take the vma lock now, in order to safely call
535	* userfaultfd_huge_must_wait() later. Since acquiring the
536	* (sleepable) vma lock can modify the current task state, that
537	* must be before explicitly calling set_current_state().
538	*/
539	if (is_vm_hugetlb_page(vma))
540	hugetlb_vma_lock_read(vma);
541
542	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
543	/*
544	* After the __add_wait_queue the uwq is visible to userland
545	* through poll/read().
546	*/
547	__add_wait_queue(wq_head: &ctx->fault_pending_wqh, wq_entry: &uwq.wq);
548	/*
549	* The smp_mb() after __set_current_state prevents the reads
550	* following the spin_unlock to happen before the list_add in
551	* __add_wait_queue.
552	*/
553	set_current_state(blocking_state);
554	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
555
556	if (!is_vm_hugetlb_page(vma))
557	must_wait = userfaultfd_must_wait(ctx, vmf, reason);
558	else
559	must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
560	if (is_vm_hugetlb_page(vma))
561	hugetlb_vma_unlock_read(vma);
562	release_fault_lock(vmf);
563
564	if (likely(must_wait && !READ_ONCE(ctx->released))) {
565	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
566	schedule();
567	}
568
569	__set_current_state(TASK_RUNNING);
570
571	/*
572	* Here we race with the list_del; list_add in
573	* userfaultfd_ctx_read(), however because we don't ever run
574	* list_del_init() to refile across the two lists, the prev
575	* and next pointers will never point to self. list_add also
576	* would never let any of the two pointers to point to
577	* self. So list_empty_careful won't risk to see both pointers
578	* pointing to self at any time during the list refile. The
579	* only case where list_del_init() is called is the full
580	* removal in the wake function and there we don't re-list_add
581	* and it's fine not to block on the spinlock. The uwq on this
582	* kernel stack can be released after the list_del_init.
583	*/
584	if (!list_empty_careful(head: &uwq.wq.entry)) {
585	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
586	/*
587	* No need of list_del_init(), the uwq on the stack
588	* will be freed shortly anyway.
589	*/
590	list_del(entry: &uwq.wq.entry);
591	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
592	}
593
594	/*
595	* ctx may go away after this if the userfault pseudo fd is
596	* already released.
597	*/
598	userfaultfd_ctx_put(ctx);
599
600	out:
601	return ret;
602	}
603
604	static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
605	struct userfaultfd_wait_queue *ewq)
606	{
607	struct userfaultfd_ctx *release_new_ctx;
608
609	if (WARN_ON_ONCE(current->flags & PF_EXITING))
610	goto out;
611
612	ewq->ctx = ctx;
613	init_waitqueue_entry(wq_entry: &ewq->wq, current);
614	release_new_ctx = NULL;
615
616	spin_lock_irq(lock: &ctx->event_wqh.lock);
617	/*
618	* After the __add_wait_queue the uwq is visible to userland
619	* through poll/read().
620	*/
621	__add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
622	for (;;) {
623	set_current_state(TASK_KILLABLE);
624	if (ewq->msg.event == `0`)
625	break;
626	if (READ_ONCE(ctx->released) \|\|
627	fatal_signal_pending(current)) {
628	/*
629	* &ewq->wq may be queued in fork_event, but
630	* __remove_wait_queue ignores the head
631	* parameter. It would be a problem if it
632	* didn't.
633	*/
634	__remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
635	if (ewq->msg.event == UFFD_EVENT_FORK) {
636	struct userfaultfd_ctx *new;
637
638	new = (struct userfaultfd_ctx *)
639	(unsigned long)
640	ewq->msg.arg.reserved.reserved1;
641	release_new_ctx = new;
642	}
643	break;
644	}
645
646	spin_unlock_irq(lock: &ctx->event_wqh.lock);
647
648	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
649	schedule();
650
651	spin_lock_irq(lock: &ctx->event_wqh.lock);
652	}
653	__set_current_state(TASK_RUNNING);
654	spin_unlock_irq(lock: &ctx->event_wqh.lock);
655
656	if (release_new_ctx) {
657	struct vm_area_struct *vma;
658	struct mm_struct *mm = release_new_ctx->mm;
659	VMA_ITERATOR(vmi, mm, `0`);
660
661	/ the various vma->vm_userfaultfd_ctx still points to it /
662	mmap_write_lock(mm);
663	for_each_vma(vmi, vma) {
664	if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
665	vma_start_write(vma);
666	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
667	userfaultfd_set_vm_flags(vma,
668	flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
669	}
670	}
671	mmap_write_unlock(mm);
672
673	userfaultfd_ctx_put(ctx: release_new_ctx);
674	}
675
676	/*
677	* ctx may go away after this if the userfault pseudo fd is
678	* already released.
679	*/
680	out:
681	atomic_dec(v: &ctx->mmap_changing);
682	VM_BUG_ON(atomic_read(&ctx->mmap_changing) < `0`);
683	userfaultfd_ctx_put(ctx);
684	}
685
686	static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
687	struct userfaultfd_wait_queue *ewq)
688	{
689	ewq->msg.event = `0`;
690	wake_up_locked(&ctx->event_wqh);
691	__remove_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &ewq->wq);
692	}
693
694	int dup_userfaultfd(struct vm_area_struct vma, struct* list_head *fcs)
695	{
696	struct userfaultfd_ctx ctx = NULL, octx;
697	struct userfaultfd_fork_ctx *fctx;
698
699	octx = vma->vm_userfaultfd_ctx.ctx;
700	if (!octx \|\| !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
701	vma_start_write(vma);
702	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
703	userfaultfd_set_vm_flags(vma, flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
704	return `0`;
705	}
706
707	list_for_each_entry(fctx, fcs, list)
708	if (fctx->orig == octx) {
709	ctx = fctx->new;
710	break;
711	}
712
713	if (!ctx) {
714	fctx = kmalloc(size: sizeof(*fctx), GFP_KERNEL);
715	if (!fctx)
716	return -ENOMEM;
717
718	ctx = kmem_cache_alloc(cachep: userfaultfd_ctx_cachep, GFP_KERNEL);
719	if (!ctx) {
720	kfree(objp: fctx);
721	return -ENOMEM;
722	}
723
724	refcount_set(r: &ctx->refcount, n: `1`);
725	ctx->flags = octx->flags;
726	ctx->features = octx->features;
727	ctx->released = false;
728	atomic_set(v: &ctx->mmap_changing, i: `0`);
729	ctx->mm = vma->vm_mm;
730	mmgrab(mm: ctx->mm);
731
732	userfaultfd_ctx_get(ctx: octx);
733	atomic_inc(v: &octx->mmap_changing);
734	fctx->orig = octx;
735	fctx->new = ctx;
736	list_add_tail(new: &fctx->list, head: fcs);
737	}
738
739	vma->vm_userfaultfd_ctx.ctx = ctx;
740	return `0`;
741	}
742
743	static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
744	{
745	struct userfaultfd_ctx *ctx = fctx->orig;
746	struct userfaultfd_wait_queue ewq;
747
748	msg_init(msg: &ewq.msg);
749
750	ewq.msg.event = UFFD_EVENT_FORK;
751	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
752
753	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
754	}
755
756	void dup_userfaultfd_complete(struct list_head *fcs)
757	{
758	struct userfaultfd_fork_ctx fctx, n;
759
760	list_for_each_entry_safe(fctx, n, fcs, list) {
761	dup_fctx(fctx);
762	list_del(entry: &fctx->list);
763	kfree(objp: fctx);
764	}
765	}
766
767	void mremap_userfaultfd_prep(struct vm_area_struct *vma,
768	struct vm_userfaultfd_ctx *vm_ctx)
769	{
770	struct userfaultfd_ctx *ctx;
771
772	ctx = vma->vm_userfaultfd_ctx.ctx;
773
774	if (!ctx)
775	return;
776
777	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
778	vm_ctx->ctx = ctx;
779	userfaultfd_ctx_get(ctx);
780	atomic_inc(v: &ctx->mmap_changing);
781	} else {
782	/ Drop uffd context if remap feature not enabled /
783	vma_start_write(vma);
784	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
785	userfaultfd_set_vm_flags(vma, flags: vma->vm_flags & ~__VM_UFFD_FLAGS);
786	}
787	}
788
789	void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
790	unsigned long from, unsigned long to,
791	unsigned long len)
792	{
793	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
794	struct userfaultfd_wait_queue ewq;
795
796	if (!ctx)
797	return;
798
799	if (to & ~PAGE_MASK) {
800	userfaultfd_ctx_put(ctx);
801	return;
802	}
803
804	msg_init(msg: &ewq.msg);
805
806	ewq.msg.event = UFFD_EVENT_REMAP;
807	ewq.msg.arg.remap.from = from;
808	ewq.msg.arg.remap.to = to;
809	ewq.msg.arg.remap.len = len;
810
811	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
812	}
813
814	bool userfaultfd_remove(struct vm_area_struct *vma,
815	unsigned long start, unsigned long end)
816	{
817	struct mm_struct *mm = vma->vm_mm;
818	struct userfaultfd_ctx *ctx;
819	struct userfaultfd_wait_queue ewq;
820
821	ctx = vma->vm_userfaultfd_ctx.ctx;
822	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
823	return true;
824
825	userfaultfd_ctx_get(ctx);
826	atomic_inc(v: &ctx->mmap_changing);
827	mmap_read_unlock(mm);
828
829	msg_init(msg: &ewq.msg);
830
831	ewq.msg.event = UFFD_EVENT_REMOVE;
832	ewq.msg.arg.remove.start = start;
833	ewq.msg.arg.remove.end = end;
834
835	userfaultfd_event_wait_completion(ctx, ewq: &ewq);
836
837	return false;
838	}
839
840	static bool has_unmap_ctx(struct userfaultfd_ctx ctx, struct* list_head *unmaps,
841	unsigned long start, unsigned long end)
842	{
843	struct userfaultfd_unmap_ctx *unmap_ctx;
844
845	list_for_each_entry(unmap_ctx, unmaps, list)
846	if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
847	unmap_ctx->end == end)
848	return true;
849
850	return false;
851	}
852
853	int userfaultfd_unmap_prep(struct vm_area_struct vma, unsigned* long start,
854	unsigned long end, struct list_head *unmaps)
855	{
856	struct userfaultfd_unmap_ctx *unmap_ctx;
857	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
858
859	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) \|\|
860	has_unmap_ctx(ctx, unmaps, start, end))
861	return `0`;
862
863	unmap_ctx = kzalloc(size: sizeof(*unmap_ctx), GFP_KERNEL);
864	if (!unmap_ctx)
865	return -ENOMEM;
866
867	userfaultfd_ctx_get(ctx);
868	atomic_inc(v: &ctx->mmap_changing);
869	unmap_ctx->ctx = ctx;
870	unmap_ctx->start = start;
871	unmap_ctx->end = end;
872	list_add_tail(new: &unmap_ctx->list, head: unmaps);
873
874	return `0`;
875	}
876
877	void userfaultfd_unmap_complete(struct mm_struct mm, struct* list_head *uf)
878	{
879	struct userfaultfd_unmap_ctx ctx, n;
880	struct userfaultfd_wait_queue ewq;
881
882	list_for_each_entry_safe(ctx, n, uf, list) {
883	msg_init(msg: &ewq.msg);
884
885	ewq.msg.event = UFFD_EVENT_UNMAP;
886	ewq.msg.arg.remove.start = ctx->start;
887	ewq.msg.arg.remove.end = ctx->end;
888
889	userfaultfd_event_wait_completion(ctx: ctx->ctx, ewq: &ewq);
890
891	list_del(entry: &ctx->list);
892	kfree(objp: ctx);
893	}
894	}
895
896	static int userfaultfd_release(struct inode inode, struct* file *file)
897	{
898	struct userfaultfd_ctx *ctx = file->private_data;
899	struct mm_struct *mm = ctx->mm;
900	struct vm_area_struct vma, prev;
901	/ len == 0 means wake all /
902	struct userfaultfd_wake_range range = { .len = `0`, };
903	unsigned long new_flags;
904	VMA_ITERATOR(vmi, mm, `0`);
905
906	WRITE_ONCE(ctx->released, true);
907
908	if (!mmget_not_zero(mm))
909	goto wakeup;
910
911	/*
912	* Flush page faults out of all CPUs. NOTE: all page faults
913	* must be retried without returning VM_FAULT_SIGBUS if
914	* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
915	* changes while handle_userfault released the mmap_lock. So
916	* it's critical that released is set to true (above), before
917	* taking the mmap_lock for writing.
918	*/
919	mmap_write_lock(mm);
920	prev = NULL;
921	for_each_vma(vmi, vma) {
922	cond_resched();
923	BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
924	!!(vma->vm_flags & __VM_UFFD_FLAGS));
925	if (vma->vm_userfaultfd_ctx.ctx != ctx) {
926	prev = vma;
927	continue;
928	}
929	new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
930	vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start: vma->vm_start,
931	end: vma->vm_end, new_flags,
932	NULL_VM_UFFD_CTX);
933
934	vma_start_write(vma);
935	userfaultfd_set_vm_flags(vma, flags: new_flags);
936	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
937
938	prev = vma;
939	}
940	mmap_write_unlock(mm);
941	mmput(mm);
942	wakeup:
943	/*
944	* After no new page faults can wait on this fault_*wqh, flush
945	* the last page faults that may have been already waiting on
946	* the fault_*wqh.
947	*/
948	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
949	__wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL, key: &range);
950	__wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: `1`, key: &range);
951	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
952
953	/ Flush pending events that may still wait on event_wqh /
954	wake_up_all(&ctx->event_wqh);
955
956	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
957	userfaultfd_ctx_put(ctx);
958	return `0`;
959	}
960
961	/ fault_pending_wqh.lock must be hold by the caller /
962	static inline struct userfaultfd_wait_queue *find_userfault_in(
963	wait_queue_head_t *wqh)
964	{
965	wait_queue_entry_t *wq;
966	struct userfaultfd_wait_queue *uwq;
967
968	lockdep_assert_held(&wqh->lock);
969
970	uwq = NULL;
971	if (!waitqueue_active(wq_head: wqh))
972	goto out;
973	/ walk in reverse to provide FIFO behavior to read userfaults /
974	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
975	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
976	out:
977	return uwq;
978	}
979
980	static inline struct userfaultfd_wait_queue *find_userfault(
981	struct userfaultfd_ctx *ctx)
982	{
983	return find_userfault_in(wqh: &ctx->fault_pending_wqh);
984	}
985
986	static inline struct userfaultfd_wait_queue *find_userfault_evt(
987	struct userfaultfd_ctx *ctx)
988	{
989	return find_userfault_in(wqh: &ctx->event_wqh);
990	}
991
992	static __poll_t userfaultfd_poll(struct file file, poll_table wait)
993	{
994	struct userfaultfd_ctx *ctx = file->private_data;
995	__poll_t ret;
996
997	poll_wait(filp: file, wait_address: &ctx->fd_wqh, p: wait);
998
999	if (!userfaultfd_is_initialized(ctx))
1000	return EPOLLERR;
1001
1002	/*
1003	* poll() never guarantees that read won't block.
1004	* userfaults can be waken before they're read().
1005	*/
1006	if (unlikely(!(file->f_flags & O_NONBLOCK)))
1007	return EPOLLERR;
1008	/*
1009	* lockless access to see if there are pending faults
1010	* __pollwait last action is the add_wait_queue but
1011	* the spin_unlock would allow the waitqueue_active to
1012	* pass above the actual list_add inside
1013	* add_wait_queue critical section. So use a full
1014	* memory barrier to serialize the list_add write of
1015	* add_wait_queue() with the waitqueue_active read
1016	* below.
1017	*/
1018	ret = `0`;
1019	smp_mb();
1020	if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
1021	ret = EPOLLIN;
1022	else if (waitqueue_active(wq_head: &ctx->event_wqh))
1023	ret = EPOLLIN;
1024
1025	return ret;
1026	}
1027
1028	static const struct file_operations userfaultfd_fops;
1029
1030	static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1031	struct inode *inode,
1032	struct uffd_msg *msg)
1033	{
1034	int fd;
1035
1036	fd = anon_inode_getfd_secure(name: "[userfaultfd]", fops: &userfaultfd_fops, priv: new,
1037	O_RDONLY \| (new->flags & UFFD_SHARED_FCNTL_FLAGS), context_inode: inode);
1038	if (fd < `0`)
1039	return fd;
1040
1041	msg->arg.reserved.reserved1 = `0`;
1042	msg->arg.fork.ufd = fd;
1043	return `0`;
1044	}
1045
1046	static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx ctx, int* no_wait,
1047	struct uffd_msg msg, struct* inode *inode)
1048	{
1049	ssize_t ret;
1050	DECLARE_WAITQUEUE(wait, current);
1051	struct userfaultfd_wait_queue *uwq;
1052	/*
1053	* Handling fork event requires sleeping operations, so
1054	* we drop the event_wqh lock, then do these ops, then
1055	* lock it back and wake up the waiter. While the lock is
1056	* dropped the ewq may go away so we keep track of it
1057	* carefully.
1058	*/
1059	LIST_HEAD(fork_event);
1060	struct userfaultfd_ctx *fork_nctx = NULL;
1061
1062	/ always take the fd_wqh lock before the fault_pending_wqh lock /
1063	spin_lock_irq(lock: &ctx->fd_wqh.lock);
1064	__add_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1065	for (;;) {
1066	set_current_state(TASK_INTERRUPTIBLE);
1067	spin_lock(lock: &ctx->fault_pending_wqh.lock);
1068	uwq = find_userfault(ctx);
1069	if (uwq) {
1070	/*
1071	* Use a seqcount to repeat the lockless check
1072	* in wake_userfault() to avoid missing
1073	* wakeups because during the refile both
1074	* waitqueue could become empty if this is the
1075	* only userfault.
1076	*/
1077	write_seqcount_begin(&ctx->refile_seq);
1078
1079	/*
1080	* The fault_pending_wqh.lock prevents the uwq
1081	* to disappear from under us.
1082	*
1083	* Refile this userfault from
1084	* fault_pending_wqh to fault_wqh, it's not
1085	* pending anymore after we read it.
1086	*
1087	* Use list_del() by hand (as
1088	* userfaultfd_wake_function also uses
1089	* list_del_init() by hand) to be sure nobody
1090	* changes __remove_wait_queue() to use
1091	* list_del_init() in turn breaking the
1092	* !list_empty_careful() check in
1093	* handle_userfault(). The uwq->wq.head list
1094	* must never be empty at any time during the
1095	* refile, or the waitqueue could disappear
1096	* from under us. The "wait_queue_head_t"
1097	* parameter of __remove_wait_queue() is unused
1098	* anyway.
1099	*/
1100	list_del(entry: &uwq->wq.entry);
1101	add_wait_queue(wq_head: &ctx->fault_wqh, wq_entry: &uwq->wq);
1102
1103	write_seqcount_end(&ctx->refile_seq);
1104
1105	/ careful to always initialize msg if ret == 0 /
1106	*msg = uwq->msg;
1107	spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1108	ret = `0`;
1109	break;
1110	}
1111	spin_unlock(lock: &ctx->fault_pending_wqh.lock);
1112
1113	spin_lock(lock: &ctx->event_wqh.lock);
1114	uwq = find_userfault_evt(ctx);
1115	if (uwq) {
1116	*msg = uwq->msg;
1117
1118	if (uwq->msg.event == UFFD_EVENT_FORK) {
1119	fork_nctx = (struct userfaultfd_ctx *)
1120	(unsigned long)
1121	uwq->msg.arg.reserved.reserved1;
1122	list_move(list: &uwq->wq.entry, head: &fork_event);
1123	/*
1124	* fork_nctx can be freed as soon as
1125	* we drop the lock, unless we take a
1126	* reference on it.
1127	*/
1128	userfaultfd_ctx_get(ctx: fork_nctx);
1129	spin_unlock(lock: &ctx->event_wqh.lock);
1130	ret = `0`;
1131	break;
1132	}
1133
1134	userfaultfd_event_complete(ctx, ewq: uwq);
1135	spin_unlock(lock: &ctx->event_wqh.lock);
1136	ret = `0`;
1137	break;
1138	}
1139	spin_unlock(lock: &ctx->event_wqh.lock);
1140
1141	if (signal_pending(current)) {
1142	ret = -ERESTARTSYS;
1143	break;
1144	}
1145	if (no_wait) {
1146	ret = -EAGAIN;
1147	break;
1148	}
1149	spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1150	schedule();
1151	spin_lock_irq(lock: &ctx->fd_wqh.lock);
1152	}
1153	__remove_wait_queue(wq_head: &ctx->fd_wqh, wq_entry: &wait);
1154	__set_current_state(TASK_RUNNING);
1155	spin_unlock_irq(lock: &ctx->fd_wqh.lock);
1156
1157	if (!ret && msg->event == UFFD_EVENT_FORK) {
1158	ret = resolve_userfault_fork(new: fork_nctx, inode, msg);
1159	spin_lock_irq(lock: &ctx->event_wqh.lock);
1160	if (!list_empty(head: &fork_event)) {
1161	/*
1162	* The fork thread didn't abort, so we can
1163	* drop the temporary refcount.
1164	*/
1165	userfaultfd_ctx_put(ctx: fork_nctx);
1166
1167	uwq = list_first_entry(&fork_event,
1168	typeof(*uwq),
1169	wq.entry);
1170	/*
1171	* If fork_event list wasn't empty and in turn
1172	* the event wasn't already released by fork
1173	* (the event is allocated on fork kernel
1174	* stack), put the event back to its place in
1175	* the event_wq. fork_event head will be freed
1176	* as soon as we return so the event cannot
1177	* stay queued there no matter the current
1178	* "ret" value.
1179	*/
1180	list_del(entry: &uwq->wq.entry);
1181	__add_wait_queue(wq_head: &ctx->event_wqh, wq_entry: &uwq->wq);
1182
1183	/*
1184	* Leave the event in the waitqueue and report
1185	* error to userland if we failed to resolve
1186	* the userfault fork.
1187	*/
1188	if (likely(!ret))
1189	userfaultfd_event_complete(ctx, ewq: uwq);
1190	} else {
1191	/*
1192	* Here the fork thread aborted and the
1193	* refcount from the fork thread on fork_nctx
1194	* has already been released. We still hold
1195	* the reference we took before releasing the
1196	* lock above. If resolve_userfault_fork
1197	* failed we've to drop it because the
1198	* fork_nctx has to be freed in such case. If
1199	* it succeeded we'll hold it because the new
1200	* uffd references it.
1201	*/
1202	if (ret)
1203	userfaultfd_ctx_put(ctx: fork_nctx);
1204	}
1205	spin_unlock_irq(lock: &ctx->event_wqh.lock);
1206	}
1207
1208	return ret;
1209	}
1210
1211	static ssize_t userfaultfd_read(struct file file, char* __user *buf,
1212	size_t count, loff_t *ppos)
1213	{
1214	struct userfaultfd_ctx *ctx = file->private_data;
1215	ssize_t _ret, ret = `0`;
1216	struct uffd_msg msg;
1217	int no_wait = file->f_flags & O_NONBLOCK;
1218	struct inode *inode = file_inode(f: file);
1219
1220	if (!userfaultfd_is_initialized(ctx))
1221	return -EINVAL;
1222
1223	for (;;) {
1224	if (count < sizeof(msg))
1225	return ret ? ret : -EINVAL;
1226	_ret = userfaultfd_ctx_read(ctx, no_wait, msg: &msg, inode);
1227	if (_ret < `0`)
1228	return ret ? ret : _ret;
1229	if (copy_to_user(to: (__u64 __user ) buf, from: &msg, n: sizeof*(msg)))
1230	return ret ? ret : -EFAULT;
1231	ret += sizeof(msg);
1232	buf += sizeof(msg);
1233	count -= sizeof(msg);
1234	/*
1235	* Allow to read more than one fault at time but only
1236	* block if waiting for the very first one.
1237	*/
1238	no_wait = O_NONBLOCK;
1239	}
1240	}
1241
1242	static void __wake_userfault(struct userfaultfd_ctx *ctx,
1243	struct userfaultfd_wake_range *range)
1244	{
1245	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
1246	/ wake all in the range and autoremove /
1247	if (waitqueue_active(wq_head: &ctx->fault_pending_wqh))
1248	__wake_up_locked_key(wq_head: &ctx->fault_pending_wqh, TASK_NORMAL,
1249	key: range);
1250	if (waitqueue_active(wq_head: &ctx->fault_wqh))
1251	__wake_up(wq_head: &ctx->fault_wqh, TASK_NORMAL, nr: `1`, key: range);
1252	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
1253	}
1254
1255	static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1256	struct userfaultfd_wake_range *range)
1257	{
1258	unsigned seq;
1259	bool need_wakeup;
1260
1261	/*
1262	* To be sure waitqueue_active() is not reordered by the CPU
1263	* before the pagetable update, use an explicit SMP memory
1264	* barrier here. PT lock release or mmap_read_unlock(mm) still
1265	* have release semantics that can allow the
1266	* waitqueue_active() to be reordered before the pte update.
1267	*/
1268	smp_mb();
1269
1270	/*
1271	* Use waitqueue_active because it's very frequent to
1272	* change the address space atomically even if there are no
1273	* userfaults yet. So we take the spinlock only when we're
1274	* sure we've userfaults to wake.
1275	*/
1276	do {
1277	seq = read_seqcount_begin(&ctx->refile_seq);
1278	need_wakeup = waitqueue_active(wq_head: &ctx->fault_pending_wqh) \|\|
1279	waitqueue_active(wq_head: &ctx->fault_wqh);
1280	cond_resched();
1281	} while (read_seqcount_retry(&ctx->refile_seq, seq));
1282	if (need_wakeup)
1283	__wake_userfault(ctx, range);
1284	}
1285
1286	static __always_inline int validate_unaligned_range(
1287	struct mm_struct *mm, __u64 start, __u64 len)
1288	{
1289	__u64 task_size = mm->task_size;
1290
1291	if (len & ~PAGE_MASK)
1292	return -EINVAL;
1293	if (!len)
1294	return -EINVAL;
1295	if (start < mmap_min_addr)
1296	return -EINVAL;
1297	if (start >= task_size)
1298	return -EINVAL;
1299	if (len > task_size - start)
1300	return -EINVAL;
1301	if (start + len <= start)
1302	return -EINVAL;
1303	return `0`;
1304	}
1305
1306	static __always_inline int validate_range(struct mm_struct *mm,
1307	__u64 start, __u64 len)
1308	{
1309	if (start & ~PAGE_MASK)
1310	return -EINVAL;
1311
1312	return validate_unaligned_range(mm, start, len);
1313	}
1314
1315	static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1316	unsigned long arg)
1317	{
1318	struct mm_struct *mm = ctx->mm;
1319	struct vm_area_struct vma, prev, *cur;
1320	int ret;
1321	struct uffdio_register uffdio_register;
1322	struct uffdio_register __user *user_uffdio_register;
1323	unsigned long vm_flags, new_flags;
1324	bool found;
1325	bool basic_ioctls;
1326	unsigned long start, end, vma_end;
1327	struct vma_iterator vmi;
1328	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1329
1330	user_uffdio_register = (struct uffdio_register __user *) arg;
1331
1332	ret = -EFAULT;
1333	if (copy_from_user(to: &uffdio_register, from: user_uffdio_register,
1334	n: sizeof(uffdio_register)-sizeof(__u64)))
1335	goto out;
1336
1337	ret = -EINVAL;
1338	if (!uffdio_register.mode)
1339	goto out;
1340	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1341	goto out;
1342	vm_flags = `0`;
1343	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1344	vm_flags \|= VM_UFFD_MISSING;
1345	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1346	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1347	goto out;
1348	#endif
1349	vm_flags \|= VM_UFFD_WP;
1350	}
1351	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1352	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1353	goto out;
1354	#endif
1355	vm_flags \|= VM_UFFD_MINOR;
1356	}
1357
1358	ret = validate_range(mm, start: uffdio_register.range.start,
1359	len: uffdio_register.range.len);
1360	if (ret)
1361	goto out;
1362
1363	start = uffdio_register.range.start;
1364	end = start + uffdio_register.range.len;
1365
1366	ret = -ENOMEM;
1367	if (!mmget_not_zero(mm))
1368	goto out;
1369
1370	ret = -EINVAL;
1371	mmap_write_lock(mm);
1372	vma_iter_init(vmi: &vmi, mm, addr: start);
1373	vma = vma_find(vmi: &vmi, max: end);
1374	if (!vma)
1375	goto out_unlock;
1376
1377	/*
1378	* If the first vma contains huge pages, make sure start address
1379	* is aligned to huge page size.
1380	*/
1381	if (is_vm_hugetlb_page(vma)) {
1382	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1383
1384	if (start & (vma_hpagesize - `1`))
1385	goto out_unlock;
1386	}
1387
1388	/*
1389	* Search for not compatible vmas.
1390	*/
1391	found = false;
1392	basic_ioctls = false;
1393	cur = vma;
1394	do {
1395	cond_resched();
1396
1397	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1398	!!(cur->vm_flags & __VM_UFFD_FLAGS));
1399
1400	/ check not compatible vmas /
1401	ret = -EINVAL;
1402	if (!vma_can_userfault(vma: cur, vm_flags, wp_async))
1403	goto out_unlock;
1404
1405	/*
1406	* UFFDIO_COPY will fill file holes even without
1407	* PROT_WRITE. This check enforces that if this is a
1408	* MAP_SHARED, the process has write permission to the backing
1409	* file. If VM_MAYWRITE is set it also enforces that on a
1410	* MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1411	* F_WRITE_SEAL can be taken until the vma is destroyed.
1412	*/
1413	ret = -EPERM;
1414	if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1415	goto out_unlock;
1416
1417	/*
1418	* If this vma contains ending address, and huge pages
1419	* check alignment.
1420	*/
1421	if (is_vm_hugetlb_page(vma: cur) && end <= cur->vm_end &&
1422	end > cur->vm_start) {
1423	unsigned long vma_hpagesize = vma_kernel_pagesize(vma: cur);
1424
1425	ret = -EINVAL;
1426
1427	if (end & (vma_hpagesize - `1`))
1428	goto out_unlock;
1429	}
1430	if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1431	goto out_unlock;
1432
1433	/*
1434	* Check that this vma isn't already owned by a
1435	* different userfaultfd. We can't allow more than one
1436	* userfaultfd to own a single vma simultaneously or we
1437	* wouldn't know which one to deliver the userfaults to.
1438	*/
1439	ret = -EBUSY;
1440	if (cur->vm_userfaultfd_ctx.ctx &&
1441	cur->vm_userfaultfd_ctx.ctx != ctx)
1442	goto out_unlock;
1443
1444	/*
1445	* Note vmas containing huge pages
1446	*/
1447	if (is_vm_hugetlb_page(vma: cur))
1448	basic_ioctls = true;
1449
1450	found = true;
1451	} for_each_vma_range(vmi, cur, end);
1452	BUG_ON(!found);
1453
1454	vma_iter_set(vmi: &vmi, addr: start);
1455	prev = vma_prev(vmi: &vmi);
1456	if (vma->vm_start < start)
1457	prev = vma;
1458
1459	ret = `0`;
1460	for_each_vma_range(vmi, vma, end) {
1461	cond_resched();
1462
1463	BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
1464	BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1465	vma->vm_userfaultfd_ctx.ctx != ctx);
1466	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1467
1468	/*
1469	* Nothing to do: this vma is already registered into this
1470	* userfaultfd and with the right tracking mode too.
1471	*/
1472	if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1473	(vma->vm_flags & vm_flags) == vm_flags)
1474	goto skip;
1475
1476	if (vma->vm_start > start)
1477	start = vma->vm_start;
1478	vma_end = min(end, vma->vm_end);
1479
1480	new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) \| vm_flags;
1481	vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start, end: vma_end,
1482	new_flags,
1483	new_ctx: (struct vm_userfaultfd_ctx){ctx});
1484	if (IS_ERR(ptr: vma)) {
1485	ret = PTR_ERR(ptr: vma);
1486	break;
1487	}
1488
1489	/*
1490	* In the vma_merge() successful mprotect-like case 8:
1491	* the next vma was merged into the current one and
1492	* the current one has not been updated yet.
1493	*/
1494	vma_start_write(vma);
1495	userfaultfd_set_vm_flags(vma, flags: new_flags);
1496	vma->vm_userfaultfd_ctx.ctx = ctx;
1497
1498	if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1499	hugetlb_unshare_all_pmds(vma);
1500
1501	skip:
1502	prev = vma;
1503	start = vma->vm_end;
1504	}
1505
1506	out_unlock:
1507	mmap_write_unlock(mm);
1508	mmput(mm);
1509	if (!ret) {
1510	__u64 ioctls_out;
1511
1512	ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1513	UFFD_API_RANGE_IOCTLS;
1514
1515	/*
1516	* Declare the WP ioctl only if the WP mode is
1517	* specified and all checks passed with the range
1518	*/
1519	if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1520	ioctls_out &= ~((__u64)`1` << _UFFDIO_WRITEPROTECT);
1521
1522	/ CONTINUE ioctl is only supported for MINOR ranges. /
1523	if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1524	ioctls_out &= ~((__u64)`1` << _UFFDIO_CONTINUE);
1525
1526	/*
1527	* Now that we scanned all vmas we can already tell
1528	* userland which ioctls methods are guaranteed to
1529	* succeed on this range.
1530	*/
1531	if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1532	ret = -EFAULT;
1533	}
1534	out:
1535	return ret;
1536	}
1537
1538	static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1539	unsigned long arg)
1540	{
1541	struct mm_struct *mm = ctx->mm;
1542	struct vm_area_struct vma, prev, *cur;
1543	int ret;
1544	struct uffdio_range uffdio_unregister;
1545	unsigned long new_flags;
1546	bool found;
1547	unsigned long start, end, vma_end;
1548	const void __user buf = (void* __user *)arg;
1549	struct vma_iterator vmi;
1550	bool wp_async = userfaultfd_wp_async_ctx(ctx);
1551
1552	ret = -EFAULT;
1553	if (copy_from_user(to: &uffdio_unregister, from: buf, n: sizeof(uffdio_unregister)))
1554	goto out;
1555
1556	ret = validate_range(mm, start: uffdio_unregister.start,
1557	len: uffdio_unregister.len);
1558	if (ret)
1559	goto out;
1560
1561	start = uffdio_unregister.start;
1562	end = start + uffdio_unregister.len;
1563
1564	ret = -ENOMEM;
1565	if (!mmget_not_zero(mm))
1566	goto out;
1567
1568	mmap_write_lock(mm);
1569	ret = -EINVAL;
1570	vma_iter_init(vmi: &vmi, mm, addr: start);
1571	vma = vma_find(vmi: &vmi, max: end);
1572	if (!vma)
1573	goto out_unlock;
1574
1575	/*
1576	* If the first vma contains huge pages, make sure start address
1577	* is aligned to huge page size.
1578	*/
1579	if (is_vm_hugetlb_page(vma)) {
1580	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1581
1582	if (start & (vma_hpagesize - `1`))
1583	goto out_unlock;
1584	}
1585
1586	/*
1587	* Search for not compatible vmas.
1588	*/
1589	found = false;
1590	cur = vma;
1591	do {
1592	cond_resched();
1593
1594	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1595	!!(cur->vm_flags & __VM_UFFD_FLAGS));
1596
1597	/*
1598	* Check not compatible vmas, not strictly required
1599	* here as not compatible vmas cannot have an
1600	* userfaultfd_ctx registered on them, but this
1601	* provides for more strict behavior to notice
1602	* unregistration errors.
1603	*/
1604	if (!vma_can_userfault(vma: cur, vm_flags: cur->vm_flags, wp_async))
1605	goto out_unlock;
1606
1607	found = true;
1608	} for_each_vma_range(vmi, cur, end);
1609	BUG_ON(!found);
1610
1611	vma_iter_set(vmi: &vmi, addr: start);
1612	prev = vma_prev(vmi: &vmi);
1613	if (vma->vm_start < start)
1614	prev = vma;
1615
1616	ret = `0`;
1617	for_each_vma_range(vmi, vma, end) {
1618	cond_resched();
1619
1620	BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
1621
1622	/*
1623	* Nothing to do: this vma is already registered into this
1624	* userfaultfd and with the right tracking mode too.
1625	*/
1626	if (!vma->vm_userfaultfd_ctx.ctx)
1627	goto skip;
1628
1629	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1630
1631	if (vma->vm_start > start)
1632	start = vma->vm_start;
1633	vma_end = min(end, vma->vm_end);
1634
1635	if (userfaultfd_missing(vma)) {
1636	/*
1637	* Wake any concurrent pending userfault while
1638	* we unregister, so they will not hang
1639	* permanently and it avoids userland to call
1640	* UFFDIO_WAKE explicitly.
1641	*/
1642	struct userfaultfd_wake_range range;
1643	range.start = start;
1644	range.len = vma_end - start;
1645	wake_userfault(ctx: vma->vm_userfaultfd_ctx.ctx, range: &range);
1646	}
1647
1648	/ Reset ptes for the whole vma range if wr-protected /
1649	if (userfaultfd_wp(vma))
1650	uffd_wp_range(vma, start, len: vma_end - start, enable_wp: false);
1651
1652	new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1653	vma = vma_modify_flags_uffd(vmi: &vmi, prev, vma, start, end: vma_end,
1654	new_flags, NULL_VM_UFFD_CTX);
1655	if (IS_ERR(ptr: vma)) {
1656	ret = PTR_ERR(ptr: vma);
1657	break;
1658	}
1659
1660	/*
1661	* In the vma_merge() successful mprotect-like case 8:
1662	* the next vma was merged into the current one and
1663	* the current one has not been updated yet.
1664	*/
1665	vma_start_write(vma);
1666	userfaultfd_set_vm_flags(vma, flags: new_flags);
1667	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1668
1669	skip:
1670	prev = vma;
1671	start = vma->vm_end;
1672	}
1673
1674	out_unlock:
1675	mmap_write_unlock(mm);
1676	mmput(mm);
1677	out:
1678	return ret;
1679	}
1680
1681	/*
1682	* userfaultfd_wake may be used in combination with the
1683	* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1684	*/
1685	static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1686	unsigned long arg)
1687	{
1688	int ret;
1689	struct uffdio_range uffdio_wake;
1690	struct userfaultfd_wake_range range;
1691	const void __user buf = (void* __user *)arg;
1692
1693	ret = -EFAULT;
1694	if (copy_from_user(to: &uffdio_wake, from: buf, n: sizeof(uffdio_wake)))
1695	goto out;
1696
1697	ret = validate_range(mm: ctx->mm, start: uffdio_wake.start, len: uffdio_wake.len);
1698	if (ret)
1699	goto out;
1700
1701	range.start = uffdio_wake.start;
1702	range.len = uffdio_wake.len;
1703
1704	/*
1705	* len == 0 means wake all and we don't want to wake all here,
1706	* so check it again to be sure.
1707	*/
1708	VM_BUG_ON(!range.len);
1709
1710	wake_userfault(ctx, range: &range);
1711	ret = `0`;
1712
1713	out:
1714	return ret;
1715	}
1716
1717	static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1718	unsigned long arg)
1719	{
1720	__s64 ret;
1721	struct uffdio_copy uffdio_copy;
1722	struct uffdio_copy __user *user_uffdio_copy;
1723	struct userfaultfd_wake_range range;
1724	uffd_flags_t flags = `0`;
1725
1726	user_uffdio_copy = (struct uffdio_copy __user *) arg;
1727
1728	ret = -EAGAIN;
1729	if (atomic_read(v: &ctx->mmap_changing))
1730	goto out;
1731
1732	ret = -EFAULT;
1733	if (copy_from_user(to: &uffdio_copy, from: user_uffdio_copy,
1734	/ don't copy "copy" last field /
1735	n: sizeof(uffdio_copy)-sizeof(__s64)))
1736	goto out;
1737
1738	ret = validate_unaligned_range(mm: ctx->mm, start: uffdio_copy.src,
1739	len: uffdio_copy.len);
1740	if (ret)
1741	goto out;
1742	ret = validate_range(mm: ctx->mm, start: uffdio_copy.dst, len: uffdio_copy.len);
1743	if (ret)
1744	goto out;
1745
1746	ret = -EINVAL;
1747	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE\|UFFDIO_COPY_MODE_WP))
1748	goto out;
1749	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1750	flags \|= MFILL_ATOMIC_WP;
1751	if (mmget_not_zero(mm: ctx->mm)) {
1752	ret = mfill_atomic_copy(dst_mm: ctx->mm, dst_start: uffdio_copy.dst, src_start: uffdio_copy.src,
1753	len: uffdio_copy.len, mmap_changing: &ctx->mmap_changing,
1754	flags);
1755	mmput(ctx->mm);
1756	} else {
1757	return -ESRCH;
1758	}
1759	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1760	return -EFAULT;
1761	if (ret < `0`)
1762	goto out;
1763	BUG_ON(!ret);
1764	/ len == 0 would wake all /
1765	range.len = ret;
1766	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1767	range.start = uffdio_copy.dst;
1768	wake_userfault(ctx, range: &range);
1769	}
1770	ret = range.len == uffdio_copy.len ? `0` : -EAGAIN;
1771	out:
1772	return ret;
1773	}
1774
1775	static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1776	unsigned long arg)
1777	{
1778	__s64 ret;
1779	struct uffdio_zeropage uffdio_zeropage;
1780	struct uffdio_zeropage __user *user_uffdio_zeropage;
1781	struct userfaultfd_wake_range range;
1782
1783	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1784
1785	ret = -EAGAIN;
1786	if (atomic_read(v: &ctx->mmap_changing))
1787	goto out;
1788
1789	ret = -EFAULT;
1790	if (copy_from_user(to: &uffdio_zeropage, from: user_uffdio_zeropage,
1791	/ don't copy "zeropage" last field /
1792	n: sizeof(uffdio_zeropage)-sizeof(__s64)))
1793	goto out;
1794
1795	ret = validate_range(mm: ctx->mm, start: uffdio_zeropage.range.start,
1796	len: uffdio_zeropage.range.len);
1797	if (ret)
1798	goto out;
1799	ret = -EINVAL;
1800	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1801	goto out;
1802
1803	if (mmget_not_zero(mm: ctx->mm)) {
1804	ret = mfill_atomic_zeropage(dst_mm: ctx->mm, dst_start: uffdio_zeropage.range.start,
1805	len: uffdio_zeropage.range.len,
1806	mmap_changing: &ctx->mmap_changing);
1807	mmput(ctx->mm);
1808	} else {
1809	return -ESRCH;
1810	}
1811	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1812	return -EFAULT;
1813	if (ret < `0`)
1814	goto out;
1815	/ len == 0 would wake all /
1816	BUG_ON(!ret);
1817	range.len = ret;
1818	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1819	range.start = uffdio_zeropage.range.start;
1820	wake_userfault(ctx, range: &range);
1821	}
1822	ret = range.len == uffdio_zeropage.range.len ? `0` : -EAGAIN;
1823	out:
1824	return ret;
1825	}
1826
1827	static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1828	unsigned long arg)
1829	{
1830	int ret;
1831	struct uffdio_writeprotect uffdio_wp;
1832	struct uffdio_writeprotect __user *user_uffdio_wp;
1833	struct userfaultfd_wake_range range;
1834	bool mode_wp, mode_dontwake;
1835
1836	if (atomic_read(v: &ctx->mmap_changing))
1837	return -EAGAIN;
1838
1839	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1840
1841	if (copy_from_user(to: &uffdio_wp, from: user_uffdio_wp,
1842	n: sizeof(struct uffdio_writeprotect)))
1843	return -EFAULT;
1844
1845	ret = validate_range(mm: ctx->mm, start: uffdio_wp.range.start,
1846	len: uffdio_wp.range.len);
1847	if (ret)
1848	return ret;
1849
1850	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE \|
1851	UFFDIO_WRITEPROTECT_MODE_WP))
1852	return -EINVAL;
1853
1854	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1855	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1856
1857	if (mode_wp && mode_dontwake)
1858	return -EINVAL;
1859
1860	if (mmget_not_zero(mm: ctx->mm)) {
1861	ret = mwriteprotect_range(dst_mm: ctx->mm, start: uffdio_wp.range.start,
1862	len: uffdio_wp.range.len, enable_wp: mode_wp,
1863	mmap_changing: &ctx->mmap_changing);
1864	mmput(ctx->mm);
1865	} else {
1866	return -ESRCH;
1867	}
1868
1869	if (ret)
1870	return ret;
1871
1872	if (!mode_wp && !mode_dontwake) {
1873	range.start = uffdio_wp.range.start;
1874	range.len = uffdio_wp.range.len;
1875	wake_userfault(ctx, range: &range);
1876	}
1877	return ret;
1878	}
1879
1880	static int userfaultfd_continue(struct userfaultfd_ctx ctx, unsigned* long arg)
1881	{
1882	__s64 ret;
1883	struct uffdio_continue uffdio_continue;
1884	struct uffdio_continue __user *user_uffdio_continue;
1885	struct userfaultfd_wake_range range;
1886	uffd_flags_t flags = `0`;
1887
1888	user_uffdio_continue = (struct uffdio_continue __user *)arg;
1889
1890	ret = -EAGAIN;
1891	if (atomic_read(v: &ctx->mmap_changing))
1892	goto out;
1893
1894	ret = -EFAULT;
1895	if (copy_from_user(to: &uffdio_continue, from: user_uffdio_continue,
1896	/ don't copy the output fields /
1897	n: sizeof(uffdio_continue) - (sizeof(__s64))))
1898	goto out;
1899
1900	ret = validate_range(mm: ctx->mm, start: uffdio_continue.range.start,
1901	len: uffdio_continue.range.len);
1902	if (ret)
1903	goto out;
1904
1905	ret = -EINVAL;
1906	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE \|
1907	UFFDIO_CONTINUE_MODE_WP))
1908	goto out;
1909	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1910	flags \|= MFILL_ATOMIC_WP;
1911
1912	if (mmget_not_zero(mm: ctx->mm)) {
1913	ret = mfill_atomic_continue(dst_mm: ctx->mm, dst_start: uffdio_continue.range.start,
1914	len: uffdio_continue.range.len,
1915	mmap_changing: &ctx->mmap_changing, flags);
1916	mmput(ctx->mm);
1917	} else {
1918	return -ESRCH;
1919	}
1920
1921	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1922	return -EFAULT;
1923	if (ret < `0`)
1924	goto out;
1925
1926	/ len == 0 would wake all /
1927	BUG_ON(!ret);
1928	range.len = ret;
1929	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1930	range.start = uffdio_continue.range.start;
1931	wake_userfault(ctx, range: &range);
1932	}
1933	ret = range.len == uffdio_continue.range.len ? `0` : -EAGAIN;
1934
1935	out:
1936	return ret;
1937	}
1938
1939	static inline int userfaultfd_poison(struct userfaultfd_ctx ctx, unsigned* long arg)
1940	{
1941	__s64 ret;
1942	struct uffdio_poison uffdio_poison;
1943	struct uffdio_poison __user *user_uffdio_poison;
1944	struct userfaultfd_wake_range range;
1945
1946	user_uffdio_poison = (struct uffdio_poison __user *)arg;
1947
1948	ret = -EAGAIN;
1949	if (atomic_read(v: &ctx->mmap_changing))
1950	goto out;
1951
1952	ret = -EFAULT;
1953	if (copy_from_user(to: &uffdio_poison, from: user_uffdio_poison,
1954	/ don't copy the output fields /
1955	n: sizeof(uffdio_poison) - (sizeof(__s64))))
1956	goto out;
1957
1958	ret = validate_range(mm: ctx->mm, start: uffdio_poison.range.start,
1959	len: uffdio_poison.range.len);
1960	if (ret)
1961	goto out;
1962
1963	ret = -EINVAL;
1964	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1965	goto out;
1966
1967	if (mmget_not_zero(mm: ctx->mm)) {
1968	ret = mfill_atomic_poison(dst_mm: ctx->mm, start: uffdio_poison.range.start,
1969	len: uffdio_poison.range.len,
1970	mmap_changing: &ctx->mmap_changing, flags: `0`);
1971	mmput(ctx->mm);
1972	} else {
1973	return -ESRCH;
1974	}
1975
1976	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
1977	return -EFAULT;
1978	if (ret < `0`)
1979	goto out;
1980
1981	/ len == 0 would wake all /
1982	BUG_ON(!ret);
1983	range.len = ret;
1984	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
1985	range.start = uffdio_poison.range.start;
1986	wake_userfault(ctx, range: &range);
1987	}
1988	ret = range.len == uffdio_poison.range.len ? `0` : -EAGAIN;
1989
1990	out:
1991	return ret;
1992	}
1993
1994	bool userfaultfd_wp_async(struct vm_area_struct *vma)
1995	{
1996	return userfaultfd_wp_async_ctx(ctx: vma->vm_userfaultfd_ctx.ctx);
1997	}
1998
1999	static inline unsigned int uffd_ctx_features(__u64 user_features)
2000	{
2001	/*
2002	* For the current set of features the bits just coincide. Set
2003	* UFFD_FEATURE_INITIALIZED to mark the features as enabled.
2004	*/
2005	return (unsigned int)user_features \| UFFD_FEATURE_INITIALIZED;
2006	}
2007
2008	/*
2009	* userland asks for a certain API version and we return which bits
2010	* and ioctl commands are implemented in this kernel for such API
2011	* version or -EINVAL if unknown.
2012	*/
2013	static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2014	unsigned long arg)
2015	{
2016	struct uffdio_api uffdio_api;
2017	void __user buf = (void* __user *)arg;
2018	unsigned int ctx_features;
2019	int ret;
2020	__u64 features;
2021
2022	ret = -EFAULT;
2023	if (copy_from_user(to: &uffdio_api, from: buf, n: sizeof(uffdio_api)))
2024	goto out;
2025	features = uffdio_api.features;
2026	ret = -EINVAL;
2027	if (uffdio_api.api != UFFD_API \|\| (features & ~UFFD_API_FEATURES))
2028	goto err_out;
2029	ret = -EPERM;
2030	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2031	goto err_out;
2032
2033	/ WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally /
2034	if (features & UFFD_FEATURE_WP_ASYNC)
2035	features \|= UFFD_FEATURE_WP_UNPOPULATED;
2036
2037	/ report all available features and ioctls to userland /
2038	uffdio_api.features = UFFD_API_FEATURES;
2039	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
2040	uffdio_api.features &=
2041	~(UFFD_FEATURE_MINOR_HUGETLBFS \| UFFD_FEATURE_MINOR_SHMEM);
2042	#endif
2043	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2044	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
2045	#endif
2046	#ifndef CONFIG_PTE_MARKER_UFFD_WP
2047	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2048	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
2049	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
2050	#endif
2051	uffdio_api.ioctls = UFFD_API_IOCTLS;
2052	ret = -EFAULT;
2053	if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2054	goto out;
2055
2056	/ only enable the requested features for this uffd context /
2057	ctx_features = uffd_ctx_features(user_features: features);
2058	ret = -EINVAL;
2059	if (cmpxchg(&ctx->features, `0`, ctx_features) != `0`)
2060	goto err_out;
2061
2062	ret = `0`;
2063	out:
2064	return ret;
2065	err_out:
2066	memset(&uffdio_api, `0`, sizeof(uffdio_api));
2067	if (copy_to_user(to: buf, from: &uffdio_api, n: sizeof(uffdio_api)))
2068	ret = -EFAULT;
2069	goto out;
2070	}
2071
2072	static long userfaultfd_ioctl(struct file file, unsigned* cmd,
2073	unsigned long arg)
2074	{
2075	int ret = -EINVAL;
2076	struct userfaultfd_ctx *ctx = file->private_data;
2077
2078	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2079	return -EINVAL;
2080
2081	switch(cmd) {
2082	case UFFDIO_API:
2083	ret = userfaultfd_api(ctx, arg);
2084	break;
2085	case UFFDIO_REGISTER:
2086	ret = userfaultfd_register(ctx, arg);
2087	break;
2088	case UFFDIO_UNREGISTER:
2089	ret = userfaultfd_unregister(ctx, arg);
2090	break;
2091	case UFFDIO_WAKE:
2092	ret = userfaultfd_wake(ctx, arg);
2093	break;
2094	case UFFDIO_COPY:
2095	ret = userfaultfd_copy(ctx, arg);
2096	break;
2097	case UFFDIO_ZEROPAGE:
2098	ret = userfaultfd_zeropage(ctx, arg);
2099	break;
2100	case UFFDIO_WRITEPROTECT:
2101	ret = userfaultfd_writeprotect(ctx, arg);
2102	break;
2103	case UFFDIO_CONTINUE:
2104	ret = userfaultfd_continue(ctx, arg);
2105	break;
2106	case UFFDIO_POISON:
2107	ret = userfaultfd_poison(ctx, arg);
2108	break;
2109	}
2110	return ret;
2111	}
2112
2113	#ifdef CONFIG_PROC_FS
2114	static void userfaultfd_show_fdinfo(struct seq_file m, struct* file *f)
2115	{
2116	struct userfaultfd_ctx *ctx = f->private_data;
2117	wait_queue_entry_t *wq;
2118	unsigned long pending = `0`, total = `0`;
2119
2120	spin_lock_irq(lock: &ctx->fault_pending_wqh.lock);
2121	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2122	pending++;
2123	total++;
2124	}
2125	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2126	total++;
2127	}
2128	spin_unlock_irq(lock: &ctx->fault_pending_wqh.lock);
2129
2130	/*
2131	* If more protocols will be added, there will be all shown
2132	* separated by a space. Like this:
2133	* protocols: aa:... bb:...
2134	*/
2135	seq_printf(m, fmt: "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2136	pending, total, UFFD_API, ctx->features,
2137	UFFD_API_IOCTLS\|UFFD_API_RANGE_IOCTLS);
2138	}
2139	#endif
2140
2141	static const struct file_operations userfaultfd_fops = {
2142	#ifdef CONFIG_PROC_FS
2143	.show_fdinfo = userfaultfd_show_fdinfo,
2144	#endif
2145	.release = userfaultfd_release,
2146	.poll = userfaultfd_poll,
2147	.read = userfaultfd_read,
2148	.unlocked_ioctl = userfaultfd_ioctl,
2149	.compat_ioctl = compat_ptr_ioctl,
2150	.llseek = noop_llseek,
2151	};
2152
2153	static void init_once_userfaultfd_ctx(void *mem)
2154	{
2155	struct userfaultfd_ctx ctx = (struct* userfaultfd_ctx *) mem;
2156
2157	init_waitqueue_head(&ctx->fault_pending_wqh);
2158	init_waitqueue_head(&ctx->fault_wqh);
2159	init_waitqueue_head(&ctx->event_wqh);
2160	init_waitqueue_head(&ctx->fd_wqh);
2161	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2162	}
2163
2164	static int new_userfaultfd(int flags)
2165	{
2166	struct userfaultfd_ctx *ctx;
2167	int fd;
2168
2169	BUG_ON(!current->mm);
2170
2171	/ Check the UFFD_* constants for consistency. /
2172	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2173	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2174	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2175
2176	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS \| UFFD_USER_MODE_ONLY))
2177	return -EINVAL;
2178
2179	ctx = kmem_cache_alloc(cachep: userfaultfd_ctx_cachep, GFP_KERNEL);
2180	if (!ctx)
2181	return -ENOMEM;
2182
2183	refcount_set(r: &ctx->refcount, n: `1`);
2184	ctx->flags = flags;
2185	ctx->features = `0`;
2186	ctx->released = false;
2187	atomic_set(v: &ctx->mmap_changing, i: `0`);
2188	ctx->mm = current->mm;
2189	/ prevent the mm struct to be freed /
2190	mmgrab(mm: ctx->mm);
2191
2192	fd = anon_inode_getfd_secure(name: "[userfaultfd]", fops: &userfaultfd_fops, priv: ctx,
2193	O_RDONLY \| (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2194	if (fd < `0`) {
2195	mmdrop(mm: ctx->mm);
2196	kmem_cache_free(s: userfaultfd_ctx_cachep, objp: ctx);
2197	}
2198	return fd;
2199	}
2200
2201	static inline bool userfaultfd_syscall_allowed(int flags)
2202	{
2203	/ Userspace-only page faults are always allowed /
2204	if (flags & UFFD_USER_MODE_ONLY)
2205	return true;
2206
2207	/*
2208	* The user is requesting a userfaultfd which can handle kernel faults.
2209	* Privileged users are always allowed to do this.
2210	*/
2211	if (capable(CAP_SYS_PTRACE))
2212	return true;
2213
2214	/ Otherwise, access to kernel fault handling is sysctl controlled. /
2215	return sysctl_unprivileged_userfaultfd;
2216	}
2217
2218	SYSCALL_DEFINE1(userfaultfd, int, flags)
2219	{
2220	if (!userfaultfd_syscall_allowed(flags))
2221	return -EPERM;
2222
2223	return new_userfaultfd(flags);
2224	}
2225
2226	static long userfaultfd_dev_ioctl(struct file file, unsigned* int cmd, unsigned long flags)
2227	{
2228	if (cmd != USERFAULTFD_IOC_NEW)
2229	return -EINVAL;
2230
2231	return new_userfaultfd(flags);
2232	}
2233
2234	static const struct file_operations userfaultfd_dev_fops = {
2235	.unlocked_ioctl = userfaultfd_dev_ioctl,
2236	.compat_ioctl = userfaultfd_dev_ioctl,
2237	.owner = THIS_MODULE,
2238	.llseek = noop_llseek,
2239	};
2240
2241	static struct miscdevice userfaultfd_misc = {
2242	.minor = MISC_DYNAMIC_MINOR,
2243	.name = "userfaultfd",
2244	.fops = &userfaultfd_dev_fops
2245	};
2246
2247	static int __init userfaultfd_init(void)
2248	{
2249	int ret;
2250
2251	ret = misc_register(misc: &userfaultfd_misc);
2252	if (ret)
2253	return ret;
2254
2255	userfaultfd_ctx_cachep = kmem_cache_create(name: "userfaultfd_ctx_cache",
2256	size: sizeof(struct userfaultfd_ctx),
2257	align: `0`,
2258	SLAB_HWCACHE_ALIGN\|SLAB_PANIC,
2259	ctor: init_once_userfaultfd_ctx);
2260	#ifdef CONFIG_SYSCTL
2261	register_sysctl_init("vm", vm_userfaultfd_table);
2262	#endif
2263	return `0`;
2264	}
2265	__initcall(userfaultfd_init);
2266

source code of linux/fs/userfaultfd.c