vma.c source code [linux/mm/vma.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2
3	/*
4	* VMA-specific functions.
5	*/
6
7	#include "vma_internal.h"
8	#include "vma.h"
9
10	struct mmap_state {
11	struct mm_struct *mm;
12	struct vma_iterator *vmi;
13
14	unsigned long addr;
15	unsigned long end;
16	pgoff_t pgoff;
17	unsigned long pglen;
18	vm_flags_t vm_flags;
19	struct file *file;
20	pgprot_t page_prot;
21
22	/ User-defined fields, perhaps updated by .mmap_prepare(). /
23	const struct vm_operations_struct *vm_ops;
24	void *vm_private_data;
25
26	unsigned long charged;
27
28	struct vm_area_struct *prev;
29	struct vm_area_struct *next;
30
31	/ Unmapping state. /
32	struct vma_munmap_struct vms;
33	struct ma_state mas_detach;
34	struct maple_tree mt_detach;
35
36	/ Determine if we can check KSM flags early in mmap() logic. /
37	bool check_ksm_early :`1`;
38	/ If we map new, hold the file rmap lock on mapping. /
39	bool hold_file_rmap_lock :`1`;
40	/ If .mmap_prepare changed the file, we don't need to pin. /
41	bool file_doesnt_need_get :`1`;
42	};
43
44	#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
45	struct mmap_state name = { \
46	.mm = mm_, \
47	.vmi = vmi_, \
48	.addr = addr_, \
49	.end = (addr_) + (len_), \
50	.pgoff = pgoff_, \
51	.pglen = PHYS_PFN(len_), \
52	.vm_flags = vm_flags_, \
53	.file = file_, \
54	.page_prot = vm_get_page_prot(vm_flags_), \
55	}
56
57	#define VMG_MMAP_STATE(name, map_, vma_) \
58	struct vma_merge_struct name = { \
59	.mm = (map_)->mm, \
60	.vmi = (map_)->vmi, \
61	.start = (map_)->addr, \
62	.end = (map_)->end, \
63	.vm_flags = (map_)->vm_flags, \
64	.pgoff = (map_)->pgoff, \
65	.file = (map_)->file, \
66	.prev = (map_)->prev, \
67	.middle = vma_, \
68	.next = (vma_) ? NULL : (map_)->next, \
69	.state = VMA_MERGE_START, \
70	}
71
72	/ Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? /
73	static bool vma_is_fork_child(struct vm_area_struct *vma)
74	{
75	/*
76	* The list_is_singular() test is to avoid merging VMA cloned from
77	* parents. This can improve scalability caused by the anon_vma root
78	* lock.
79	*/
80	return vma && vma->anon_vma && !list_is_singular(head: &vma->anon_vma_chain);
81	}
82
83	static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
84	{
85	struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
86
87	if (!mpol_equal(a: vmg->policy, vma_policy(vma)))
88	return false;
89	if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE)
90	return false;
91	if (vma->vm_file != vmg->file)
92	return false;
93	if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_ctx: vmg->uffd_ctx))
94	return false;
95	if (!anon_vma_name_eq(anon_name1: anon_vma_name(vma), anon_name2: vmg->anon_name))
96	return false;
97	return true;
98	}
99
100	static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
101	{
102	struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
103	struct vm_area_struct src = vmg->middle; /* existing merge case. /
104	struct anon_vma *tgt_anon = tgt->anon_vma;
105	struct anon_vma *src_anon = vmg->anon_vma;
106
107	/*
108	* We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
109	* will remove the existing VMA's anon_vma's so there's no scalability
110	* concerns.
111	*/
112	VM_WARN_ON(src && src_anon != src->anon_vma);
113
114	/ Case 1 - we will dup_anon_vma() from src into tgt. /
115	if (!tgt_anon && src_anon) {
116	struct vm_area_struct *copied_from = vmg->copied_from;
117
118	if (vma_is_fork_child(vma: src))
119	return false;
120	if (vma_is_fork_child(vma: copied_from))
121	return false;
122
123	return true;
124	}
125	/ Case 2 - we will simply use tgt's anon_vma. /
126	if (tgt_anon && !src_anon)
127	return !vma_is_fork_child(vma: tgt);
128	/ Case 3 - the anon_vma's are already shared. /
129	return src_anon == tgt_anon;
130	}
131
132	/*
133	* init_multi_vma_prep() - Initializer for struct vma_prepare
134	* @vp: The vma_prepare struct
135	* @vma: The vma that will be altered once locked
136	* @vmg: The merge state that will be used to determine adjustment and VMA
137	* removal.
138	*/
139	static void init_multi_vma_prep(struct vma_prepare *vp,
140	struct vm_area_struct *vma,
141	struct vma_merge_struct *vmg)
142	{
143	struct vm_area_struct *adjust;
144	struct vm_area_struct **remove = &vp->remove;
145
146	memset(vp, `0`, sizeof(struct vma_prepare));
147	vp->vma = vma;
148	vp->anon_vma = vma->anon_vma;
149
150	if (vmg && vmg->__remove_middle) {
151	*remove = vmg->middle;
152	remove = &vp->remove2;
153	}
154	if (vmg && vmg->__remove_next)
155	*remove = vmg->next;
156
157	if (vmg && vmg->__adjust_middle_start)
158	adjust = vmg->middle;
159	else if (vmg && vmg->__adjust_next_start)
160	adjust = vmg->next;
161	else
162	adjust = NULL;
163
164	vp->adj_next = adjust;
165	if (!vp->anon_vma && adjust)
166	vp->anon_vma = adjust->anon_vma;
167
168	VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
169	vp->anon_vma != adjust->anon_vma);
170
171	vp->file = vma->vm_file;
172	if (vp->file)
173	vp->mapping = vma->vm_file->f_mapping;
174
175	if (vmg && vmg->skip_vma_uprobe)
176	vp->skip_vma_uprobe = true;
177	}
178
179	/*
180	* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
181	* in front of (at a lower virtual address and file offset than) the vma.
182	*
183	* We cannot merge two vmas if they have differently assigned (non-NULL)
184	* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
185	*
186	* We don't check here for the merged mmap wrapping around the end of pagecache
187	* indices (16TB on ia32) because do_mmap() does not permit mmap's which
188	* wrap, nor mmaps which cover the final page at index -1UL.
189	*
190	* We assume the vma may be removed as part of the merge.
191	*/
192	static bool can_vma_merge_before(struct vma_merge_struct *vmg)
193	{
194	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
195
196	if (is_mergeable_vma(vmg, / merge_next = / true) &&
197	is_mergeable_anon_vma(vmg, / merge_next = / true)) {
198	if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
199	return true;
200	}
201
202	return false;
203	}
204
205	/*
206	* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
207	* beyond (at a higher virtual address and file offset than) the vma.
208	*
209	* We cannot merge two vmas if they have differently assigned (non-NULL)
210	* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
211	*
212	* We assume that vma is not removed as part of the merge.
213	*/
214	static bool can_vma_merge_after(struct vma_merge_struct *vmg)
215	{
216	if (is_mergeable_vma(vmg, / merge_next = / false) &&
217	is_mergeable_anon_vma(vmg, / merge_next = / false)) {
218	if (vmg->prev->vm_pgoff + vma_pages(vma: vmg->prev) == vmg->pgoff)
219	return true;
220	}
221	return false;
222	}
223
224	static void __vma_link_file(struct vm_area_struct *vma,
225	struct address_space *mapping)
226	{
227	if (vma_is_shared_maywrite(vma))
228	mapping_allow_writable(mapping);
229
230	flush_dcache_mmap_lock(mapping);
231	vma_interval_tree_insert(node: vma, root: &mapping->i_mmap);
232	flush_dcache_mmap_unlock(mapping);
233	}
234
235	/*
236	* Requires inode->i_mapping->i_mmap_rwsem
237	*/
238	static void __remove_shared_vm_struct(struct vm_area_struct *vma,
239	struct address_space *mapping)
240	{
241	if (vma_is_shared_maywrite(vma))
242	mapping_unmap_writable(mapping);
243
244	flush_dcache_mmap_lock(mapping);
245	vma_interval_tree_remove(node: vma, root: &mapping->i_mmap);
246	flush_dcache_mmap_unlock(mapping);
247	}
248
249	/*
250	* vma has some anon_vma assigned, and is already inserted on that
251	* anon_vma's interval trees.
252	*
253	* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
254	* vma must be removed from the anon_vma's interval trees using
255	* anon_vma_interval_tree_pre_update_vma().
256	*
257	* After the update, the vma will be reinserted using
258	* anon_vma_interval_tree_post_update_vma().
259	*
260	* The entire update must be protected by exclusive mmap_lock and by
261	* the root anon_vma's mutex.
262	*/
263	static void
264	anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
265	{
266	struct anon_vma_chain *avc;
267
268	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
269	anon_vma_interval_tree_remove(node: avc, root: &avc->anon_vma->rb_root);
270	}
271
272	static void
273	anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
274	{
275	struct anon_vma_chain *avc;
276
277	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
278	anon_vma_interval_tree_insert(node: avc, root: &avc->anon_vma->rb_root);
279	}
280
281	/*
282	* vma_prepare() - Helper function for handling locking VMAs prior to altering
283	* @vp: The initialized vma_prepare struct
284	*/
285	static void vma_prepare(struct vma_prepare *vp)
286	{
287	if (vp->file) {
288	uprobe_munmap(vma: vp->vma, start: vp->vma->vm_start, end: vp->vma->vm_end);
289
290	if (vp->adj_next)
291	uprobe_munmap(vma: vp->adj_next, start: vp->adj_next->vm_start,
292	end: vp->adj_next->vm_end);
293
294	i_mmap_lock_write(mapping: vp->mapping);
295	if (vp->insert && vp->insert->vm_file) {
296	/*
297	* Put into interval tree now, so instantiated pages
298	* are visible to arm/parisc __flush_dcache_page
299	* throughout; but we cannot insert into address
300	* space until vma start or end is updated.
301	*/
302	__vma_link_file(vma: vp->insert,
303	mapping: vp->insert->vm_file->f_mapping);
304	}
305	}
306
307	if (vp->anon_vma) {
308	anon_vma_lock_write(anon_vma: vp->anon_vma);
309	anon_vma_interval_tree_pre_update_vma(vma: vp->vma);
310	if (vp->adj_next)
311	anon_vma_interval_tree_pre_update_vma(vma: vp->adj_next);
312	}
313
314	if (vp->file) {
315	flush_dcache_mmap_lock(mapping: vp->mapping);
316	vma_interval_tree_remove(node: vp->vma, root: &vp->mapping->i_mmap);
317	if (vp->adj_next)
318	vma_interval_tree_remove(node: vp->adj_next,
319	root: &vp->mapping->i_mmap);
320	}
321
322	}
323
324	/*
325	* vma_complete- Helper function for handling the unlocking after altering VMAs,
326	* or for inserting a VMA.
327	*
328	* @vp: The vma_prepare struct
329	* @vmi: The vma iterator
330	* @mm: The mm_struct
331	*/
332	static void vma_complete(struct vma_prepare vp, struct* vma_iterator *vmi,
333	struct mm_struct *mm)
334	{
335	if (vp->file) {
336	if (vp->adj_next)
337	vma_interval_tree_insert(node: vp->adj_next,
338	root: &vp->mapping->i_mmap);
339	vma_interval_tree_insert(node: vp->vma, root: &vp->mapping->i_mmap);
340	flush_dcache_mmap_unlock(mapping: vp->mapping);
341	}
342
343	if (vp->remove && vp->file) {
344	__remove_shared_vm_struct(vma: vp->remove, mapping: vp->mapping);
345	if (vp->remove2)
346	__remove_shared_vm_struct(vma: vp->remove2, mapping: vp->mapping);
347	} else if (vp->insert) {
348	/*
349	* split_vma has split insert from vma, and needs
350	* us to insert it before dropping the locks
351	* (it may either follow vma or precede it).
352	*/
353	vma_iter_store_new(vmi, vma: vp->insert);
354	mm->map_count++;
355	}
356
357	if (vp->anon_vma) {
358	anon_vma_interval_tree_post_update_vma(vma: vp->vma);
359	if (vp->adj_next)
360	anon_vma_interval_tree_post_update_vma(vma: vp->adj_next);
361	anon_vma_unlock_write(anon_vma: vp->anon_vma);
362	}
363
364	if (vp->file) {
365	i_mmap_unlock_write(mapping: vp->mapping);
366
367	if (!vp->skip_vma_uprobe) {
368	uprobe_mmap(vma: vp->vma);
369
370	if (vp->adj_next)
371	uprobe_mmap(vma: vp->adj_next);
372	}
373	}
374
375	if (vp->remove) {
376	again:
377	vma_mark_detached(vma: vp->remove);
378	if (vp->file) {
379	uprobe_munmap(vma: vp->remove, start: vp->remove->vm_start,
380	end: vp->remove->vm_end);
381	fput(vp->file);
382	}
383	if (vp->remove->anon_vma)
384	anon_vma_merge(vma: vp->vma, next: vp->remove);
385	mm->map_count--;
386	mpol_put(vma_policy(vp->remove));
387	if (!vp->remove2)
388	WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
389	vm_area_free(vma: vp->remove);
390
391	/*
392	* In mprotect's case 6 (see comments on vma_merge),
393	* we are removing both mid and next vmas
394	*/
395	if (vp->remove2) {
396	vp->remove = vp->remove2;
397	vp->remove2 = NULL;
398	goto again;
399	}
400	}
401	if (vp->insert && vp->file)
402	uprobe_mmap(vma: vp->insert);
403	}
404
405	/*
406	* init_vma_prep() - Initializer wrapper for vma_prepare struct
407	* @vp: The vma_prepare struct
408	* @vma: The vma that will be altered once locked
409	*/
410	static void init_vma_prep(struct vma_prepare vp, struct* vm_area_struct *vma)
411	{
412	init_multi_vma_prep(vp, vma, NULL);
413	}
414
415	/*
416	* Can the proposed VMA be merged with the left (previous) VMA taking into
417	* account the start position of the proposed range.
418	*/
419	static bool can_vma_merge_left(struct vma_merge_struct *vmg)
420
421	{
422	return vmg->prev && vmg->prev->vm_end == vmg->start &&
423	can_vma_merge_after(vmg);
424	}
425
426	/*
427	* Can the proposed VMA be merged with the right (next) VMA taking into
428	* account the end position of the proposed range.
429	*
430	* In addition, if we can merge with the left VMA, ensure that left and right
431	* anon_vma's are also compatible.
432	*/
433	static bool can_vma_merge_right(struct vma_merge_struct *vmg,
434	bool can_merge_left)
435	{
436	struct vm_area_struct *next = vmg->next;
437	struct vm_area_struct *prev;
438
439	if (!next \|\| vmg->end != next->vm_start \|\| !can_vma_merge_before(vmg))
440	return false;
441
442	if (!can_merge_left)
443	return true;
444
445	/*
446	* If we can merge with prev (left) and next (right), indicating that
447	* each VMA's anon_vma is compatible with the proposed anon_vma, this
448	* does not mean prev and next are compatible with EACH OTHER.
449	*
450	* We therefore check this in addition to mergeability to either side.
451	*/
452	prev = vmg->prev;
453	return !prev->anon_vma \|\| !next->anon_vma \|\|
454	prev->anon_vma == next->anon_vma;
455	}
456
457	/*
458	* Close a vm structure and free it.
459	*/
460	void remove_vma(struct vm_area_struct *vma)
461	{
462	might_sleep();
463	vma_close(vma);
464	if (vma->vm_file)
465	fput(vma->vm_file);
466	mpol_put(vma_policy(vma));
467	vm_area_free(vma);
468	}
469
470	/*
471	* Get rid of page table information in the indicated region.
472	*
473	* Called with the mm semaphore held.
474	*/
475	void unmap_region(struct ma_state mas, struct* vm_area_struct *vma,
476	struct vm_area_struct prev, struct* vm_area_struct *next)
477	{
478	struct mm_struct *mm = vma->vm_mm;
479	struct mmu_gather tlb;
480
481	tlb_gather_mmu(tlb: &tlb, mm);
482	update_hiwater_rss(mm);
483	unmap_vmas(tlb: &tlb, mas, start_vma: vma, start: vma->vm_start, end: vma->vm_end, tree_end: vma->vm_end);
484	mas_set(mas, index: vma->vm_end);
485	free_pgtables(tlb: &tlb, mas, start_vma: vma, floor: prev ? prev->vm_end : FIRST_USER_ADDRESS,
486	ceiling: next ? next->vm_start : USER_PGTABLES_CEILING,
487	/ mm_wr_locked = / true);
488	tlb_finish_mmu(tlb: &tlb);
489	}
490
491	/*
492	* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
493	* has already been checked or doesn't make sense to fail.
494	* VMA Iterator will point to the original VMA.
495	*/
496	static __must_check int
497	__split_vma(struct vma_iterator vmi, struct* vm_area_struct *vma,
498	unsigned long addr, int new_below)
499	{
500	struct vma_prepare vp;
501	struct vm_area_struct *new;
502	int err;
503
504	WARN_ON(vma->vm_start >= addr);
505	WARN_ON(vma->vm_end <= addr);
506
507	if (vma->vm_ops && vma->vm_ops->may_split) {
508	err = vma->vm_ops->may_split(vma, addr);
509	if (err)
510	return err;
511	}
512
513	new = vm_area_dup(orig: vma);
514	if (!new)
515	return -ENOMEM;
516
517	if (new_below) {
518	new->vm_end = addr;
519	} else {
520	new->vm_start = addr;
521	new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
522	}
523
524	err = -ENOMEM;
525	vma_iter_config(vmi, index: new->vm_start, last: new->vm_end);
526	if (vma_iter_prealloc(vmi, vma: new))
527	goto out_free_vma;
528
529	err = vma_dup_policy(src: vma, dst: new);
530	if (err)
531	goto out_free_vmi;
532
533	err = anon_vma_clone(new, vma);
534	if (err)
535	goto out_free_mpol;
536
537	if (new->vm_file)
538	get_file(f: new->vm_file);
539
540	if (new->vm_ops && new->vm_ops->open)
541	new->vm_ops->open(new);
542
543	vma_start_write(vma);
544	vma_start_write(vma: new);
545
546	init_vma_prep(vp: &vp, vma);
547	vp.insert = new;
548	vma_prepare(vp: &vp);
549
550	/*
551	* Get rid of huge pages and shared page tables straddling the split
552	* boundary.
553	*/
554	vma_adjust_trans_huge(vma, start: vma->vm_start, end: addr, NULL);
555	if (is_vm_hugetlb_page(vma))
556	hugetlb_split(vma, addr);
557
558	if (new_below) {
559	vma->vm_start = addr;
560	vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
561	} else {
562	vma->vm_end = addr;
563	}
564
565	/ vma_complete stores the new vma /
566	vma_complete(vp: &vp, vmi, mm: vma->vm_mm);
567	validate_mm(mm: vma->vm_mm);
568
569	/ Success. /
570	if (new_below)
571	vma_next(vmi);
572	else
573	vma_prev(vmi);
574
575	return `0`;
576
577	out_free_mpol:
578	mpol_put(vma_policy(new));
579	out_free_vmi:
580	vma_iter_free(vmi);
581	out_free_vma:
582	vm_area_free(vma: new);
583	return err;
584	}
585
586	/*
587	* Split a vma into two pieces at address 'addr', a new vma is allocated
588	* either for the first part or the tail.
589	*/
590	static int split_vma(struct vma_iterator vmi, struct* vm_area_struct *vma,
591	unsigned long addr, int new_below)
592	{
593	if (vma->vm_mm->map_count >= sysctl_max_map_count)
594	return -ENOMEM;
595
596	return __split_vma(vmi, vma, addr, new_below);
597	}
598
599	/*
600	* dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
601	* instance that the destination VMA has no anon_vma but the source does.
602	*
603	* @dst: The destination VMA
604	* @src: The source VMA
605	* @dup: Pointer to the destination VMA when successful.
606	*
607	* Returns: 0 on success.
608	*/
609	static int dup_anon_vma(struct vm_area_struct *dst,
610	struct vm_area_struct src, struct* vm_area_struct **dup)
611	{
612	/*
613	* There are three cases to consider for correctly propagating
614	* anon_vma's on merge.
615	*
616	* The first is trivial - neither VMA has anon_vma, we need not do
617	* anything.
618	*
619	* The second where both have anon_vma is also a no-op, as they must
620	* then be the same, so there is simply nothing to copy.
621	*
622	* Here we cover the third - if the destination VMA has no anon_vma,
623	* that is it is unfaulted, we need to ensure that the newly merged
624	* range is referenced by the anon_vma's of the source.
625	*/
626	if (src->anon_vma && !dst->anon_vma) {
627	int ret;
628
629	vma_assert_write_locked(vma: dst);
630	dst->anon_vma = src->anon_vma;
631	ret = anon_vma_clone(dst, src);
632	if (ret)
633	return ret;
634
635	*dup = dst;
636	}
637
638	return `0`;
639	}
640
641	#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
642	void validate_mm(struct mm_struct *mm)
643	{
644	int bug = `0`;
645	int i = `0`;
646	struct vm_area_struct *vma;
647	VMA_ITERATOR(vmi, mm, `0`);
648
649	mt_validate(mt: &mm->mm_mt);
650	for_each_vma(vmi, vma) {
651	#ifdef CONFIG_DEBUG_VM_RB
652	struct anon_vma *anon_vma = vma->anon_vma;
653	struct anon_vma_chain *avc;
654	#endif
655	unsigned long vmi_start, vmi_end;
656	bool warn = `0`;
657
658	vmi_start = vma_iter_addr(vmi: &vmi);
659	vmi_end = vma_iter_end(vmi: &vmi);
660	if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
661	warn = `1`;
662
663	if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
664	warn = `1`;
665
666	if (warn) {
667	pr_emerg("issue in %s\n", current->comm);
668	dump_stack();
669	dump_vma(vma);
670	pr_emerg("tree range: %px start %lx end %lx\n", vma,
671	vmi_start, vmi_end - `1`);
672	vma_iter_dump_tree(vmi: &vmi);
673	}
674
675	#ifdef CONFIG_DEBUG_VM_RB
676	if (anon_vma) {
677	anon_vma_lock_read(anon_vma);
678	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
679	anon_vma_interval_tree_verify(node: avc);
680	anon_vma_unlock_read(anon_vma);
681	}
682	#endif
683	/ Check for a infinite loop /
684	if (++i > mm->map_count + `10`) {
685	i = -`1`;
686	break;
687	}
688	}
689	if (i != mm->map_count) {
690	pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
691	bug = `1`;
692	}
693	VM_BUG_ON_MM(bug, mm);
694	}
695	#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
696
697	/*
698	* Based on the vmg flag indicating whether we need to adjust the vm_start field
699	* for the middle or next VMA, we calculate what the range of the newly adjusted
700	* VMA ought to be, and set the VMA's range accordingly.
701	*/
702	static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
703	{
704	struct vm_area_struct *adjust;
705	pgoff_t pgoff;
706
707	if (vmg->__adjust_middle_start) {
708	adjust = vmg->middle;
709	pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
710	} else if (vmg->__adjust_next_start) {
711	adjust = vmg->next;
712	pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
713	} else {
714	return;
715	}
716
717	vma_set_range(vma: adjust, start: vmg->end, end: adjust->vm_end, pgoff);
718	}
719
720	/*
721	* Actually perform the VMA merge operation.
722	*
723	* IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
724	* modify any VMAs or cause inconsistent state should an OOM condition arise.
725	*
726	* Returns 0 on success, or an error value on failure.
727	*/
728	static int commit_merge(struct vma_merge_struct *vmg)
729	{
730	struct vm_area_struct *vma;
731	struct vma_prepare vp;
732
733	if (vmg->__adjust_next_start) {
734	/ We manipulate middle and adjust next, which is the target. /
735	vma = vmg->middle;
736	vma_iter_config(vmi: vmg->vmi, index: vmg->end, last: vmg->next->vm_end);
737	} else {
738	vma = vmg->target;
739	/ Note: vma iterator must be pointing to 'start'. /
740	vma_iter_config(vmi: vmg->vmi, index: vmg->start, last: vmg->end);
741	}
742
743	init_multi_vma_prep(vp: &vp, vma, vmg);
744
745	/*
746	* If vmg->give_up_on_oom is set, we're safe, because we don't actually
747	* manipulate any VMAs until we succeed at preallocation.
748	*
749	* Past this point, we will not return an error.
750	*/
751	if (vma_iter_prealloc(vmi: vmg->vmi, vma))
752	return -ENOMEM;
753
754	vma_prepare(vp: &vp);
755	/*
756	* THP pages may need to do additional splits if we increase
757	* middle->vm_start.
758	*/
759	vma_adjust_trans_huge(vma, start: vmg->start, end: vmg->end,
760	next: vmg->__adjust_middle_start ? vmg->middle : NULL);
761	vma_set_range(vma, start: vmg->start, end: vmg->end, pgoff: vmg->pgoff);
762	vmg_adjust_set_range(vmg);
763	vma_iter_store_overwrite(vmi: vmg->vmi, vma: vmg->target);
764
765	vma_complete(vp: &vp, vmi: vmg->vmi, mm: vma->vm_mm);
766
767	return `0`;
768	}
769
770	/ We can only remove VMAs when merging if they do not have a close hook. /
771	static bool can_merge_remove_vma(struct vm_area_struct *vma)
772	{
773	return !vma->vm_ops \|\| !vma->vm_ops->close;
774	}
775
776	/*
777	* vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
778	* attributes modified.
779	*
780	* @vmg: Describes the modifications being made to a VMA and associated
781	* metadata.
782	*
783	* When the attributes of a range within a VMA change, then it might be possible
784	* for immediately adjacent VMAs to be merged into that VMA due to having
785	* identical properties.
786	*
787	* This function checks for the existence of any such mergeable VMAs and updates
788	* the maple tree describing the @vmg->middle->vm_mm address space to account
789	* for this, as well as any VMAs shrunk/expanded/deleted as a result of this
790	* merge.
791	*
792	* As part of this operation, if a merge occurs, the @vmg object will have its
793	* vma, start, end, and pgoff fields modified to execute the merge. Subsequent
794	* calls to this function should reset these fields.
795	*
796	* Returns: The merged VMA if merge succeeds, or NULL otherwise.
797	*
798	* ASSUMPTIONS:
799	* - The caller must assign the VMA to be modified to @vmg->middle.
800	* - The caller must have set @vmg->prev to the previous VMA, if there is one.
801	* - The caller must not set @vmg->next, as we determine this.
802	* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
803	* - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
804	*/
805	static __must_check struct vm_area_struct *vma_merge_existing_range(
806	struct vma_merge_struct *vmg)
807	{
808	vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY;
809	struct vm_area_struct *middle = vmg->middle;
810	struct vm_area_struct *prev = vmg->prev;
811	struct vm_area_struct *next;
812	struct vm_area_struct *anon_dup = NULL;
813	unsigned long start = vmg->start;
814	unsigned long end = vmg->end;
815	bool left_side = middle && start == middle->vm_start;
816	bool right_side = middle && end == middle->vm_end;
817	int err = `0`;
818	bool merge_left, merge_right, merge_both;
819
820	mmap_assert_write_locked(mm: vmg->mm);
821	VM_WARN_ON_VMG(!middle, vmg); / We are modifying a VMA, so caller must specify. /
822	VM_WARN_ON_VMG(vmg->next, vmg); / We set this. /
823	VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
824	VM_WARN_ON_VMG(start >= end, vmg);
825
826	/*
827	* If middle == prev, then we are offset into a VMA. Otherwise, if we are
828	* not, we must span a portion of the VMA.
829	*/
830	VM_WARN_ON_VMG(middle &&
831	((middle != prev && vmg->start != middle->vm_start) \|\|
832	vmg->end > middle->vm_end), vmg);
833	/ The vmi must be positioned within vmg->middle. /
834	VM_WARN_ON_VMG(middle &&
835	!(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
836	vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
837	/ An existing merge can never be used by the mremap() logic. /
838	VM_WARN_ON_VMG(vmg->copied_from, vmg);
839
840	vmg->state = VMA_MERGE_NOMERGE;
841
842	/*
843	* If a special mapping or if the range being modified is neither at the
844	* furthermost left or right side of the VMA, then we have no chance of
845	* merging and should abort.
846	*/
847	if (vmg->vm_flags & VM_SPECIAL \|\| (!left_side && !right_side))
848	return NULL;
849
850	if (left_side)
851	merge_left = can_vma_merge_left(vmg);
852	else
853	merge_left = false;
854
855	if (right_side) {
856	next = vmg->next = vma_iter_next_range(vmi: vmg->vmi);
857	vma_iter_prev_range(vmi: vmg->vmi);
858
859	merge_right = can_vma_merge_right(vmg, can_merge_left: merge_left);
860	} else {
861	merge_right = false;
862	next = NULL;
863	}
864
865	if (merge_left) / If merging prev, position iterator there. /
866	vma_prev(vmi: vmg->vmi);
867	else if (!merge_right) / If we have nothing to merge, abort. /
868	return NULL;
869
870	merge_both = merge_left && merge_right;
871	/ If we span the entire VMA, a merge implies it will be deleted. /
872	vmg->__remove_middle = left_side && right_side;
873
874	/*
875	* If we need to remove middle in its entirety but are unable to do so,
876	* we have no sensible recourse but to abort the merge.
877	*/
878	if (vmg->__remove_middle && !can_merge_remove_vma(vma: middle))
879	return NULL;
880
881	/*
882	* If we merge both VMAs, then next is also deleted. This implies
883	* merge_will_delete_vma also.
884	*/
885	vmg->__remove_next = merge_both;
886
887	/*
888	* If we cannot delete next, then we can reduce the operation to merging
889	* prev and middle (thereby deleting middle).
890	*/
891	if (vmg->__remove_next && !can_merge_remove_vma(vma: next)) {
892	vmg->__remove_next = false;
893	merge_right = false;
894	merge_both = false;
895	}
896
897	/ No matter what happens, we will be adjusting middle. /
898	vma_start_write(vma: middle);
899
900	if (merge_right) {
901	vma_start_write(vma: next);
902	vmg->target = next;
903	sticky_flags \|= (next->vm_flags & VM_STICKY);
904	}
905
906	if (merge_left) {
907	vma_start_write(vma: prev);
908	vmg->target = prev;
909	sticky_flags \|= (prev->vm_flags & VM_STICKY);
910	}
911
912	if (merge_both) {
913	/*
914	* \|<-------------------->\|
915	* \|-------********-------\|
916	* prev middle next
917	* extend delete delete
918	*/
919
920	vmg->start = prev->vm_start;
921	vmg->end = next->vm_end;
922	vmg->pgoff = prev->vm_pgoff;
923
924	/*
925	* We already ensured anon_vma compatibility above, so now it's
926	* simply a case of, if prev has no anon_vma object, which of
927	* next or middle contains the anon_vma we must duplicate.
928	*/
929	err = dup_anon_vma(dst: prev, src: next->anon_vma ? next : middle,
930	dup: &anon_dup);
931	} else if (merge_left) {
932	/*
933	* \|<------------>\| OR
934	* \|<----------------->\|
935	* \|-------*************
936	* prev middle
937	* extend shrink/delete
938	*/
939
940	vmg->start = prev->vm_start;
941	vmg->pgoff = prev->vm_pgoff;
942
943	if (!vmg->__remove_middle)
944	vmg->__adjust_middle_start = true;
945
946	err = dup_anon_vma(dst: prev, src: middle, dup: &anon_dup);
947	} else { / merge_right /
948	/*
949	* \|<------------->\| OR
950	* \|<----------------->\|
951	* *************-------\|
952	* middle next
953	* shrink/delete extend
954	*/
955
956	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
957
958	VM_WARN_ON_VMG(!merge_right, vmg);
959	/ If we are offset into a VMA, then prev must be middle. /
960	VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);
961
962	if (vmg->__remove_middle) {
963	vmg->end = next->vm_end;
964	vmg->pgoff = next->vm_pgoff - pglen;
965	} else {
966	/ We shrink middle and expand next. /
967	vmg->__adjust_next_start = true;
968	vmg->start = middle->vm_start;
969	vmg->end = start;
970	vmg->pgoff = middle->vm_pgoff;
971	}
972
973	err = dup_anon_vma(dst: next, src: middle, dup: &anon_dup);
974	}
975
976	if (err \|\| commit_merge(vmg))
977	goto abort;
978
979	vm_flags_set(vma: vmg->target, flags: sticky_flags);
980	khugepaged_enter_vma(vma: vmg->target, vm_flags: vmg->vm_flags);
981	vmg->state = VMA_MERGE_SUCCESS;
982	return vmg->target;
983
984	abort:
985	vma_iter_set(vmi: vmg->vmi, addr: start);
986	vma_iter_load(vmi: vmg->vmi);
987
988	if (anon_dup)
989	unlink_anon_vmas(anon_dup);
990
991	/*
992	* This means we have failed to clone anon_vma's correctly, but no
993	* actual changes to VMAs have occurred, so no harm no foul - if the
994	* user doesn't want this reported and instead just wants to give up on
995	* the merge, allow it.
996	*/
997	if (!vmg->give_up_on_oom)
998	vmg->state = VMA_MERGE_ERROR_NOMEM;
999	return NULL;
1000	}
1001
1002	/*
1003	* vma_merge_new_range - Attempt to merge a new VMA into address space
1004	*
1005	* @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
1006	* (exclusive), which we try to merge with any adjacent VMAs if possible.
1007	*
1008	* We are about to add a VMA to the address space starting at @vmg->start and
1009	* ending at @vmg->end. There are three different possible scenarios:
1010	*
1011	* 1. There is a VMA with identical properties immediately adjacent to the
1012	* proposed new VMA [@vmg->start, @vmg->end) either before or after it -
1013	* EXPAND that VMA:
1014	*
1015	* Proposed: \|-----\| or \|-----\|
1016	* Existing: \|----\| \|----\|
1017	*
1018	* 2. There are VMAs with identical properties immediately adjacent to the
1019	* proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
1020	* EXPAND the former and REMOVE the latter:
1021	*
1022	* Proposed: \|-----\|
1023	* Existing: \|----\| \|----\|
1024	*
1025	* 3. There are no VMAs immediately adjacent to the proposed new VMA or those
1026	* VMAs do not have identical attributes - NO MERGE POSSIBLE.
1027	*
1028	* In instances where we can merge, this function returns the expanded VMA which
1029	* will have its range adjusted accordingly and the underlying maple tree also
1030	* adjusted.
1031	*
1032	* Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
1033	* to the VMA we expanded.
1034	*
1035	* This function adjusts @vmg to provide @vmg->next if not already specified,
1036	* and adjusts [@vmg->start, @vmg->end) to span the expanded range.
1037	*
1038	* ASSUMPTIONS:
1039	* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
1040	* - The caller must have determined that [@vmg->start, @vmg->end) is empty,
1041	other than VMAs that will be unmapped should the operation succeed.
1042	* - The caller must have specified the previous vma in @vmg->prev.
1043	* - The caller must have specified the next vma in @vmg->next.
1044	* - The caller must have positioned the vmi at or before the gap.
1045	*/
1046	struct vm_area_struct vma_merge_new_range(struct* vma_merge_struct *vmg)
1047	{
1048	struct vm_area_struct *prev = vmg->prev;
1049	struct vm_area_struct *next = vmg->next;
1050	unsigned long end = vmg->end;
1051	bool can_merge_left, can_merge_right;
1052
1053	mmap_assert_write_locked(mm: vmg->mm);
1054	VM_WARN_ON_VMG(vmg->middle, vmg);
1055	VM_WARN_ON_VMG(vmg->target, vmg);
1056	/ vmi must point at or before the gap. /
1057	VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
1058
1059	vmg->state = VMA_MERGE_NOMERGE;
1060
1061	/ Special VMAs are unmergeable, also if no prev/next. /
1062	if ((vmg->vm_flags & VM_SPECIAL) \|\| (!prev && !next))
1063	return NULL;
1064
1065	can_merge_left = can_vma_merge_left(vmg);
1066	can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);
1067
1068	/ If we can merge with the next VMA, adjust vmg accordingly. /
1069	if (can_merge_right) {
1070	vmg->end = next->vm_end;
1071	vmg->target = next;
1072	}
1073
1074	/ If we can merge with the previous VMA, adjust vmg accordingly. /
1075	if (can_merge_left) {
1076	vmg->start = prev->vm_start;
1077	vmg->target = prev;
1078	vmg->pgoff = prev->vm_pgoff;
1079
1080	/*
1081	* If this merge would result in removal of the next VMA but we
1082	* are not permitted to do so, reduce the operation to merging
1083	* prev and vma.
1084	*/
1085	if (can_merge_right && !can_merge_remove_vma(vma: next))
1086	vmg->end = end;
1087
1088	/ In expand-only case we are already positioned at prev. /
1089	if (!vmg->just_expand) {
1090	/ Equivalent to going to the previous range. /
1091	vma_prev(vmi: vmg->vmi);
1092	}
1093	}
1094
1095	/*
1096	* Now try to expand adjacent VMA(s). This takes care of removing the
1097	* following VMA if we have VMAs on both sides.
1098	*/
1099	if (vmg->target && !vma_expand(vmg)) {
1100	khugepaged_enter_vma(vma: vmg->target, vm_flags: vmg->vm_flags);
1101	vmg->state = VMA_MERGE_SUCCESS;
1102	return vmg->target;
1103	}
1104
1105	return NULL;
1106	}
1107
1108	/*
1109	* vma_merge_copied_range - Attempt to merge a VMA that is being copied by
1110	* mremap()
1111	*
1112	* @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to
1113	* @vmg->end (exclusive), which we try to merge with any adjacent VMAs if
1114	* possible.
1115	*
1116	* vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO
1117	* range, i.e. the target range for the VMA.
1118	*
1119	* Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
1120	* to the VMA we expanded.
1121	*
1122	* ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain
1123	* the copied-from VMA.
1124	*/
1125	static struct vm_area_struct vma_merge_copied_range(struct* vma_merge_struct *vmg)
1126	{
1127	/ We must have a copied-from VMA. /
1128	VM_WARN_ON_VMG(!vmg->middle, vmg);
1129
1130	vmg->copied_from = vmg->middle;
1131	vmg->middle = NULL;
1132	return vma_merge_new_range(vmg);
1133	}
1134
1135	/*
1136	* vma_expand - Expand an existing VMA
1137	*
1138	* @vmg: Describes a VMA expansion operation.
1139	*
1140	* Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
1141	* Will expand over vmg->next if it's different from vmg->target and vmg->end ==
1142	* vmg->next->vm_end. Checking if the vmg->target can expand and merge with
1143	* vmg->next needs to be handled by the caller.
1144	*
1145	* Returns: 0 on success.
1146	*
1147	* ASSUMPTIONS:
1148	* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
1149	* - The caller must have set @vmg->target and @vmg->next.
1150	*/
1151	int vma_expand(struct vma_merge_struct *vmg)
1152	{
1153	struct vm_area_struct *anon_dup = NULL;
1154	struct vm_area_struct *target = vmg->target;
1155	struct vm_area_struct *next = vmg->next;
1156	bool remove_next = false;
1157	vm_flags_t sticky_flags;
1158	int ret = `0`;
1159
1160	mmap_assert_write_locked(mm: vmg->mm);
1161	vma_start_write(vma: target);
1162
1163	if (next && target != next && vmg->end == next->vm_end)
1164	remove_next = true;
1165
1166	/ We must have a target. /
1167	VM_WARN_ON_VMG(!target, vmg);
1168	/ This should have already been checked by this point. /
1169	VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg);
1170	/ Not merging but overwriting any part of next is not handled. /
1171	VM_WARN_ON_VMG(next && !remove_next &&
1172	next != target && vmg->end > next->vm_start, vmg);
1173	/ Only handles expanding. /
1174	VM_WARN_ON_VMG(target->vm_start < vmg->start \|\|
1175	target->vm_end > vmg->end, vmg);
1176
1177	sticky_flags = vmg->vm_flags & VM_STICKY;
1178	sticky_flags \|= target->vm_flags & VM_STICKY;
1179	if (remove_next)
1180	sticky_flags \|= next->vm_flags & VM_STICKY;
1181
1182	/*
1183	* If we are removing the next VMA or copying from a VMA
1184	* (e.g. mremap()'ing), we must propagate anon_vma state.
1185	*
1186	* Note that, by convention, callers ignore OOM for this case, so
1187	* we don't need to account for vmg->give_up_on_mm here.
1188	*/
1189	if (remove_next)
1190	ret = dup_anon_vma(dst: target, src: next, dup: &anon_dup);
1191	if (!ret && vmg->copied_from)
1192	ret = dup_anon_vma(dst: target, src: vmg->copied_from, dup: &anon_dup);
1193	if (ret)
1194	return ret;
1195
1196	if (remove_next) {
1197	vma_start_write(vma: next);
1198	vmg->__remove_next = true;
1199	}
1200	if (commit_merge(vmg))
1201	goto nomem;
1202
1203	vm_flags_set(vma: target, flags: sticky_flags);
1204	return `0`;
1205
1206	nomem:
1207	if (anon_dup)
1208	unlink_anon_vmas(anon_dup);
1209	/*
1210	* If the user requests that we just give upon OOM, we are safe to do so
1211	* here, as commit merge provides this contract to us. Nothing has been
1212	* changed - no harm no foul, just don't report it.
1213	*/
1214	if (!vmg->give_up_on_oom)
1215	vmg->state = VMA_MERGE_ERROR_NOMEM;
1216	return -ENOMEM;
1217	}
1218
1219	/*
1220	* vma_shrink() - Reduce an existing VMAs memory area
1221	* @vmi: The vma iterator
1222	* @vma: The VMA to modify
1223	* @start: The new start
1224	* @end: The new end
1225	*
1226	* Returns: 0 on success, -ENOMEM otherwise
1227	*/
1228	int vma_shrink(struct vma_iterator vmi, struct* vm_area_struct *vma,
1229	unsigned long start, unsigned long end, pgoff_t pgoff)
1230	{
1231	struct vma_prepare vp;
1232
1233	WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
1234
1235	if (vma->vm_start < start)
1236	vma_iter_config(vmi, index: vma->vm_start, last: start);
1237	else
1238	vma_iter_config(vmi, index: end, last: vma->vm_end);
1239
1240	if (vma_iter_prealloc(vmi, NULL))
1241	return -ENOMEM;
1242
1243	vma_start_write(vma);
1244
1245	init_vma_prep(vp: &vp, vma);
1246	vma_prepare(vp: &vp);
1247	vma_adjust_trans_huge(vma, start, end, NULL);
1248
1249	vma_iter_clear(vmi);
1250	vma_set_range(vma, start, end, pgoff);
1251	vma_complete(vp: &vp, vmi, mm: vma->vm_mm);
1252	validate_mm(mm: vma->vm_mm);
1253	return `0`;
1254	}
1255
1256	static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
1257	struct ma_state *mas_detach, bool mm_wr_locked)
1258	{
1259	struct mmu_gather tlb;
1260
1261	if (!vms->clear_ptes) / Nothing to do /
1262	return;
1263
1264	/*
1265	* We can free page tables without write-locking mmap_lock because VMAs
1266	* were isolated before we downgraded mmap_lock.
1267	*/
1268	mas_set(mas: mas_detach, index: `1`);
1269	tlb_gather_mmu(tlb: &tlb, mm: vms->vma->vm_mm);
1270	update_hiwater_rss(mm: vms->vma->vm_mm);
1271	unmap_vmas(tlb: &tlb, mas: mas_detach, start_vma: vms->vma, start: vms->start, end: vms->end,
1272	tree_end: vms->vma_count);
1273
1274	mas_set(mas: mas_detach, index: `1`);
1275	/ start and end may be different if there is no prev or next vma. /
1276	free_pgtables(tlb: &tlb, mas: mas_detach, start_vma: vms->vma, floor: vms->unmap_start,
1277	ceiling: vms->unmap_end, mm_wr_locked);
1278	tlb_finish_mmu(tlb: &tlb);
1279	vms->clear_ptes = false;
1280	}
1281
1282	static void vms_clean_up_area(struct vma_munmap_struct *vms,
1283	struct ma_state *mas_detach)
1284	{
1285	struct vm_area_struct *vma;
1286
1287	if (!vms->nr_pages)
1288	return;
1289
1290	vms_clear_ptes(vms, mas_detach, mm_wr_locked: true);
1291	mas_set(mas: mas_detach, index: `0`);
1292	mas_for_each(mas_detach, vma, ULONG_MAX)
1293	vma_close(vma);
1294	}
1295
1296	/*
1297	* vms_complete_munmap_vmas() - Finish the munmap() operation
1298	* @vms: The vma munmap struct
1299	* @mas_detach: The maple state of the detached vmas
1300	*
1301	* This updates the mm_struct, unmaps the region, frees the resources
1302	* used for the munmap() and may downgrade the lock - if requested. Everything
1303	* needed to be done once the vma maple tree is updated.
1304	*/
1305	static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
1306	struct ma_state *mas_detach)
1307	{
1308	struct vm_area_struct *vma;
1309	struct mm_struct *mm;
1310
1311	mm = current->mm;
1312	mm->map_count -= vms->vma_count;
1313	mm->locked_vm -= vms->locked_vm;
1314	if (vms->unlock)
1315	mmap_write_downgrade(mm);
1316
1317	if (!vms->nr_pages)
1318	return;
1319
1320	vms_clear_ptes(vms, mas_detach, mm_wr_locked: !vms->unlock);
1321	/ Update high watermark before we lower total_vm /
1322	update_hiwater_vm(mm);
1323	/ Stat accounting /
1324	WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
1325	/ Paranoid bookkeeping /
1326	VM_WARN_ON(vms->exec_vm > mm->exec_vm);
1327	VM_WARN_ON(vms->stack_vm > mm->stack_vm);
1328	VM_WARN_ON(vms->data_vm > mm->data_vm);
1329	mm->exec_vm -= vms->exec_vm;
1330	mm->stack_vm -= vms->stack_vm;
1331	mm->data_vm -= vms->data_vm;
1332
1333	/ Remove and clean up vmas /
1334	mas_set(mas: mas_detach, index: `0`);
1335	mas_for_each(mas_detach, vma, ULONG_MAX)
1336	remove_vma(vma);
1337
1338	vm_unacct_memory(pages: vms->nr_accounted);
1339	validate_mm(mm);
1340	if (vms->unlock)
1341	mmap_read_unlock(mm);
1342
1343	__mt_destroy(mt: mas_detach->tree);
1344	}
1345
1346	/*
1347	* reattach_vmas() - Undo any munmap work and free resources
1348	* @mas_detach: The maple state with the detached maple tree
1349	*
1350	* Reattach any detached vmas and free up the maple tree used to track the vmas.
1351	*/
1352	static void reattach_vmas(struct ma_state *mas_detach)
1353	{
1354	struct vm_area_struct *vma;
1355
1356	mas_set(mas: mas_detach, index: `0`);
1357	mas_for_each(mas_detach, vma, ULONG_MAX)
1358	vma_mark_attached(vma);
1359
1360	__mt_destroy(mt: mas_detach->tree);
1361	}
1362
1363	/*
1364	* vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
1365	* for removal at a later date. Handles splitting first and last if necessary
1366	* and marking the vmas as isolated.
1367	*
1368	* @vms: The vma munmap struct
1369	* @mas_detach: The maple state tracking the detached tree
1370	*
1371	* Return: 0 on success, error otherwise
1372	*/
1373	static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
1374	struct ma_state *mas_detach)
1375	{
1376	struct vm_area_struct *next = NULL;
1377	int error;
1378
1379	/*
1380	* If we need to split any vma, do it now to save pain later.
1381	* Does it split the first one?
1382	*/
1383	if (vms->start > vms->vma->vm_start) {
1384
1385	/*
1386	* Make sure that map_count on return from munmap() will
1387	* not exceed its limit; but let map_count go just above
1388	* its limit temporarily, to help free resources as expected.
1389	*/
1390	if (vms->end < vms->vma->vm_end &&
1391	vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
1392	error = -ENOMEM;
1393	goto map_count_exceeded;
1394	}
1395
1396	/ Don't bother splitting the VMA if we can't unmap it anyway /
1397	if (vma_is_sealed(vma: vms->vma)) {
1398	error = -EPERM;
1399	goto start_split_failed;
1400	}
1401
1402	error = __split_vma(vmi: vms->vmi, vma: vms->vma, addr: vms->start, new_below: `1`);
1403	if (error)
1404	goto start_split_failed;
1405	}
1406	vms->prev = vma_prev(vmi: vms->vmi);
1407	if (vms->prev)
1408	vms->unmap_start = vms->prev->vm_end;
1409
1410	/*
1411	* Detach a range of VMAs from the mm. Using next as a temp variable as
1412	* it is always overwritten.
1413	*/
1414	for_each_vma_range(*(vms->vmi), next, vms->end) {
1415	long nrpages;
1416
1417	if (vma_is_sealed(vma: next)) {
1418	error = -EPERM;
1419	goto modify_vma_failed;
1420	}
1421	/ Does it split the end? /
1422	if (next->vm_end > vms->end) {
1423	error = __split_vma(vmi: vms->vmi, vma: next, addr: vms->end, new_below: `0`);
1424	if (error)
1425	goto end_split_failed;
1426	}
1427	vma_start_write(vma: next);
1428	mas_set(mas: mas_detach, index: vms->vma_count++);
1429	error = mas_store_gfp(mas: mas_detach, entry: next, GFP_KERNEL);
1430	if (error)
1431	goto munmap_gather_failed;
1432
1433	vma_mark_detached(vma: next);
1434	nrpages = vma_pages(vma: next);
1435
1436	vms->nr_pages += nrpages;
1437	if (next->vm_flags & VM_LOCKED)
1438	vms->locked_vm += nrpages;
1439
1440	if (next->vm_flags & VM_ACCOUNT)
1441	vms->nr_accounted += nrpages;
1442
1443	if (is_exec_mapping(flags: next->vm_flags))
1444	vms->exec_vm += nrpages;
1445	else if (is_stack_mapping(flags: next->vm_flags))
1446	vms->stack_vm += nrpages;
1447	else if (is_data_mapping(flags: next->vm_flags))
1448	vms->data_vm += nrpages;
1449
1450	if (vms->uf) {
1451	/*
1452	* If userfaultfd_unmap_prep returns an error the vmas
1453	* will remain split, but userland will get a
1454	* highly unexpected error anyway. This is no
1455	* different than the case where the first of the two
1456	* __split_vma fails, but we don't undo the first
1457	* split, despite we could. This is unlikely enough
1458	* failure that it's not worth optimizing it for.
1459	*/
1460	error = userfaultfd_unmap_prep(vma: next, start: vms->start,
1461	end: vms->end, uf: vms->uf);
1462	if (error)
1463	goto userfaultfd_error;
1464	}
1465	#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
1466	BUG_ON(next->vm_start < vms->start);
1467	BUG_ON(next->vm_start > vms->end);
1468	#endif
1469	}
1470
1471	vms->next = vma_next(vmi: vms->vmi);
1472	if (vms->next)
1473	vms->unmap_end = vms->next->vm_start;
1474
1475	#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
1476	/ Make sure no VMAs are about to be lost. /
1477	{
1478	MA_STATE(test, mas_detach->tree, `0`, `0`);
1479	struct vm_area_struct vma_mas, vma_test;
1480	int test_count = `0`;
1481
1482	vma_iter_set(vmi: vms->vmi, addr: vms->start);
1483	rcu_read_lock();
1484	vma_test = mas_find(mas: &test, max: vms->vma_count - `1`);
1485	for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
1486	BUG_ON(vma_mas != vma_test);
1487	test_count++;
1488	vma_test = mas_next(mas: &test, max: vms->vma_count - `1`);
1489	}
1490	rcu_read_unlock();
1491	BUG_ON(vms->vma_count != test_count);
1492	}
1493	#endif
1494
1495	while (vma_iter_addr(vmi: vms->vmi) > vms->start)
1496	vma_iter_prev_range(vmi: vms->vmi);
1497
1498	vms->clear_ptes = true;
1499	return `0`;
1500
1501	userfaultfd_error:
1502	munmap_gather_failed:
1503	end_split_failed:
1504	modify_vma_failed:
1505	reattach_vmas(mas_detach);
1506	start_split_failed:
1507	map_count_exceeded:
1508	return error;
1509	}
1510
1511	/*
1512	* init_vma_munmap() - Initializer wrapper for vma_munmap_struct
1513	* @vms: The vma munmap struct
1514	* @vmi: The vma iterator
1515	* @vma: The first vm_area_struct to munmap
1516	* @start: The aligned start address to munmap
1517	* @end: The aligned end address to munmap
1518	* @uf: The userfaultfd list_head
1519	* @unlock: Unlock after the operation. Only unlocked on success
1520	*/
1521	static void init_vma_munmap(struct vma_munmap_struct *vms,
1522	struct vma_iterator vmi, struct* vm_area_struct *vma,
1523	unsigned long start, unsigned long end, struct list_head *uf,
1524	bool unlock)
1525	{
1526	vms->vmi = vmi;
1527	vms->vma = vma;
1528	if (vma) {
1529	vms->start = start;
1530	vms->end = end;
1531	} else {
1532	vms->start = vms->end = `0`;
1533	}
1534	vms->unlock = unlock;
1535	vms->uf = uf;
1536	vms->vma_count = `0`;
1537	vms->nr_pages = vms->locked_vm = vms->nr_accounted = `0`;
1538	vms->exec_vm = vms->stack_vm = vms->data_vm = `0`;
1539	vms->unmap_start = FIRST_USER_ADDRESS;
1540	vms->unmap_end = USER_PGTABLES_CEILING;
1541	vms->clear_ptes = false;
1542	}
1543
1544	/*
1545	* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
1546	* @vmi: The vma iterator
1547	* @vma: The starting vm_area_struct
1548	* @mm: The mm_struct
1549	* @start: The aligned start address to munmap.
1550	* @end: The aligned end address to munmap.
1551	* @uf: The userfaultfd list_head
1552	* @unlock: Set to true to drop the mmap_lock. unlocking only happens on
1553	* success.
1554	*
1555	* Return: 0 on success and drops the lock if so directed, error and leaves the
1556	* lock held otherwise.
1557	*/
1558	int do_vmi_align_munmap(struct vma_iterator vmi, struct* vm_area_struct *vma,
1559	struct mm_struct mm, unsigned* long start, unsigned long end,
1560	struct list_head *uf, bool unlock)
1561	{
1562	struct maple_tree mt_detach;
1563	MA_STATE(mas_detach, &mt_detach, `0`, `0`);
1564	mt_init_flags(mt: &mt_detach, flags: vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
1565	mt_on_stack(mt_detach);
1566	struct vma_munmap_struct vms;
1567	int error;
1568
1569	init_vma_munmap(vms: &vms, vmi, vma, start, end, uf, unlock);
1570	error = vms_gather_munmap_vmas(vms: &vms, mas_detach: &mas_detach);
1571	if (error)
1572	goto gather_failed;
1573
1574	error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
1575	if (error)
1576	goto clear_tree_failed;
1577
1578	/ Point of no return /
1579	vms_complete_munmap_vmas(vms: &vms, mas_detach: &mas_detach);
1580	return `0`;
1581
1582	clear_tree_failed:
1583	reattach_vmas(mas_detach: &mas_detach);
1584	gather_failed:
1585	validate_mm(mm);
1586	return error;
1587	}
1588
1589	/*
1590	* do_vmi_munmap() - munmap a given range.
1591	* @vmi: The vma iterator
1592	* @mm: The mm_struct
1593	* @start: The start address to munmap
1594	* @len: The length of the range to munmap
1595	* @uf: The userfaultfd list_head
1596	* @unlock: set to true if the user wants to drop the mmap_lock on success
1597	*
1598	* This function takes a @mas that is either pointing to the previous VMA or set
1599	* to MA_START and sets it up to remove the mapping(s). The @len will be
1600	* aligned.
1601	*
1602	* Return: 0 on success and drops the lock if so directed, error and leaves the
1603	* lock held otherwise.
1604	*/
1605	int do_vmi_munmap(struct vma_iterator vmi, struct* mm_struct *mm,
1606	unsigned long start, size_t len, struct list_head *uf,
1607	bool unlock)
1608	{
1609	unsigned long end;
1610	struct vm_area_struct *vma;
1611
1612	if ((offset_in_page(start)) \|\| start > TASK_SIZE \|\| len > TASK_SIZE-start)
1613	return -EINVAL;
1614
1615	end = start + PAGE_ALIGN(len);
1616	if (end == start)
1617	return -EINVAL;
1618
1619	/ Find the first overlapping VMA /
1620	vma = vma_find(vmi, max: end);
1621	if (!vma) {
1622	if (unlock)
1623	mmap_write_unlock(mm);
1624	return `0`;
1625	}
1626
1627	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
1628	}
1629
1630	/*
1631	* We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
1632	* context and anonymous VMA name within the range [start, end).
1633	*
1634	* As a result, we might be able to merge the newly modified VMA range with an
1635	* adjacent VMA with identical properties.
1636	*
1637	* If no merge is possible and the range does not span the entirety of the VMA,
1638	* we then need to split the VMA to accommodate the change.
1639	*
1640	* The function returns either the merged VMA, the original VMA if a split was
1641	* required instead, or an error if the split failed.
1642	*/
1643	static struct vm_area_struct vma_modify(struct* vma_merge_struct *vmg)
1644	{
1645	struct vm_area_struct *vma = vmg->middle;
1646	unsigned long start = vmg->start;
1647	unsigned long end = vmg->end;
1648	struct vm_area_struct *merged;
1649
1650	/ First, try to merge. /
1651	merged = vma_merge_existing_range(vmg);
1652	if (merged)
1653	return merged;
1654	if (vmg_nomem(vmg))
1655	return ERR_PTR(error: -ENOMEM);
1656
1657	/*
1658	* Split can fail for reasons other than OOM, so if the user requests
1659	* this it's probably a mistake.
1660	*/
1661	VM_WARN_ON(vmg->give_up_on_oom &&
1662	(vma->vm_start != start \|\| vma->vm_end != end));
1663
1664	/ Split any preceding portion of the VMA. /
1665	if (vma->vm_start < start) {
1666	int err = split_vma(vmi: vmg->vmi, vma, addr: start, new_below: `1`);
1667
1668	if (err)
1669	return ERR_PTR(error: err);
1670	}
1671
1672	/ Split any trailing portion of the VMA. /
1673	if (vma->vm_end > end) {
1674	int err = split_vma(vmi: vmg->vmi, vma, addr: end, new_below: `0`);
1675
1676	if (err)
1677	return ERR_PTR(error: err);
1678	}
1679
1680	return vma;
1681	}
1682
1683	struct vm_area_struct vma_modify_flags(struct* vma_iterator *vmi,
1684	struct vm_area_struct prev, struct* vm_area_struct *vma,
1685	unsigned long start, unsigned long end,
1686	vm_flags_t *vm_flags_ptr)
1687	{
1688	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1689	const vm_flags_t vm_flags = *vm_flags_ptr;
1690	struct vm_area_struct *ret;
1691
1692	vmg.vm_flags = vm_flags;
1693
1694	ret = vma_modify(vmg: &vmg);
1695	if (IS_ERR(ptr: ret))
1696	return ret;
1697
1698	/*
1699	* For a merge to succeed, the flags must match those
1700	* requested. However, sticky flags may have been retained, so propagate
1701	* them to the caller.
1702	*/
1703	if (vmg.state == VMA_MERGE_SUCCESS)
1704	*vm_flags_ptr = ret->vm_flags;
1705	return ret;
1706	}
1707
1708	struct vm_area_struct vma_modify_name(struct* vma_iterator *vmi,
1709	struct vm_area_struct prev, struct* vm_area_struct *vma,
1710	unsigned long start, unsigned long end,
1711	struct anon_vma_name *new_name)
1712	{
1713	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1714
1715	vmg.anon_name = new_name;
1716
1717	return vma_modify(vmg: &vmg);
1718	}
1719
1720	struct vm_area_struct vma_modify_policy(struct* vma_iterator *vmi,
1721	struct vm_area_struct prev, struct* vm_area_struct *vma,
1722	unsigned long start, unsigned long end,
1723	struct mempolicy *new_pol)
1724	{
1725	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1726
1727	vmg.policy = new_pol;
1728
1729	return vma_modify(vmg: &vmg);
1730	}
1731
1732	struct vm_area_struct vma_modify_flags_uffd(struct* vma_iterator *vmi,
1733	struct vm_area_struct prev, struct* vm_area_struct *vma,
1734	unsigned long start, unsigned long end, vm_flags_t vm_flags,
1735	struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom)
1736	{
1737	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
1738
1739	vmg.vm_flags = vm_flags;
1740	vmg.uffd_ctx = new_ctx;
1741	if (give_up_on_oom)
1742	vmg.give_up_on_oom = true;
1743
1744	return vma_modify(vmg: &vmg);
1745	}
1746
1747	/*
1748	* Expand vma by delta bytes, potentially merging with an immediately adjacent
1749	* VMA with identical properties.
1750	*/
1751	struct vm_area_struct vma_merge_extend(struct* vma_iterator *vmi,
1752	struct vm_area_struct *vma,
1753	unsigned long delta)
1754	{
1755	VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
1756
1757	vmg.next = vma_iter_next_rewind(vmi, NULL);
1758	vmg.middle = NULL; / We use the VMA to populate VMG fields only. /
1759
1760	return vma_merge_new_range(vmg: &vmg);
1761	}
1762
1763	void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
1764	{
1765	vb->count = `0`;
1766	}
1767
1768	static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
1769	{
1770	struct address_space *mapping;
1771	int i;
1772
1773	mapping = vb->vmas[`0`]->vm_file->f_mapping;
1774	i_mmap_lock_write(mapping);
1775	for (i = `0`; i < vb->count; i++) {
1776	VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
1777	__remove_shared_vm_struct(vma: vb->vmas[i], mapping);
1778	}
1779	i_mmap_unlock_write(mapping);
1780
1781	unlink_file_vma_batch_init(vb);
1782	}
1783
1784	void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
1785	struct vm_area_struct *vma)
1786	{
1787	if (vma->vm_file == NULL)
1788	return;
1789
1790	if ((vb->count > `0` && vb->vmas[`0`]->vm_file != vma->vm_file) \|\|
1791	vb->count == ARRAY_SIZE(vb->vmas))
1792	unlink_file_vma_batch_process(vb);
1793
1794	vb->vmas[vb->count] = vma;
1795	vb->count++;
1796	}
1797
1798	void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
1799	{
1800	if (vb->count > `0`)
1801	unlink_file_vma_batch_process(vb);
1802	}
1803
1804	static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock)
1805	{
1806	struct file *file = vma->vm_file;
1807	struct address_space *mapping;
1808
1809	if (file) {
1810	mapping = file->f_mapping;
1811	i_mmap_lock_write(mapping);
1812	__vma_link_file(vma, mapping);
1813	if (!hold_rmap_lock)
1814	i_mmap_unlock_write(mapping);
1815	}
1816	}
1817
1818	static int vma_link(struct mm_struct mm, struct* vm_area_struct *vma)
1819	{
1820	VMA_ITERATOR(vmi, mm, `0`);
1821
1822	vma_iter_config(vmi: &vmi, index: vma->vm_start, last: vma->vm_end);
1823	if (vma_iter_prealloc(vmi: &vmi, vma))
1824	return -ENOMEM;
1825
1826	vma_start_write(vma);
1827	vma_iter_store_new(vmi: &vmi, vma);
1828	vma_link_file(vma, / hold_rmap_lock= /false);
1829	mm->map_count++;
1830	validate_mm(mm);
1831	return `0`;
1832	}
1833
1834	/*
1835	* Copy the vma structure to a new location in the same mm,
1836	* prior to moving page table entries, to effect an mremap move.
1837	*/
1838	struct vm_area_struct copy_vma(struct* vm_area_struct **vmap,
1839	unsigned long addr, unsigned long len, pgoff_t pgoff,
1840	bool *need_rmap_locks)
1841	{
1842	struct vm_area_struct vma = vmap;
1843	unsigned long vma_start = vma->vm_start;
1844	struct mm_struct *mm = vma->vm_mm;
1845	struct vm_area_struct *new_vma;
1846	bool faulted_in_anon_vma = true;
1847	VMA_ITERATOR(vmi, mm, addr);
1848	VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
1849
1850	/*
1851	* If anonymous vma has not yet been faulted, update new pgoff
1852	* to match new location, to increase its chance of merging.
1853	*/
1854	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
1855	pgoff = addr >> PAGE_SHIFT;
1856	faulted_in_anon_vma = false;
1857	}
1858
1859	/*
1860	* If the VMA we are copying might contain a uprobe PTE, ensure
1861	* that we do not establish one upon merge. Otherwise, when mremap()
1862	* moves page tables, it will orphan the newly created PTE.
1863	*/
1864	if (vma->vm_file)
1865	vmg.skip_vma_uprobe = true;
1866
1867	new_vma = find_vma_prev(mm, addr, pprev: &vmg.prev);
1868	if (new_vma && new_vma->vm_start < addr + len)
1869	return NULL; / should never get here /
1870
1871	vmg.pgoff = pgoff;
1872	vmg.next = vma_iter_next_rewind(vmi: &vmi, NULL);
1873	new_vma = vma_merge_copied_range(vmg: &vmg);
1874
1875	if (new_vma) {
1876	/*
1877	* Source vma may have been merged into new_vma
1878	*/
1879	if (unlikely(vma_start >= new_vma->vm_start &&
1880	vma_start < new_vma->vm_end)) {
1881	/*
1882	* The only way we can get a vma_merge with
1883	* self during an mremap is if the vma hasn't
1884	* been faulted in yet and we were allowed to
1885	* reset the dst vma->vm_pgoff to the
1886	* destination address of the mremap to allow
1887	* the merge to happen. mremap must change the
1888	* vm_pgoff linearity between src and dst vmas
1889	* (in turn preventing a vma_merge) to be
1890	* safe. It is only safe to keep the vm_pgoff
1891	* linear if there are no pages mapped yet.
1892	*/
1893	VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
1894	*vmap = vma = new_vma;
1895	}
1896	*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
1897	} else {
1898	new_vma = vm_area_dup(orig: vma);
1899	if (!new_vma)
1900	goto out;
1901	vma_set_range(vma: new_vma, start: addr, end: addr + len, pgoff);
1902	if (vma_dup_policy(src: vma, dst: new_vma))
1903	goto out_free_vma;
1904	if (anon_vma_clone(new_vma, vma))
1905	goto out_free_mempol;
1906	if (new_vma->vm_file)
1907	get_file(f: new_vma->vm_file);
1908	if (new_vma->vm_ops && new_vma->vm_ops->open)
1909	new_vma->vm_ops->open(new_vma);
1910	if (vma_link(mm, vma: new_vma))
1911	goto out_vma_link;
1912	*need_rmap_locks = false;
1913	}
1914	return new_vma;
1915
1916	out_vma_link:
1917	fixup_hugetlb_reservations(vma: new_vma);
1918	vma_close(vma: new_vma);
1919
1920	if (new_vma->vm_file)
1921	fput(new_vma->vm_file);
1922
1923	unlink_anon_vmas(new_vma);
1924	out_free_mempol:
1925	mpol_put(vma_policy(new_vma));
1926	out_free_vma:
1927	vm_area_free(vma: new_vma);
1928	out:
1929	return NULL;
1930	}
1931
1932	/*
1933	* Rough compatibility check to quickly see if it's even worth looking
1934	* at sharing an anon_vma.
1935	*
1936	* They need to have the same vm_file, and the flags can only differ
1937	* in things that mprotect may change.
1938	*
1939	* NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1940	* we can merge the two vma's. For example, we refuse to merge a vma if
1941	* there is a vm_ops->close() function, because that indicates that the
1942	* driver is doing some kind of reference counting. But that doesn't
1943	* really matter for the anon_vma sharing case.
1944	*/
1945	static int anon_vma_compatible(struct vm_area_struct a, struct* vm_area_struct *b)
1946	{
1947	return a->vm_end == b->vm_start &&
1948	mpol_equal(vma_policy(a), vma_policy(b)) &&
1949	a->vm_file == b->vm_file &&
1950	!((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS \| VM_IGNORE_MERGE)) &&
1951	b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1952	}
1953
1954	/*
1955	* Do some basic sanity checking to see if we can re-use the anon_vma
1956	* from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1957	* the same as 'old', the other will be the new one that is trying
1958	* to share the anon_vma.
1959	*
1960	* NOTE! This runs with mmap_lock held for reading, so it is possible that
1961	* the anon_vma of 'old' is concurrently in the process of being set up
1962	* by another page fault trying to merge _that_. But that's ok: if it
1963	* is being set up, that automatically means that it will be a singleton
1964	* acceptable for merging, so we can do all of this optimistically. But
1965	* we do that READ_ONCE() to make sure that we never re-load the pointer.
1966	*
1967	* IOW: that the "list_is_singular()" test on the anon_vma_chain only
1968	* matters for the 'stable anon_vma' case (ie the thing we want to avoid
1969	* is to return an anon_vma that is "complex" due to having gone through
1970	* a fork).
1971	*
1972	* We also make sure that the two vma's are compatible (adjacent,
1973	* and with the same memory policies). That's all stable, even with just
1974	* a read lock on the mmap_lock.
1975	*/
1976	static struct anon_vma reusable_anon_vma(struct* vm_area_struct *old,
1977	struct vm_area_struct *a,
1978	struct vm_area_struct *b)
1979	{
1980	if (anon_vma_compatible(a, b)) {
1981	struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1982
1983	if (anon_vma && list_is_singular(head: &old->anon_vma_chain))
1984	return anon_vma;
1985	}
1986	return NULL;
1987	}
1988
1989	/*
1990	* find_mergeable_anon_vma is used by anon_vma_prepare, to check
1991	* neighbouring vmas for a suitable anon_vma, before it goes off
1992	* to allocate a new anon_vma. It checks because a repetitive
1993	* sequence of mprotects and faults may otherwise lead to distinct
1994	* anon_vmas being allocated, preventing vma merge in subsequent
1995	* mprotect.
1996	*/
1997	struct anon_vma find_mergeable_anon_vma(struct* vm_area_struct *vma)
1998	{
1999	struct anon_vma *anon_vma = NULL;
2000	struct vm_area_struct prev, next;
2001	VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
2002
2003	/ Try next first. /
2004	next = vma_iter_load(vmi: &vmi);
2005	if (next) {
2006	anon_vma = reusable_anon_vma(old: next, a: vma, b: next);
2007	if (anon_vma)
2008	return anon_vma;
2009	}
2010
2011	prev = vma_prev(vmi: &vmi);
2012	VM_BUG_ON_VMA(prev != vma, vma);
2013	prev = vma_prev(vmi: &vmi);
2014	/ Try prev next. /
2015	if (prev)
2016	anon_vma = reusable_anon_vma(old: prev, a: prev, b: vma);
2017
2018	/*
2019	* We might reach here with anon_vma == NULL if we can't find
2020	* any reusable anon_vma.
2021	* There's no absolute need to look only at touching neighbours:
2022	* we could search further afield for "compatible" anon_vmas.
2023	* But it would probably just be a waste of time searching,
2024	* or lead to too many vmas hanging off the same anon_vma.
2025	* We're trying to allow mprotect remerging later on,
2026	* not trying to minimize memory used for anon_vmas.
2027	*/
2028	return anon_vma;
2029	}
2030
2031	static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
2032	{
2033	return vm_ops && (vm_ops->page_mkwrite \|\| vm_ops->pfn_mkwrite);
2034	}
2035
2036	static bool vma_is_shared_writable(struct vm_area_struct *vma)
2037	{
2038	return (vma->vm_flags & (VM_WRITE \| VM_SHARED)) ==
2039	(VM_WRITE \| VM_SHARED);
2040	}
2041
2042	static bool vma_fs_can_writeback(struct vm_area_struct *vma)
2043	{
2044	/ No managed pages to writeback. /
2045	if (vma->vm_flags & VM_PFNMAP)
2046	return false;
2047
2048	return vma->vm_file && vma->vm_file->f_mapping &&
2049	mapping_can_writeback(mapping: vma->vm_file->f_mapping);
2050	}
2051
2052	/*
2053	* Does this VMA require the underlying folios to have their dirty state
2054	* tracked?
2055	*/
2056	bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
2057	{
2058	/ Only shared, writable VMAs require dirty tracking. /
2059	if (!vma_is_shared_writable(vma))
2060	return false;
2061
2062	/ Does the filesystem need to be notified? /
2063	if (vm_ops_needs_writenotify(vm_ops: vma->vm_ops))
2064	return true;
2065
2066	/*
2067	* Even if the filesystem doesn't indicate a need for writenotify, if it
2068	* can writeback, dirty tracking is still required.
2069	*/
2070	return vma_fs_can_writeback(vma);
2071	}
2072
2073	/*
2074	* Some shared mappings will want the pages marked read-only
2075	* to track write events. If so, we'll downgrade vm_page_prot
2076	* to the private version (using protection_map[] without the
2077	* VM_SHARED bit).
2078	*/
2079	bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
2080	{
2081	/ If it was private or non-writable, the write bit is already clear /
2082	if (!vma_is_shared_writable(vma))
2083	return false;
2084
2085	/ The backer wishes to know when pages are first written to? /
2086	if (vm_ops_needs_writenotify(vm_ops: vma->vm_ops))
2087	return true;
2088
2089	/ The open routine did something to the protections that pgprot_modify*
2090	* won't preserve? */
2091	if (pgprot_val(vm_page_prot) !=
2092	pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
2093	return false;
2094
2095	/*
2096	* Do we need to track softdirty? hugetlb does not support softdirty
2097	* tracking yet.
2098	*/
2099	if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
2100	return true;
2101
2102	/ Do we need write faults for uffd-wp tracking? /
2103	if (userfaultfd_wp(vma))
2104	return true;
2105
2106	/ Can the mapping track the dirty pages? /
2107	return vma_fs_can_writeback(vma);
2108	}
2109
2110	static DEFINE_MUTEX(mm_all_locks_mutex);
2111
2112	static void vm_lock_anon_vma(struct mm_struct mm, struct* anon_vma *anon_vma)
2113	{
2114	if (!test_bit(`0`, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
2115	/*
2116	* The LSB of head.next can't change from under us
2117	* because we hold the mm_all_locks_mutex.
2118	*/
2119	down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
2120	/*
2121	* We can safely modify head.next after taking the
2122	* anon_vma->root->rwsem. If some other vma in this mm shares
2123	* the same anon_vma we won't take it again.
2124	*
2125	* No need of atomic instructions here, head.next
2126	* can't change from under us thanks to the
2127	* anon_vma->root->rwsem.
2128	*/
2129	if (__test_and_set_bit(`0`, (unsigned long *)
2130	&anon_vma->root->rb_root.rb_root.rb_node))
2131	BUG();
2132	}
2133	}
2134
2135	static void vm_lock_mapping(struct mm_struct mm, struct* address_space *mapping)
2136	{
2137	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2138	/*
2139	* AS_MM_ALL_LOCKS can't change from under us because
2140	* we hold the mm_all_locks_mutex.
2141	*
2142	* Operations on ->flags have to be atomic because
2143	* even if AS_MM_ALL_LOCKS is stable thanks to the
2144	* mm_all_locks_mutex, there may be other cpus
2145	* changing other bitflags in parallel to us.
2146	*/
2147	if (test_and_set_bit(nr: AS_MM_ALL_LOCKS, addr: &mapping->flags))
2148	BUG();
2149	down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
2150	}
2151	}
2152
2153	/*
2154	* This operation locks against the VM for all pte/vma/mm related
2155	* operations that could ever happen on a certain mm. This includes
2156	* vmtruncate, try_to_unmap, and all page faults.
2157	*
2158	* The caller must take the mmap_lock in write mode before calling
2159	* mm_take_all_locks(). The caller isn't allowed to release the
2160	* mmap_lock until mm_drop_all_locks() returns.
2161	*
2162	* mmap_lock in write mode is required in order to block all operations
2163	* that could modify pagetables and free pages without need of
2164	* altering the vma layout. It's also needed in write mode to avoid new
2165	* anon_vmas to be associated with existing vmas.
2166	*
2167	* A single task can't take more than one mm_take_all_locks() in a row
2168	* or it would deadlock.
2169	*
2170	* The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
2171	* mapping->flags avoid to take the same lock twice, if more than one
2172	* vma in this mm is backed by the same anon_vma or address_space.
2173	*
2174	* We take locks in following order, accordingly to comment at beginning
2175	* of mm/rmap.c:
2176	* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
2177	* hugetlb mapping);
2178	* - all vmas marked locked
2179	* - all i_mmap_rwsem locks;
2180	* - all anon_vma->rwseml
2181	*
2182	* We can take all locks within these types randomly because the VM code
2183	* doesn't nest them and we protected from parallel mm_take_all_locks() by
2184	* mm_all_locks_mutex.
2185	*
2186	* mm_take_all_locks() and mm_drop_all_locks are expensive operations
2187	* that may have to take thousand of locks.
2188	*
2189	* mm_take_all_locks() can fail if it's interrupted by signals.
2190	*/
2191	int mm_take_all_locks(struct mm_struct *mm)
2192	{
2193	struct vm_area_struct *vma;
2194	struct anon_vma_chain *avc;
2195	VMA_ITERATOR(vmi, mm, `0`);
2196
2197	mmap_assert_write_locked(mm);
2198
2199	mutex_lock(&mm_all_locks_mutex);
2200
2201	/*
2202	* vma_start_write() does not have a complement in mm_drop_all_locks()
2203	* because vma_start_write() is always asymmetrical; it marks a VMA as
2204	* being written to until mmap_write_unlock() or mmap_write_downgrade()
2205	* is reached.
2206	*/
2207	for_each_vma(vmi, vma) {
2208	if (signal_pending(current))
2209	goto out_unlock;
2210	vma_start_write(vma);
2211	}
2212
2213	vma_iter_init(vmi: &vmi, mm, addr: `0`);
2214	for_each_vma(vmi, vma) {
2215	if (signal_pending(current))
2216	goto out_unlock;
2217	if (vma->vm_file && vma->vm_file->f_mapping &&
2218	is_vm_hugetlb_page(vma))
2219	vm_lock_mapping(mm, mapping: vma->vm_file->f_mapping);
2220	}
2221
2222	vma_iter_init(vmi: &vmi, mm, addr: `0`);
2223	for_each_vma(vmi, vma) {
2224	if (signal_pending(current))
2225	goto out_unlock;
2226	if (vma->vm_file && vma->vm_file->f_mapping &&
2227	!is_vm_hugetlb_page(vma))
2228	vm_lock_mapping(mm, mapping: vma->vm_file->f_mapping);
2229	}
2230
2231	vma_iter_init(vmi: &vmi, mm, addr: `0`);
2232	for_each_vma(vmi, vma) {
2233	if (signal_pending(current))
2234	goto out_unlock;
2235	if (vma->anon_vma)
2236	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2237	vm_lock_anon_vma(mm, anon_vma: avc->anon_vma);
2238	}
2239
2240	return `0`;
2241
2242	out_unlock:
2243	mm_drop_all_locks(mm);
2244	return -EINTR;
2245	}
2246
2247	static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2248	{
2249	if (test_bit(`0`, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
2250	/*
2251	* The LSB of head.next can't change to 0 from under
2252	* us because we hold the mm_all_locks_mutex.
2253	*
2254	* We must however clear the bitflag before unlocking
2255	* the vma so the users using the anon_vma->rb_root will
2256	* never see our bitflag.
2257	*
2258	* No need of atomic instructions here, head.next
2259	* can't change from under us until we release the
2260	* anon_vma->root->rwsem.
2261	*/
2262	if (!__test_and_clear_bit(`0`, (unsigned long *)
2263	&anon_vma->root->rb_root.rb_root.rb_node))
2264	BUG();
2265	anon_vma_unlock_write(anon_vma);
2266	}
2267	}
2268
2269	static void vm_unlock_mapping(struct address_space *mapping)
2270	{
2271	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2272	/*
2273	* AS_MM_ALL_LOCKS can't change to 0 from under us
2274	* because we hold the mm_all_locks_mutex.
2275	*/
2276	i_mmap_unlock_write(mapping);
2277	if (!test_and_clear_bit(nr: AS_MM_ALL_LOCKS,
2278	addr: &mapping->flags))
2279	BUG();
2280	}
2281	}
2282
2283	/*
2284	* The mmap_lock cannot be released by the caller until
2285	* mm_drop_all_locks() returns.
2286	*/
2287	void mm_drop_all_locks(struct mm_struct *mm)
2288	{
2289	struct vm_area_struct *vma;
2290	struct anon_vma_chain *avc;
2291	VMA_ITERATOR(vmi, mm, `0`);
2292
2293	mmap_assert_write_locked(mm);
2294	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2295
2296	for_each_vma(vmi, vma) {
2297	if (vma->anon_vma)
2298	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2299	vm_unlock_anon_vma(anon_vma: avc->anon_vma);
2300	if (vma->vm_file && vma->vm_file->f_mapping)
2301	vm_unlock_mapping(mapping: vma->vm_file->f_mapping);
2302	}
2303
2304	mutex_unlock(lock: &mm_all_locks_mutex);
2305	}
2306
2307	/*
2308	* We account for memory if it's a private writeable mapping,
2309	* not hugepages and VM_NORESERVE wasn't set.
2310	*/
2311	static bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
2312	{
2313	/*
2314	* hugetlb has its own accounting separate from the core VM
2315	* VM_HUGETLB may not be set yet so we cannot check for that flag.
2316	*/
2317	if (file && is_file_hugepages(file))
2318	return false;
2319
2320	return (vm_flags & (VM_NORESERVE \| VM_SHARED \| VM_WRITE)) == VM_WRITE;
2321	}
2322
2323	/*
2324	* vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
2325	* operation.
2326	* @vms: The vma unmap structure
2327	* @mas_detach: The maple state with the detached maple tree
2328	*
2329	* Reattach any detached vmas, free up the maple tree used to track the vmas.
2330	* If that's not possible because the ptes are cleared (and vm_ops->closed() may
2331	* have been called), then a NULL is written over the vmas and the vmas are
2332	* removed (munmap() completed).
2333	*/
2334	static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
2335	struct ma_state *mas_detach)
2336	{
2337	struct ma_state *mas = &vms->vmi->mas;
2338
2339	if (!vms->nr_pages)
2340	return;
2341
2342	if (vms->clear_ptes)
2343	return reattach_vmas(mas_detach);
2344
2345	/*
2346	* Aborting cannot just call the vm_ops open() because they are often
2347	* not symmetrical and state data has been lost. Resort to the old
2348	* failure method of leaving a gap where the MAP_FIXED mapping failed.
2349	*/
2350	mas_set_range(mas, start: vms->start, last: vms->end - `1`);
2351	mas_store_gfp(mas, NULL, GFP_KERNEL\|__GFP_NOFAIL);
2352	/ Clean up the insertion of the unfortunate gap /
2353	vms_complete_munmap_vmas(vms, mas_detach);
2354	}
2355
2356	static void update_ksm_flags(struct mmap_state *map)
2357	{
2358	map->vm_flags = ksm_vma_flags(mm: map->mm, file: map->file, vm_flags: map->vm_flags);
2359	}
2360
2361	static void set_desc_from_map(struct vm_area_desc *desc,
2362	const struct mmap_state *map)
2363	{
2364	desc->start = map->addr;
2365	desc->end = map->end;
2366
2367	desc->pgoff = map->pgoff;
2368	desc->vm_file = map->file;
2369	desc->vm_flags = map->vm_flags;
2370	desc->page_prot = map->page_prot;
2371	}
2372
2373	/*
2374	* __mmap_setup() - Prepare to gather any overlapping VMAs that need to be
2375	* unmapped once the map operation is completed, check limits, account mapping
2376	* and clean up any pre-existing VMAs.
2377	*
2378	* As a result it sets up the @map and @desc objects.
2379	*
2380	* @map: Mapping state.
2381	* @desc: VMA descriptor
2382	* @uf: Userfaultfd context list.
2383	*
2384	* Returns: 0 on success, error code otherwise.
2385	*/
2386	static int __mmap_setup(struct mmap_state map, struct* vm_area_desc *desc,
2387	struct list_head *uf)
2388	{
2389	int error;
2390	struct vma_iterator *vmi = map->vmi;
2391	struct vma_munmap_struct *vms = &map->vms;
2392
2393	/ Find the first overlapping VMA and initialise unmap state. /
2394	vms->vma = vma_find(vmi, max: map->end);
2395	init_vma_munmap(vms, vmi, vma: vms->vma, start: map->addr, end: map->end, uf,
2396	/ unlock = / false);
2397
2398	/ OK, we have overlapping VMAs - prepare to unmap them. /
2399	if (vms->vma) {
2400	mt_init_flags(mt: &map->mt_detach,
2401	flags: vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
2402	mt_on_stack(map->mt_detach);
2403	mas_init(mas: &map->mas_detach, tree: &map->mt_detach, / addr = / `0`);
2404	/ Prepare to unmap any existing mapping in the area /
2405	error = vms_gather_munmap_vmas(vms, mas_detach: &map->mas_detach);
2406	if (error) {
2407	/ On error VMAs will already have been reattached. /
2408	vms->nr_pages = `0`;
2409	return error;
2410	}
2411
2412	map->next = vms->next;
2413	map->prev = vms->prev;
2414	} else {
2415	map->next = vma_iter_next_rewind(vmi, pprev: &map->prev);
2416	}
2417
2418	/ Check against address space limit. /
2419	if (!may_expand_vm(map->mm, map->vm_flags, npages: map->pglen - vms->nr_pages))
2420	return -ENOMEM;
2421
2422	/ Private writable mapping: check memory availability. /
2423	if (accountable_mapping(file: map->file, vm_flags: map->vm_flags)) {
2424	map->charged = map->pglen;
2425	map->charged -= vms->nr_accounted;
2426	if (map->charged) {
2427	error = security_vm_enough_memory_mm(mm: map->mm, pages: map->charged);
2428	if (error)
2429	return error;
2430	}
2431
2432	vms->nr_accounted = `0`;
2433	map->vm_flags \|= VM_ACCOUNT;
2434	}
2435
2436	/*
2437	* Clear PTEs while the vma is still in the tree so that rmap
2438	* cannot race with the freeing later in the truncate scenario.
2439	* This is also needed for mmap_file(), which is why vm_ops
2440	* close function is called.
2441	*/
2442	vms_clean_up_area(vms, mas_detach: &map->mas_detach);
2443
2444	set_desc_from_map(desc, map);
2445	return `0`;
2446	}
2447
2448
2449	static int __mmap_new_file_vma(struct mmap_state *map,
2450	struct vm_area_struct *vma)
2451	{
2452	struct vma_iterator *vmi = map->vmi;
2453	int error;
2454
2455	vma->vm_file = map->file;
2456	if (!map->file_doesnt_need_get)
2457	get_file(f: map->file);
2458
2459	if (!map->file->f_op->mmap)
2460	return `0`;
2461
2462	error = mmap_file(file: vma->vm_file, vma);
2463	if (error) {
2464	fput(vma->vm_file);
2465	vma->vm_file = NULL;
2466
2467	vma_iter_set(vmi, addr: vma->vm_end);
2468	/ Undo any partial mapping done by a device driver. /
2469	unmap_region(mas: &vmi->mas, vma, prev: map->prev, next: map->next);
2470
2471	return error;
2472	}
2473
2474	/ Drivers cannot alter the address of the VMA. /
2475	WARN_ON_ONCE(map->addr != vma->vm_start);
2476	/*
2477	* Drivers should not permit writability when previously it was
2478	* disallowed.
2479	*/
2480	VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
2481	!(map->vm_flags & VM_MAYWRITE) &&
2482	(vma->vm_flags & VM_MAYWRITE));
2483
2484	map->file = vma->vm_file;
2485	map->vm_flags = vma->vm_flags;
2486
2487	return `0`;
2488	}
2489
2490	/*
2491	* __mmap_new_vma() - Allocate a new VMA for the region, as merging was not
2492	* possible.
2493	*
2494	* @map: Mapping state.
2495	* @vmap: Output pointer for the new VMA.
2496	*
2497	* Returns: Zero on success, or an error.
2498	*/
2499	static int __mmap_new_vma(struct mmap_state map, struct* vm_area_struct **vmap)
2500	{
2501	struct vma_iterator *vmi = map->vmi;
2502	int error = `0`;
2503	struct vm_area_struct *vma;
2504
2505	/*
2506	* Determine the object being mapped and call the appropriate
2507	* specific mapper. the address has already been validated, but
2508	* not unmapped, but the maps are removed from the list.
2509	*/
2510	vma = vm_area_alloc(mm: map->mm);
2511	if (!vma)
2512	return -ENOMEM;
2513
2514	vma_iter_config(vmi, index: map->addr, last: map->end);
2515	vma_set_range(vma, start: map->addr, end: map->end, pgoff: map->pgoff);
2516	vm_flags_init(vma, flags: map->vm_flags);
2517	vma->vm_page_prot = map->page_prot;
2518
2519	if (vma_iter_prealloc(vmi, vma)) {
2520	error = -ENOMEM;
2521	goto free_vma;
2522	}
2523
2524	if (map->file)
2525	error = __mmap_new_file_vma(map, vma);
2526	else if (map->vm_flags & VM_SHARED)
2527	error = shmem_zero_setup(vma);
2528	else
2529	vma_set_anonymous(vma);
2530
2531	if (error)
2532	goto free_iter_vma;
2533
2534	if (!map->check_ksm_early) {
2535	update_ksm_flags(map);
2536	vm_flags_init(vma, flags: map->vm_flags);
2537	}
2538
2539	#ifdef CONFIG_SPARC64
2540	/ TODO: Fix SPARC ADI! /
2541	WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
2542	#endif
2543
2544	/ Lock the VMA since it is modified after insertion into VMA tree /
2545	vma_start_write(vma);
2546	vma_iter_store_new(vmi, vma);
2547	map->mm->map_count++;
2548	vma_link_file(vma, hold_rmap_lock: map->hold_file_rmap_lock);
2549
2550	/*
2551	* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
2552	* call covers the non-merge case.
2553	*/
2554	if (!vma_is_anonymous(vma))
2555	khugepaged_enter_vma(vma, vm_flags: map->vm_flags);
2556	*vmap = vma;
2557	return `0`;
2558
2559	free_iter_vma:
2560	vma_iter_free(vmi);
2561	free_vma:
2562	vm_area_free(vma);
2563	return error;
2564	}
2565
2566	/*
2567	* __mmap_complete() - Unmap any VMAs we overlap, account memory mapping
2568	* statistics, handle locking and finalise the VMA.
2569	*
2570	* @map: Mapping state.
2571	* @vma: Merged or newly allocated VMA for the mmap()'d region.
2572	*/
2573	static void __mmap_complete(struct mmap_state map, struct* vm_area_struct *vma)
2574	{
2575	struct mm_struct *mm = map->mm;
2576	vm_flags_t vm_flags = vma->vm_flags;
2577
2578	perf_event_mmap(vma);
2579
2580	/ Unmap any existing mapping in the area. /
2581	vms_complete_munmap_vmas(vms: &map->vms, mas_detach: &map->mas_detach);
2582
2583	vm_stat_account(mm, vma->vm_flags, npages: map->pglen);
2584	if (vm_flags & VM_LOCKED) {
2585	if ((vm_flags & VM_SPECIAL) \|\| vma_is_dax(vma) \|\|
2586	is_vm_hugetlb_page(vma) \|\|
2587	vma == get_gate_vma(mm))
2588	vm_flags_clear(vma, VM_LOCKED_MASK);
2589	else
2590	mm->locked_vm += map->pglen;
2591	}
2592
2593	if (vma->vm_file)
2594	uprobe_mmap(vma);
2595
2596	/*
2597	* New (or expanded) vma always get soft dirty status.
2598	* Otherwise user-space soft-dirty page tracker won't
2599	* be able to distinguish situation when vma area unmapped,
2600	* then new mapped in-place (which must be aimed as
2601	* a completely new data area).
2602	*/
2603	if (pgtable_supports_soft_dirty())
2604	vm_flags_set(vma, VM_SOFTDIRTY);
2605
2606	vma_set_page_prot(vma);
2607	}
2608
2609	static void call_action_prepare(struct mmap_state *map,
2610	struct vm_area_desc *desc)
2611	{
2612	struct mmap_action *action = &desc->action;
2613
2614	mmap_action_prepare(action, desc);
2615
2616	if (action->hide_from_rmap_until_complete)
2617	map->hold_file_rmap_lock = true;
2618	}
2619
2620	/*
2621	* Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
2622	* specifies it.
2623	*
2624	* This is called prior to any merge attempt, and updates whitelisted fields
2625	* that are permitted to be updated by the caller.
2626	*
2627	* All but user-defined fields will be pre-populated with original values.
2628	*
2629	* Returns 0 on success, or an error code otherwise.
2630	*/
2631	static int call_mmap_prepare(struct mmap_state *map,
2632	struct vm_area_desc *desc)
2633	{
2634	int err;
2635
2636	/ Invoke the hook. /
2637	err = vfs_mmap_prepare(file: map->file, desc);
2638	if (err)
2639	return err;
2640
2641	call_action_prepare(map, desc);
2642
2643	/ Update fields permitted to be changed. /
2644	map->pgoff = desc->pgoff;
2645	if (desc->vm_file != map->file) {
2646	map->file_doesnt_need_get = true;
2647	map->file = desc->vm_file;
2648	}
2649	map->vm_flags = desc->vm_flags;
2650	map->page_prot = desc->page_prot;
2651	/ User-defined fields. /
2652	map->vm_ops = desc->vm_ops;
2653	map->vm_private_data = desc->private_data;
2654
2655	return `0`;
2656	}
2657
2658	static void set_vma_user_defined_fields(struct vm_area_struct *vma,
2659	struct mmap_state *map)
2660	{
2661	if (map->vm_ops)
2662	vma->vm_ops = map->vm_ops;
2663	vma->vm_private_data = map->vm_private_data;
2664	}
2665
2666	/*
2667	* Are we guaranteed no driver can change state such as to preclude KSM merging?
2668	* If so, let's set the KSM mergeable flag early so we don't break VMA merging.
2669	*/
2670	static bool can_set_ksm_flags_early(struct mmap_state *map)
2671	{
2672	struct file *file = map->file;
2673
2674	/ Anonymous mappings have no driver which can change them. /
2675	if (!file)
2676	return true;
2677
2678	/*
2679	* If .mmap_prepare() is specified, then the driver will have already
2680	* manipulated state prior to updating KSM flags. So no need to worry
2681	* about mmap callbacks modifying VMA flags after the KSM flag has been
2682	* updated here, which could otherwise affect KSM eligibility.
2683	*/
2684	if (file->f_op->mmap_prepare)
2685	return true;
2686
2687	/ shmem is safe. /
2688	if (shmem_file(file))
2689	return true;
2690
2691	/ Any other .mmap callback is not safe. /
2692	return false;
2693	}
2694
2695	static int call_action_complete(struct mmap_state *map,
2696	struct vm_area_desc *desc,
2697	struct vm_area_struct *vma)
2698	{
2699	struct mmap_action *action = &desc->action;
2700	int ret;
2701
2702	ret = mmap_action_complete(action, vma);
2703
2704	/ If we held the file rmap we need to release it. /
2705	if (map->hold_file_rmap_lock) {
2706	struct file *file = vma->vm_file;
2707
2708	i_mmap_unlock_write(mapping: file->f_mapping);
2709	}
2710	return ret;
2711	}
2712
2713	static unsigned long __mmap_region(struct file file, unsigned* long addr,
2714	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
2715	struct list_head *uf)
2716	{
2717	struct mm_struct *mm = current->mm;
2718	struct vm_area_struct *vma = NULL;
2719	bool have_mmap_prepare = file && file->f_op->mmap_prepare;
2720	VMA_ITERATOR(vmi, mm, addr);
2721	MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
2722	struct vm_area_desc desc = {
2723	.mm = mm,
2724	.file = file,
2725	.action = {
2726	.type = MMAP_NOTHING, / Default to no further action. /
2727	},
2728	};
2729	bool allocated_new = false;
2730	int error;
2731
2732	map.check_ksm_early = can_set_ksm_flags_early(map: &map);
2733
2734	error = __mmap_setup(map: &map, desc: &desc, uf);
2735	if (!error && have_mmap_prepare)
2736	error = call_mmap_prepare(map: &map, desc: &desc);
2737	if (error)
2738	goto abort_munmap;
2739
2740	if (map.check_ksm_early)
2741	update_ksm_flags(map: &map);
2742
2743	/ Attempt to merge with adjacent VMAs... /
2744	if (map.prev \|\| map.next) {
2745	VMG_MMAP_STATE(vmg, &map, / vma = / NULL);
2746
2747	vma = vma_merge_new_range(vmg: &vmg);
2748	}
2749
2750	/ ...but if we can't, allocate a new VMA. /
2751	if (!vma) {
2752	error = __mmap_new_vma(map: &map, vmap: &vma);
2753	if (error)
2754	goto unacct_error;
2755	allocated_new = true;
2756	}
2757
2758	if (have_mmap_prepare)
2759	set_vma_user_defined_fields(vma, map: &map);
2760
2761	__mmap_complete(map: &map, vma);
2762
2763	if (have_mmap_prepare && allocated_new) {
2764	error = call_action_complete(map: &map, desc: &desc, vma);
2765
2766	if (error)
2767	return error;
2768	}
2769
2770	return addr;
2771
2772	/ Accounting was done by __mmap_setup(). /
2773	unacct_error:
2774	if (map.charged)
2775	vm_unacct_memory(pages: map.charged);
2776	abort_munmap:
2777	vms_abort_munmap_vmas(vms: &map.vms, mas_detach: &map.mas_detach);
2778	return error;
2779	}
2780
2781	/**
2782	* mmap_region() - Actually perform the userland mapping of a VMA into
2783	* current->mm with known, aligned and overflow-checked @addr and @len, and
2784	* correctly determined VMA flags @vm_flags and page offset @pgoff.
2785	*
2786	* This is an internal memory management function, and should not be used
2787	* directly.
2788	*
2789	* The caller must write-lock current->mm->mmap_lock.
2790	*
2791	* @file: If a file-backed mapping, a pointer to the struct file describing the
2792	* file to be mapped, otherwise NULL.
2793	* @addr: The page-aligned address at which to perform the mapping.
2794	* @len: The page-aligned, non-zero, length of the mapping.
2795	* @vm_flags: The VMA flags which should be applied to the mapping.
2796	* @pgoff: If @file is specified, the page offset into the file, if not then
2797	* the virtual page offset in memory of the anonymous mapping.
2798	* @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
2799	* events.
2800	*
2801	* Returns: Either an error, or the address at which the requested mapping has
2802	* been performed.
2803	*/
2804	unsigned long mmap_region(struct file file, unsigned* long addr,
2805	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
2806	struct list_head *uf)
2807	{
2808	unsigned long ret;
2809	bool writable_file_mapping = false;
2810
2811	mmap_assert_write_locked(current->mm);
2812
2813	/ Check to see if MDWE is applicable. /
2814	if (map_deny_write_exec(old: vm_flags, new: vm_flags))
2815	return -EACCES;
2816
2817	/ Allow architectures to sanity-check the vm_flags. /
2818	if (!arch_validate_flags(flags: vm_flags))
2819	return -EINVAL;
2820
2821	/ Map writable and ensure this isn't a sealed memfd. /
2822	if (file && is_shared_maywrite(vm_flags)) {
2823	int error = mapping_map_writable(mapping: file->f_mapping);
2824
2825	if (error)
2826	return error;
2827	writable_file_mapping = true;
2828	}
2829
2830	ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
2831
2832	/ Clear our write mapping regardless of error. /
2833	if (writable_file_mapping)
2834	mapping_unmap_writable(mapping: file->f_mapping);
2835
2836	validate_mm(current->mm);
2837	return ret;
2838	}
2839
2840	/*
2841	* do_brk_flags() - Increase the brk vma if the flags match.
2842	* @vmi: The vma iterator
2843	* @addr: The start address
2844	* @len: The length of the increase
2845	* @vma: The vma,
2846	* @vm_flags: The VMA Flags
2847	*
2848	* Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
2849	* do not match then create a new anonymous VMA. Eventually we may be able to
2850	* do some brk-specific accounting here.
2851	*/
2852	int do_brk_flags(struct vma_iterator vmi, struct* vm_area_struct *vma,
2853	unsigned long addr, unsigned long len, vm_flags_t vm_flags)
2854	{
2855	struct mm_struct *mm = current->mm;
2856
2857	/*
2858	* Check against address space limits by the changed size
2859	* Note: This happens after clearing old mappings in some code paths.
2860	*/
2861	vm_flags \|= VM_DATA_DEFAULT_FLAGS \| VM_ACCOUNT \| mm->def_flags;
2862	vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
2863	if (!may_expand_vm(mm, vm_flags, npages: len >> PAGE_SHIFT))
2864	return -ENOMEM;
2865
2866	if (mm->map_count > sysctl_max_map_count)
2867	return -ENOMEM;
2868
2869	if (security_vm_enough_memory_mm(mm, pages: len >> PAGE_SHIFT))
2870	return -ENOMEM;
2871
2872	/*
2873	* Expand the existing vma if possible; Note that singular lists do not
2874	* occur after forking, so the expand will only happen on new VMAs.
2875	*/
2876	if (vma && vma->vm_end == addr) {
2877	VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
2878
2879	vmg.prev = vma;
2880	/ vmi is positioned at prev, which this mode expects. /
2881	vmg.just_expand = true;
2882
2883	if (vma_merge_new_range(vmg: &vmg))
2884	goto out;
2885	else if (vmg_nomem(vmg: &vmg))
2886	goto unacct_fail;
2887	}
2888
2889	if (vma)
2890	vma_iter_next_range(vmi);
2891	/ create a vma struct for an anonymous mapping /
2892	vma = vm_area_alloc(mm);
2893	if (!vma)
2894	goto unacct_fail;
2895
2896	vma_set_anonymous(vma);
2897	vma_set_range(vma, start: addr, end: addr + len, pgoff: addr >> PAGE_SHIFT);
2898	vm_flags_init(vma, flags: vm_flags);
2899	vma->vm_page_prot = vm_get_page_prot(vm_flags);
2900	vma_start_write(vma);
2901	if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
2902	goto mas_store_fail;
2903
2904	mm->map_count++;
2905	validate_mm(mm);
2906	out:
2907	perf_event_mmap(vma);
2908	mm->total_vm += len >> PAGE_SHIFT;
2909	mm->data_vm += len >> PAGE_SHIFT;
2910	if (vm_flags & VM_LOCKED)
2911	mm->locked_vm += (len >> PAGE_SHIFT);
2912	if (pgtable_supports_soft_dirty())
2913	vm_flags_set(vma, VM_SOFTDIRTY);
2914	return `0`;
2915
2916	mas_store_fail:
2917	vm_area_free(vma);
2918	unacct_fail:
2919	vm_unacct_memory(pages: len >> PAGE_SHIFT);
2920	return -ENOMEM;
2921	}
2922
2923	/**
2924	* unmapped_area() - Find an area between the low_limit and the high_limit with
2925	* the correct alignment and offset, all from @info. Note: current->mm is used
2926	* for the search.
2927	*
2928	* @info: The unmapped area information including the range [low_limit -
2929	* high_limit), the alignment offset and mask.
2930	*
2931	* Return: A memory address or -ENOMEM.
2932	*/
2933	unsigned long unmapped_area(struct vm_unmapped_area_info *info)
2934	{
2935	unsigned long length, gap;
2936	unsigned long low_limit, high_limit;
2937	struct vm_area_struct *tmp;
2938	VMA_ITERATOR(vmi, current->mm, `0`);
2939
2940	/ Adjust search length to account for worst case alignment overhead /
2941	length = info->length + info->align_mask + info->start_gap;
2942	if (length < info->length)
2943	return -ENOMEM;
2944
2945	low_limit = info->low_limit;
2946	if (low_limit < mmap_min_addr)
2947	low_limit = mmap_min_addr;
2948	high_limit = info->high_limit;
2949	retry:
2950	if (vma_iter_area_lowest(vmi: &vmi, min: low_limit, max: high_limit, size: length))
2951	return -ENOMEM;
2952
2953	/*
2954	* Adjust for the gap first so it doesn't interfere with the
2955	* later alignment. The first step is the minimum needed to
2956	* fulill the start gap, the next steps is the minimum to align
2957	* that. It is the minimum needed to fulill both.
2958	*/
2959	gap = vma_iter_addr(vmi: &vmi) + info->start_gap;
2960	gap += (info->align_offset - gap) & info->align_mask;
2961	tmp = vma_next(vmi: &vmi);
2962	if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { / Avoid prev check if possible /
2963	if (vm_start_gap(vma: tmp) < gap + length - `1`) {
2964	low_limit = tmp->vm_end;
2965	vma_iter_reset(vmi: &vmi);
2966	goto retry;
2967	}
2968	} else {
2969	tmp = vma_prev(vmi: &vmi);
2970	if (tmp && vm_end_gap(vma: tmp) > gap) {
2971	low_limit = vm_end_gap(vma: tmp);
2972	vma_iter_reset(vmi: &vmi);
2973	goto retry;
2974	}
2975	}
2976
2977	return gap;
2978	}
2979
2980	/**
2981	* unmapped_area_topdown() - Find an area between the low_limit and the
2982	* high_limit with the correct alignment and offset at the highest available
2983	* address, all from @info. Note: current->mm is used for the search.
2984	*
2985	* @info: The unmapped area information including the range [low_limit -
2986	* high_limit), the alignment offset and mask.
2987	*
2988	* Return: A memory address or -ENOMEM.
2989	*/
2990	unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
2991	{
2992	unsigned long length, gap, gap_end;
2993	unsigned long low_limit, high_limit;
2994	struct vm_area_struct *tmp;
2995	VMA_ITERATOR(vmi, current->mm, `0`);
2996
2997	/ Adjust search length to account for worst case alignment overhead /
2998	length = info->length + info->align_mask + info->start_gap;
2999	if (length < info->length)
3000	return -ENOMEM;
3001
3002	low_limit = info->low_limit;
3003	if (low_limit < mmap_min_addr)
3004	low_limit = mmap_min_addr;
3005	high_limit = info->high_limit;
3006	retry:
3007	if (vma_iter_area_highest(vmi: &vmi, min: low_limit, max: high_limit, size: length))
3008	return -ENOMEM;
3009
3010	gap = vma_iter_end(vmi: &vmi) - info->length;
3011	gap -= (gap - info->align_offset) & info->align_mask;
3012	gap_end = vma_iter_end(vmi: &vmi);
3013	tmp = vma_next(vmi: &vmi);
3014	if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { / Avoid prev check if possible /
3015	if (vm_start_gap(vma: tmp) < gap_end) {
3016	high_limit = vm_start_gap(vma: tmp);
3017	vma_iter_reset(vmi: &vmi);
3018	goto retry;
3019	}
3020	} else {
3021	tmp = vma_prev(vmi: &vmi);
3022	if (tmp && vm_end_gap(vma: tmp) > gap) {
3023	high_limit = tmp->vm_start;
3024	vma_iter_reset(vmi: &vmi);
3025	goto retry;
3026	}
3027	}
3028
3029	return gap;
3030	}
3031
3032	/*
3033	* Verify that the stack growth is acceptable and
3034	* update accounting. This is shared with both the
3035	* grow-up and grow-down cases.
3036	*/
3037	static int acct_stack_growth(struct vm_area_struct *vma,
3038	unsigned long size, unsigned long grow)
3039	{
3040	struct mm_struct *mm = vma->vm_mm;
3041	unsigned long new_start;
3042
3043	/ address space limit tests /
3044	if (!may_expand_vm(mm, vma->vm_flags, npages: grow))
3045	return -ENOMEM;
3046
3047	/ Stack limit test /
3048	if (size > rlimit(RLIMIT_STACK))
3049	return -ENOMEM;
3050
3051	/ mlock limit tests /
3052	if (!mlock_future_ok(mm, vm_flags: vma->vm_flags, bytes: grow << PAGE_SHIFT))
3053	return -ENOMEM;
3054
3055	/ Check to ensure the stack will not grow into a hugetlb-only region /
3056	new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
3057	vma->vm_end - size;
3058	if (is_hugepage_only_range(mm: vma->vm_mm, addr: new_start, len: size))
3059	return -EFAULT;
3060
3061	/*
3062	* Overcommit.. This must be the final test, as it will
3063	* update security statistics.
3064	*/
3065	if (security_vm_enough_memory_mm(mm, pages: grow))
3066	return -ENOMEM;
3067
3068	return `0`;
3069	}
3070
3071	#if defined(CONFIG_STACK_GROWSUP)
3072	/*
3073	* PA-RISC uses this for its stack.
3074	* vma is the last one with address > vma->vm_end. Have to extend vma.
3075	*/
3076	int expand_upwards(struct vm_area_struct vma, unsigned* long address)
3077	{
3078	struct mm_struct *mm = vma->vm_mm;
3079	struct vm_area_struct *next;
3080	unsigned long gap_addr;
3081	int error = `0`;
3082	VMA_ITERATOR(vmi, mm, vma->vm_start);
3083
3084	if (!(vma->vm_flags & VM_GROWSUP))
3085	return -EFAULT;
3086
3087	mmap_assert_write_locked(mm);
3088
3089	/ Guard against exceeding limits of the address space. /
3090	address &= PAGE_MASK;
3091	if (address >= (TASK_SIZE & PAGE_MASK))
3092	return -ENOMEM;
3093	address += PAGE_SIZE;
3094
3095	/ Enforce stack_guard_gap /
3096	gap_addr = address + stack_guard_gap;
3097
3098	/ Guard against overflow /
3099	if (gap_addr < address \|\| gap_addr > TASK_SIZE)
3100	gap_addr = TASK_SIZE;
3101
3102	next = find_vma_intersection(mm, vma->vm_end, gap_addr);
3103	if (next && vma_is_accessible(next)) {
3104	if (!(next->vm_flags & VM_GROWSUP))
3105	return -ENOMEM;
3106	/ Check that both stack segments have the same anon_vma? /
3107	}
3108
3109	if (next)
3110	vma_iter_prev_range_limit(&vmi, address);
3111
3112	vma_iter_config(&vmi, vma->vm_start, address);
3113	if (vma_iter_prealloc(&vmi, vma))
3114	return -ENOMEM;
3115
3116	/ We must make sure the anon_vma is allocated. /
3117	if (unlikely(anon_vma_prepare(vma))) {
3118	vma_iter_free(&vmi);
3119	return -ENOMEM;
3120	}
3121
3122	/ Lock the VMA before expanding to prevent concurrent page faults /
3123	vma_start_write(vma);
3124	/ We update the anon VMA tree. /
3125	anon_vma_lock_write(vma->anon_vma);
3126
3127	/ Somebody else might have raced and expanded it already /
3128	if (address > vma->vm_end) {
3129	unsigned long size, grow;
3130
3131	size = address - vma->vm_start;
3132	grow = (address - vma->vm_end) >> PAGE_SHIFT;
3133
3134	error = -ENOMEM;
3135	if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
3136	error = acct_stack_growth(vma, size, grow);
3137	if (!error) {
3138	if (vma->vm_flags & VM_LOCKED)
3139	mm->locked_vm += grow;
3140	vm_stat_account(mm, vma->vm_flags, grow);
3141	anon_vma_interval_tree_pre_update_vma(vma);
3142	vma->vm_end = address;
3143	/ Overwrite old entry in mtree. /
3144	vma_iter_store_overwrite(&vmi, vma);
3145	anon_vma_interval_tree_post_update_vma(vma);
3146
3147	perf_event_mmap(vma);
3148	}
3149	}
3150	}
3151	anon_vma_unlock_write(vma->anon_vma);
3152	vma_iter_free(&vmi);
3153	validate_mm(mm);
3154	return error;
3155	}
3156	#endif /* CONFIG_STACK_GROWSUP */
3157
3158	/*
3159	* vma is the first one with address < vma->vm_start. Have to extend vma.
3160	* mmap_lock held for writing.
3161	*/
3162	int expand_downwards(struct vm_area_struct vma, unsigned* long address)
3163	{
3164	struct mm_struct *mm = vma->vm_mm;
3165	struct vm_area_struct *prev;
3166	int error = `0`;
3167	VMA_ITERATOR(vmi, mm, vma->vm_start);
3168
3169	if (!(vma->vm_flags & VM_GROWSDOWN))
3170	return -EFAULT;
3171
3172	mmap_assert_write_locked(mm);
3173
3174	address &= PAGE_MASK;
3175	if (address < mmap_min_addr \|\| address < FIRST_USER_ADDRESS)
3176	return -EPERM;
3177
3178	/ Enforce stack_guard_gap /
3179	prev = vma_prev(vmi: &vmi);
3180	/ Check that both stack segments have the same anon_vma? /
3181	if (prev) {
3182	if (!(prev->vm_flags & VM_GROWSDOWN) &&
3183	vma_is_accessible(vma: prev) &&
3184	(address - prev->vm_end < stack_guard_gap))
3185	return -ENOMEM;
3186	}
3187
3188	if (prev)
3189	vma_iter_next_range_limit(vmi: &vmi, max: vma->vm_start);
3190
3191	vma_iter_config(vmi: &vmi, index: address, last: vma->vm_end);
3192	if (vma_iter_prealloc(vmi: &vmi, vma))
3193	return -ENOMEM;
3194
3195	/ We must make sure the anon_vma is allocated. /
3196	if (unlikely(anon_vma_prepare(vma))) {
3197	vma_iter_free(vmi: &vmi);
3198	return -ENOMEM;
3199	}
3200
3201	/ Lock the VMA before expanding to prevent concurrent page faults /
3202	vma_start_write(vma);
3203	/ We update the anon VMA tree. /
3204	anon_vma_lock_write(anon_vma: vma->anon_vma);
3205
3206	/ Somebody else might have raced and expanded it already /
3207	if (address < vma->vm_start) {
3208	unsigned long size, grow;
3209
3210	size = vma->vm_end - address;
3211	grow = (vma->vm_start - address) >> PAGE_SHIFT;
3212
3213	error = -ENOMEM;
3214	if (grow <= vma->vm_pgoff) {
3215	error = acct_stack_growth(vma, size, grow);
3216	if (!error) {
3217	if (vma->vm_flags & VM_LOCKED)
3218	mm->locked_vm += grow;
3219	vm_stat_account(mm, vma->vm_flags, npages: grow);
3220	anon_vma_interval_tree_pre_update_vma(vma);
3221	vma->vm_start = address;
3222	vma->vm_pgoff -= grow;
3223	/ Overwrite old entry in mtree. /
3224	vma_iter_store_overwrite(vmi: &vmi, vma);
3225	anon_vma_interval_tree_post_update_vma(vma);
3226
3227	perf_event_mmap(vma);
3228	}
3229	}
3230	}
3231	anon_vma_unlock_write(anon_vma: vma->anon_vma);
3232	vma_iter_free(vmi: &vmi);
3233	validate_mm(mm);
3234	return error;
3235	}
3236
3237	int __vm_munmap(unsigned long start, size_t len, bool unlock)
3238	{
3239	int ret;
3240	struct mm_struct *mm = current->mm;
3241	LIST_HEAD(uf);
3242	VMA_ITERATOR(vmi, mm, start);
3243
3244	if (mmap_write_lock_killable(mm))
3245	return -EINTR;
3246
3247	ret = do_vmi_munmap(vmi: &vmi, mm, start, len, uf: &uf, unlock);
3248	if (ret \|\| !unlock)
3249	mmap_write_unlock(mm);
3250
3251	userfaultfd_unmap_complete(mm, uf: &uf);
3252	return ret;
3253	}
3254
3255	/ Insert vm structure into process list sorted by address*
3256	* and into the inode's i_mmap tree. If vm_file is non-NULL
3257	* then i_mmap_rwsem is taken here.
3258	*/
3259	int insert_vm_struct(struct mm_struct mm, struct* vm_area_struct *vma)
3260	{
3261	unsigned long charged = vma_pages(vma);
3262
3263
3264	if (find_vma_intersection(mm, start_addr: vma->vm_start, end_addr: vma->vm_end))
3265	return -ENOMEM;
3266
3267	if ((vma->vm_flags & VM_ACCOUNT) &&
3268	security_vm_enough_memory_mm(mm, pages: charged))
3269	return -ENOMEM;
3270
3271	/*
3272	* The vm_pgoff of a purely anonymous vma should be irrelevant
3273	* until its first write fault, when page's anon_vma and index
3274	* are set. But now set the vm_pgoff it will almost certainly
3275	* end up with (unless mremap moves it elsewhere before that
3276	* first wfault), so /proc/pid/maps tells a consistent story.
3277	*
3278	* By setting it to reflect the virtual start address of the
3279	* vma, merges and splits can happen in a seamless way, just
3280	* using the existing file pgoff checks and manipulations.
3281	* Similarly in do_mmap and in do_brk_flags.
3282	*/
3283	if (vma_is_anonymous(vma)) {
3284	BUG_ON(vma->anon_vma);
3285	vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
3286	}
3287
3288	if (vma_link(mm, vma)) {
3289	if (vma->vm_flags & VM_ACCOUNT)
3290	vm_unacct_memory(pages: charged);
3291	return -ENOMEM;
3292	}
3293
3294	return `0`;
3295	}
3296

source code of linux/mm/vma.c