mlock.c source code [linux/mm/mlock.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/mm/mlock.c
4	*
5	* (C) Copyright 1995 Linus Torvalds
6	* (C) Copyright 2002 Christoph Hellwig
7	*/
8
9	#include <linux/capability.h>
10	#include <linux/mman.h>
11	#include <linux/mm.h>
12	#include <linux/sched/user.h>
13	#include <linux/swap.h>
14	#include <linux/swapops.h>
15	#include <linux/pagemap.h>
16	#include <linux/pagevec.h>
17	#include <linux/pagewalk.h>
18	#include <linux/mempolicy.h>
19	#include <linux/syscalls.h>
20	#include <linux/sched.h>
21	#include <linux/export.h>
22	#include <linux/rmap.h>
23	#include <linux/mmzone.h>
24	#include <linux/hugetlb.h>
25	#include <linux/memcontrol.h>
26	#include <linux/mm_inline.h>
27	#include <linux/secretmem.h>
28
29	#include "internal.h"
30
31	struct mlock_fbatch {
32	local_lock_t lock;
33	struct folio_batch fbatch;
34	};
35
36	static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
37	.lock = INIT_LOCAL_LOCK(lock),
38	};
39
40	bool can_do_mlock(void)
41	{
42	if (rlimit(RLIMIT_MEMLOCK) != `0`)
43	return true;
44	if (capable(CAP_IPC_LOCK))
45	return true;
46	return false;
47	}
48	EXPORT_SYMBOL(can_do_mlock);
49
50	/*
51	* Mlocked folios are marked with the PG_mlocked flag for efficient testing
52	* in vmscan and, possibly, the fault path; and to support semi-accurate
53	* statistics.
54	*
55	* An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it
56	* will be ostensibly placed on the LRU "unevictable" list (actually no such
57	* list exists), rather than the [in]active lists. PG_unevictable is set to
58	* indicate the unevictable state.
59	*/
60
61	static struct lruvec __mlock_folio(struct* folio folio, struct* lruvec *lruvec)
62	{
63	/ There is nothing more we can do while it's off LRU /
64	if (!folio_test_clear_lru(folio))
65	return lruvec;
66
67	lruvec = folio_lruvec_relock_irq(folio, locked_lruvec: lruvec);
68
69	if (unlikely(folio_evictable(folio))) {
70	/*
71	* This is a little surprising, but quite possible: PG_mlocked
72	* must have got cleared already by another CPU. Could this
73	* folio be unevictable? I'm not sure, but move it now if so.
74	*/
75	if (folio_test_unevictable(folio)) {
76	lruvec_del_folio(lruvec, folio);
77	folio_clear_unevictable(folio);
78	lruvec_add_folio(lruvec, folio);
79
80	__count_vm_events(item: UNEVICTABLE_PGRESCUED,
81	delta: folio_nr_pages(folio));
82	}
83	goto out;
84	}
85
86	if (folio_test_unevictable(folio)) {
87	if (folio_test_mlocked(folio))
88	folio->mlock_count++;
89	goto out;
90	}
91
92	lruvec_del_folio(lruvec, folio);
93	folio_clear_active(folio);
94	folio_set_unevictable(folio);
95	folio->mlock_count = !!folio_test_mlocked(folio);
96	lruvec_add_folio(lruvec, folio);
97	__count_vm_events(item: UNEVICTABLE_PGCULLED, delta: folio_nr_pages(folio));
98	out:
99	folio_set_lru(folio);
100	return lruvec;
101	}
102
103	static struct lruvec __mlock_new_folio(struct* folio folio, struct* lruvec *lruvec)
104	{
105	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
106
107	lruvec = folio_lruvec_relock_irq(folio, locked_lruvec: lruvec);
108
109	/ As above, this is a little surprising, but possible /
110	if (unlikely(folio_evictable(folio)))
111	goto out;
112
113	folio_set_unevictable(folio);
114	folio->mlock_count = !!folio_test_mlocked(folio);
115	__count_vm_events(item: UNEVICTABLE_PGCULLED, delta: folio_nr_pages(folio));
116	out:
117	lruvec_add_folio(lruvec, folio);
118	folio_set_lru(folio);
119	return lruvec;
120	}
121
122	static struct lruvec __munlock_folio(struct* folio folio, struct* lruvec *lruvec)
123	{
124	int nr_pages = folio_nr_pages(folio);
125	bool isolated = false;
126
127	if (!folio_test_clear_lru(folio))
128	goto munlock;
129
130	isolated = true;
131	lruvec = folio_lruvec_relock_irq(folio, locked_lruvec: lruvec);
132
133	if (folio_test_unevictable(folio)) {
134	/ Then mlock_count is maintained, but might undercount /
135	if (folio->mlock_count)
136	folio->mlock_count--;
137	if (folio->mlock_count)
138	goto out;
139	}
140	/ else assume that was the last mlock: reclaim will fix it if not /
141
142	munlock:
143	if (folio_test_clear_mlocked(folio)) {
144	__zone_stat_mod_folio(folio, item: NR_MLOCK, nr: -nr_pages);
145	if (isolated \|\| !folio_test_unevictable(folio))
146	__count_vm_events(item: UNEVICTABLE_PGMUNLOCKED, delta: nr_pages);
147	else
148	__count_vm_events(item: UNEVICTABLE_PGSTRANDED, delta: nr_pages);
149	}
150
151	/ folio_evictable() has to be checked after clearing Mlocked /
152	if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
153	lruvec_del_folio(lruvec, folio);
154	folio_clear_unevictable(folio);
155	lruvec_add_folio(lruvec, folio);
156	__count_vm_events(item: UNEVICTABLE_PGRESCUED, delta: nr_pages);
157	}
158	out:
159	if (isolated)
160	folio_set_lru(folio);
161	return lruvec;
162	}
163
164	/*
165	* Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
166	*/
167	#define LRU_FOLIO 0x1
168	#define NEW_FOLIO 0x2
169	static inline struct folio mlock_lru(struct* folio *folio)
170	{
171	return (struct folio )((unsigned* long)folio + LRU_FOLIO);
172	}
173
174	static inline struct folio mlock_new(struct* folio *folio)
175	{
176	return (struct folio )((unsigned* long)folio + NEW_FOLIO);
177	}
178
179	/*
180	* mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
181	* make use of such folio pointer flags in future, but for now just keep it for
182	* mlock. We could use three separate folio batches instead, but one feels
183	* better (munlocking a full folio batch does not need to drain mlocking folio
184	* batches first).
185	*/
186	static void mlock_folio_batch(struct folio_batch *fbatch)
187	{
188	struct lruvec *lruvec = NULL;
189	unsigned long mlock;
190	struct folio *folio;
191	int i;
192
193	for (i = `0`; i < folio_batch_count(fbatch); i++) {
194	folio = fbatch->folios[i];
195	mlock = (unsigned long)folio & (LRU_FOLIO \| NEW_FOLIO);
196	folio = (struct folio )((unsigned* long)folio - mlock);
197	fbatch->folios[i] = folio;
198
199	if (mlock & LRU_FOLIO)
200	lruvec = __mlock_folio(folio, lruvec);
201	else if (mlock & NEW_FOLIO)
202	lruvec = __mlock_new_folio(folio, lruvec);
203	else
204	lruvec = __munlock_folio(folio, lruvec);
205	}
206
207	if (lruvec)
208	unlock_page_lruvec_irq(lruvec);
209	folios_put(folios: fbatch->folios, nr: folio_batch_count(fbatch));
210	folio_batch_reinit(fbatch);
211	}
212
213	void mlock_drain_local(void)
214	{
215	struct folio_batch *fbatch;
216
217	local_lock(&mlock_fbatch.lock);
218	fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
219	if (folio_batch_count(fbatch))
220	mlock_folio_batch(fbatch);
221	local_unlock(&mlock_fbatch.lock);
222	}
223
224	void mlock_drain_remote(int cpu)
225	{
226	struct folio_batch *fbatch;
227
228	WARN_ON_ONCE(cpu_online(cpu));
229	fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
230	if (folio_batch_count(fbatch))
231	mlock_folio_batch(fbatch);
232	}
233
234	bool need_mlock_drain(int cpu)
235	{
236	return folio_batch_count(fbatch: &per_cpu(mlock_fbatch.fbatch, cpu));
237	}
238
239	/**
240	* mlock_folio - mlock a folio already on (or temporarily off) LRU
241	* @folio: folio to be mlocked.
242	*/
243	void mlock_folio(struct folio *folio)
244	{
245	struct folio_batch *fbatch;
246
247	local_lock(&mlock_fbatch.lock);
248	fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
249
250	if (!folio_test_set_mlocked(folio)) {
251	int nr_pages = folio_nr_pages(folio);
252
253	zone_stat_mod_folio(folio, item: NR_MLOCK, nr: nr_pages);
254	__count_vm_events(item: UNEVICTABLE_PGMLOCKED, delta: nr_pages);
255	}
256
257	folio_get(folio);
258	if (!folio_batch_add(fbatch, folio: mlock_lru(folio)) \|\|
259	folio_test_large(folio) \|\| lru_cache_disabled())
260	mlock_folio_batch(fbatch);
261	local_unlock(&mlock_fbatch.lock);
262	}
263
264	/**
265	* mlock_new_folio - mlock a newly allocated folio not yet on LRU
266	* @folio: folio to be mlocked, either normal or a THP head.
267	*/
268	void mlock_new_folio(struct folio *folio)
269	{
270	struct folio_batch *fbatch;
271	int nr_pages = folio_nr_pages(folio);
272
273	local_lock(&mlock_fbatch.lock);
274	fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
275	folio_set_mlocked(folio);
276
277	zone_stat_mod_folio(folio, item: NR_MLOCK, nr: nr_pages);
278	__count_vm_events(item: UNEVICTABLE_PGMLOCKED, delta: nr_pages);
279
280	folio_get(folio);
281	if (!folio_batch_add(fbatch, folio: mlock_new(folio)) \|\|
282	folio_test_large(folio) \|\| lru_cache_disabled())
283	mlock_folio_batch(fbatch);
284	local_unlock(&mlock_fbatch.lock);
285	}
286
287	/**
288	* munlock_folio - munlock a folio
289	* @folio: folio to be munlocked, either normal or a THP head.
290	*/
291	void munlock_folio(struct folio *folio)
292	{
293	struct folio_batch *fbatch;
294
295	local_lock(&mlock_fbatch.lock);
296	fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
297	/*
298	* folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
299	* which will check whether the folio is multiply mlocked.
300	*/
301	folio_get(folio);
302	if (!folio_batch_add(fbatch, folio) \|\|
303	folio_test_large(folio) \|\| lru_cache_disabled())
304	mlock_folio_batch(fbatch);
305	local_unlock(&mlock_fbatch.lock);
306	}
307
308	static inline unsigned int folio_mlock_step(struct folio *folio,
309	pte_t pte, unsigned* long addr, unsigned long end)
310	{
311	unsigned int count, i, nr = folio_nr_pages(folio);
312	unsigned long pfn = folio_pfn(folio);
313	pte_t ptent = ptep_get(ptep: pte);
314
315	if (!folio_test_large(folio))
316	return `1`;
317
318	count = pfn + nr - pte_pfn(pte: ptent);
319	count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);
320
321	for (i = `0`; i < count; i++, pte++) {
322	pte_t entry = ptep_get(ptep: pte);
323
324	if (!pte_present(a: entry))
325	break;
326	if (pte_pfn(pte: entry) - pfn >= nr)
327	break;
328	}
329
330	return i;
331	}
332
333	static inline bool allow_mlock_munlock(struct folio *folio,
334	struct vm_area_struct vma, unsigned* long start,
335	unsigned long end, unsigned int step)
336	{
337	/*
338	* For unlock, allow munlock large folio which is partially
339	* mapped to VMA. As it's possible that large folio is
340	* mlocked and VMA is split later.
341	*
342	* During memory pressure, such kind of large folio can
343	* be split. And the pages are not in VM_LOCKed VMA
344	* can be reclaimed.
345	*/
346	if (!(vma->vm_flags & VM_LOCKED))
347	return true;
348
349	/ folio_within_range() cannot take KSM, but any small folio is OK /
350	if (!folio_test_large(folio))
351	return true;
352
353	/ folio not in range [start, end), skip mlock /
354	if (!folio_within_range(folio, vma, start, end))
355	return false;
356
357	/ folio is not fully mapped, skip mlock /
358	if (step != folio_nr_pages(folio))
359	return false;
360
361	return true;
362	}
363
364	static int mlock_pte_range(pmd_t pmd, unsigned* long addr,
365	unsigned long end, struct mm_walk *walk)
366
367	{
368	struct vm_area_struct *vma = walk->vma;
369	spinlock_t *ptl;
370	pte_t start_pte, pte;
371	pte_t ptent;
372	struct folio *folio;
373	unsigned int step = `1`;
374	unsigned long start = addr;
375
376	ptl = pmd_trans_huge_lock(pmd, vma);
377	if (ptl) {
378	if (!pmd_present(pmd: *pmd))
379	goto out;
380	if (is_huge_zero_pmd(pmd: *pmd))
381	goto out;
382	folio = page_folio(pmd_page(*pmd));
383	if (vma->vm_flags & VM_LOCKED)
384	mlock_folio(folio);
385	else
386	munlock_folio(folio);
387	goto out;
388	}
389
390	start_pte = pte_offset_map_lock(mm: vma->vm_mm, pmd, addr, ptlp: &ptl);
391	if (!start_pte) {
392	walk->action = ACTION_AGAIN;
393	return `0`;
394	}
395
396	for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
397	ptent = ptep_get(ptep: pte);
398	if (!pte_present(a: ptent))
399	continue;
400	folio = vm_normal_folio(vma, addr, pte: ptent);
401	if (!folio \|\| folio_is_zone_device(folio))
402	continue;
403
404	step = folio_mlock_step(folio, pte, addr, end);
405	if (!allow_mlock_munlock(folio, vma, start, end, step))
406	goto next_entry;
407
408	if (vma->vm_flags & VM_LOCKED)
409	mlock_folio(folio);
410	else
411	munlock_folio(folio);
412
413	next_entry:
414	pte += step - `1`;
415	addr += (step - `1`) << PAGE_SHIFT;
416	}
417	pte_unmap(pte: start_pte);
418	out:
419	spin_unlock(lock: ptl);
420	cond_resched();
421	return `0`;
422	}
423
424	/*
425	* mlock_vma_pages_range() - mlock any pages already in the range,
426	* or munlock all pages in the range.
427	* @vma - vma containing range to be mlock()ed or munlock()ed
428	* @start - start address in @vma of the range
429	* @end - end of range in @vma
430	* @newflags - the new set of flags for @vma.
431	*
432	* Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
433	* called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
434	*/
435	static void mlock_vma_pages_range(struct vm_area_struct *vma,
436	unsigned long start, unsigned long end, vm_flags_t newflags)
437	{
438	static const struct mm_walk_ops mlock_walk_ops = {
439	.pmd_entry = mlock_pte_range,
440	.walk_lock = PGWALK_WRLOCK_VERIFY,
441	};
442
443	/*
444	* There is a slight chance that concurrent page migration,
445	* or page reclaim finding a page of this now-VM_LOCKED vma,
446	* will call mlock_vma_folio() and raise page's mlock_count:
447	* double counting, leaving the page unevictable indefinitely.
448	* Communicate this danger to mlock_vma_folio() with VM_IO,
449	* which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
450	* mmap_lock is held in write mode here, so this weird
451	* combination should not be visible to other mmap_lock users;
452	* but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
453	*/
454	if (newflags & VM_LOCKED)
455	newflags \|= VM_IO;
456	vma_start_write(vma);
457	vm_flags_reset_once(vma, flags: newflags);
458
459	lru_add_drain();
460	walk_page_range(mm: vma->vm_mm, start, end, ops: &mlock_walk_ops, NULL);
461	lru_add_drain();
462
463	if (newflags & VM_IO) {
464	newflags &= ~VM_IO;
465	vm_flags_reset_once(vma, flags: newflags);
466	}
467	}
468
469	/*
470	* mlock_fixup - handle mlock[all]/munlock[all] requests.
471	*
472	* Filters out "special" vmas -- VM_LOCKED never gets set for these, and
473	* munlock is a no-op. However, for some special vmas, we go ahead and
474	* populate the ptes.
475	*
476	* For vmas that pass the filters, merge/split as appropriate.
477	*/
478	static int mlock_fixup(struct vma_iterator vmi, struct* vm_area_struct *vma,
479	struct vm_area_struct *prev, unsigned* long start,
480	unsigned long end, vm_flags_t newflags)
481	{
482	struct mm_struct *mm = vma->vm_mm;
483	int nr_pages;
484	int ret = `0`;
485	vm_flags_t oldflags = vma->vm_flags;
486
487	if (newflags == oldflags \|\| (oldflags & VM_SPECIAL) \|\|
488	is_vm_hugetlb_page(vma) \|\| vma == get_gate_vma(current->mm) \|\|
489	vma_is_dax(vma) \|\| vma_is_secretmem(vma))
490	/ don't set VM_LOCKED or VM_LOCKONFAULT and don't count /
491	goto out;
492
493	vma = vma_modify_flags(vmi, prev: *prev, vma, start, end, new_flags: newflags);
494	if (IS_ERR(ptr: vma)) {
495	ret = PTR_ERR(ptr: vma);
496	goto out;
497	}
498
499	/*
500	* Keep track of amount of locked VM.
501	*/
502	nr_pages = (end - start) >> PAGE_SHIFT;
503	if (!(newflags & VM_LOCKED))
504	nr_pages = -nr_pages;
505	else if (oldflags & VM_LOCKED)
506	nr_pages = `0`;
507	mm->locked_vm += nr_pages;
508
509	/*
510	* vm_flags is protected by the mmap_lock held in write mode.
511	* It's okay if try_to_unmap_one unmaps a page just after we
512	* set VM_LOCKED, populate_vma_page_range will bring it back.
513	*/
514	if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
515	/ No work to do, and mlocking twice would be wrong /
516	vma_start_write(vma);
517	vm_flags_reset(vma, flags: newflags);
518	} else {
519	mlock_vma_pages_range(vma, start, end, newflags);
520	}
521	out:
522	*prev = vma;
523	return ret;
524	}
525
526	static int apply_vma_lock_flags(unsigned long start, size_t len,
527	vm_flags_t flags)
528	{
529	unsigned long nstart, end, tmp;
530	struct vm_area_struct vma, prev;
531	VMA_ITERATOR(vmi, current->mm, start);
532
533	VM_BUG_ON(offset_in_page(start));
534	VM_BUG_ON(len != PAGE_ALIGN(len));
535	end = start + len;
536	if (end < start)
537	return -EINVAL;
538	if (end == start)
539	return `0`;
540	vma = vma_iter_load(vmi: &vmi);
541	if (!vma)
542	return -ENOMEM;
543
544	prev = vma_prev(vmi: &vmi);
545	if (start > vma->vm_start)
546	prev = vma;
547
548	nstart = start;
549	tmp = vma->vm_start;
550	for_each_vma_range(vmi, vma, end) {
551	int error;
552	vm_flags_t newflags;
553
554	if (vma->vm_start != tmp)
555	return -ENOMEM;
556
557	newflags = vma->vm_flags & ~VM_LOCKED_MASK;
558	newflags \|= flags;
559	/ Here we know that vma->vm_start <= nstart < vma->vm_end. /
560	tmp = vma->vm_end;
561	if (tmp > end)
562	tmp = end;
563	error = mlock_fixup(vmi: &vmi, vma, prev: &prev, start: nstart, end: tmp, newflags);
564	if (error)
565	return error;
566	tmp = vma_iter_end(vmi: &vmi);
567	nstart = tmp;
568	}
569
570	if (tmp < end)
571	return -ENOMEM;
572
573	return `0`;
574	}
575
576	/*
577	* Go through vma areas and sum size of mlocked
578	* vma pages, as return value.
579	* Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
580	* is also counted.
581	* Return value: previously mlocked page counts
582	*/
583	static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
584	unsigned long start, size_t len)
585	{
586	struct vm_area_struct *vma;
587	unsigned long count = `0`;
588	unsigned long end;
589	VMA_ITERATOR(vmi, mm, start);
590
591	/ Don't overflow past ULONG_MAX /
592	if (unlikely(ULONG_MAX - len < start))
593	end = ULONG_MAX;
594	else
595	end = start + len;
596
597	for_each_vma_range(vmi, vma, end) {
598	if (vma->vm_flags & VM_LOCKED) {
599	if (start > vma->vm_start)
600	count -= (start - vma->vm_start);
601	if (end < vma->vm_end) {
602	count += end - vma->vm_start;
603	break;
604	}
605	count += vma->vm_end - vma->vm_start;
606	}
607	}
608
609	return count >> PAGE_SHIFT;
610	}
611
612	/*
613	* convert get_user_pages() return value to posix mlock() error
614	*/
615	static int __mlock_posix_error_return(long retval)
616	{
617	if (retval == -EFAULT)
618	retval = -ENOMEM;
619	else if (retval == -ENOMEM)
620	retval = -EAGAIN;
621	return retval;
622	}
623
624	static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
625	{
626	unsigned long locked;
627	unsigned long lock_limit;
628	int error = -ENOMEM;
629
630	start = untagged_addr(start);
631
632	if (!can_do_mlock())
633	return -EPERM;
634
635	len = PAGE_ALIGN(len + (offset_in_page(start)));
636	start &= PAGE_MASK;
637
638	lock_limit = rlimit(RLIMIT_MEMLOCK);
639	lock_limit >>= PAGE_SHIFT;
640	locked = len >> PAGE_SHIFT;
641
642	if (mmap_write_lock_killable(current->mm))
643	return -EINTR;
644
645	locked += current->mm->locked_vm;
646	if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
647	/*
648	* It is possible that the regions requested intersect with
649	* previously mlocked areas, that part area in "mm->locked_vm"
650	* should not be counted to new mlock increment count. So check
651	* and adjust locked count if necessary.
652	*/
653	locked -= count_mm_mlocked_page_nr(current->mm,
654	start, len);
655	}
656
657	/ check against resource limits /
658	if ((locked <= lock_limit) \|\| capable(CAP_IPC_LOCK))
659	error = apply_vma_lock_flags(start, len, flags);
660
661	mmap_write_unlock(current->mm);
662	if (error)
663	return error;
664
665	error = __mm_populate(addr: start, len, ignore_errors: `0`);
666	if (error)
667	return __mlock_posix_error_return(retval: error);
668	return `0`;
669	}
670
671	SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
672	{
673	return do_mlock(start, len, VM_LOCKED);
674	}
675
676	SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
677	{
678	vm_flags_t vm_flags = VM_LOCKED;
679
680	if (flags & ~MLOCK_ONFAULT)
681	return -EINVAL;
682
683	if (flags & MLOCK_ONFAULT)
684	vm_flags \|= VM_LOCKONFAULT;
685
686	return do_mlock(start, len, flags: vm_flags);
687	}
688
689	SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
690	{
691	int ret;
692
693	start = untagged_addr(start);
694
695	len = PAGE_ALIGN(len + (offset_in_page(start)));
696	start &= PAGE_MASK;
697
698	if (mmap_write_lock_killable(current->mm))
699	return -EINTR;
700	ret = apply_vma_lock_flags(start, len, flags: `0`);
701	mmap_write_unlock(current->mm);
702
703	return ret;
704	}
705
706	/*
707	* Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
708	* and translate into the appropriate modifications to mm->def_flags and/or the
709	* flags for all current VMAs.
710	*
711	* There are a couple of subtleties with this. If mlockall() is called multiple
712	* times with different flags, the values do not necessarily stack. If mlockall
713	* is called once including the MCL_FUTURE flag and then a second time without
714	* it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
715	*/
716	static int apply_mlockall_flags(int flags)
717	{
718	VMA_ITERATOR(vmi, current->mm, `0`);
719	struct vm_area_struct vma, prev = NULL;
720	vm_flags_t to_add = `0`;
721
722	current->mm->def_flags &= ~VM_LOCKED_MASK;
723	if (flags & MCL_FUTURE) {
724	current->mm->def_flags \|= VM_LOCKED;
725
726	if (flags & MCL_ONFAULT)
727	current->mm->def_flags \|= VM_LOCKONFAULT;
728
729	if (!(flags & MCL_CURRENT))
730	goto out;
731	}
732
733	if (flags & MCL_CURRENT) {
734	to_add \|= VM_LOCKED;
735	if (flags & MCL_ONFAULT)
736	to_add \|= VM_LOCKONFAULT;
737	}
738
739	for_each_vma(vmi, vma) {
740	vm_flags_t newflags;
741
742	newflags = vma->vm_flags & ~VM_LOCKED_MASK;
743	newflags \|= to_add;
744
745	/ Ignore errors /
746	mlock_fixup(vmi: &vmi, vma, prev: &prev, start: vma->vm_start, end: vma->vm_end,
747	newflags);
748	cond_resched();
749	}
750	out:
751	return `0`;
752	}
753
754	SYSCALL_DEFINE1(mlockall, int, flags)
755	{
756	unsigned long lock_limit;
757	int ret;
758
759	if (!flags \|\| (flags & ~(MCL_CURRENT \| MCL_FUTURE \| MCL_ONFAULT)) \|\|
760	flags == MCL_ONFAULT)
761	return -EINVAL;
762
763	if (!can_do_mlock())
764	return -EPERM;
765
766	lock_limit = rlimit(RLIMIT_MEMLOCK);
767	lock_limit >>= PAGE_SHIFT;
768
769	if (mmap_write_lock_killable(current->mm))
770	return -EINTR;
771
772	ret = -ENOMEM;
773	if (!(flags & MCL_CURRENT) \|\| (current->mm->total_vm <= lock_limit) \|\|
774	capable(CAP_IPC_LOCK))
775	ret = apply_mlockall_flags(flags);
776	mmap_write_unlock(current->mm);
777	if (!ret && (flags & MCL_CURRENT))
778	mm_populate(addr: `0`, TASK_SIZE);
779
780	return ret;
781	}
782
783	SYSCALL_DEFINE0(munlockall)
784	{
785	int ret;
786
787	if (mmap_write_lock_killable(current->mm))
788	return -EINTR;
789	ret = apply_mlockall_flags(flags: `0`);
790	mmap_write_unlock(current->mm);
791	return ret;
792	}
793
794	/*
795	* Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
796	* shm segments) get accounted against the user_struct instead.
797	*/
798	static DEFINE_SPINLOCK(shmlock_user_lock);
799
800	int user_shm_lock(size_t size, struct ucounts *ucounts)
801	{
802	unsigned long lock_limit, locked;
803	long memlock;
804	int allowed = `0`;
805
806	locked = (size + PAGE_SIZE - `1`) >> PAGE_SHIFT;
807	lock_limit = rlimit(RLIMIT_MEMLOCK);
808	if (lock_limit != RLIM_INFINITY)
809	lock_limit >>= PAGE_SHIFT;
810	spin_lock(lock: &shmlock_user_lock);
811	memlock = inc_rlimit_ucounts(ucounts, type: UCOUNT_RLIMIT_MEMLOCK, v: locked);
812
813	if ((memlock == LONG_MAX \|\| memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
814	dec_rlimit_ucounts(ucounts, type: UCOUNT_RLIMIT_MEMLOCK, v: locked);
815	goto out;
816	}
817	if (!get_ucounts(ucounts)) {
818	dec_rlimit_ucounts(ucounts, type: UCOUNT_RLIMIT_MEMLOCK, v: locked);
819	allowed = `0`;
820	goto out;
821	}
822	allowed = `1`;
823	out:
824	spin_unlock(lock: &shmlock_user_lock);
825	return allowed;
826	}
827
828	void user_shm_unlock(size_t size, struct ucounts *ucounts)
829	{
830	spin_lock(lock: &shmlock_user_lock);
831	dec_rlimit_ucounts(ucounts, type: UCOUNT_RLIMIT_MEMLOCK, v: (size + PAGE_SIZE - `1`) >> PAGE_SHIFT);
832	spin_unlock(lock: &shmlock_user_lock);
833	put_ucounts(ucounts);
834	}
835

source code of linux/mm/mlock.c