util.c source code [linux/mm/util.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include <linux/mm.h>
3	#include <linux/slab.h>
4	#include <linux/string.h>
5	#include <linux/compiler.h>
6	#include <linux/export.h>
7	#include <linux/err.h>
8	#include <linux/sched.h>
9	#include <linux/sched/mm.h>
10	#include <linux/sched/signal.h>
11	#include <linux/sched/task_stack.h>
12	#include <linux/security.h>
13	#include <linux/swap.h>
14	#include <linux/swapops.h>
15	#include <linux/mman.h>
16	#include <linux/hugetlb.h>
17	#include <linux/vmalloc.h>
18	#include <linux/userfaultfd_k.h>
19	#include <linux/elf.h>
20	#include <linux/elf-randomize.h>
21	#include <linux/personality.h>
22	#include <linux/random.h>
23	#include <linux/processor.h>
24	#include <linux/sizes.h>
25	#include <linux/compat.h>
26
27	#include <linux/uaccess.h>
28
29	#include "internal.h"
30	#include "swap.h"
31
32	/**
33	* kfree_const - conditionally free memory
34	* @x: pointer to the memory
35	*
36	* Function calls kfree only if @x is not in .rodata section.
37	*/
38	void kfree_const(const void *x)
39	{
40	if (!is_kernel_rodata(addr: (unsigned long)x))
41	kfree(objp: x);
42	}
43	EXPORT_SYMBOL(kfree_const);
44
45	/**
46	* kstrdup - allocate space for and copy an existing string
47	* @s: the string to duplicate
48	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
49	*
50	* Return: newly allocated copy of @s or %NULL in case of error
51	*/
52	noinline
53	char kstrdup(const* char *s, gfp_t gfp)
54	{
55	size_t len;
56	char *buf;
57
58	if (!s)
59	return NULL;
60
61	len = strlen(s) + `1`;
62	buf = kmalloc_track_caller(len, gfp);
63	if (buf)
64	memcpy(buf, s, len);
65	return buf;
66	}
67	EXPORT_SYMBOL(kstrdup);
68
69	/**
70	* kstrdup_const - conditionally duplicate an existing const string
71	* @s: the string to duplicate
72	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
73	*
74	* Note: Strings allocated by kstrdup_const should be freed by kfree_const and
75	* must not be passed to krealloc().
76	*
77	* Return: source string if it is in .rodata section otherwise
78	* fallback to kstrdup.
79	*/
80	const char kstrdup_const(const* char *s, gfp_t gfp)
81	{
82	if (is_kernel_rodata(addr: (unsigned long)s))
83	return s;
84
85	return kstrdup(s, gfp);
86	}
87	EXPORT_SYMBOL(kstrdup_const);
88
89	/**
90	* kstrndup - allocate space for and copy an existing string
91	* @s: the string to duplicate
92	* @max: read at most @max chars from @s
93	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
94	*
95	* Note: Use kmemdup_nul() instead if the size is known exactly.
96	*
97	* Return: newly allocated copy of @s or %NULL in case of error
98	*/
99	char kstrndup(const* char *s, size_t max, gfp_t gfp)
100	{
101	size_t len;
102	char *buf;
103
104	if (!s)
105	return NULL;
106
107	len = strnlen(p: s, maxlen: max);
108	buf = kmalloc_track_caller(len+`1`, gfp);
109	if (buf) {
110	memcpy(buf, s, len);
111	buf[len] = `'\0'`;
112	}
113	return buf;
114	}
115	EXPORT_SYMBOL(kstrndup);
116
117	/**
118	* kmemdup - duplicate region of memory
119	*
120	* @src: memory region to duplicate
121	* @len: memory region length
122	* @gfp: GFP mask to use
123	*
124	* Return: newly allocated copy of @src or %NULL in case of error,
125	* result is physically contiguous. Use kfree() to free.
126	*/
127	void kmemdup(const* void *src, size_t len, gfp_t gfp)
128	{
129	void *p;
130
131	p = kmalloc_track_caller(len, gfp);
132	if (p)
133	memcpy(p, src, len);
134	return p;
135	}
136	EXPORT_SYMBOL(kmemdup);
137
138	/**
139	* kvmemdup - duplicate region of memory
140	*
141	* @src: memory region to duplicate
142	* @len: memory region length
143	* @gfp: GFP mask to use
144	*
145	* Return: newly allocated copy of @src or %NULL in case of error,
146	* result may be not physically contiguous. Use kvfree() to free.
147	*/
148	void kvmemdup(const* void *src, size_t len, gfp_t gfp)
149	{
150	void *p;
151
152	p = kvmalloc(size: len, flags: gfp);
153	if (p)
154	memcpy(p, src, len);
155	return p;
156	}
157	EXPORT_SYMBOL(kvmemdup);
158
159	/**
160	* kmemdup_nul - Create a NUL-terminated string from unterminated data
161	* @s: The data to stringify
162	* @len: The size of the data
163	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
164	*
165	* Return: newly allocated copy of @s with NUL-termination or %NULL in
166	* case of error
167	*/
168	char kmemdup_nul(const* char *s, size_t len, gfp_t gfp)
169	{
170	char *buf;
171
172	if (!s)
173	return NULL;
174
175	buf = kmalloc_track_caller(len + `1`, gfp);
176	if (buf) {
177	memcpy(buf, s, len);
178	buf[len] = `'\0'`;
179	}
180	return buf;
181	}
182	EXPORT_SYMBOL(kmemdup_nul);
183
184	/**
185	* memdup_user - duplicate memory region from user space
186	*
187	* @src: source address in user space
188	* @len: number of bytes to copy
189	*
190	* Return: an ERR_PTR() on failure. Result is physically
191	* contiguous, to be freed by kfree().
192	*/
193	void memdup_user(const* void __user *src, size_t len)
194	{
195	void *p;
196
197	p = kmalloc_track_caller(len, GFP_USER \| __GFP_NOWARN);
198	if (!p)
199	return ERR_PTR(error: -ENOMEM);
200
201	if (copy_from_user(to: p, from: src, n: len)) {
202	kfree(objp: p);
203	return ERR_PTR(error: -EFAULT);
204	}
205
206	return p;
207	}
208	EXPORT_SYMBOL(memdup_user);
209
210	/**
211	* vmemdup_user - duplicate memory region from user space
212	*
213	* @src: source address in user space
214	* @len: number of bytes to copy
215	*
216	* Return: an ERR_PTR() on failure. Result may be not
217	* physically contiguous. Use kvfree() to free.
218	*/
219	void vmemdup_user(const* void __user *src, size_t len)
220	{
221	void *p;
222
223	p = kvmalloc(size: len, GFP_USER);
224	if (!p)
225	return ERR_PTR(error: -ENOMEM);
226
227	if (copy_from_user(to: p, from: src, n: len)) {
228	kvfree(addr: p);
229	return ERR_PTR(error: -EFAULT);
230	}
231
232	return p;
233	}
234	EXPORT_SYMBOL(vmemdup_user);
235
236	/**
237	* strndup_user - duplicate an existing string from user space
238	* @s: The string to duplicate
239	* @n: Maximum number of bytes to copy, including the trailing NUL.
240	*
241	* Return: newly allocated copy of @s or an ERR_PTR() in case of error
242	*/
243	char strndup_user(const* char __user s, long* n)
244	{
245	char *p;
246	long length;
247
248	length = strnlen_user(str: s, n);
249
250	if (!length)
251	return ERR_PTR(error: -EFAULT);
252
253	if (length > n)
254	return ERR_PTR(error: -EINVAL);
255
256	p = memdup_user(s, length);
257
258	if (IS_ERR(ptr: p))
259	return p;
260
261	p[length - `1`] = `'\0'`;
262
263	return p;
264	}
265	EXPORT_SYMBOL(strndup_user);
266
267	/**
268	* memdup_user_nul - duplicate memory region from user space and NUL-terminate
269	*
270	* @src: source address in user space
271	* @len: number of bytes to copy
272	*
273	* Return: an ERR_PTR() on failure.
274	*/
275	void memdup_user_nul(const* void __user *src, size_t len)
276	{
277	char *p;
278
279	/*
280	* Always use GFP_KERNEL, since copy_from_user() can sleep and
281	* cause pagefault, which makes it pointless to use GFP_NOFS
282	* or GFP_ATOMIC.
283	*/
284	p = kmalloc_track_caller(len + `1`, GFP_KERNEL);
285	if (!p)
286	return ERR_PTR(error: -ENOMEM);
287
288	if (copy_from_user(to: p, from: src, n: len)) {
289	kfree(objp: p);
290	return ERR_PTR(error: -EFAULT);
291	}
292	p[len] = `'\0'`;
293
294	return p;
295	}
296	EXPORT_SYMBOL(memdup_user_nul);
297
298	/ Check if the vma is being used as a stack by this task /
299	int vma_is_stack_for_current(struct vm_area_struct *vma)
300	{
301	struct task_struct * __maybe_unused t = current;
302
303	return (vma->vm_start <= KSTK_ESP(task: t) && vma->vm_end >= KSTK_ESP(task: t));
304	}
305
306	/*
307	* Change backing file, only valid to use during initial VMA setup.
308	*/
309	void vma_set_file(struct vm_area_struct vma, struct* file *file)
310	{
311	/ Changing an anonymous vma with this is illegal /
312	get_file(f: file);
313	swap(vma->vm_file, file);
314	fput(file);
315	}
316	EXPORT_SYMBOL(vma_set_file);
317
318	#ifndef STACK_RND_MASK
319	#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
320	#endif
321
322	unsigned long randomize_stack_top(unsigned long stack_top)
323	{
324	unsigned long random_variable = `0`;
325
326	if (current->flags & PF_RANDOMIZE) {
327	random_variable = get_random_long();
328	random_variable &= STACK_RND_MASK;
329	random_variable <<= PAGE_SHIFT;
330	}
331	#ifdef CONFIG_STACK_GROWSUP
332	return PAGE_ALIGN(stack_top) + random_variable;
333	#else
334	return PAGE_ALIGN(stack_top) - random_variable;
335	#endif
336	}
337
338	/**
339	* randomize_page - Generate a random, page aligned address
340	* @start: The smallest acceptable address the caller will take.
341	* @range: The size of the area, starting at @start, within which the
342	* random address must fall.
343	*
344	* If @start + @range would overflow, @range is capped.
345	*
346	* NOTE: Historical use of randomize_range, which this replaces, presumed that
347	* @start was already page aligned. We now align it regardless.
348	*
349	* Return: A page aligned address within [start, start + range). On error,
350	* @start is returned.
351	*/
352	unsigned long randomize_page(unsigned long start, unsigned long range)
353	{
354	if (!PAGE_ALIGNED(start)) {
355	range -= PAGE_ALIGN(start) - start;
356	start = PAGE_ALIGN(start);
357	}
358
359	if (start > ULONG_MAX - range)
360	range = ULONG_MAX - start;
361
362	range >>= PAGE_SHIFT;
363
364	if (range == `0`)
365	return start;
366
367	return start + (get_random_long() % range << PAGE_SHIFT);
368	}
369
370	#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
371	unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
372	{
373	/ Is the current task 32bit ? /
374	if (!IS_ENABLED(CONFIG_64BIT) \|\| is_compat_task())
375	return randomize_page(mm->brk, SZ_32M);
376
377	return randomize_page(mm->brk, SZ_1G);
378	}
379
380	unsigned long arch_mmap_rnd(void)
381	{
382	unsigned long rnd;
383
384	#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
385	if (is_compat_task())
386	rnd = get_random_long() & ((`1UL` << mmap_rnd_compat_bits) - `1`);
387	else
388	#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
389	rnd = get_random_long() & ((`1UL` << mmap_rnd_bits) - `1`);
390
391	return rnd << PAGE_SHIFT;
392	}
393
394	static int mmap_is_legacy(struct rlimit *rlim_stack)
395	{
396	if (current->personality & ADDR_COMPAT_LAYOUT)
397	return `1`;
398
399	/ On parisc the stack always grows up - so a unlimited stack should*
400	* not be an indicator to use the legacy memory layout. */
401	if (rlim_stack->rlim_cur == RLIM_INFINITY &&
402	!IS_ENABLED(CONFIG_STACK_GROWSUP))
403	return `1`;
404
405	return sysctl_legacy_va_layout;
406	}
407
408	/*
409	* Leave enough space between the mmap area and the stack to honour ulimit in
410	* the face of randomisation.
411	*/
412	#define MIN_GAP (SZ_128M)
413	#define MAX_GAP (STACK_TOP / 6 * 5)
414
415	static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
416	{
417	unsigned long gap = rlim_stack->rlim_cur;
418	unsigned long pad = stack_guard_gap;
419
420	/ Account for stack randomization if necessary /
421	if (current->flags & PF_RANDOMIZE)
422	pad += (STACK_RND_MASK << PAGE_SHIFT);
423
424	/ Values close to RLIM_INFINITY can overflow. /
425	if (gap + pad > gap)
426	gap += pad;
427
428	if (gap < MIN_GAP)
429	gap = MIN_GAP;
430	else if (gap > MAX_GAP)
431	gap = MAX_GAP;
432
433	return PAGE_ALIGN(STACK_TOP - gap - rnd);
434	}
435
436	void arch_pick_mmap_layout(struct mm_struct mm, struct* rlimit *rlim_stack)
437	{
438	unsigned long random_factor = `0UL`;
439
440	if (current->flags & PF_RANDOMIZE)
441	random_factor = arch_mmap_rnd();
442
443	if (mmap_is_legacy(rlim_stack)) {
444	mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
445	mm->get_unmapped_area = arch_get_unmapped_area;
446	} else {
447	mm->mmap_base = mmap_base(random_factor, rlim_stack);
448	mm->get_unmapped_area = arch_get_unmapped_area_topdown;
449	}
450	}
451	#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
452	void arch_pick_mmap_layout(struct mm_struct mm, struct* rlimit *rlim_stack)
453	{
454	mm->mmap_base = TASK_UNMAPPED_BASE;
455	mm->get_unmapped_area = arch_get_unmapped_area;
456	}
457	#endif
458
459	/**
460	* __account_locked_vm - account locked pages to an mm's locked_vm
461	* @mm: mm to account against
462	* @pages: number of pages to account
463	* @inc: %true if @pages should be considered positive, %false if not
464	* @task: task used to check RLIMIT_MEMLOCK
465	* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
466	*
467	* Assumes @task and @mm are valid (i.e. at least one reference on each), and
468	* that mmap_lock is held as writer.
469	*
470	* Return:
471	* * 0 on success
472	* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
473	*/
474	int __account_locked_vm(struct mm_struct mm, unsigned* long pages, bool inc,
475	struct task_struct *task, bool bypass_rlim)
476	{
477	unsigned long locked_vm, limit;
478	int ret = `0`;
479
480	mmap_assert_write_locked(mm);
481
482	locked_vm = mm->locked_vm;
483	if (inc) {
484	if (!bypass_rlim) {
485	limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
486	if (locked_vm + pages > limit)
487	ret = -ENOMEM;
488	}
489	if (!ret)
490	mm->locked_vm = locked_vm + pages;
491	} else {
492	WARN_ON_ONCE(pages > locked_vm);
493	mm->locked_vm = locked_vm - pages;
494	}
495
496	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
497	(void *)_RET_IP_, (inc) ? `'+'` : `'-'`, pages << PAGE_SHIFT,
498	locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
499	ret ? " - exceeded" : "");
500
501	return ret;
502	}
503	EXPORT_SYMBOL_GPL(__account_locked_vm);
504
505	/**
506	* account_locked_vm - account locked pages to an mm's locked_vm
507	* @mm: mm to account against, may be NULL
508	* @pages: number of pages to account
509	* @inc: %true if @pages should be considered positive, %false if not
510	*
511	* Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
512	*
513	* Return:
514	* * 0 on success, or if mm is NULL
515	* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
516	*/
517	int account_locked_vm(struct mm_struct mm, unsigned* long pages, bool inc)
518	{
519	int ret;
520
521	if (pages == `0` \|\| !mm)
522	return `0`;
523
524	mmap_write_lock(mm);
525	ret = __account_locked_vm(mm, pages, inc, current,
526	capable(CAP_IPC_LOCK));
527	mmap_write_unlock(mm);
528
529	return ret;
530	}
531	EXPORT_SYMBOL_GPL(account_locked_vm);
532
533	unsigned long vm_mmap_pgoff(struct file file, unsigned* long addr,
534	unsigned long len, unsigned long prot,
535	unsigned long flag, unsigned long pgoff)
536	{
537	unsigned long ret;
538	struct mm_struct *mm = current->mm;
539	unsigned long populate;
540	LIST_HEAD(uf);
541
542	ret = security_mmap_file(file, prot, flags: flag);
543	if (!ret) {
544	if (mmap_write_lock_killable(mm))
545	return -EINTR;
546	ret = do_mmap(file, addr, len, prot, flags: flag, vm_flags: `0`, pgoff, populate: &populate,
547	uf: &uf);
548	mmap_write_unlock(mm);
549	userfaultfd_unmap_complete(mm, uf: &uf);
550	if (populate)
551	mm_populate(addr: ret, len: populate);
552	}
553	return ret;
554	}
555
556	unsigned long vm_mmap(struct file file, unsigned* long addr,
557	unsigned long len, unsigned long prot,
558	unsigned long flag, unsigned long offset)
559	{
560	if (unlikely(offset + PAGE_ALIGN(len) < offset))
561	return -EINVAL;
562	if (unlikely(offset_in_page(offset)))
563	return -EINVAL;
564
565	return vm_mmap_pgoff(file, addr, len, prot, flag, pgoff: offset >> PAGE_SHIFT);
566	}
567	EXPORT_SYMBOL(vm_mmap);
568
569	/**
570	* kvmalloc_node - attempt to allocate physically contiguous memory, but upon
571	* failure, fall back to non-contiguous (vmalloc) allocation.
572	* @size: size of the request.
573	* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
574	* @node: numa node to allocate from
575	*
576	* Uses kmalloc to get the memory but if the allocation fails then falls back
577	* to the vmalloc allocator. Use kvfree for freeing the memory.
578	*
579	* GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
580	* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
581	* preferable to the vmalloc fallback, due to visible performance drawbacks.
582	*
583	* Return: pointer to the allocated memory of %NULL in case of failure
584	*/
585	void kvmalloc_node(size_t size, gfp_t flags, int* node)
586	{
587	gfp_t kmalloc_flags = flags;
588	void *ret;
589
590	/*
591	* We want to attempt a large physically contiguous block first because
592	* it is less likely to fragment multiple larger blocks and therefore
593	* contribute to a long term fragmentation less than vmalloc fallback.
594	* However make sure that larger requests are not too disruptive - no
595	* OOM killer and no allocation failure warnings as we have a fallback.
596	*/
597	if (size > PAGE_SIZE) {
598	kmalloc_flags \|= __GFP_NOWARN;
599
600	if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
601	kmalloc_flags \|= __GFP_NORETRY;
602
603	/ nofail semantic is implemented by the vmalloc fallback /
604	kmalloc_flags &= ~__GFP_NOFAIL;
605	}
606
607	ret = kmalloc_node(size, flags: kmalloc_flags, node);
608
609	/*
610	* It doesn't really make sense to fallback to vmalloc for sub page
611	* requests
612	*/
613	if (ret \|\| size <= PAGE_SIZE)
614	return ret;
615
616	/ non-sleeping allocations are not supported by vmalloc /
617	if (!gfpflags_allow_blocking(gfp_flags: flags))
618	return NULL;
619
620	/ Don't even allow crazy sizes /
621	if (unlikely(size > INT_MAX)) {
622	WARN_ON_ONCE(!(flags & __GFP_NOWARN));
623	return NULL;
624	}
625
626	/*
627	* kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
628	* since the callers already cannot assume anything
629	* about the resulting pointer, and cannot play
630	* protection games.
631	*/
632	return __vmalloc_node_range(size, align: `1`, VMALLOC_START, VMALLOC_END,
633	gfp_mask: flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
634	node, caller: __builtin_return_address(`0`));
635	}
636	EXPORT_SYMBOL(kvmalloc_node);
637
638	/**
639	* kvfree() - Free memory.
640	* @addr: Pointer to allocated memory.
641	*
642	* kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
643	* It is slightly more efficient to use kfree() or vfree() if you are certain
644	* that you know which one to use.
645	*
646	* Context: Either preemptible task context or not-NMI interrupt.
647	*/
648	void kvfree(const void *addr)
649	{
650	if (is_vmalloc_addr(x: addr))
651	vfree(addr);
652	else
653	kfree(objp: addr);
654	}
655	EXPORT_SYMBOL(kvfree);
656
657	/**
658	* kvfree_sensitive - Free a data object containing sensitive information.
659	* @addr: address of the data object to be freed.
660	* @len: length of the data object.
661	*
662	* Use the special memzero_explicit() function to clear the content of a
663	* kvmalloc'ed object containing sensitive data to make sure that the
664	* compiler won't optimize out the data clearing.
665	*/
666	void kvfree_sensitive(const void *addr, size_t len)
667	{
668	if (likely(!ZERO_OR_NULL_PTR(addr))) {
669	memzero_explicit(s: (void *)addr, count: len);
670	kvfree(addr);
671	}
672	}
673	EXPORT_SYMBOL(kvfree_sensitive);
674
675	void kvrealloc(const* void *p, size_t oldsize, size_t newsize, gfp_t flags)
676	{
677	void *newp;
678
679	if (oldsize >= newsize)
680	return (void *)p;
681	newp = kvmalloc(size: newsize, flags);
682	if (!newp)
683	return NULL;
684	memcpy(newp, p, oldsize);
685	kvfree(p);
686	return newp;
687	}
688	EXPORT_SYMBOL(kvrealloc);
689
690	/**
691	* __vmalloc_array - allocate memory for a virtually contiguous array.
692	* @n: number of elements.
693	* @size: element size.
694	* @flags: the type of memory to allocate (see kmalloc).
695	*/
696	void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
697	{
698	size_t bytes;
699
700	if (unlikely(check_mul_overflow(n, size, &bytes)))
701	return NULL;
702	return __vmalloc(size: bytes, gfp_mask: flags);
703	}
704	EXPORT_SYMBOL(__vmalloc_array);
705
706	/**
707	* vmalloc_array - allocate memory for a virtually contiguous array.
708	* @n: number of elements.
709	* @size: element size.
710	*/
711	void *vmalloc_array(size_t n, size_t size)
712	{
713	return __vmalloc_array(n, size, GFP_KERNEL);
714	}
715	EXPORT_SYMBOL(vmalloc_array);
716
717	/**
718	* __vcalloc - allocate and zero memory for a virtually contiguous array.
719	* @n: number of elements.
720	* @size: element size.
721	* @flags: the type of memory to allocate (see kmalloc).
722	*/
723	void *__vcalloc(size_t n, size_t size, gfp_t flags)
724	{
725	return __vmalloc_array(n, size, flags \| __GFP_ZERO);
726	}
727	EXPORT_SYMBOL(__vcalloc);
728
729	/**
730	* vcalloc - allocate and zero memory for a virtually contiguous array.
731	* @n: number of elements.
732	* @size: element size.
733	*/
734	void *vcalloc(size_t n, size_t size)
735	{
736	return __vmalloc_array(n, size, GFP_KERNEL \| __GFP_ZERO);
737	}
738	EXPORT_SYMBOL(vcalloc);
739
740	struct anon_vma folio_anon_vma(struct* folio *folio)
741	{
742	unsigned long mapping = (unsigned long)folio->mapping;
743
744	if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
745	return NULL;
746	return (void *)(mapping - PAGE_MAPPING_ANON);
747	}
748
749	/**
750	* folio_mapping - Find the mapping where this folio is stored.
751	* @folio: The folio.
752	*
753	* For folios which are in the page cache, return the mapping that this
754	* page belongs to. Folios in the swap cache return the swap mapping
755	* this page is stored in (which is different from the mapping for the
756	* swap file or swap device where the data is stored).
757	*
758	* You can call this for folios which aren't in the swap cache or page
759	* cache and it will return NULL.
760	*/
761	struct address_space folio_mapping(struct* folio *folio)
762	{
763	struct address_space *mapping;
764
765	/ This happens if someone calls flush_dcache_page on slab page /
766	if (unlikely(folio_test_slab(folio)))
767	return NULL;
768
769	if (unlikely(folio_test_swapcache(folio)))
770	return swap_address_space(folio->swap);
771
772	mapping = folio->mapping;
773	if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
774	return NULL;
775
776	return mapping;
777	}
778	EXPORT_SYMBOL(folio_mapping);
779
780	/**
781	* folio_copy - Copy the contents of one folio to another.
782	* @dst: Folio to copy to.
783	* @src: Folio to copy from.
784	*
785	* The bytes in the folio represented by @src are copied to @dst.
786	* Assumes the caller has validated that @dst is at least as large as @src.
787	* Can be called in atomic context for order-0 folios, but if the folio is
788	* larger, it may sleep.
789	*/
790	void folio_copy(struct folio dst, struct* folio *src)
791	{
792	long i = `0`;
793	long nr = folio_nr_pages(folio: src);
794
795	for (;;) {
796	copy_highpage(folio_page(dst, i), folio_page(src, i));
797	if (++i == nr)
798	break;
799	cond_resched();
800	}
801	}
802	EXPORT_SYMBOL(folio_copy);
803
804	int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
805	int sysctl_overcommit_ratio __read_mostly = `50`;
806	unsigned long sysctl_overcommit_kbytes __read_mostly;
807	int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
808	unsigned long sysctl_user_reserve_kbytes __read_mostly = `1UL` << `17`; / 128MB /
809	unsigned long sysctl_admin_reserve_kbytes __read_mostly = `1UL` << `13`; / 8MB /
810
811	int overcommit_ratio_handler(struct ctl_table table, int* write, void *buffer,
812	size_t lenp, loff_t ppos)
813	{
814	int ret;
815
816	ret = proc_dointvec(table, write, buffer, lenp, ppos);
817	if (ret == `0` && write)
818	sysctl_overcommit_kbytes = `0`;
819	return ret;
820	}
821
822	static void sync_overcommit_as(struct work_struct *dummy)
823	{
824	percpu_counter_sync(fbc: &vm_committed_as);
825	}
826
827	int overcommit_policy_handler(struct ctl_table table, int* write, void *buffer,
828	size_t lenp, loff_t ppos)
829	{
830	struct ctl_table t;
831	int new_policy = -`1`;
832	int ret;
833
834	/*
835	* The deviation of sync_overcommit_as could be big with loose policy
836	* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
837	* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
838	* with the strict "NEVER", and to avoid possible race condition (even
839	* though user usually won't too frequently do the switching to policy
840	* OVERCOMMIT_NEVER), the switch is done in the following order:
841	* 1. changing the batch
842	* 2. sync percpu count on each CPU
843	* 3. switch the policy
844	*/
845	if (write) {
846	t = *table;
847	t.data = &new_policy;
848	ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
849	if (ret \|\| new_policy == -`1`)
850	return ret;
851
852	mm_compute_batch(overcommit_policy: new_policy);
853	if (new_policy == OVERCOMMIT_NEVER)
854	schedule_on_each_cpu(func: sync_overcommit_as);
855	sysctl_overcommit_memory = new_policy;
856	} else {
857	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
858	}
859
860	return ret;
861	}
862
863	int overcommit_kbytes_handler(struct ctl_table table, int* write, void *buffer,
864	size_t lenp, loff_t ppos)
865	{
866	int ret;
867
868	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
869	if (ret == `0` && write)
870	sysctl_overcommit_ratio = `0`;
871	return ret;
872	}
873
874	/*
875	* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
876	*/
877	unsigned long vm_commit_limit(void)
878	{
879	unsigned long allowed;
880
881	if (sysctl_overcommit_kbytes)
882	allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - `10`);
883	else
884	allowed = ((totalram_pages() - hugetlb_total_pages())
885	* sysctl_overcommit_ratio / `100`);
886	allowed += total_swap_pages;
887
888	return allowed;
889	}
890
891	/*
892	* Make sure vm_committed_as in one cacheline and not cacheline shared with
893	* other variables. It can be updated by several CPUs frequently.
894	*/
895	struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
896
897	/*
898	* The global memory commitment made in the system can be a metric
899	* that can be used to drive ballooning decisions when Linux is hosted
900	* as a guest. On Hyper-V, the host implements a policy engine for dynamically
901	* balancing memory across competing virtual machines that are hosted.
902	* Several metrics drive this policy engine including the guest reported
903	* memory commitment.
904	*
905	* The time cost of this is very low for small platforms, and for big
906	* platform like a 2S/36C/72T Skylake server, in worst case where
907	* vm_committed_as's spinlock is under severe contention, the time cost
908	* could be about 30~40 microseconds.
909	*/
910	unsigned long vm_memory_committed(void)
911	{
912	return percpu_counter_sum_positive(fbc: &vm_committed_as);
913	}
914	EXPORT_SYMBOL_GPL(vm_memory_committed);
915
916	/*
917	* Check that a process has enough memory to allocate a new virtual
918	* mapping. 0 means there is enough memory for the allocation to
919	* succeed and -ENOMEM implies there is not.
920	*
921	* We currently support three overcommit policies, which are set via the
922	* vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
923	*
924	* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
925	* Additional code 2002 Jul 20 by Robert Love.
926	*
927	* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
928	*
929	* Note this is a helper function intended to be used by LSMs which
930	* wish to use this logic.
931	*/
932	int __vm_enough_memory(struct mm_struct mm, long* pages, int cap_sys_admin)
933	{
934	long allowed;
935
936	vm_acct_memory(pages);
937
938	/*
939	* Sometimes we want to use more memory than we have
940	*/
941	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
942	return `0`;
943
944	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
945	if (pages > totalram_pages() + total_swap_pages)
946	goto error;
947	return `0`;
948	}
949
950	allowed = vm_commit_limit();
951	/*
952	* Reserve some for root
953	*/
954	if (!cap_sys_admin)
955	allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - `10`);
956
957	/*
958	* Don't let a single process grow so big a user can't recover
959	*/
960	if (mm) {
961	long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - `10`);
962
963	allowed -= min_t(long, mm->total_vm / `32`, reserve);
964	}
965
966	if (percpu_counter_read_positive(fbc: &vm_committed_as) < allowed)
967	return `0`;
968	error:
969	pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
970	__func__, current->pid, current->comm);
971	vm_unacct_memory(pages);
972
973	return -ENOMEM;
974	}
975
976	/**
977	* get_cmdline() - copy the cmdline value to a buffer.
978	* @task: the task whose cmdline value to copy.
979	* @buffer: the buffer to copy to.
980	* @buflen: the length of the buffer. Larger cmdline values are truncated
981	* to this length.
982	*
983	* Return: the size of the cmdline field copied. Note that the copy does
984	* not guarantee an ending NULL byte.
985	*/
986	int get_cmdline(struct task_struct task, char* buffer, int* buflen)
987	{
988	int res = `0`;
989	unsigned int len;
990	struct mm_struct *mm = get_task_mm(task);
991	unsigned long arg_start, arg_end, env_start, env_end;
992	if (!mm)
993	goto out;
994	if (!mm->arg_end)
995	goto out_mm; / Shh! No looking before we're done /
996
997	spin_lock(lock: &mm->arg_lock);
998	arg_start = mm->arg_start;
999	arg_end = mm->arg_end;
1000	env_start = mm->env_start;
1001	env_end = mm->env_end;
1002	spin_unlock(lock: &mm->arg_lock);
1003
1004	len = arg_end - arg_start;
1005
1006	if (len > buflen)
1007	len = buflen;
1008
1009	res = access_process_vm(tsk: task, addr: arg_start, buf: buffer, len, gup_flags: FOLL_FORCE);
1010
1011	/*
1012	* If the nul at the end of args has been overwritten, then
1013	* assume application is using setproctitle(3).
1014	*/
1015	if (res > `0` && buffer[res-`1`] != `'\0'` && len < buflen) {
1016	len = strnlen(p: buffer, maxlen: res);
1017	if (len < res) {
1018	res = len;
1019	} else {
1020	len = env_end - env_start;
1021	if (len > buflen - res)
1022	len = buflen - res;
1023	res += access_process_vm(tsk: task, addr: env_start,
1024	buf: buffer+res, len,
1025	gup_flags: FOLL_FORCE);
1026	res = strnlen(p: buffer, maxlen: res);
1027	}
1028	}
1029	out_mm:
1030	mmput(mm);
1031	out:
1032	return res;
1033	}
1034
1035	int __weak memcmp_pages(struct page page1, struct* page *page2)
1036	{
1037	char addr1, addr2;
1038	int ret;
1039
1040	addr1 = kmap_atomic(page: page1);
1041	addr2 = kmap_atomic(page: page2);
1042	ret = memcmp(p: addr1, q: addr2, PAGE_SIZE);
1043	kunmap_atomic(addr2);
1044	kunmap_atomic(addr1);
1045	return ret;
1046	}
1047
1048	#ifdef CONFIG_PRINTK
1049	/**
1050	* mem_dump_obj - Print available provenance information
1051	* @object: object for which to find provenance information.
1052	*
1053	* This function uses pr_cont(), so that the caller is expected to have
1054	* printed out whatever preamble is appropriate. The provenance information
1055	* depends on the type of object and on how much debugging is enabled.
1056	* For example, for a slab-cache object, the slab name is printed, and,
1057	* if available, the return address and stack trace from the allocation
1058	* and last free path of that object.
1059	*/
1060	void mem_dump_obj(void *object)
1061	{
1062	const char *type;
1063
1064	if (kmem_dump_obj(object))
1065	return;
1066
1067	if (vmalloc_dump_obj(object))
1068	return;
1069
1070	if (is_vmalloc_addr(x: object))
1071	type = "vmalloc memory";
1072	else if (virt_addr_valid(object))
1073	type = "non-slab/vmalloc memory";
1074	else if (object == NULL)
1075	type = "NULL pointer";
1076	else if (object == ZERO_SIZE_PTR)
1077	type = "zero-size pointer";
1078	else
1079	type = "non-paged memory";
1080
1081	pr_cont(" %s\n", type);
1082	}
1083	EXPORT_SYMBOL_GPL(mem_dump_obj);
1084	#endif
1085
1086	/*
1087	* A driver might set a page logically offline -- PageOffline() -- and
1088	* turn the page inaccessible in the hypervisor; after that, access to page
1089	* content can be fatal.
1090	*
1091	* Some special PFN walkers -- i.e., /proc/kcore -- read content of random
1092	* pages after checking PageOffline(); however, these PFN walkers can race
1093	* with drivers that set PageOffline().
1094	*
1095	* page_offline_freeze()/page_offline_thaw() allows for a subsystem to
1096	* synchronize with such drivers, achieving that a page cannot be set
1097	* PageOffline() while frozen.
1098	*
1099	* page_offline_begin()/page_offline_end() is used by drivers that care about
1100	* such races when setting a page PageOffline().
1101	*/
1102	static DECLARE_RWSEM(page_offline_rwsem);
1103
1104	void page_offline_freeze(void)
1105	{
1106	down_read(sem: &page_offline_rwsem);
1107	}
1108
1109	void page_offline_thaw(void)
1110	{
1111	up_read(sem: &page_offline_rwsem);
1112	}
1113
1114	void page_offline_begin(void)
1115	{
1116	down_write(sem: &page_offline_rwsem);
1117	}
1118	EXPORT_SYMBOL(page_offline_begin);
1119
1120	void page_offline_end(void)
1121	{
1122	up_write(sem: &page_offline_rwsem);
1123	}
1124	EXPORT_SYMBOL(page_offline_end);
1125
1126	#ifndef flush_dcache_folio
1127	void flush_dcache_folio(struct folio *folio)
1128	{
1129	long i, nr = folio_nr_pages(folio);
1130
1131	for (i = `0`; i < nr; i++)
1132	flush_dcache_page(folio_page(folio, i));
1133	}
1134	EXPORT_SYMBOL(flush_dcache_folio);
1135	#endif
1136

source code of linux/mm/util.c