mempolicy.c source code [linux/mm/mempolicy.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Simple NUMA memory policy for the Linux kernel.
4	*
5	* Copyright 2003,2004 Andi Kleen, SuSE Labs.
6	* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7	*
8	* NUMA policy allows the user to give hints in which node(s) memory should
9	* be allocated.
10	*
11	* Support six policies per VMA and per process:
12	*
13	* The VMA policy has priority over the process policy for a page fault.
14	*
15	* interleave Allocate memory interleaved over a set of nodes,
16	* with normal fallback if it fails.
17	* For VMA based allocations this interleaves based on the
18	* offset into the backing object or offset into the mapping
19	* for anonymous memory. For process policy an process counter
20	* is used.
21	*
22	* weighted interleave
23	* Allocate memory interleaved over a set of nodes based on
24	* a set of weights (per-node), with normal fallback if it
25	* fails. Otherwise operates the same as interleave.
26	* Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27	* on node 0 for every 1 page allocated on node 1.
28	*
29	* bind Only allocate memory on a specific set of nodes,
30	* no fallback.
31	* FIXME: memory is allocated starting with the first node
32	* to the last. It would be better if bind would truly restrict
33	* the allocation to memory nodes instead
34	*
35	* preferred Try a specific node first before normal fallback.
36	* As a special case NUMA_NO_NODE here means do the allocation
37	* on the local CPU. This is normally identical to default,
38	* but useful to set in a VMA when you have a non default
39	* process policy.
40	*
41	* preferred many Try a set of nodes first before normal fallback. This is
42	* similar to preferred without the special case.
43	*
44	* default Allocate on the local node first, or when on a VMA
45	* use the process policy. This is what Linux always did
46	* in a NUMA aware kernel and still does by, ahem, default.
47	*
48	* The process policy is applied for most non interrupt memory allocations
49	* in that process' context. Interrupts ignore the policies and always
50	* try to allocate on the local CPU. The VMA policy is only applied for memory
51	* allocations for a VMA in the VM.
52	*
53	* Currently there are a few corner cases in swapping where the policy
54	* is not applied, but the majority should be handled. When process policy
55	* is used it is not remembered over swap outs/swap ins.
56	*
57	* Only the highest zone in the zone hierarchy gets policied. Allocations
58	* requesting a lower zone just use default policy. This implies that
59	* on systems with highmem kernel lowmem allocation don't get policied.
60	* Same with GFP_DMA allocations.
61	*
62	* For shmem/tmpfs shared memory the policy is shared between
63	* all users and remembered even when nobody has memory mapped.
64	*/
65
66	/ Notebook:*
67	fix mmap readahead to honour policy and enable policy for any page cache
68	object
69	statistics for bigpages
70	global policy for page cache? currently it uses process policy. Requires
71	first item above.
72	handle mremap for shared memory (currently ignored for the policy)
73	grows down?
74	make bind policy root only? It can trigger oom much faster and the
75	kernel is not always grateful with that.
76	*/
77
78	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80	#include <linux/mempolicy.h>
81	#include <linux/pagewalk.h>
82	#include <linux/highmem.h>
83	#include <linux/hugetlb.h>
84	#include <linux/kernel.h>
85	#include <linux/sched.h>
86	#include <linux/sched/mm.h>
87	#include <linux/sched/numa_balancing.h>
88	#include <linux/sched/sysctl.h>
89	#include <linux/sched/task.h>
90	#include <linux/nodemask.h>
91	#include <linux/cpuset.h>
92	#include <linux/slab.h>
93	#include <linux/string.h>
94	#include <linux/export.h>
95	#include <linux/nsproxy.h>
96	#include <linux/interrupt.h>
97	#include <linux/init.h>
98	#include <linux/compat.h>
99	#include <linux/ptrace.h>
100	#include <linux/swap.h>
101	#include <linux/seq_file.h>
102	#include <linux/proc_fs.h>
103	#include <linux/memory-tiers.h>
104	#include <linux/migrate.h>
105	#include <linux/ksm.h>
106	#include <linux/rmap.h>
107	#include <linux/security.h>
108	#include <linux/syscalls.h>
109	#include <linux/ctype.h>
110	#include <linux/mm_inline.h>
111	#include <linux/mmu_notifier.h>
112	#include <linux/printk.h>
113	#include <linux/leafops.h>
114	#include <linux/gcd.h>
115
116	#include <asm/tlbflush.h>
117	#include <asm/tlb.h>
118	#include <linux/uaccess.h>
119	#include <linux/memory.h>
120
121	#include "internal.h"
122
123	/ Internal flags /
124	#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
125	#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
126	#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
127
128	static struct kmem_cache *policy_cache;
129	static struct kmem_cache *sn_cache;
130
131	/ Highest zone. An specific allocation for a zone below that is not*
132	policied. /*
133	enum zone_type policy_zone = `0`;
134
135	/*
136	* run-time system-wide default policy => local allocation
137	*/
138	static struct mempolicy default_policy = {
139	.refcnt = ATOMIC_INIT(`1`), / never free it /
140	.mode = MPOL_LOCAL,
141	};
142
143	static struct mempolicy preferred_node_policy[MAX_NUMNODES];
144
145	/*
146	* weightiness balances the tradeoff between small weights (cycles through nodes
147	* faster, more fair/even distribution) and large weights (smaller errors
148	* between actual bandwidth ratios and weight ratios). 32 is a number that has
149	* been found to perform at a reasonable compromise between the two goals.
150	*/
151	static const int weightiness = `32`;
152
153	/*
154	* A null weighted_interleave_state is interpreted as having .mode="auto",
155	* and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156	*/
157	struct weighted_interleave_state {
158	bool mode_auto;
159	u8 iw_table[];
160	};
161	static struct weighted_interleave_state __rcu *wi_state;
162	static unsigned int *node_bw_table;
163
164	/*
165	* wi_state_lock protects both wi_state and node_bw_table.
166	* node_bw_table is only used by writers to update wi_state.
167	*/
168	static DEFINE_MUTEX(wi_state_lock);
169
170	static u8 get_il_weight(int node)
171	{
172	struct weighted_interleave_state *state;
173	u8 weight = `1`;
174
175	rcu_read_lock();
176	state = rcu_dereference(wi_state);
177	if (state)
178	weight = state->iw_table[node];
179	rcu_read_unlock();
180	return weight;
181	}
182
183	/*
184	* Convert bandwidth values into weighted interleave weights.
185	* Call with wi_state_lock.
186	*/
187	static void reduce_interleave_weights(unsigned int bw, u8 new_iw)
188	{
189	u64 sum_bw = `0`;
190	unsigned int cast_sum_bw, scaling_factor = `1`, iw_gcd = `0`;
191	int nid;
192
193	for_each_node_state(nid, N_MEMORY)
194	sum_bw += bw[nid];
195
196	/ Scale bandwidths to whole numbers in the range [1, weightiness] /
197	for_each_node_state(nid, N_MEMORY) {
198	/*
199	* Try not to perform 64-bit division.
200	* If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201	* If sum_bw > scaling_factor, then round the weight up to 1.
202	*/
203	scaling_factor = weightiness * bw[nid];
204	if (bw[nid] && sum_bw < scaling_factor) {
205	cast_sum_bw = (unsigned int)sum_bw;
206	new_iw[nid] = scaling_factor / cast_sum_bw;
207	} else {
208	new_iw[nid] = `1`;
209	}
210	if (!iw_gcd)
211	iw_gcd = new_iw[nid];
212	iw_gcd = gcd(a: iw_gcd, b: new_iw[nid]);
213	}
214
215	/ 1:2 is strictly better than 16:32. Reduce by the weights' GCD. /
216	for_each_node_state(nid, N_MEMORY)
217	new_iw[nid] /= iw_gcd;
218	}
219
220	int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221	{
222	struct weighted_interleave_state new_wi_state, old_wi_state = NULL;
223	unsigned int old_bw, new_bw;
224	unsigned int bw_val;
225	int i;
226
227	bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228	new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229	if (!new_bw)
230	return -ENOMEM;
231
232	new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
233	GFP_KERNEL);
234	if (!new_wi_state) {
235	kfree(objp: new_bw);
236	return -ENOMEM;
237	}
238	new_wi_state->mode_auto = true;
239	for (i = `0`; i < nr_node_ids; i++)
240	new_wi_state->iw_table[i] = `1`;
241
242	/*
243	* Update bandwidth info, even in manual mode. That way, when switching
244	* to auto mode in the future, iw_table can be overwritten using
245	* accurate bw data.
246	*/
247	mutex_lock(&wi_state_lock);
248
249	old_bw = node_bw_table;
250	if (old_bw)
251	memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
252	new_bw[node] = bw_val;
253	node_bw_table = new_bw;
254
255	old_wi_state = rcu_dereference_protected(wi_state,
256	lockdep_is_held(&wi_state_lock));
257	if (old_wi_state && !old_wi_state->mode_auto) {
258	/ Manual mode; skip reducing weights and updating wi_state /
259	mutex_unlock(lock: &wi_state_lock);
260	kfree(objp: new_wi_state);
261	goto out;
262	}
263
264	/ NULL wi_state assumes auto=true; reduce weights and update wi_state/
265	reduce_interleave_weights(bw: new_bw, new_iw: new_wi_state->iw_table);
266	rcu_assign_pointer(wi_state, new_wi_state);
267
268	mutex_unlock(lock: &wi_state_lock);
269	if (old_wi_state) {
270	synchronize_rcu();
271	kfree(objp: old_wi_state);
272	}
273	out:
274	kfree(objp: old_bw);
275	return `0`;
276	}
277
278	/**
279	* numa_nearest_node - Find nearest node by state
280	* @node: Node id to start the search
281	* @state: State to filter the search
282	*
283	* Lookup the closest node by distance if @nid is not in state.
284	*
285	* Return: this @node if it is in state, otherwise the closest node by distance
286	*/
287	int numa_nearest_node(int node, unsigned int state)
288	{
289	int min_dist = INT_MAX, dist, n, min_node;
290
291	if (state >= NR_NODE_STATES)
292	return -EINVAL;
293
294	if (node == NUMA_NO_NODE \|\| node_state(node, state))
295	return node;
296
297	min_node = node;
298	for_each_node_state(n, state) {
299	dist = node_distance(node, n);
300	if (dist < min_dist) {
301	min_dist = dist;
302	min_node = n;
303	}
304	}
305
306	return min_node;
307	}
308	EXPORT_SYMBOL_GPL(numa_nearest_node);
309
310	/**
311	* nearest_node_nodemask - Find the node in @mask at the nearest distance
312	* from @node.
313	*
314	* @node: a valid node ID to start the search from.
315	* @mask: a pointer to a nodemask representing the allowed nodes.
316	*
317	* This function iterates over all nodes in @mask and calculates the
318	* distance from the starting @node, then it returns the node ID that is
319	* the closest to @node, or MAX_NUMNODES if no node is found.
320	*
321	* Note that @node must be a valid node ID usable with node_distance(),
322	* providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
323	* or unexpected behavior.
324	*/
325	int nearest_node_nodemask(int node, nodemask_t *mask)
326	{
327	int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
328
329	for_each_node_mask(n, *mask) {
330	dist = node_distance(node, n);
331	if (dist < min_dist) {
332	min_dist = dist;
333	min_node = n;
334	}
335	}
336
337	return min_node;
338	}
339	EXPORT_SYMBOL_GPL(nearest_node_nodemask);
340
341	struct mempolicy get_task_policy(struct* task_struct *p)
342	{
343	struct mempolicy *pol = p->mempolicy;
344	int node;
345
346	if (pol)
347	return pol;
348
349	node = numa_node_id();
350	if (node != NUMA_NO_NODE) {
351	pol = &preferred_node_policy[node];
352	/ preferred_node_policy is not initialised early in boot /
353	if (pol->mode)
354	return pol;
355	}
356
357	return &default_policy;
358	}
359	EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
360
361	static const struct mempolicy_operations {
362	int (create)(struct* mempolicy pol, const* nodemask_t *nodes);
363	void (rebind)(struct* mempolicy pol, const* nodemask_t *nodes);
364	} mpol_ops[MPOL_MAX];
365
366	static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
367	{
368	return pol->flags & MPOL_MODE_FLAGS;
369	}
370
371	static void mpol_relative_nodemask(nodemask_t ret, const* nodemask_t *orig,
372	const nodemask_t *rel)
373	{
374	nodemask_t tmp;
375	nodes_fold(tmp, orig, nodes_weight(rel));
376	nodes_onto(ret, tmp, rel);
377	}
378
379	static int mpol_new_nodemask(struct mempolicy pol, const* nodemask_t *nodes)
380	{
381	if (nodes_empty(*nodes))
382	return -EINVAL;
383	pol->nodes = *nodes;
384	return `0`;
385	}
386
387	static int mpol_new_preferred(struct mempolicy pol, const* nodemask_t *nodes)
388	{
389	if (nodes_empty(*nodes))
390	return -EINVAL;
391
392	nodes_clear(pol->nodes);
393	node_set(first_node(*nodes), pol->nodes);
394	return `0`;
395	}
396
397	/*
398	* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
399	* any, for the new policy. mpol_new() has already validated the nodes
400	* parameter with respect to the policy mode and flags.
401	*
402	* Must be called holding task's alloc_lock to protect task's mems_allowed
403	* and mempolicy. May also be called holding the mmap_lock for write.
404	*/
405	static int mpol_set_nodemask(struct mempolicy *pol,
406	const nodemask_t nodes, struct* nodemask_scratch *nsc)
407	{
408	int ret;
409
410	/*
411	* Default (pol==NULL) resp. local memory policies are not a
412	* subject of any remapping. They also do not need any special
413	* constructor.
414	*/
415	if (!pol \|\| pol->mode == MPOL_LOCAL)
416	return `0`;
417
418	/ Check N_MEMORY /
419	nodes_and(nsc->mask1,
420	cpuset_current_mems_allowed, node_states[N_MEMORY]);
421
422	VM_BUG_ON(!nodes);
423
424	if (pol->flags & MPOL_F_RELATIVE_NODES)
425	mpol_relative_nodemask(ret: &nsc->mask2, orig: nodes, rel: &nsc->mask1);
426	else
427	nodes_and(nsc->mask2, *nodes, nsc->mask1);
428
429	if (mpol_store_user_nodemask(pol))
430	pol->w.user_nodemask = *nodes;
431	else
432	pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
433
434	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
435	return ret;
436	}
437
438	/*
439	* This function just creates a new policy, does some check and simple
440	* initialization. You must invoke mpol_set_nodemask() to set nodes.
441	*/
442	static struct mempolicy mpol_new(unsigned* short mode, unsigned short flags,
443	nodemask_t *nodes)
444	{
445	struct mempolicy *policy;
446
447	if (mode == MPOL_DEFAULT) {
448	if (nodes && !nodes_empty(*nodes))
449	return ERR_PTR(error: -EINVAL);
450	return NULL;
451	}
452	VM_BUG_ON(!nodes);
453
454	/*
455	* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
456	* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
457	* All other modes require a valid pointer to a non-empty nodemask.
458	*/
459	if (mode == MPOL_PREFERRED) {
460	if (nodes_empty(*nodes)) {
461	if (((flags & MPOL_F_STATIC_NODES) \|\|
462	(flags & MPOL_F_RELATIVE_NODES)))
463	return ERR_PTR(error: -EINVAL);
464
465	mode = MPOL_LOCAL;
466	}
467	} else if (mode == MPOL_LOCAL) {
468	if (!nodes_empty(*nodes) \|\|
469	(flags & MPOL_F_STATIC_NODES) \|\|
470	(flags & MPOL_F_RELATIVE_NODES))
471	return ERR_PTR(error: -EINVAL);
472	} else if (nodes_empty(*nodes))
473	return ERR_PTR(error: -EINVAL);
474
475	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
476	if (!policy)
477	return ERR_PTR(error: -ENOMEM);
478	atomic_set(v: &policy->refcnt, i: `1`);
479	policy->mode = mode;
480	policy->flags = flags;
481	policy->home_node = NUMA_NO_NODE;
482
483	return policy;
484	}
485
486	/ Slow path of a mpol destructor. /
487	void __mpol_put(struct mempolicy *pol)
488	{
489	if (!atomic_dec_and_test(v: &pol->refcnt))
490	return;
491	kmem_cache_free(s: policy_cache, objp: pol);
492	}
493	EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
494
495	static void mpol_rebind_default(struct mempolicy pol, const* nodemask_t *nodes)
496	{
497	}
498
499	static void mpol_rebind_nodemask(struct mempolicy pol, const* nodemask_t *nodes)
500	{
501	nodemask_t tmp;
502
503	if (pol->flags & MPOL_F_STATIC_NODES)
504	nodes_and(tmp, pol->w.user_nodemask, *nodes);
505	else if (pol->flags & MPOL_F_RELATIVE_NODES)
506	mpol_relative_nodemask(ret: &tmp, orig: &pol->w.user_nodemask, rel: nodes);
507	else {
508	nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
509	*nodes);
510	pol->w.cpuset_mems_allowed = *nodes;
511	}
512
513	if (nodes_empty(tmp))
514	tmp = *nodes;
515
516	pol->nodes = tmp;
517	}
518
519	static void mpol_rebind_preferred(struct mempolicy *pol,
520	const nodemask_t *nodes)
521	{
522	pol->w.cpuset_mems_allowed = *nodes;
523	}
524
525	/*
526	* mpol_rebind_policy - Migrate a policy to a different set of nodes
527	*
528	* Per-vma policies are protected by mmap_lock. Allocations using per-task
529	* policies are protected by task->mems_allowed_seq to prevent a premature
530	* OOM/allocation failure due to parallel nodemask modification.
531	*/
532	static void mpol_rebind_policy(struct mempolicy pol, const* nodemask_t *newmask)
533	{
534	if (!pol \|\| pol->mode == MPOL_LOCAL)
535	return;
536	if (!mpol_store_user_nodemask(pol) &&
537	nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
538	return;
539
540	mpol_ops[pol->mode].rebind(pol, newmask);
541	}
542
543	/*
544	* Wrapper for mpol_rebind_policy() that just requires task
545	* pointer, and updates task mempolicy.
546	*
547	* Called with task's alloc_lock held.
548	*/
549	void mpol_rebind_task(struct task_struct tsk, const* nodemask_t *new)
550	{
551	mpol_rebind_policy(pol: tsk->mempolicy, newmask: new);
552	}
553
554	/*
555	* Rebind each vma in mm to new nodemask.
556	*
557	* Call holding a reference to mm. Takes mm->mmap_lock during call.
558	*/
559	void mpol_rebind_mm(struct mm_struct mm, nodemask_t new)
560	{
561	struct vm_area_struct *vma;
562	VMA_ITERATOR(vmi, mm, `0`);
563
564	mmap_write_lock(mm);
565	for_each_vma(vmi, vma) {
566	vma_start_write(vma);
567	mpol_rebind_policy(pol: vma->vm_policy, newmask: new);
568	}
569	mmap_write_unlock(mm);
570	}
571
572	static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
573	[MPOL_DEFAULT] = {
574	.rebind = mpol_rebind_default,
575	},
576	[MPOL_INTERLEAVE] = {
577	.create = mpol_new_nodemask,
578	.rebind = mpol_rebind_nodemask,
579	},
580	[MPOL_PREFERRED] = {
581	.create = mpol_new_preferred,
582	.rebind = mpol_rebind_preferred,
583	},
584	[MPOL_BIND] = {
585	.create = mpol_new_nodemask,
586	.rebind = mpol_rebind_nodemask,
587	},
588	[MPOL_LOCAL] = {
589	.rebind = mpol_rebind_default,
590	},
591	[MPOL_PREFERRED_MANY] = {
592	.create = mpol_new_nodemask,
593	.rebind = mpol_rebind_preferred,
594	},
595	[MPOL_WEIGHTED_INTERLEAVE] = {
596	.create = mpol_new_nodemask,
597	.rebind = mpol_rebind_nodemask,
598	},
599	};
600
601	static bool migrate_folio_add(struct folio folio, struct* list_head *foliolist,
602	unsigned long flags);
603	static nodemask_t policy_nodemask(gfp_t gfp, struct* mempolicy *pol,
604	pgoff_t ilx, int *nid);
605
606	static bool strictly_unmovable(unsigned long flags)
607	{
608	/*
609	* STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
610	* if any misplaced page is found.
611	*/
612	return (flags & (MPOL_MF_STRICT \| MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) ==
613	MPOL_MF_STRICT;
614	}
615
616	struct migration_mpol { / for alloc_migration_target_by_mpol() /
617	struct mempolicy *pol;
618	pgoff_t ilx;
619	};
620
621	struct queue_pages {
622	struct list_head *pagelist;
623	unsigned long flags;
624	nodemask_t *nmask;
625	unsigned long start;
626	unsigned long end;
627	struct vm_area_struct *first;
628	struct folio large; /* note last large folio encountered /
629	long nr_failed; / could not be isolated at this time /
630	};
631
632	/*
633	* Check if the folio's nid is in qp->nmask.
634	*
635	* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
636	* in the invert of qp->nmask.
637	*/
638	static inline bool queue_folio_required(struct folio *folio,
639	struct queue_pages *qp)
640	{
641	int nid = folio_nid(folio);
642	unsigned long flags = qp->flags;
643
644	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
645	}
646
647	static void queue_folios_pmd(pmd_t pmd, struct* mm_walk *walk)
648	{
649	struct folio *folio;
650	struct queue_pages *qp = walk->private;
651
652	if (unlikely(pmd_is_migration_entry(*pmd))) {
653	qp->nr_failed++;
654	return;
655	}
656	folio = pmd_folio(*pmd);
657	if (is_huge_zero_folio(folio)) {
658	walk->action = ACTION_CONTINUE;
659	return;
660	}
661	if (!queue_folio_required(folio, qp))
662	return;
663	if (!(qp->flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) \|\|
664	!vma_migratable(vma: walk->vma) \|\|
665	!migrate_folio_add(folio, foliolist: qp->pagelist, flags: qp->flags))
666	qp->nr_failed++;
667	}
668
669	/*
670	* Scan through folios, checking if they satisfy the required conditions,
671	* moving them from LRU to local pagelist for migration if they do (or not).
672	*
673	* queue_folios_pte_range() has two possible return values:
674	* 0 - continue walking to scan for more, even if an existing folio on the
675	* wrong node could not be isolated and queued for migration.
676	* -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
677	* and an existing folio was on a node that does not follow the policy.
678	*/
679	static int queue_folios_pte_range(pmd_t pmd, unsigned* long addr,
680	unsigned long end, struct mm_walk *walk)
681	{
682	struct vm_area_struct *vma = walk->vma;
683	struct folio *folio;
684	struct queue_pages *qp = walk->private;
685	unsigned long flags = qp->flags;
686	pte_t pte, mapped_pte;
687	pte_t ptent;
688	spinlock_t *ptl;
689	int max_nr, nr;
690
691	ptl = pmd_trans_huge_lock(pmd, vma);
692	if (ptl) {
693	queue_folios_pmd(pmd, walk);
694	spin_unlock(lock: ptl);
695	goto out;
696	}
697
698	mapped_pte = pte = pte_offset_map_lock(mm: walk->mm, pmd, addr, ptlp: &ptl);
699	if (!pte) {
700	walk->action = ACTION_AGAIN;
701	return `0`;
702	}
703	for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
704	max_nr = (end - addr) >> PAGE_SHIFT;
705	nr = `1`;
706	ptent = ptep_get(ptep: pte);
707	if (pte_none(pte: ptent))
708	continue;
709	if (!pte_present(a: ptent)) {
710	const softleaf_t entry = softleaf_from_pte(pte: ptent);
711
712	if (softleaf_is_migration(entry))
713	qp->nr_failed++;
714	continue;
715	}
716	folio = vm_normal_folio(vma, addr, pte: ptent);
717	if (!folio \|\| folio_is_zone_device(folio))
718	continue;
719	if (folio_test_large(folio) && max_nr != `1`)
720	nr = folio_pte_batch(folio, ptep: pte, pte: ptent, max_nr);
721	/*
722	* vm_normal_folio() filters out zero pages, but there might
723	* still be reserved folios to skip, perhaps in a VDSO.
724	*/
725	if (folio_test_reserved(folio))
726	continue;
727	if (!queue_folio_required(folio, qp))
728	continue;
729	if (folio_test_large(folio)) {
730	/*
731	* A large folio can only be isolated from LRU once,
732	* but may be mapped by many PTEs (and Copy-On-Write may
733	* intersperse PTEs of other, order 0, folios). This is
734	* a common case, so don't mistake it for failure (but
735	* there can be other cases of multi-mapped pages which
736	* this quick check does not help to filter out - and a
737	* search of the pagelist might grow to be prohibitive).
738	*
739	* migrate_pages(&pagelist) returns nr_failed folios, so
740	* check "large" now so that queue_pages_range() returns
741	* a comparable nr_failed folios. This does imply that
742	* if folio could not be isolated for some racy reason
743	* at its first PTE, later PTEs will not give it another
744	* chance of isolation; but keeps the accounting simple.
745	*/
746	if (folio == qp->large)
747	continue;
748	qp->large = folio;
749	}
750	if (!(flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) \|\|
751	!vma_migratable(vma) \|\|
752	!migrate_folio_add(folio, foliolist: qp->pagelist, flags)) {
753	qp->nr_failed += nr;
754	if (strictly_unmovable(flags))
755	break;
756	}
757	}
758	pte_unmap_unlock(mapped_pte, ptl);
759	cond_resched();
760	out:
761	if (qp->nr_failed && strictly_unmovable(flags))
762	return -EIO;
763	return `0`;
764	}
765
766	static int queue_folios_hugetlb(pte_t pte, unsigned* long hmask,
767	unsigned long addr, unsigned long end,
768	struct mm_walk *walk)
769	{
770	#ifdef CONFIG_HUGETLB_PAGE
771	struct queue_pages *qp = walk->private;
772	unsigned long flags = qp->flags;
773	struct folio *folio;
774	spinlock_t *ptl;
775	pte_t ptep;
776
777	ptl = huge_pte_lock(h: hstate_vma(vma: walk->vma), mm: walk->mm, pte);
778	ptep = huge_ptep_get(mm: walk->mm, addr, ptep: pte);
779	if (!pte_present(a: ptep)) {
780	if (!huge_pte_none(pte: ptep)) {
781	const softleaf_t entry = softleaf_from_pte(pte: ptep);
782
783	if (unlikely(softleaf_is_migration(entry)))
784	qp->nr_failed++;
785	}
786
787	goto unlock;
788	}
789	folio = pfn_folio(pfn: pte_pfn(pte: ptep));
790	if (!queue_folio_required(folio, qp))
791	goto unlock;
792	if (!(flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) \|\|
793	!vma_migratable(vma: walk->vma)) {
794	qp->nr_failed++;
795	goto unlock;
796	}
797	/*
798	* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
799	* Choosing not to migrate a shared folio is not counted as a failure.
800	*
801	* See folio_maybe_mapped_shared() on possible imprecision when we
802	* cannot easily detect if a folio is shared.
803	*/
804	if ((flags & MPOL_MF_MOVE_ALL) \|\|
805	(!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
806	if (!folio_isolate_hugetlb(folio, list: qp->pagelist))
807	qp->nr_failed++;
808	unlock:
809	spin_unlock(lock: ptl);
810	if (qp->nr_failed && strictly_unmovable(flags))
811	return -EIO;
812	#endif
813	return `0`;
814	}
815
816	#ifdef CONFIG_NUMA_BALANCING
817	/**
818	* folio_can_map_prot_numa() - check whether the folio can map prot numa
819	* @folio: The folio whose mapping considered for being made NUMA hintable
820	* @vma: The VMA that the folio belongs to.
821	* @is_private_single_threaded: Is this a single-threaded private VMA or not
822	*
823	* This function checks to see if the folio actually indicates that
824	* we need to make the mapping one which causes a NUMA hinting fault,
825	* as there are cases where it's simply unnecessary, and the folio's
826	* access time is adjusted for memory tiering if prot numa needed.
827	*
828	* Return: True if the mapping of the folio needs to be changed, false otherwise.
829	*/
830	bool folio_can_map_prot_numa(struct folio folio, struct* vm_area_struct *vma,
831	bool is_private_single_threaded)
832	{
833	int nid;
834
835	if (!folio \|\| folio_is_zone_device(folio) \|\| folio_test_ksm(folio))
836	return false;
837
838	/ Also skip shared copy-on-write folios /
839	if (is_cow_mapping(flags: vma->vm_flags) && folio_maybe_mapped_shared(folio))
840	return false;
841
842	/ Folios are pinned and can't be migrated /
843	if (folio_maybe_dma_pinned(folio))
844	return false;
845
846	/*
847	* While migration can move some dirty folios,
848	* it cannot move them all from MIGRATE_ASYNC
849	* context.
850	*/
851	if (folio_is_file_lru(folio) && folio_test_dirty(folio))
852	return false;
853
854	/*
855	* Don't mess with PTEs if folio is already on the node
856	* a single-threaded process is running on.
857	*/
858	nid = folio_nid(folio);
859	if (is_private_single_threaded && (nid == numa_node_id()))
860	return false;
861
862	/*
863	* Skip scanning top tier node if normal numa
864	* balancing is disabled
865	*/
866	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
867	node_is_toptier(node: nid))
868	return false;
869
870	if (folio_use_access_time(folio))
871	folio_xchg_access_time(folio, time: jiffies_to_msecs(j: jiffies));
872
873	return true;
874	}
875
876	/*
877	* This is used to mark a range of virtual addresses to be inaccessible.
878	* These are later cleared by a NUMA hinting fault. Depending on these
879	* faults, pages may be migrated for better NUMA placement.
880	*
881	* This is assuming that NUMA faults are handled using PROT_NONE. If
882	* an architecture makes a different choice, it will need further
883	* changes to the core.
884	*/
885	unsigned long change_prot_numa(struct vm_area_struct *vma,
886	unsigned long addr, unsigned long end)
887	{
888	struct mmu_gather tlb;
889	long nr_updated;
890
891	tlb_gather_mmu(tlb: &tlb, mm: vma->vm_mm);
892
893	nr_updated = change_protection(tlb: &tlb, vma, start: addr, end, MM_CP_PROT_NUMA);
894	if (nr_updated > `0`) {
895	count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
896	count_memcg_events_mm(mm: vma->vm_mm, idx: NUMA_PTE_UPDATES, count: nr_updated);
897	}
898
899	tlb_finish_mmu(tlb: &tlb);
900
901	return nr_updated;
902	}
903	#endif /* CONFIG_NUMA_BALANCING */
904
905	static int queue_pages_test_walk(unsigned long start, unsigned long end,
906	struct mm_walk *walk)
907	{
908	struct vm_area_struct next, vma = walk->vma;
909	struct queue_pages *qp = walk->private;
910	unsigned long flags = qp->flags;
911
912	/ range check first /
913	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
914
915	if (!qp->first) {
916	qp->first = vma;
917	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
918	(qp->start < vma->vm_start))
919	/ hole at head side of range /
920	return -EFAULT;
921	}
922	next = find_vma(mm: vma->vm_mm, addr: vma->vm_end);
923	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
924	((vma->vm_end < qp->end) &&
925	(!next \|\| vma->vm_end < next->vm_start)))
926	/ hole at middle or tail of range /
927	return -EFAULT;
928
929	/*
930	* Need check MPOL_MF_STRICT to return -EIO if possible
931	* regardless of vma_migratable
932	*/
933	if (!vma_migratable(vma) &&
934	!(flags & MPOL_MF_STRICT))
935	return `1`;
936
937	/*
938	* Check page nodes, and queue pages to move, in the current vma.
939	* But if no moving, and no strict checking, the scan can be skipped.
940	*/
941	if (flags & (MPOL_MF_STRICT \| MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
942	return `0`;
943	return `1`;
944	}
945
946	static const struct mm_walk_ops queue_pages_walk_ops = {
947	.hugetlb_entry = queue_folios_hugetlb,
948	.pmd_entry = queue_folios_pte_range,
949	.test_walk = queue_pages_test_walk,
950	.walk_lock = PGWALK_RDLOCK,
951	};
952
953	static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
954	.hugetlb_entry = queue_folios_hugetlb,
955	.pmd_entry = queue_folios_pte_range,
956	.test_walk = queue_pages_test_walk,
957	.walk_lock = PGWALK_WRLOCK,
958	};
959
960	/*
961	* Walk through page tables and collect pages to be migrated.
962	*
963	* If pages found in a given range are not on the required set of @nodes,
964	* and migration is allowed, they are isolated and queued to @pagelist.
965	*
966	* queue_pages_range() may return:
967	* 0 - all pages already on the right node, or successfully queued for moving
968	* (or neither strict checking nor moving requested: only range checking).
969	* >0 - this number of misplaced folios could not be queued for moving
970	* (a hugetlbfs page or a transparent huge page being counted as 1).
971	* -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
972	* -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
973	*/
974	static long
975	queue_pages_range(struct mm_struct mm, unsigned* long start, unsigned long end,
976	nodemask_t nodes, unsigned* long flags,
977	struct list_head *pagelist)
978	{
979	int err;
980	struct queue_pages qp = {
981	.pagelist = pagelist,
982	.flags = flags,
983	.nmask = nodes,
984	.start = start,
985	.end = end,
986	.first = NULL,
987	};
988	const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
989	&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
990
991	err = walk_page_range(mm, start, end, ops, private: &qp);
992
993	if (!qp.first)
994	/ whole range in hole /
995	err = -EFAULT;
996
997	return err ? : qp.nr_failed;
998	}
999
1000	/*
1001	* Apply policy to a single VMA
1002	* This must be called with the mmap_lock held for writing.
1003	*/
1004	static int vma_replace_policy(struct vm_area_struct *vma,
1005	struct mempolicy *pol)
1006	{
1007	int err;
1008	struct mempolicy *old;
1009	struct mempolicy *new;
1010
1011	vma_assert_write_locked(vma);
1012
1013	new = mpol_dup(pol);
1014	if (IS_ERR(ptr: new))
1015	return PTR_ERR(ptr: new);
1016
1017	if (vma->vm_ops && vma->vm_ops->set_policy) {
1018	err = vma->vm_ops->set_policy(vma, new);
1019	if (err)
1020	goto err_out;
1021	}
1022
1023	old = vma->vm_policy;
1024	vma->vm_policy = new; / protected by mmap_lock /
1025	mpol_put(pol: old);
1026
1027	return `0`;
1028	err_out:
1029	mpol_put(pol: new);
1030	return err;
1031	}
1032
1033	/ Split or merge the VMA (if required) and apply the new policy /
1034	static int mbind_range(struct vma_iterator vmi, struct* vm_area_struct *vma,
1035	struct vm_area_struct *prev, unsigned* long start,
1036	unsigned long end, struct mempolicy *new_pol)
1037	{
1038	unsigned long vmstart, vmend;
1039
1040	vmend = min(end, vma->vm_end);
1041	if (start > vma->vm_start) {
1042	*prev = vma;
1043	vmstart = start;
1044	} else {
1045	vmstart = vma->vm_start;
1046	}
1047
1048	if (mpol_equal(a: vma->vm_policy, b: new_pol)) {
1049	*prev = vma;
1050	return `0`;
1051	}
1052
1053	vma = vma_modify_policy(vmi, prev: *prev, vma, start: vmstart, end: vmend, new_pol);
1054	if (IS_ERR(ptr: vma))
1055	return PTR_ERR(ptr: vma);
1056
1057	*prev = vma;
1058	return vma_replace_policy(vma, pol: new_pol);
1059	}
1060
1061	/ Set the process memory policy /
1062	static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1063	nodemask_t *nodes)
1064	{
1065	struct mempolicy new, old;
1066	NODEMASK_SCRATCH(scratch);
1067	int ret;
1068
1069	if (!scratch)
1070	return -ENOMEM;
1071
1072	new = mpol_new(mode, flags, nodes);
1073	if (IS_ERR(ptr: new)) {
1074	ret = PTR_ERR(ptr: new);
1075	goto out;
1076	}
1077
1078	task_lock(current);
1079	ret = mpol_set_nodemask(pol: new, nodes, nsc: scratch);
1080	if (ret) {
1081	task_unlock(current);
1082	mpol_put(pol: new);
1083	goto out;
1084	}
1085
1086	old = current->mempolicy;
1087	current->mempolicy = new;
1088	if (new && (new->mode == MPOL_INTERLEAVE \|\|
1089	new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
1090	current->il_prev = MAX_NUMNODES-`1`;
1091	current->il_weight = `0`;
1092	}
1093	task_unlock(current);
1094	mpol_put(pol: old);
1095	ret = `0`;
1096	out:
1097	NODEMASK_SCRATCH_FREE(scratch);
1098	return ret;
1099	}
1100
1101	/*
1102	* Return nodemask for policy for get_mempolicy() query
1103	*
1104	* Called with task's alloc_lock held
1105	*/
1106	static void get_policy_nodemask(struct mempolicy pol, nodemask_t nodes)
1107	{
1108	nodes_clear(*nodes);
1109	if (pol == &default_policy)
1110	return;
1111
1112	switch (pol->mode) {
1113	case MPOL_BIND:
1114	case MPOL_INTERLEAVE:
1115	case MPOL_PREFERRED:
1116	case MPOL_PREFERRED_MANY:
1117	case MPOL_WEIGHTED_INTERLEAVE:
1118	*nodes = pol->nodes;
1119	break;
1120	case MPOL_LOCAL:
1121	/ return empty node mask for local allocation /
1122	break;
1123	default:
1124	BUG();
1125	}
1126	}
1127
1128	static int lookup_node(struct mm_struct mm, unsigned* long addr)
1129	{
1130	struct page *p = NULL;
1131	int ret;
1132
1133	ret = get_user_pages_fast(start: addr & PAGE_MASK, nr_pages: `1`, gup_flags: `0`, pages: &p);
1134	if (ret > `0`) {
1135	ret = page_to_nid(page: p);
1136	put_page(page: p);
1137	}
1138	return ret;
1139	}
1140
1141	/ Retrieve NUMA policy /
1142	static long do_get_mempolicy(int policy, nodemask_t nmask,
1143	unsigned long addr, unsigned long flags)
1144	{
1145	int err;
1146	struct mm_struct *mm = current->mm;
1147	struct vm_area_struct *vma = NULL;
1148	struct mempolicy pol = current->mempolicy, pol_refcount = NULL;
1149
1150	if (flags &
1151	~(unsigned long)(MPOL_F_NODE\|MPOL_F_ADDR\|MPOL_F_MEMS_ALLOWED))
1152	return -EINVAL;
1153
1154	if (flags & MPOL_F_MEMS_ALLOWED) {
1155	if (flags & (MPOL_F_NODE\|MPOL_F_ADDR))
1156	return -EINVAL;
1157	policy = `0`; /* just so it's initialized /
1158	task_lock(current);
1159	*nmask = cpuset_current_mems_allowed;
1160	task_unlock(current);
1161	return `0`;
1162	}
1163
1164	if (flags & MPOL_F_ADDR) {
1165	pgoff_t ilx; / ignored here /
1166	/*
1167	* Do NOT fall back to task policy if the
1168	* vma/shared policy at addr is NULL. We
1169	* want to return MPOL_DEFAULT in this case.
1170	*/
1171	mmap_read_lock(mm);
1172	vma = vma_lookup(mm, addr);
1173	if (!vma) {
1174	mmap_read_unlock(mm);
1175	return -EFAULT;
1176	}
1177	pol = __get_vma_policy(vma, addr, ilx: &ilx);
1178	} else if (addr)
1179	return -EINVAL;
1180
1181	if (!pol)
1182	pol = &default_policy; / indicates default behavior /
1183
1184	if (flags & MPOL_F_NODE) {
1185	if (flags & MPOL_F_ADDR) {
1186	/*
1187	* Take a refcount on the mpol, because we are about to
1188	* drop the mmap_lock, after which only "pol" remains
1189	* valid, "vma" is stale.
1190	*/
1191	pol_refcount = pol;
1192	vma = NULL;
1193	mpol_get(pol);
1194	mmap_read_unlock(mm);
1195	err = lookup_node(mm, addr);
1196	if (err < `0`)
1197	goto out;
1198	*policy = err;
1199	} else if (pol == current->mempolicy &&
1200	pol->mode == MPOL_INTERLEAVE) {
1201	*policy = next_node_in(current->il_prev, pol->nodes);
1202	} else if (pol == current->mempolicy &&
1203	pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1204	if (current->il_weight)
1205	*policy = current->il_prev;
1206	else
1207	*policy = next_node_in(current->il_prev,
1208	pol->nodes);
1209	} else {
1210	err = -EINVAL;
1211	goto out;
1212	}
1213	} else {
1214	*policy = pol == &default_policy ? MPOL_DEFAULT :
1215	pol->mode;
1216	/*
1217	* Internal mempolicy flags must be masked off before exposing
1218	* the policy to userspace.
1219	*/
1220	*policy \|= (pol->flags & MPOL_MODE_FLAGS);
1221	}
1222
1223	err = `0`;
1224	if (nmask) {
1225	if (mpol_store_user_nodemask(pol)) {
1226	*nmask = pol->w.user_nodemask;
1227	} else {
1228	task_lock(current);
1229	get_policy_nodemask(pol, nodes: nmask);
1230	task_unlock(current);
1231	}
1232	}
1233
1234	out:
1235	mpol_cond_put(pol);
1236	if (vma)
1237	mmap_read_unlock(mm);
1238	if (pol_refcount)
1239	mpol_put(pol: pol_refcount);
1240	return err;
1241	}
1242
1243	#ifdef CONFIG_MIGRATION
1244	static bool migrate_folio_add(struct folio folio, struct* list_head *foliolist,
1245	unsigned long flags)
1246	{
1247	/*
1248	* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
1249	* Choosing not to migrate a shared folio is not counted as a failure.
1250	*
1251	* See folio_maybe_mapped_shared() on possible imprecision when we
1252	* cannot easily detect if a folio is shared.
1253	*/
1254	if ((flags & MPOL_MF_MOVE_ALL) \|\| !folio_maybe_mapped_shared(folio)) {
1255	if (folio_isolate_lru(folio)) {
1256	list_add_tail(new: &folio->lru, head: foliolist);
1257	node_stat_mod_folio(folio,
1258	item: NR_ISOLATED_ANON + folio_is_file_lru(folio),
1259	nr: folio_nr_pages(folio));
1260	} else {
1261	/*
1262	* Non-movable folio may reach here. And, there may be
1263	* temporary off LRU folios or non-LRU movable folios.
1264	* Treat them as unmovable folios since they can't be
1265	* isolated, so they can't be moved at the moment.
1266	*/
1267	return false;
1268	}
1269	}
1270	return true;
1271	}
1272
1273	/*
1274	* Migrate pages from one node to a target node.
1275	* Returns error or the number of pages not migrated.
1276	*/
1277	static long migrate_to_node(struct mm_struct mm, int* source, int dest,
1278	int flags)
1279	{
1280	nodemask_t nmask;
1281	struct vm_area_struct *vma;
1282	LIST_HEAD(pagelist);
1283	long nr_failed;
1284	long err = `0`;
1285	struct migration_target_control mtc = {
1286	.nid = dest,
1287	.gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
1288	.reason = MR_SYSCALL,
1289	};
1290
1291	nodes_clear(nmask);
1292	node_set(source, nmask);
1293
1294	VM_BUG_ON(!(flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)));
1295
1296	mmap_read_lock(mm);
1297	vma = find_vma(mm, addr: `0`);
1298	if (unlikely(!vma)) {
1299	mmap_read_unlock(mm);
1300	return `0`;
1301	}
1302
1303	/*
1304	* This does not migrate the range, but isolates all pages that
1305	* need migration. Between passing in the full user address
1306	* space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1307	* but passes back the count of pages which could not be isolated.
1308	*/
1309	nr_failed = queue_pages_range(mm, start: vma->vm_start, end: mm->task_size, nodes: &nmask,
1310	flags: flags \| MPOL_MF_DISCONTIG_OK, pagelist: &pagelist);
1311	mmap_read_unlock(mm);
1312
1313	if (!list_empty(head: &pagelist)) {
1314	err = migrate_pages(l: &pagelist, new: alloc_migration_target, NULL,
1315	private: (unsigned long)&mtc, mode: MIGRATE_SYNC, reason: MR_SYSCALL, NULL);
1316	if (err)
1317	putback_movable_pages(l: &pagelist);
1318	}
1319
1320	if (err >= `0`)
1321	err += nr_failed;
1322	return err;
1323	}
1324
1325	/*
1326	* Move pages between the two nodesets so as to preserve the physical
1327	* layout as much as possible.
1328	*
1329	* Returns the number of page that could not be moved.
1330	*/
1331	int do_migrate_pages(struct mm_struct mm, const* nodemask_t *from,
1332	const nodemask_t to, int* flags)
1333	{
1334	long nr_failed = `0`;
1335	long err = `0`;
1336	nodemask_t tmp;
1337
1338	lru_cache_disable();
1339
1340	/*
1341	* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1342	* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1343	* bit in 'tmp', and return that <source, dest> pair for migration.
1344	* The pair of nodemasks 'to' and 'from' define the map.
1345	*
1346	* If no pair of bits is found that way, fallback to picking some
1347	* pair of 'source' and 'dest' bits that are not the same. If the
1348	* 'source' and 'dest' bits are the same, this represents a node
1349	* that will be migrating to itself, so no pages need move.
1350	*
1351	* If no bits are left in 'tmp', or if all remaining bits left
1352	* in 'tmp' correspond to the same bit in 'to', return false
1353	* (nothing left to migrate).
1354	*
1355	* This lets us pick a pair of nodes to migrate between, such that
1356	* if possible the dest node is not already occupied by some other
1357	* source node, minimizing the risk of overloading the memory on a
1358	* node that would happen if we migrated incoming memory to a node
1359	* before migrating outgoing memory source that same node.
1360	*
1361	* A single scan of tmp is sufficient. As we go, we remember the
1362	* most recent <s, d> pair that moved (s != d). If we find a pair
1363	* that not only moved, but what's better, moved to an empty slot
1364	* (d is not set in tmp), then we break out then, with that pair.
1365	* Otherwise when we finish scanning from_tmp, we at least have the
1366	* most recent <s, d> pair that moved. If we get all the way through
1367	* the scan of tmp without finding any node that moved, much less
1368	* moved to an empty node, then there is nothing left worth migrating.
1369	*/
1370
1371	tmp = *from;
1372	while (!nodes_empty(tmp)) {
1373	int s, d;
1374	int source = NUMA_NO_NODE;
1375	int dest = `0`;
1376
1377	for_each_node_mask(s, tmp) {
1378
1379	/*
1380	* do_migrate_pages() tries to maintain the relative
1381	* node relationship of the pages established between
1382	* threads and memory areas.
1383	*
1384	* However if the number of source nodes is not equal to
1385	* the number of destination nodes we can not preserve
1386	* this node relative relationship. In that case, skip
1387	* copying memory from a node that is in the destination
1388	* mask.
1389	*
1390	* Example: [2,3,4] -> [3,4,5] moves everything.
1391	* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1392	*/
1393
1394	if ((nodes_weight(from) != nodes_weight(to)) &&
1395	(node_isset(s, *to)))
1396	continue;
1397
1398	d = node_remap(s, from, to);
1399	if (s == d)
1400	continue;
1401
1402	source = s; / Node moved. Memorize /
1403	dest = d;
1404
1405	/ dest not in remaining from nodes? /
1406	if (!node_isset(dest, tmp))
1407	break;
1408	}
1409	if (source == NUMA_NO_NODE)
1410	break;
1411
1412	node_clear(source, tmp);
1413	err = migrate_to_node(mm, source, dest, flags);
1414	if (err > `0`)
1415	nr_failed += err;
1416	if (err < `0`)
1417	break;
1418	}
1419
1420	lru_cache_enable();
1421	if (err < `0`)
1422	return err;
1423	return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
1424	}
1425
1426	/*
1427	* Allocate a new folio for page migration, according to NUMA mempolicy.
1428	*/
1429	static struct folio alloc_migration_target_by_mpol(struct* folio *src,
1430	unsigned long private)
1431	{
1432	struct migration_mpol mmpol = (struct* migration_mpol *)private;
1433	struct mempolicy *pol = mmpol->pol;
1434	pgoff_t ilx = mmpol->ilx;
1435	unsigned int order;
1436	int nid = numa_node_id();
1437	gfp_t gfp;
1438
1439	order = folio_order(folio: src);
1440	ilx += src->index >> order;
1441
1442	if (folio_test_hugetlb(folio: src)) {
1443	nodemask_t *nodemask;
1444	struct hstate *h;
1445
1446	h = folio_hstate(folio: src);
1447	gfp = htlb_alloc_mask(h);
1448	nodemask = policy_nodemask(gfp, pol, ilx, nid: &nid);
1449	return alloc_hugetlb_folio_nodemask(h, preferred_nid: nid, nmask: nodemask, gfp_mask: gfp,
1450	allow_alloc_fallback: htlb_allow_alloc_fallback(reason: MR_MEMPOLICY_MBIND));
1451	}
1452
1453	if (folio_test_large(folio: src))
1454	gfp = GFP_TRANSHUGE;
1455	else
1456	gfp = GFP_HIGHUSER_MOVABLE \| __GFP_RETRY_MAYFAIL \| __GFP_COMP;
1457
1458	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
1459	}
1460	#else
1461
1462	static bool migrate_folio_add(struct folio folio, struct* list_head *foliolist,
1463	unsigned long flags)
1464	{
1465	return false;
1466	}
1467
1468	int do_migrate_pages(struct mm_struct mm, const* nodemask_t *from,
1469	const nodemask_t to, int* flags)
1470	{
1471	return -ENOSYS;
1472	}
1473
1474	static struct folio alloc_migration_target_by_mpol(struct* folio *src,
1475	unsigned long private)
1476	{
1477	return NULL;
1478	}
1479	#endif
1480
1481	static long do_mbind(unsigned long start, unsigned long len,
1482	unsigned short mode, unsigned short mode_flags,
1483	nodemask_t nmask, unsigned* long flags)
1484	{
1485	struct mm_struct *mm = current->mm;
1486	struct vm_area_struct vma, prev;
1487	struct vma_iterator vmi;
1488	struct migration_mpol mmpol;
1489	struct mempolicy *new;
1490	unsigned long end;
1491	long err;
1492	long nr_failed;
1493	LIST_HEAD(pagelist);
1494
1495	if (flags & ~(unsigned long)MPOL_MF_VALID)
1496	return -EINVAL;
1497	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1498	return -EPERM;
1499
1500	if (start & ~PAGE_MASK)
1501	return -EINVAL;
1502
1503	if (mode == MPOL_DEFAULT)
1504	flags &= ~MPOL_MF_STRICT;
1505
1506	len = PAGE_ALIGN(len);
1507	end = start + len;
1508
1509	if (end < start)
1510	return -EINVAL;
1511	if (end == start)
1512	return `0`;
1513
1514	new = mpol_new(mode, flags: mode_flags, nodes: nmask);
1515	if (IS_ERR(ptr: new))
1516	return PTR_ERR(ptr: new);
1517
1518	/*
1519	* If we are using the default policy then operation
1520	* on discontinuous address spaces is okay after all
1521	*/
1522	if (!new)
1523	flags \|= MPOL_MF_DISCONTIG_OK;
1524
1525	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
1526	lru_cache_disable();
1527	{
1528	NODEMASK_SCRATCH(scratch);
1529	if (scratch) {
1530	mmap_write_lock(mm);
1531	err = mpol_set_nodemask(pol: new, nodes: nmask, nsc: scratch);
1532	if (err)
1533	mmap_write_unlock(mm);
1534	} else
1535	err = -ENOMEM;
1536	NODEMASK_SCRATCH_FREE(scratch);
1537	}
1538	if (err)
1539	goto mpol_out;
1540
1541	/*
1542	* Lock the VMAs before scanning for pages to migrate,
1543	* to ensure we don't miss a concurrently inserted page.
1544	*/
1545	nr_failed = queue_pages_range(mm, start, end, nodes: nmask,
1546	flags: flags \| MPOL_MF_INVERT \| MPOL_MF_WRLOCK, pagelist: &pagelist);
1547
1548	if (nr_failed < `0`) {
1549	err = nr_failed;
1550	nr_failed = `0`;
1551	} else {
1552	vma_iter_init(vmi: &vmi, mm, addr: start);
1553	prev = vma_prev(vmi: &vmi);
1554	for_each_vma_range(vmi, vma, end) {
1555	err = mbind_range(vmi: &vmi, vma, prev: &prev, start, end, new_pol: new);
1556	if (err)
1557	break;
1558	}
1559	}
1560
1561	if (!err && !list_empty(head: &pagelist)) {
1562	/ Convert MPOL_DEFAULT's NULL to task or default policy /
1563	if (!new) {
1564	new = get_task_policy(current);
1565	mpol_get(pol: new);
1566	}
1567	mmpol.pol = new;
1568	mmpol.ilx = `0`;
1569
1570	/*
1571	* In the interleaved case, attempt to allocate on exactly the
1572	* targeted nodes, for the first VMA to be migrated; for later
1573	* VMAs, the nodes will still be interleaved from the targeted
1574	* nodemask, but one by one may be selected differently.
1575	*/
1576	if (new->mode == MPOL_INTERLEAVE \|\|
1577	new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1578	struct folio *folio;
1579	unsigned int order;
1580	unsigned long addr = -EFAULT;
1581
1582	list_for_each_entry(folio, &pagelist, lru) {
1583	if (!folio_test_ksm(folio))
1584	break;
1585	}
1586	if (!list_entry_is_head(folio, &pagelist, lru)) {
1587	vma_iter_init(vmi: &vmi, mm, addr: start);
1588	for_each_vma_range(vmi, vma, end) {
1589	addr = page_address_in_vma(folio,
1590	folio_page(folio, `0`), vma);
1591	if (addr != -EFAULT)
1592	break;
1593	}
1594	}
1595	if (addr != -EFAULT) {
1596	order = folio_order(folio);
1597	/ We already know the pol, but not the ilx /
1598	mpol_cond_put(pol: get_vma_policy(vma, addr, order,
1599	ilx: &mmpol.ilx));
1600	/ Set base from which to increment by index /
1601	mmpol.ilx -= folio->index >> order;
1602	}
1603	}
1604	}
1605
1606	mmap_write_unlock(mm);
1607
1608	if (!err && !list_empty(head: &pagelist)) {
1609	nr_failed \|= migrate_pages(l: &pagelist,
1610	new: alloc_migration_target_by_mpol, NULL,
1611	private: (unsigned long)&mmpol, mode: MIGRATE_SYNC,
1612	reason: MR_MEMPOLICY_MBIND, NULL);
1613	}
1614
1615	if (nr_failed && (flags & MPOL_MF_STRICT))
1616	err = -EIO;
1617	if (!list_empty(head: &pagelist))
1618	putback_movable_pages(l: &pagelist);
1619	mpol_out:
1620	mpol_put(pol: new);
1621	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
1622	lru_cache_enable();
1623	return err;
1624	}
1625
1626	/*
1627	* User space interface with variable sized bitmaps for nodelists.
1628	*/
1629	static int get_bitmap(unsigned long mask, const* unsigned long __user *nmask,
1630	unsigned long maxnode)
1631	{
1632	unsigned long nlongs = BITS_TO_LONGS(maxnode);
1633	int ret;
1634
1635	if (in_compat_syscall())
1636	ret = compat_get_bitmap(mask,
1637	umask: (const compat_ulong_t __user *)nmask,
1638	bitmap_size: maxnode);
1639	else
1640	ret = copy_from_user(to: mask, from: nmask,
1641	n: nlongs * sizeof(unsigned long));
1642
1643	if (ret)
1644	return -EFAULT;
1645
1646	if (maxnode % BITS_PER_LONG)
1647	mask[nlongs - `1`] &= (`1UL` << (maxnode % BITS_PER_LONG)) - `1`;
1648
1649	return `0`;
1650	}
1651
1652	/ Copy a node mask from user space. /
1653	static int get_nodes(nodemask_t nodes, const* unsigned long __user *nmask,
1654	unsigned long maxnode)
1655	{
1656	--maxnode;
1657	nodes_clear(*nodes);
1658	if (maxnode == `0` \|\| !nmask)
1659	return `0`;
1660	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1661	return -EINVAL;
1662
1663	/*
1664	* When the user specified more nodes than supported just check
1665	* if the non supported part is all zero, one word at a time,
1666	* starting at the end.
1667	*/
1668	while (maxnode > MAX_NUMNODES) {
1669	unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1670	unsigned long t;
1671
1672	if (get_bitmap(mask: &t, nmask: &nmask[(maxnode - `1`) / BITS_PER_LONG], maxnode: bits))
1673	return -EFAULT;
1674
1675	if (maxnode - bits >= MAX_NUMNODES) {
1676	maxnode -= bits;
1677	} else {
1678	maxnode = MAX_NUMNODES;
1679	t &= ~((`1UL` << (MAX_NUMNODES % BITS_PER_LONG)) - `1`);
1680	}
1681	if (t)
1682	return -EINVAL;
1683	}
1684
1685	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1686	}
1687
1688	/ Copy a kernel node mask to user space /
1689	static int copy_nodes_to_user(unsigned long __user mask, unsigned* long maxnode,
1690	nodemask_t *nodes)
1691	{
1692	unsigned long copy = ALIGN(maxnode-`1`, `64`) / `8`;
1693	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1694	bool compat = in_compat_syscall();
1695
1696	if (compat)
1697	nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1698
1699	if (copy > nbytes) {
1700	if (copy > PAGE_SIZE)
1701	return -EINVAL;
1702	if (clear_user(to: (char __user *)mask + nbytes, n: copy - nbytes))
1703	return -EFAULT;
1704	copy = nbytes;
1705	maxnode = nr_node_ids;
1706	}
1707
1708	if (compat)
1709	return compat_put_bitmap(umask: (compat_ulong_t __user *)mask,
1710	nodes_addr(*nodes), bitmap_size: maxnode);
1711
1712	return copy_to_user(to: mask, nodes_addr(*nodes), n: copy) ? -EFAULT : `0`;
1713	}
1714
1715	/ Basic parameter sanity check used by both mbind() and set_mempolicy() /
1716	static inline int sanitize_mpol_flags(int mode, unsigned* short *flags)
1717	{
1718	flags = mode & MPOL_MODE_FLAGS;
1719	*mode &= ~MPOL_MODE_FLAGS;
1720
1721	if ((unsigned int)(*mode) >= MPOL_MAX)
1722	return -EINVAL;
1723	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1724	return -EINVAL;
1725	if (*flags & MPOL_F_NUMA_BALANCING) {
1726	if (mode == MPOL_BIND \|\| mode == MPOL_PREFERRED_MANY)
1727	*flags \|= (MPOL_F_MOF \| MPOL_F_MORON);
1728	else
1729	return -EINVAL;
1730	}
1731	return `0`;
1732	}
1733
1734	static long kernel_mbind(unsigned long start, unsigned long len,
1735	unsigned long mode, const unsigned long __user *nmask,
1736	unsigned long maxnode, unsigned int flags)
1737	{
1738	unsigned short mode_flags;
1739	nodemask_t nodes;
1740	int lmode = mode;
1741	int err;
1742
1743	start = untagged_addr(start);
1744	err = sanitize_mpol_flags(mode: &lmode, flags: &mode_flags);
1745	if (err)
1746	return err;
1747
1748	err = get_nodes(nodes: &nodes, nmask, maxnode);
1749	if (err)
1750	return err;
1751
1752	return do_mbind(start, len, mode: lmode, mode_flags, nmask: &nodes, flags);
1753	}
1754
1755	SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1756	unsigned long, home_node, unsigned long, flags)
1757	{
1758	struct mm_struct *mm = current->mm;
1759	struct vm_area_struct vma, prev;
1760	struct mempolicy new, old;
1761	unsigned long end;
1762	int err = -ENOENT;
1763	VMA_ITERATOR(vmi, mm, start);
1764
1765	start = untagged_addr(start);
1766	if (start & ~PAGE_MASK)
1767	return -EINVAL;
1768	/*
1769	* flags is used for future extension if any.
1770	*/
1771	if (flags != `0`)
1772	return -EINVAL;
1773
1774	/*
1775	* Check home_node is online to avoid accessing uninitialized
1776	* NODE_DATA.
1777	*/
1778	if (home_node >= MAX_NUMNODES \|\| !node_online(home_node))
1779	return -EINVAL;
1780
1781	len = PAGE_ALIGN(len);
1782	end = start + len;
1783
1784	if (end < start)
1785	return -EINVAL;
1786	if (end == start)
1787	return `0`;
1788	mmap_write_lock(mm);
1789	prev = vma_prev(vmi: &vmi);
1790	for_each_vma_range(vmi, vma, end) {
1791	/*
1792	* If any vma in the range got policy other than MPOL_BIND
1793	* or MPOL_PREFERRED_MANY we return error. We don't reset
1794	* the home node for vmas we already updated before.
1795	*/
1796	old = vma_policy(vma);
1797	if (!old) {
1798	prev = vma;
1799	continue;
1800	}
1801	if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1802	err = -EOPNOTSUPP;
1803	break;
1804	}
1805	new = mpol_dup(pol: old);
1806	if (IS_ERR(ptr: new)) {
1807	err = PTR_ERR(ptr: new);
1808	break;
1809	}
1810
1811	vma_start_write(vma);
1812	new->home_node = home_node;
1813	err = mbind_range(vmi: &vmi, vma, prev: &prev, start, end, new_pol: new);
1814	mpol_put(pol: new);
1815	if (err)
1816	break;
1817	}
1818	mmap_write_unlock(mm);
1819	return err;
1820	}
1821
1822	SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1823	unsigned long, mode, const unsigned long __user *, nmask,
1824	unsigned long, maxnode, unsigned int, flags)
1825	{
1826	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1827	}
1828
1829	/ Set the process memory policy /
1830	static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1831	unsigned long maxnode)
1832	{
1833	unsigned short mode_flags;
1834	nodemask_t nodes;
1835	int lmode = mode;
1836	int err;
1837
1838	err = sanitize_mpol_flags(mode: &lmode, flags: &mode_flags);
1839	if (err)
1840	return err;
1841
1842	err = get_nodes(nodes: &nodes, nmask, maxnode);
1843	if (err)
1844	return err;
1845
1846	return do_set_mempolicy(mode: lmode, flags: mode_flags, nodes: &nodes);
1847	}
1848
1849	SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1850	unsigned long, maxnode)
1851	{
1852	return kernel_set_mempolicy(mode, nmask, maxnode);
1853	}
1854
1855	static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1856	const unsigned long __user *old_nodes,
1857	const unsigned long __user *new_nodes)
1858	{
1859	struct mm_struct *mm = NULL;
1860	struct task_struct *task;
1861	nodemask_t task_nodes;
1862	int err;
1863	nodemask_t *old;
1864	nodemask_t *new;
1865	NODEMASK_SCRATCH(scratch);
1866
1867	if (!scratch)
1868	return -ENOMEM;
1869
1870	old = &scratch->mask1;
1871	new = &scratch->mask2;
1872
1873	err = get_nodes(nodes: old, nmask: old_nodes, maxnode);
1874	if (err)
1875	goto out;
1876
1877	err = get_nodes(nodes: new, nmask: new_nodes, maxnode);
1878	if (err)
1879	goto out;
1880
1881	/ Find the mm_struct /
1882	rcu_read_lock();
1883	task = pid ? find_task_by_vpid(nr: pid) : current;
1884	if (!task) {
1885	rcu_read_unlock();
1886	err = -ESRCH;
1887	goto out;
1888	}
1889	get_task_struct(t: task);
1890
1891	err = -EINVAL;
1892
1893	/*
1894	* Check if this process has the right to modify the specified process.
1895	* Use the regular "ptrace_may_access()" checks.
1896	*/
1897	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1898	rcu_read_unlock();
1899	err = -EPERM;
1900	goto out_put;
1901	}
1902	rcu_read_unlock();
1903
1904	task_nodes = cpuset_mems_allowed(p: task);
1905	/ Is the user allowed to access the target nodes? /
1906	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1907	err = -EPERM;
1908	goto out_put;
1909	}
1910
1911	task_nodes = cpuset_mems_allowed(current);
1912	nodes_and(new, new, task_nodes);
1913	if (nodes_empty(*new))
1914	goto out_put;
1915
1916	err = security_task_movememory(p: task);
1917	if (err)
1918	goto out_put;
1919
1920	mm = get_task_mm(task);
1921	put_task_struct(t: task);
1922
1923	if (!mm) {
1924	err = -EINVAL;
1925	goto out;
1926	}
1927
1928	err = do_migrate_pages(mm, from: old, to: new,
1929	flags: capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1930
1931	mmput(mm);
1932	out:
1933	NODEMASK_SCRATCH_FREE(scratch);
1934
1935	return err;
1936
1937	out_put:
1938	put_task_struct(t: task);
1939	goto out;
1940	}
1941
1942	SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1943	const unsigned long __user *, old_nodes,
1944	const unsigned long __user *, new_nodes)
1945	{
1946	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1947	}
1948
1949	/ Retrieve NUMA policy /
1950	static int kernel_get_mempolicy(int __user *policy,
1951	unsigned long __user *nmask,
1952	unsigned long maxnode,
1953	unsigned long addr,
1954	unsigned long flags)
1955	{
1956	int err;
1957	int pval;
1958	nodemask_t nodes;
1959
1960	if (nmask != NULL && maxnode < nr_node_ids)
1961	return -EINVAL;
1962
1963	addr = untagged_addr(addr);
1964
1965	err = do_get_mempolicy(policy: &pval, nmask: &nodes, addr, flags);
1966
1967	if (err)
1968	return err;
1969
1970	if (policy && put_user(pval, policy))
1971	return -EFAULT;
1972
1973	if (nmask)
1974	err = copy_nodes_to_user(mask: nmask, maxnode, nodes: &nodes);
1975
1976	return err;
1977	}
1978
1979	SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1980	unsigned long __user , nmask, unsigned* long, maxnode,
1981	unsigned long, addr, unsigned long, flags)
1982	{
1983	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1984	}
1985
1986	bool vma_migratable(struct vm_area_struct *vma)
1987	{
1988	if (vma->vm_flags & (VM_IO \| VM_PFNMAP))
1989	return false;
1990
1991	/*
1992	* DAX device mappings require predictable access latency, so avoid
1993	* incurring periodic faults.
1994	*/
1995	if (vma_is_dax(vma))
1996	return false;
1997
1998	if (is_vm_hugetlb_page(vma) &&
1999	!hugepage_migration_supported(h: hstate_vma(vma)))
2000	return false;
2001
2002	/*
2003	* Migration allocates pages in the highest zone. If we cannot
2004	* do so then migration (at least from node to node) is not
2005	* possible.
2006	*/
2007	if (vma->vm_file &&
2008	gfp_zone(flags: mapping_gfp_mask(mapping: vma->vm_file->f_mapping))
2009	< policy_zone)
2010	return false;
2011	return true;
2012	}
2013
2014	struct mempolicy __get_vma_policy(struct* vm_area_struct *vma,
2015	unsigned long addr, pgoff_t *ilx)
2016	{
2017	*ilx = `0`;
2018	return (vma->vm_ops && vma->vm_ops->get_policy) ?
2019	vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
2020	}
2021
2022	/*
2023	* get_vma_policy(@vma, @addr, @order, @ilx)
2024	* @vma: virtual memory area whose policy is sought
2025	* @addr: address in @vma for shared policy lookup
2026	* @order: 0, or appropriate huge_page_order for interleaving
2027	* @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2028	* MPOL_WEIGHTED_INTERLEAVE
2029	*
2030	* Returns effective policy for a VMA at specified address.
2031	* Falls back to current->mempolicy or system default policy, as necessary.
2032	* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
2033	* count--added by the get_policy() vm_op, as appropriate--to protect against
2034	* freeing by another task. It is the caller's responsibility to free the
2035	* extra reference for shared policies.
2036	*/
2037	struct mempolicy get_vma_policy(struct* vm_area_struct *vma,
2038	unsigned long addr, int order, pgoff_t *ilx)
2039	{
2040	struct mempolicy *pol;
2041
2042	pol = __get_vma_policy(vma, addr, ilx);
2043	if (!pol)
2044	pol = get_task_policy(current);
2045	if (pol->mode == MPOL_INTERLEAVE \|\|
2046	pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2047	*ilx += vma->vm_pgoff >> order;
2048	*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2049	}
2050	return pol;
2051	}
2052
2053	bool vma_policy_mof(struct vm_area_struct *vma)
2054	{
2055	struct mempolicy *pol;
2056
2057	if (vma->vm_ops && vma->vm_ops->get_policy) {
2058	bool ret = false;
2059	pgoff_t ilx; / ignored here /
2060
2061	pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2062	if (pol && (pol->flags & MPOL_F_MOF))
2063	ret = true;
2064	mpol_cond_put(pol);
2065
2066	return ret;
2067	}
2068
2069	pol = vma->vm_policy;
2070	if (!pol)
2071	pol = get_task_policy(current);
2072
2073	return pol->flags & MPOL_F_MOF;
2074	}
2075
2076	bool apply_policy_zone(struct mempolicy policy, enum* zone_type zone)
2077	{
2078	enum zone_type dynamic_policy_zone = policy_zone;
2079
2080	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2081
2082	/*
2083	* if policy->nodes has movable memory only,
2084	* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2085	*
2086	* policy->nodes is intersect with node_states[N_MEMORY].
2087	* so if the following test fails, it implies
2088	* policy->nodes has movable memory only.
2089	*/
2090	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2091	dynamic_policy_zone = ZONE_MOVABLE;
2092
2093	return zone >= dynamic_policy_zone;
2094	}
2095
2096	static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2097	{
2098	unsigned int node;
2099	unsigned int cpuset_mems_cookie;
2100
2101	retry:
2102	/ to prevent miscount use tsk->mems_allowed_seq to detect rebind /
2103	cpuset_mems_cookie = read_mems_allowed_begin();
2104	node = current->il_prev;
2105	if (!current->il_weight \|\| !node_isset(node, policy->nodes)) {
2106	node = next_node_in(node, policy->nodes);
2107	if (read_mems_allowed_retry(seq: cpuset_mems_cookie))
2108	goto retry;
2109	if (node == MAX_NUMNODES)
2110	return node;
2111	current->il_prev = node;
2112	current->il_weight = get_il_weight(node);
2113	}
2114	current->il_weight--;
2115	return node;
2116	}
2117
2118	/ Do dynamic interleaving for a process /
2119	static unsigned int interleave_nodes(struct mempolicy *policy)
2120	{
2121	unsigned int nid;
2122	unsigned int cpuset_mems_cookie;
2123
2124	/ to prevent miscount, use tsk->mems_allowed_seq to detect rebind /
2125	do {
2126	cpuset_mems_cookie = read_mems_allowed_begin();
2127	nid = next_node_in(current->il_prev, policy->nodes);
2128	} while (read_mems_allowed_retry(seq: cpuset_mems_cookie));
2129
2130	if (nid < MAX_NUMNODES)
2131	current->il_prev = nid;
2132	return nid;
2133	}
2134
2135	/*
2136	* Depending on the memory policy provide a node from which to allocate the
2137	* next slab entry.
2138	*/
2139	unsigned int mempolicy_slab_node(void)
2140	{
2141	struct mempolicy *policy;
2142	int node = numa_mem_id();
2143
2144	if (!in_task())
2145	return node;
2146
2147	policy = current->mempolicy;
2148	if (!policy)
2149	return node;
2150
2151	switch (policy->mode) {
2152	case MPOL_PREFERRED:
2153	return first_node(policy->nodes);
2154
2155	case MPOL_INTERLEAVE:
2156	return interleave_nodes(policy);
2157
2158	case MPOL_WEIGHTED_INTERLEAVE:
2159	return weighted_interleave_nodes(policy);
2160
2161	case MPOL_BIND:
2162	case MPOL_PREFERRED_MANY:
2163	{
2164	struct zoneref *z;
2165
2166	/*
2167	* Follow bind policy behavior and start allocation at the
2168	* first node.
2169	*/
2170	struct zonelist *zonelist;
2171	enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2172	zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2173	z = first_zones_zonelist(zonelist, highest_zoneidx,
2174	nodes: &policy->nodes);
2175	return zonelist_zone(zoneref: z) ? zonelist_node_idx(zoneref: z) : node;
2176	}
2177	case MPOL_LOCAL:
2178	return node;
2179
2180	default:
2181	BUG();
2182	}
2183	}
2184
2185	static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
2186	nodemask_t *mask)
2187	{
2188	/*
2189	* barrier stabilizes the nodemask locally so that it can be iterated
2190	* over safely without concern for changes. Allocators validate node
2191	* selection does not violate mems_allowed, so this is safe.
2192	*/
2193	barrier();
2194	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
2195	barrier();
2196	return nodes_weight(*mask);
2197	}
2198
2199	static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2200	{
2201	struct weighted_interleave_state *state;
2202	nodemask_t nodemask;
2203	unsigned int target, nr_nodes;
2204	u8 *table = NULL;
2205	unsigned int weight_total = `0`;
2206	u8 weight;
2207	int nid = `0`;
2208
2209	nr_nodes = read_once_policy_nodemask(pol, mask: &nodemask);
2210	if (!nr_nodes)
2211	return numa_node_id();
2212
2213	rcu_read_lock();
2214
2215	state = rcu_dereference(wi_state);
2216	/ Uninitialized wi_state means we should assume all weights are 1 /
2217	if (state)
2218	table = state->iw_table;
2219
2220	/ calculate the total weight /
2221	for_each_node_mask(nid, nodemask)
2222	weight_total += table ? table[nid] : `1`;
2223
2224	/ Calculate the node offset based on totals /
2225	target = ilx % weight_total;
2226	nid = first_node(nodemask);
2227	while (target) {
2228	/ detect system default usage /
2229	weight = table ? table[nid] : `1`;
2230	if (target < weight)
2231	break;
2232	target -= weight;
2233	nid = next_node_in(nid, nodemask);
2234	}
2235	rcu_read_unlock();
2236	return nid;
2237	}
2238
2239	/*
2240	* Do static interleaving for interleave index @ilx. Returns the ilx'th
2241	* node in pol->nodes (starting from ilx=0), wrapping around if ilx
2242	* exceeds the number of present nodes.
2243	*/
2244	static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2245	{
2246	nodemask_t nodemask;
2247	unsigned int target, nnodes;
2248	int i;
2249	int nid;
2250
2251	nnodes = read_once_policy_nodemask(pol, mask: &nodemask);
2252	if (!nnodes)
2253	return numa_node_id();
2254	target = ilx % nnodes;
2255	nid = first_node(nodemask);
2256	for (i = `0`; i < target; i++)
2257	nid = next_node(nid, nodemask);
2258	return nid;
2259	}
2260
2261	/*
2262	* Return a nodemask representing a mempolicy for filtering nodes for
2263	* page allocation, together with preferred node id (or the input node id).
2264	*/
2265	static nodemask_t policy_nodemask(gfp_t gfp, struct* mempolicy *pol,
2266	pgoff_t ilx, int *nid)
2267	{
2268	nodemask_t *nodemask = NULL;
2269
2270	switch (pol->mode) {
2271	case MPOL_PREFERRED:
2272	/ Override input node id /
2273	*nid = first_node(pol->nodes);
2274	break;
2275	case MPOL_PREFERRED_MANY:
2276	nodemask = &pol->nodes;
2277	if (pol->home_node != NUMA_NO_NODE)
2278	*nid = pol->home_node;
2279	break;
2280	case MPOL_BIND:
2281	/ Restrict to nodemask (but not on lower zones) /
2282	if (apply_policy_zone(policy: pol, zone: gfp_zone(flags: gfp)) &&
2283	cpuset_nodemask_valid_mems_allowed(nodemask: &pol->nodes))
2284	nodemask = &pol->nodes;
2285	if (pol->home_node != NUMA_NO_NODE)
2286	*nid = pol->home_node;
2287	/*
2288	* __GFP_THISNODE shouldn't even be used with the bind policy
2289	* because we might easily break the expectation to stay on the
2290	* requested node and not break the policy.
2291	*/
2292	WARN_ON_ONCE(gfp & __GFP_THISNODE);
2293	break;
2294	case MPOL_INTERLEAVE:
2295	/ Override input node id /
2296	*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2297	interleave_nodes(policy: pol) : interleave_nid(pol, ilx);
2298	break;
2299	case MPOL_WEIGHTED_INTERLEAVE:
2300	*nid = (ilx == NO_INTERLEAVE_INDEX) ?
2301	weighted_interleave_nodes(policy: pol) :
2302	weighted_interleave_nid(pol, ilx);
2303	break;
2304	}
2305
2306	return nodemask;
2307	}
2308
2309	#ifdef CONFIG_HUGETLBFS
2310	/*
2311	* huge_node(@vma, @addr, @gfp_flags, @mpol)
2312	* @vma: virtual memory area whose policy is sought
2313	* @addr: address in @vma for shared policy lookup and interleave policy
2314	* @gfp_flags: for requested zone
2315	* @mpol: pointer to mempolicy pointer for reference counted mempolicy
2316	* @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2317	*
2318	* Returns a nid suitable for a huge page allocation and a pointer
2319	* to the struct mempolicy for conditional unref after allocation.
2320	* If the effective policy is 'bind' or 'prefer-many', returns a pointer
2321	* to the mempolicy's @nodemask for filtering the zonelist.
2322	*/
2323	int huge_node(struct vm_area_struct vma, unsigned* long addr, gfp_t gfp_flags,
2324	struct mempolicy mpol, nodemask_t nodemask)
2325	{
2326	pgoff_t ilx;
2327	int nid;
2328
2329	nid = numa_node_id();
2330	*mpol = get_vma_policy(vma, addr, order: hstate_vma(vma)->order, ilx: &ilx);
2331	nodemask = policy_nodemask(gfp: gfp_flags, pol: mpol, ilx, nid: &nid);
2332	return nid;
2333	}
2334
2335	/*
2336	* init_nodemask_of_mempolicy
2337	*
2338	* If the current task's mempolicy is "default" [NULL], return 'false'
2339	* to indicate default policy. Otherwise, extract the policy nodemask
2340	* for 'bind' or 'interleave' policy into the argument nodemask, or
2341	* initialize the argument nodemask to contain the single node for
2342	* 'preferred' or 'local' policy and return 'true' to indicate presence
2343	* of non-default mempolicy.
2344	*
2345	* We don't bother with reference counting the mempolicy [mpol_get/put]
2346	* because the current task is examining it's own mempolicy and a task's
2347	* mempolicy is only ever changed by the task itself.
2348	*
2349	* N.B., it is the caller's responsibility to free a returned nodemask.
2350	*/
2351	bool init_nodemask_of_mempolicy(nodemask_t *mask)
2352	{
2353	struct mempolicy *mempolicy;
2354
2355	if (!(mask && current->mempolicy))
2356	return false;
2357
2358	task_lock(current);
2359	mempolicy = current->mempolicy;
2360	switch (mempolicy->mode) {
2361	case MPOL_PREFERRED:
2362	case MPOL_PREFERRED_MANY:
2363	case MPOL_BIND:
2364	case MPOL_INTERLEAVE:
2365	case MPOL_WEIGHTED_INTERLEAVE:
2366	*mask = mempolicy->nodes;
2367	break;
2368
2369	case MPOL_LOCAL:
2370	init_nodemask_of_node(mask, node: numa_node_id());
2371	break;
2372
2373	default:
2374	BUG();
2375	}
2376	task_unlock(current);
2377
2378	return true;
2379	}
2380	#endif
2381
2382	/*
2383	* mempolicy_in_oom_domain
2384	*
2385	* If tsk's mempolicy is "bind", check for intersection between mask and
2386	* the policy nodemask. Otherwise, return true for all other policies
2387	* including "interleave", as a tsk with "interleave" policy may have
2388	* memory allocated from all nodes in system.
2389	*
2390	* Takes task_lock(tsk) to prevent freeing of its mempolicy.
2391	*/
2392	bool mempolicy_in_oom_domain(struct task_struct *tsk,
2393	const nodemask_t *mask)
2394	{
2395	struct mempolicy *mempolicy;
2396	bool ret = true;
2397
2398	if (!mask)
2399	return ret;
2400
2401	task_lock(p: tsk);
2402	mempolicy = tsk->mempolicy;
2403	if (mempolicy && mempolicy->mode == MPOL_BIND)
2404	ret = nodes_intersects(mempolicy->nodes, *mask);
2405	task_unlock(p: tsk);
2406
2407	return ret;
2408	}
2409
2410	static struct page alloc_pages_preferred_many(gfp_t gfp, unsigned* int order,
2411	int nid, nodemask_t *nodemask)
2412	{
2413	struct page *page;
2414	gfp_t preferred_gfp;
2415
2416	/*
2417	* This is a two pass approach. The first pass will only try the
2418	* preferred nodes but skip the direct reclaim and allow the
2419	* allocation to fail, while the second pass will try all the
2420	* nodes in system.
2421	*/
2422	preferred_gfp = gfp \| __GFP_NOWARN;
2423	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM \| __GFP_NOFAIL);
2424	page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
2425	if (!page)
2426	page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
2427
2428	return page;
2429	}
2430
2431	/**
2432	* alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2433	* @gfp: GFP flags.
2434	* @order: Order of the page allocation.
2435	* @pol: Pointer to the NUMA mempolicy.
2436	* @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2437	* @nid: Preferred node (usually numa_node_id() but @mpol may override it).
2438	*
2439	* Return: The page on success or NULL if allocation fails.
2440	*/
2441	static struct page alloc_pages_mpol(gfp_t gfp, unsigned* int order,
2442	struct mempolicy pol, pgoff_t ilx, int* nid)
2443	{
2444	nodemask_t *nodemask;
2445	struct page *page;
2446
2447	nodemask = policy_nodemask(gfp, pol, ilx, nid: &nid);
2448
2449	if (pol->mode == MPOL_PREFERRED_MANY)
2450	return alloc_pages_preferred_many(gfp, order, nid, nodemask);
2451
2452	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2453	/ filter "hugepage" allocation, unless from alloc_pages() /
2454	order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
2455	/*
2456	* For hugepage allocation and non-interleave policy which
2457	* allows the current node (or other explicitly preferred
2458	* node) we only try to allocate from the current/preferred
2459	* node and don't fall back to other nodes, as the cost of
2460	* remote accesses would likely offset THP benefits.
2461	*
2462	* If the policy is interleave or does not allow the current
2463	* node in its nodemask, we allocate the standard way.
2464	*/
2465	if (pol->mode != MPOL_INTERLEAVE &&
2466	pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2467	(!nodemask \|\| node_isset(nid, *nodemask))) {
2468	/*
2469	* First, try to allocate THP only on local node, but
2470	* don't reclaim unnecessarily, just compact.
2471	*/
2472	page = __alloc_frozen_pages_noprof(
2473	gfp \| __GFP_THISNODE \| __GFP_NORETRY, order,
2474	nid, NULL);
2475	if (page \|\| !(gfp & __GFP_DIRECT_RECLAIM))
2476	return page;
2477	/*
2478	* If hugepage allocations are configured to always
2479	* synchronous compact or the vma has been madvised
2480	* to prefer hugepage backing, retry allowing remote
2481	* memory with both reclaim and compact as well.
2482	*/
2483	}
2484	}
2485
2486	page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2487
2488	if (unlikely(pol->mode == MPOL_INTERLEAVE \|\|
2489	pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2490	/ skip NUMA_INTERLEAVE_HIT update if numa stats is disabled /
2491	if (static_branch_likely(&vm_numa_stat_key) &&
2492	page_to_nid(page) == nid) {
2493	preempt_disable();
2494	__count_numa_event(zone: page_zone(page), item: NUMA_INTERLEAVE_HIT);
2495	preempt_enable();
2496	}
2497	}
2498
2499	return page;
2500	}
2501
2502	struct folio folio_alloc_mpol_noprof(gfp_t gfp, unsigned* int order,
2503	struct mempolicy pol, pgoff_t ilx, int* nid)
2504	{
2505	struct page *page = alloc_pages_mpol(gfp: gfp \| __GFP_COMP, order, pol,
2506	ilx, nid);
2507	if (!page)
2508	return NULL;
2509
2510	set_page_refcounted(page);
2511	return page_rmappable_folio(page);
2512	}
2513
2514	/**
2515	* vma_alloc_folio - Allocate a folio for a VMA.
2516	* @gfp: GFP flags.
2517	* @order: Order of the folio.
2518	* @vma: Pointer to VMA.
2519	* @addr: Virtual address of the allocation. Must be inside @vma.
2520	*
2521	* Allocate a folio for a specific address in @vma, using the appropriate
2522	* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2523	* VMA to prevent it from going away. Should be used for all allocations
2524	* for folios that will be mapped into user space, excepting hugetlbfs, and
2525	* excepting where direct use of folio_alloc_mpol() is more appropriate.
2526	*
2527	* Return: The folio on success or NULL if allocation fails.
2528	*/
2529	struct folio vma_alloc_folio_noprof(gfp_t gfp, int* order, struct vm_area_struct *vma,
2530	unsigned long addr)
2531	{
2532	struct mempolicy *pol;
2533	pgoff_t ilx;
2534	struct folio *folio;
2535
2536	if (vma->vm_flags & VM_DROPPABLE)
2537	gfp \|= __GFP_NOWARN;
2538
2539	pol = get_vma_policy(vma, addr, order, ilx: &ilx);
2540	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, nid: numa_node_id());
2541	mpol_cond_put(pol);
2542	return folio;
2543	}
2544	EXPORT_SYMBOL(vma_alloc_folio_noprof);
2545
2546	struct page alloc_frozen_pages_noprof(gfp_t gfp, unsigned* order)
2547	{
2548	struct mempolicy *pol = &default_policy;
2549
2550	/*
2551	* No reference counting needed for current->mempolicy
2552	* nor system default_policy
2553	*/
2554	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2555	pol = get_task_policy(current);
2556
2557	return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
2558	nid: numa_node_id());
2559	}
2560
2561	/**
2562	* alloc_pages - Allocate pages.
2563	* @gfp: GFP flags.
2564	* @order: Power of two of number of pages to allocate.
2565	*
2566	* Allocate 1 << @order contiguous pages. The physical address of the
2567	* first page is naturally aligned (eg an order-3 allocation will be aligned
2568	* to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2569	* process is honoured when in process context.
2570	*
2571	* Context: Can be called from any context, providing the appropriate GFP
2572	* flags are used.
2573	* Return: The page on success or NULL if allocation fails.
2574	*/
2575	struct page alloc_pages_noprof(gfp_t gfp, unsigned* int order)
2576	{
2577	struct page *page = alloc_frozen_pages_noprof(gfp, order);
2578
2579	if (page)
2580	set_page_refcounted(page);
2581	return page;
2582	}
2583	EXPORT_SYMBOL(alloc_pages_noprof);
2584
2585	struct folio folio_alloc_noprof(gfp_t gfp, unsigned* int order)
2586	{
2587	return page_rmappable_folio(page: alloc_pages_noprof(gfp \| __GFP_COMP, order));
2588	}
2589	EXPORT_SYMBOL(folio_alloc_noprof);
2590
2591	static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2592	struct mempolicy pol, unsigned* long nr_pages,
2593	struct page **page_array)
2594	{
2595	int nodes;
2596	unsigned long nr_pages_per_node;
2597	int delta;
2598	int i;
2599	unsigned long nr_allocated;
2600	unsigned long total_allocated = `0`;
2601
2602	nodes = nodes_weight(pol->nodes);
2603	nr_pages_per_node = nr_pages / nodes;
2604	delta = nr_pages - nodes * nr_pages_per_node;
2605
2606	for (i = `0`; i < nodes; i++) {
2607	if (delta) {
2608	nr_allocated = alloc_pages_bulk_noprof(gfp,
2609	preferred_nid: interleave_nodes(policy: pol), NULL,
2610	nr_pages: nr_pages_per_node + `1`,
2611	page_array);
2612	delta--;
2613	} else {
2614	nr_allocated = alloc_pages_bulk_noprof(gfp,
2615	preferred_nid: interleave_nodes(policy: pol), NULL,
2616	nr_pages: nr_pages_per_node, page_array);
2617	}
2618
2619	page_array += nr_allocated;
2620	total_allocated += nr_allocated;
2621	}
2622
2623	return total_allocated;
2624	}
2625
2626	static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2627	struct mempolicy pol, unsigned* long nr_pages,
2628	struct page **page_array)
2629	{
2630	struct weighted_interleave_state *state;
2631	struct task_struct *me = current;
2632	unsigned int cpuset_mems_cookie;
2633	unsigned long total_allocated = `0`;
2634	unsigned long nr_allocated = `0`;
2635	unsigned long rounds;
2636	unsigned long node_pages, delta;
2637	u8 *weights, weight;
2638	unsigned int weight_total = `0`;
2639	unsigned long rem_pages = nr_pages;
2640	nodemask_t nodes;
2641	int nnodes, node;
2642	int resume_node = MAX_NUMNODES - `1`;
2643	u8 resume_weight = `0`;
2644	int prev_node;
2645	int i;
2646
2647	if (!nr_pages)
2648	return `0`;
2649
2650	/ read the nodes onto the stack, retry if done during rebind /
2651	do {
2652	cpuset_mems_cookie = read_mems_allowed_begin();
2653	nnodes = read_once_policy_nodemask(pol, mask: &nodes);
2654	} while (read_mems_allowed_retry(seq: cpuset_mems_cookie));
2655
2656	/ if the nodemask has become invalid, we cannot do anything /
2657	if (!nnodes)
2658	return `0`;
2659
2660	/ Continue allocating from most recent node and adjust the nr_pages /
2661	node = me->il_prev;
2662	weight = me->il_weight;
2663	if (weight && node_isset(node, nodes)) {
2664	node_pages = min(rem_pages, weight);
2665	nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2666	page_array);
2667	page_array += nr_allocated;
2668	total_allocated += nr_allocated;
2669	/ if that's all the pages, no need to interleave /
2670	if (rem_pages <= weight) {
2671	me->il_weight -= rem_pages;
2672	return total_allocated;
2673	}
2674	/ Otherwise we adjust remaining pages, continue from there /
2675	rem_pages -= weight;
2676	}
2677	/ clear active weight in case of an allocation failure /
2678	me->il_weight = `0`;
2679	prev_node = node;
2680
2681	/ create a local copy of node weights to operate on outside rcu /
2682	weights = kzalloc(nr_node_ids, GFP_KERNEL);
2683	if (!weights)
2684	return total_allocated;
2685
2686	rcu_read_lock();
2687	state = rcu_dereference(wi_state);
2688	if (state) {
2689	memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2690	rcu_read_unlock();
2691	} else {
2692	rcu_read_unlock();
2693	for (i = `0`; i < nr_node_ids; i++)
2694	weights[i] = `1`;
2695	}
2696
2697	/ calculate total, detect system default usage /
2698	for_each_node_mask(node, nodes)
2699	weight_total += weights[node];
2700
2701	/*
2702	* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2703	* Track which node weighted interleave should resume from.
2704	*
2705	* if (rounds > 0) and (delta == 0), resume_node will always be
2706	* the node following prev_node and its weight.
2707	*/
2708	rounds = rem_pages / weight_total;
2709	delta = rem_pages % weight_total;
2710	resume_node = next_node_in(prev_node, nodes);
2711	resume_weight = weights[resume_node];
2712	for (i = `0`; i < nnodes; i++) {
2713	node = next_node_in(prev_node, nodes);
2714	weight = weights[node];
2715	node_pages = weight * rounds;
2716	/ If a delta exists, add this node's portion of the delta /
2717	if (delta > weight) {
2718	node_pages += weight;
2719	delta -= weight;
2720	} else if (delta) {
2721	/ when delta is depleted, resume from that node /
2722	node_pages += delta;
2723	resume_node = node;
2724	resume_weight = weight - delta;
2725	delta = `0`;
2726	}
2727	/ node_pages can be 0 if an allocation fails and rounds == 0 /
2728	if (!node_pages)
2729	break;
2730	nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2731	page_array);
2732	page_array += nr_allocated;
2733	total_allocated += nr_allocated;
2734	if (total_allocated == nr_pages)
2735	break;
2736	prev_node = node;
2737	}
2738	me->il_prev = resume_node;
2739	me->il_weight = resume_weight;
2740	kfree(objp: weights);
2741	return total_allocated;
2742	}
2743
2744	static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2745	struct mempolicy pol, unsigned* long nr_pages,
2746	struct page **page_array)
2747	{
2748	gfp_t preferred_gfp;
2749	unsigned long nr_allocated = `0`;
2750
2751	preferred_gfp = gfp \| __GFP_NOWARN;
2752	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM \| __GFP_NOFAIL);
2753
2754	nr_allocated = alloc_pages_bulk_noprof(gfp: preferred_gfp, preferred_nid: nid, nodemask: &pol->nodes,
2755	nr_pages, page_array);
2756
2757	if (nr_allocated < nr_pages)
2758	nr_allocated += alloc_pages_bulk_noprof(gfp, preferred_nid: numa_node_id(), NULL,
2759	nr_pages: nr_pages - nr_allocated,
2760	page_array: page_array + nr_allocated);
2761	return nr_allocated;
2762	}
2763
2764	/ alloc pages bulk and mempolicy should be considered at the*
2765	* same time in some situation such as vmalloc.
2766	*
2767	* It can accelerate memory allocation especially interleaving
2768	* allocate memory.
2769	*/
2770	unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2771	unsigned long nr_pages, struct page **page_array)
2772	{
2773	struct mempolicy *pol = &default_policy;
2774	nodemask_t *nodemask;
2775	int nid;
2776
2777	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2778	pol = get_task_policy(current);
2779
2780	if (pol->mode == MPOL_INTERLEAVE)
2781	return alloc_pages_bulk_interleave(gfp, pol,
2782	nr_pages, page_array);
2783
2784	if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
2785	return alloc_pages_bulk_weighted_interleave(
2786	gfp, pol, nr_pages, page_array);
2787
2788	if (pol->mode == MPOL_PREFERRED_MANY)
2789	return alloc_pages_bulk_preferred_many(gfp,
2790	nid: numa_node_id(), pol, nr_pages, page_array);
2791
2792	nid = numa_node_id();
2793	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, nid: &nid);
2794	return alloc_pages_bulk_noprof(gfp, preferred_nid: nid, nodemask,
2795	nr_pages, page_array);
2796	}
2797
2798	int vma_dup_policy(struct vm_area_struct src, struct* vm_area_struct *dst)
2799	{
2800	struct mempolicy *pol = mpol_dup(pol: src->vm_policy);
2801
2802	if (IS_ERR(ptr: pol))
2803	return PTR_ERR(ptr: pol);
2804	dst->vm_policy = pol;
2805	return `0`;
2806	}
2807
2808	/*
2809	* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2810	* rebinds the mempolicy its copying by calling mpol_rebind_policy()
2811	* with the mems_allowed returned by cpuset_mems_allowed(). This
2812	* keeps mempolicies cpuset relative after its cpuset moves. See
2813	* further kernel/cpuset.c update_nodemask().
2814	*
2815	* current's mempolicy may be rebinded by the other task(the task that changes
2816	* cpuset's mems), so we needn't do rebind work for current task.
2817	*/
2818
2819	/ Slow path of a mempolicy duplicate /
2820	struct mempolicy __mpol_dup(struct* mempolicy *old)
2821	{
2822	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2823
2824	if (!new)
2825	return ERR_PTR(error: -ENOMEM);
2826
2827	/ task's mempolicy is protected by alloc_lock /
2828	if (old == current->mempolicy) {
2829	task_lock(current);
2830	new = old;
2831	task_unlock(current);
2832	} else
2833	new = old;
2834
2835	if (current_cpuset_is_being_rebound()) {
2836	nodemask_t mems = cpuset_mems_allowed(current);
2837	mpol_rebind_policy(pol: new, newmask: &mems);
2838	}
2839	atomic_set(v: &new->refcnt, i: `1`);
2840	return new;
2841	}
2842
2843	/ Slow path of a mempolicy comparison /
2844	bool __mpol_equal(struct mempolicy a, struct* mempolicy *b)
2845	{
2846	if (!a \|\| !b)
2847	return false;
2848	if (a->mode != b->mode)
2849	return false;
2850	if (a->flags != b->flags)
2851	return false;
2852	if (a->home_node != b->home_node)
2853	return false;
2854	if (mpol_store_user_nodemask(pol: a))
2855	if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2856	return false;
2857
2858	switch (a->mode) {
2859	case MPOL_BIND:
2860	case MPOL_INTERLEAVE:
2861	case MPOL_PREFERRED:
2862	case MPOL_PREFERRED_MANY:
2863	case MPOL_WEIGHTED_INTERLEAVE:
2864	return !!nodes_equal(a->nodes, b->nodes);
2865	case MPOL_LOCAL:
2866	return true;
2867	default:
2868	BUG();
2869	return false;
2870	}
2871	}
2872
2873	/*
2874	* Shared memory backing store policy support.
2875	*
2876	* Remember policies even when nobody has shared memory mapped.
2877	* The policies are kept in Red-Black tree linked from the inode.
2878	* They are protected by the sp->lock rwlock, which should be held
2879	* for any accesses to the tree.
2880	*/
2881
2882	/*
2883	* lookup first element intersecting start-end. Caller holds sp->lock for
2884	* reading or for writing
2885	*/
2886	static struct sp_node sp_lookup(struct* shared_policy *sp,
2887	pgoff_t start, pgoff_t end)
2888	{
2889	struct rb_node *n = sp->root.rb_node;
2890
2891	while (n) {
2892	struct sp_node p = rb_entry(n, struct* sp_node, nd);
2893
2894	if (start >= p->end)
2895	n = n->rb_right;
2896	else if (end <= p->start)
2897	n = n->rb_left;
2898	else
2899	break;
2900	}
2901	if (!n)
2902	return NULL;
2903	for (;;) {
2904	struct sp_node *w = NULL;
2905	struct rb_node *prev = rb_prev(n);
2906	if (!prev)
2907	break;
2908	w = rb_entry(prev, struct sp_node, nd);
2909	if (w->end <= start)
2910	break;
2911	n = prev;
2912	}
2913	return rb_entry(n, struct sp_node, nd);
2914	}
2915
2916	/*
2917	* Insert a new shared policy into the list. Caller holds sp->lock for
2918	* writing.
2919	*/
2920	static void sp_insert(struct shared_policy sp, struct* sp_node *new)
2921	{
2922	struct rb_node **p = &sp->root.rb_node;
2923	struct rb_node *parent = NULL;
2924	struct sp_node *nd;
2925
2926	while (*p) {
2927	parent = *p;
2928	nd = rb_entry(parent, struct sp_node, nd);
2929	if (new->start < nd->start)
2930	p = &(*p)->rb_left;
2931	else if (new->end > nd->end)
2932	p = &(*p)->rb_right;
2933	else
2934	BUG();
2935	}
2936	rb_link_node(node: &new->nd, parent, rb_link: p);
2937	rb_insert_color(&new->nd, &sp->root);
2938	}
2939
2940	/ Find shared policy intersecting idx /
2941	struct mempolicy mpol_shared_policy_lookup(struct* shared_policy *sp,
2942	pgoff_t idx)
2943	{
2944	struct mempolicy *pol = NULL;
2945	struct sp_node *sn;
2946
2947	if (!sp->root.rb_node)
2948	return NULL;
2949	read_lock(&sp->lock);
2950	sn = sp_lookup(sp, start: idx, end: idx+`1`);
2951	if (sn) {
2952	mpol_get(pol: sn->policy);
2953	pol = sn->policy;
2954	}
2955	read_unlock(&sp->lock);
2956	return pol;
2957	}
2958	EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
2959
2960	static void sp_free(struct sp_node *n)
2961	{
2962	mpol_put(pol: n->policy);
2963	kmem_cache_free(s: sn_cache, objp: n);
2964	}
2965
2966	/**
2967	* mpol_misplaced - check whether current folio node is valid in policy
2968	*
2969	* @folio: folio to be checked
2970	* @vmf: structure describing the fault
2971	* @addr: virtual address in @vma for shared policy lookup and interleave policy
2972	*
2973	* Lookup current policy node id for vma,addr and "compare to" folio's
2974	* node id. Policy determination "mimics" alloc_page_vma().
2975	* Called from fault path where we know the vma and faulting address.
2976	*
2977	* Return: NUMA_NO_NODE if the page is in a node that is valid for this
2978	* policy, or a suitable node ID to allocate a replacement folio from.
2979	*/
2980	int mpol_misplaced(struct folio folio, struct* vm_fault *vmf,
2981	unsigned long addr)
2982	{
2983	struct mempolicy *pol;
2984	pgoff_t ilx;
2985	struct zoneref *z;
2986	int curnid = folio_nid(folio);
2987	struct vm_area_struct *vma = vmf->vma;
2988	int thiscpu = raw_smp_processor_id();
2989	int thisnid = numa_node_id();
2990	int polnid = NUMA_NO_NODE;
2991	int ret = NUMA_NO_NODE;
2992
2993	/*
2994	* Make sure ptl is held so that we don't preempt and we
2995	* have a stable smp processor id
2996	*/
2997	lockdep_assert_held(vmf->ptl);
2998	pol = get_vma_policy(vma, addr, order: folio_order(folio), ilx: &ilx);
2999	if (!(pol->flags & MPOL_F_MOF))
3000	goto out;
3001
3002	switch (pol->mode) {
3003	case MPOL_INTERLEAVE:
3004	polnid = interleave_nid(pol, ilx);
3005	break;
3006
3007	case MPOL_WEIGHTED_INTERLEAVE:
3008	polnid = weighted_interleave_nid(pol, ilx);
3009	break;
3010
3011	case MPOL_PREFERRED:
3012	if (node_isset(curnid, pol->nodes))
3013	goto out;
3014	polnid = first_node(pol->nodes);
3015	break;
3016
3017	case MPOL_LOCAL:
3018	polnid = numa_node_id();
3019	break;
3020
3021	case MPOL_BIND:
3022	case MPOL_PREFERRED_MANY:
3023	/*
3024	* Even though MPOL_PREFERRED_MANY can allocate pages outside
3025	* policy nodemask we don't allow numa migration to nodes
3026	* outside policy nodemask for now. This is done so that if we
3027	* want demotion to slow memory to happen, before allocating
3028	* from some DRAM node say 'x', we will end up using a
3029	* MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3030	* we should not promote to node 'x' from slow memory node.
3031	*/
3032	if (pol->flags & MPOL_F_MORON) {
3033	/*
3034	* Optimize placement among multiple nodes
3035	* via NUMA balancing
3036	*/
3037	if (node_isset(thisnid, pol->nodes))
3038	break;
3039	goto out;
3040	}
3041
3042	/*
3043	* use current page if in policy nodemask,
3044	* else select nearest allowed node, if any.
3045	* If no allowed nodes, use current [!misplaced].
3046	*/
3047	if (node_isset(curnid, pol->nodes))
3048	goto out;
3049	z = first_zones_zonelist(
3050	zonelist: node_zonelist(nid: thisnid, GFP_HIGHUSER),
3051	highest_zoneidx: gfp_zone(GFP_HIGHUSER),
3052	nodes: &pol->nodes);
3053	polnid = zonelist_node_idx(zoneref: z);
3054	break;
3055
3056	default:
3057	BUG();
3058	}
3059
3060	/ Migrate the folio towards the node whose CPU is referencing it /
3061	if (pol->flags & MPOL_F_MORON) {
3062	polnid = thisnid;
3063
3064	if (!should_numa_migrate_memory(current, folio, src_nid: curnid,
3065	dst_cpu: thiscpu))
3066	goto out;
3067	}
3068
3069	if (curnid != polnid)
3070	ret = polnid;
3071	out:
3072	mpol_cond_put(pol);
3073
3074	return ret;
3075	}
3076
3077	/*
3078	* Drop the (possibly final) reference to task->mempolicy. It needs to be
3079	* dropped after task->mempolicy is set to NULL so that any allocation done as
3080	* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3081	* policy.
3082	*/
3083	void mpol_put_task_policy(struct task_struct *task)
3084	{
3085	struct mempolicy *pol;
3086
3087	task_lock(p: task);
3088	pol = task->mempolicy;
3089	task->mempolicy = NULL;
3090	task_unlock(p: task);
3091	mpol_put(pol);
3092	}
3093
3094	static void sp_delete(struct shared_policy sp, struct* sp_node *n)
3095	{
3096	rb_erase(&n->nd, &sp->root);
3097	sp_free(n);
3098	}
3099
3100	static void sp_node_init(struct sp_node node, unsigned* long start,
3101	unsigned long end, struct mempolicy *pol)
3102	{
3103	node->start = start;
3104	node->end = end;
3105	node->policy = pol;
3106	}
3107
3108	static struct sp_node sp_alloc(unsigned* long start, unsigned long end,
3109	struct mempolicy *pol)
3110	{
3111	struct sp_node *n;
3112	struct mempolicy *newpol;
3113
3114	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3115	if (!n)
3116	return NULL;
3117
3118	newpol = mpol_dup(pol);
3119	if (IS_ERR(ptr: newpol)) {
3120	kmem_cache_free(s: sn_cache, objp: n);
3121	return NULL;
3122	}
3123	newpol->flags \|= MPOL_F_SHARED;
3124	sp_node_init(node: n, start, end, pol: newpol);
3125
3126	return n;
3127	}
3128
3129	/ Replace a policy range. /
3130	static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
3131	pgoff_t end, struct sp_node *new)
3132	{
3133	struct sp_node *n;
3134	struct sp_node *n_new = NULL;
3135	struct mempolicy *mpol_new = NULL;
3136	int ret = `0`;
3137
3138	restart:
3139	write_lock(&sp->lock);
3140	n = sp_lookup(sp, start, end);
3141	/ Take care of old policies in the same range. /
3142	while (n && n->start < end) {
3143	struct rb_node *next = rb_next(&n->nd);
3144	if (n->start >= start) {
3145	if (n->end <= end)
3146	sp_delete(sp, n);
3147	else
3148	n->start = end;
3149	} else {
3150	/ Old policy spanning whole new range. /
3151	if (n->end > end) {
3152	if (!n_new)
3153	goto alloc_new;
3154
3155	mpol_new = n->policy;
3156	atomic_set(v: &mpol_new->refcnt, i: `1`);
3157	sp_node_init(node: n_new, start: end, end: n->end, pol: mpol_new);
3158	n->end = start;
3159	sp_insert(sp, new: n_new);
3160	n_new = NULL;
3161	mpol_new = NULL;
3162	break;
3163	} else
3164	n->end = start;
3165	}
3166	if (!next)
3167	break;
3168	n = rb_entry(next, struct sp_node, nd);
3169	}
3170	if (new)
3171	sp_insert(sp, new);
3172	write_unlock(&sp->lock);
3173	ret = `0`;
3174
3175	err_out:
3176	if (mpol_new)
3177	mpol_put(pol: mpol_new);
3178	if (n_new)
3179	kmem_cache_free(s: sn_cache, objp: n_new);
3180
3181	return ret;
3182
3183	alloc_new:
3184	write_unlock(&sp->lock);
3185	ret = -ENOMEM;
3186	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
3187	if (!n_new)
3188	goto err_out;
3189	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3190	if (!mpol_new)
3191	goto err_out;
3192	atomic_set(v: &mpol_new->refcnt, i: `1`);
3193	goto restart;
3194	}
3195
3196	/**
3197	* mpol_shared_policy_init - initialize shared policy for inode
3198	* @sp: pointer to inode shared policy
3199	* @mpol: struct mempolicy to install
3200	*
3201	* Install non-NULL @mpol in inode's shared policy rb-tree.
3202	* On entry, the current task has a reference on a non-NULL @mpol.
3203	* This must be released on exit.
3204	* This is called at get_inode() calls and we can use GFP_KERNEL.
3205	*/
3206	void mpol_shared_policy_init(struct shared_policy sp, struct* mempolicy *mpol)
3207	{
3208	int ret;
3209
3210	sp->root = RB_ROOT; / empty tree == default mempolicy /
3211	rwlock_init(&sp->lock);
3212
3213	if (mpol) {
3214	struct sp_node *sn;
3215	struct mempolicy *npol;
3216	NODEMASK_SCRATCH(scratch);
3217
3218	if (!scratch)
3219	goto put_mpol;
3220
3221	/ contextualize the tmpfs mount point mempolicy to this file /
3222	npol = mpol_new(mode: mpol->mode, flags: mpol->flags, nodes: &mpol->w.user_nodemask);
3223	if (IS_ERR(ptr: npol))
3224	goto free_scratch; / no valid nodemask intersection /
3225
3226	task_lock(current);
3227	ret = mpol_set_nodemask(pol: npol, nodes: &mpol->w.user_nodemask, nsc: scratch);
3228	task_unlock(current);
3229	if (ret)
3230	goto put_npol;
3231
3232	/ alloc node covering entire file; adds ref to file's npol /
3233	sn = sp_alloc(start: `0`, MAX_LFS_FILESIZE >> PAGE_SHIFT, pol: npol);
3234	if (sn)
3235	sp_insert(sp, new: sn);
3236	put_npol:
3237	mpol_put(pol: npol); / drop initial ref on file's npol /
3238	free_scratch:
3239	NODEMASK_SCRATCH_FREE(scratch);
3240	put_mpol:
3241	mpol_put(pol: mpol); / drop our incoming ref on sb mpol /
3242	}
3243	}
3244	EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
3245
3246	int mpol_set_shared_policy(struct shared_policy *sp,
3247	struct vm_area_struct vma, struct* mempolicy *pol)
3248	{
3249	int err;
3250	struct sp_node *new = NULL;
3251	unsigned long sz = vma_pages(vma);
3252
3253	if (pol) {
3254	new = sp_alloc(start: vma->vm_pgoff, end: vma->vm_pgoff + sz, pol);
3255	if (!new)
3256	return -ENOMEM;
3257	}
3258	err = shared_policy_replace(sp, start: vma->vm_pgoff, end: vma->vm_pgoff + sz, new);
3259	if (err && new)
3260	sp_free(n: new);
3261	return err;
3262	}
3263	EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
3264
3265	/ Free a backing policy store on inode delete. /
3266	void mpol_free_shared_policy(struct shared_policy *sp)
3267	{
3268	struct sp_node *n;
3269	struct rb_node *next;
3270
3271	if (!sp->root.rb_node)
3272	return;
3273	write_lock(&sp->lock);
3274	next = rb_first(root: &sp->root);
3275	while (next) {
3276	n = rb_entry(next, struct sp_node, nd);
3277	next = rb_next(&n->nd);
3278	sp_delete(sp, n);
3279	}
3280	write_unlock(&sp->lock);
3281	}
3282	EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
3283
3284	#ifdef CONFIG_NUMA_BALANCING
3285	static int __initdata numabalancing_override;
3286
3287	static void __init check_numabalancing_enable(void)
3288	{
3289	bool numabalancing_default = false;
3290
3291	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
3292	numabalancing_default = true;
3293
3294	/ Parsed by setup_numabalancing. override == 1 enables, -1 disables /
3295	if (numabalancing_override)
3296	set_numabalancing_state(numabalancing_override == `1`);
3297
3298	if (num_online_nodes() > `1` && !numabalancing_override) {
3299	pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3300	numabalancing_default ? "Enabling" : "Disabling");
3301	set_numabalancing_state(numabalancing_default);
3302	}
3303	}
3304
3305	static int __init setup_numabalancing(char *str)
3306	{
3307	int ret = `0`;
3308	if (!str)
3309	goto out;
3310
3311	if (!strcmp(str, "enable")) {
3312	numabalancing_override = `1`;
3313	ret = `1`;
3314	} else if (!strcmp(str, "disable")) {
3315	numabalancing_override = -`1`;
3316	ret = `1`;
3317	}
3318	out:
3319	if (!ret)
3320	pr_warn("Unable to parse numa_balancing=\n");
3321
3322	return ret;
3323	}
3324	__setup("numa_balancing=", setup_numabalancing);
3325	#else
3326	static inline void __init check_numabalancing_enable(void)
3327	{
3328	}
3329	#endif /* CONFIG_NUMA_BALANCING */
3330
3331	void __init numa_policy_init(void)
3332	{
3333	nodemask_t interleave_nodes;
3334	unsigned long largest = `0`;
3335	int nid, prefer = `0`;
3336
3337	policy_cache = kmem_cache_create("numa_policy",
3338	sizeof(struct mempolicy),
3339	`0`, SLAB_PANIC, NULL);
3340
3341	sn_cache = kmem_cache_create("shared_policy_node",
3342	sizeof(struct sp_node),
3343	`0`, SLAB_PANIC, NULL);
3344
3345	for_each_node(nid) {
3346	preferred_node_policy[nid] = (struct mempolicy) {
3347	.refcnt = ATOMIC_INIT(`1`),
3348	.mode = MPOL_PREFERRED,
3349	.flags = MPOL_F_MOF \| MPOL_F_MORON,
3350	.nodes = nodemask_of_node(nid),
3351	};
3352	}
3353
3354	/*
3355	* Set interleaving policy for system init. Interleaving is only
3356	* enabled across suitably sized nodes (default is >= 16MB), or
3357	* fall back to the largest node if they're all smaller.
3358	*/
3359	nodes_clear(interleave_nodes);
3360	for_each_node_state(nid, N_MEMORY) {
3361	unsigned long total_pages = node_present_pages(nid);
3362
3363	/ Preserve the largest node /
3364	if (largest < total_pages) {
3365	largest = total_pages;
3366	prefer = nid;
3367	}
3368
3369	/ Interleave this node? /
3370	if ((total_pages << PAGE_SHIFT) >= (`16` << `20`))
3371	node_set(nid, interleave_nodes);
3372	}
3373
3374	/ All too small, use the largest /
3375	if (unlikely(nodes_empty(interleave_nodes)))
3376	node_set(prefer, interleave_nodes);
3377
3378	if (do_set_mempolicy(mode: MPOL_INTERLEAVE, flags: `0`, nodes: &interleave_nodes))
3379	pr_err("%s: interleaving failed\n", __func__);
3380
3381	check_numabalancing_enable();
3382	}
3383
3384	/ Reset policy of current process to default /
3385	void numa_default_policy(void)
3386	{
3387	do_set_mempolicy(mode: MPOL_DEFAULT, flags: `0`, NULL);
3388	}
3389
3390	/*
3391	* Parse and format mempolicy from/to strings
3392	*/
3393	static const char * const policy_modes[] =
3394	{
3395	[MPOL_DEFAULT] = "default",
3396	[MPOL_PREFERRED] = "prefer",
3397	[MPOL_BIND] = "bind",
3398	[MPOL_INTERLEAVE] = "interleave",
3399	[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3400	[MPOL_LOCAL] = "local",
3401	[MPOL_PREFERRED_MANY] = "prefer (many)",
3402	};
3403
3404	#ifdef CONFIG_TMPFS
3405	/**
3406	* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3407	* @str: string containing mempolicy to parse
3408	* @mpol: pointer to struct mempolicy pointer, returned on success.
3409	*
3410	* Format of input:
3411	* <mode>[=<flags>][:<nodelist>]
3412	*
3413	* Return: %0 on success, else %1
3414	*/
3415	int mpol_parse_str(char str, struct* mempolicy **mpol)
3416	{
3417	struct mempolicy *new = NULL;
3418	unsigned short mode_flags;
3419	nodemask_t nodes;
3420	char *nodelist = strchr(str, `':'`);
3421	char *flags = strchr(str, `'='`);
3422	int err = `1`, mode;
3423
3424	if (flags)
3425	flags++ = `'\0'`; /* terminate mode string /
3426
3427	if (nodelist) {
3428	/ NUL-terminate mode or flags string /
3429	*nodelist++ = `'\0'`;
3430	if (nodelist_parse(nodelist, nodes))
3431	goto out;
3432	if (!nodes_subset(nodes, node_states[N_MEMORY]))
3433	goto out;
3434	} else
3435	nodes_clear(nodes);
3436
3437	mode = match_string(array: policy_modes, n: MPOL_MAX, string: str);
3438	if (mode < `0`)
3439	goto out;
3440
3441	switch (mode) {
3442	case MPOL_PREFERRED:
3443	/*
3444	* Insist on a nodelist of one node only, although later
3445	* we use first_node(nodes) to grab a single node, so here
3446	* nodelist (or nodes) cannot be empty.
3447	*/
3448	if (nodelist) {
3449	char *rest = nodelist;
3450	while (isdigit(c: *rest))
3451	rest++;
3452	if (*rest)
3453	goto out;
3454	if (nodes_empty(nodes))
3455	goto out;
3456	}
3457	break;
3458	case MPOL_INTERLEAVE:
3459	case MPOL_WEIGHTED_INTERLEAVE:
3460	/*
3461	* Default to online nodes with memory if no nodelist
3462	*/
3463	if (!nodelist)
3464	nodes = node_states[N_MEMORY];
3465	break;
3466	case MPOL_LOCAL:
3467	/*
3468	* Don't allow a nodelist; mpol_new() checks flags
3469	*/
3470	if (nodelist)
3471	goto out;
3472	break;
3473	case MPOL_DEFAULT:
3474	/*
3475	* Insist on a empty nodelist
3476	*/
3477	if (!nodelist)
3478	err = `0`;
3479	goto out;
3480	case MPOL_PREFERRED_MANY:
3481	case MPOL_BIND:
3482	/*
3483	* Insist on a nodelist
3484	*/
3485	if (!nodelist)
3486	goto out;
3487	}
3488
3489	mode_flags = `0`;
3490	if (flags) {
3491	/*
3492	* Currently, we only support two mutually exclusive
3493	* mode flags.
3494	*/
3495	if (!strcmp(flags, "static"))
3496	mode_flags \|= MPOL_F_STATIC_NODES;
3497	else if (!strcmp(flags, "relative"))
3498	mode_flags \|= MPOL_F_RELATIVE_NODES;
3499	else
3500	goto out;
3501	}
3502
3503	new = mpol_new(mode, flags: mode_flags, nodes: &nodes);
3504	if (IS_ERR(ptr: new))
3505	goto out;
3506
3507	/*
3508	* Save nodes for mpol_to_str() to show the tmpfs mount options
3509	* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3510	*/
3511	if (mode != MPOL_PREFERRED) {
3512	new->nodes = nodes;
3513	} else if (nodelist) {
3514	nodes_clear(new->nodes);
3515	node_set(first_node(nodes), new->nodes);
3516	} else {
3517	new->mode = MPOL_LOCAL;
3518	}
3519
3520	/*
3521	* Save nodes for contextualization: this will be used to "clone"
3522	* the mempolicy in a specific context [cpuset] at a later time.
3523	*/
3524	new->w.user_nodemask = nodes;
3525
3526	err = `0`;
3527
3528	out:
3529	/ Restore string for error message /
3530	if (nodelist)
3531	*--nodelist = `':'`;
3532	if (flags)
3533	*--flags = `'='`;
3534	if (!err)
3535	*mpol = new;
3536	return err;
3537	}
3538	#endif /* CONFIG_TMPFS */
3539
3540	/**
3541	* mpol_to_str - format a mempolicy structure for printing
3542	* @buffer: to contain formatted mempolicy string
3543	* @maxlen: length of @buffer
3544	* @pol: pointer to mempolicy to be formatted
3545	*
3546	* Convert @pol into a string. If @buffer is too short, truncate the string.
3547	* Recommend a @maxlen of at least 51 for the longest mode, "weighted
3548	* interleave", plus the longest flag flags, "relative\|balancing", and to
3549	* display at least a few node ids.
3550	*/
3551	void mpol_to_str(char buffer, int* maxlen, struct mempolicy *pol)
3552	{
3553	char *p = buffer;
3554	nodemask_t nodes = NODE_MASK_NONE;
3555	unsigned short mode = MPOL_DEFAULT;
3556	unsigned short flags = `0`;
3557
3558	if (pol &&
3559	pol != &default_policy &&
3560	!(pol >= &preferred_node_policy[`0`] &&
3561	pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - `1`])) {
3562	mode = pol->mode;
3563	flags = pol->flags;
3564	}
3565
3566	switch (mode) {
3567	case MPOL_DEFAULT:
3568	case MPOL_LOCAL:
3569	break;
3570	case MPOL_PREFERRED:
3571	case MPOL_PREFERRED_MANY:
3572	case MPOL_BIND:
3573	case MPOL_INTERLEAVE:
3574	case MPOL_WEIGHTED_INTERLEAVE:
3575	nodes = pol->nodes;
3576	break;
3577	default:
3578	WARN_ON_ONCE(`1`);
3579	snprintf(buf: p, size: maxlen, fmt: "unknown");
3580	return;
3581	}
3582
3583	p += snprintf(buf: p, size: maxlen, fmt: "%s", policy_modes[mode]);
3584
3585	if (flags & MPOL_MODE_FLAGS) {
3586	p += snprintf(buf: p, size: buffer + maxlen - p, fmt: "=");
3587
3588	/*
3589	* Static and relative are mutually exclusive.
3590	*/
3591	if (flags & MPOL_F_STATIC_NODES)
3592	p += snprintf(buf: p, size: buffer + maxlen - p, fmt: "static");
3593	else if (flags & MPOL_F_RELATIVE_NODES)
3594	p += snprintf(buf: p, size: buffer + maxlen - p, fmt: "relative");
3595
3596	if (flags & MPOL_F_NUMA_BALANCING) {
3597	if (!is_power_of_2(n: flags & MPOL_MODE_FLAGS))
3598	p += snprintf(buf: p, size: buffer + maxlen - p, fmt: "\|");
3599	p += snprintf(buf: p, size: buffer + maxlen - p, fmt: "balancing");
3600	}
3601	}
3602
3603	if (!nodes_empty(nodes))
3604	p += scnprintf(buf: p, size: buffer + maxlen - p, fmt: ":%*pbl",
3605	nodemask_pr_args(&nodes));
3606	}
3607
3608	#ifdef CONFIG_SYSFS
3609	struct iw_node_attr {
3610	struct kobj_attribute kobj_attr;
3611	int nid;
3612	};
3613
3614	struct sysfs_wi_group {
3615	struct kobject wi_kobj;
3616	struct mutex kobj_lock;
3617	struct iw_node_attr *nattrs[];
3618	};
3619
3620	static struct sysfs_wi_group *wi_group;
3621
3622	static ssize_t node_show(struct kobject kobj, struct* kobj_attribute *attr,
3623	char *buf)
3624	{
3625	struct iw_node_attr *node_attr;
3626	u8 weight;
3627
3628	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3629	weight = get_il_weight(node: node_attr->nid);
3630	return sysfs_emit(buf, fmt: "%d\n", weight);
3631	}
3632
3633	static ssize_t node_store(struct kobject kobj, struct* kobj_attribute *attr,
3634	const char *buf, size_t count)
3635	{
3636	struct weighted_interleave_state new_wi_state, old_wi_state = NULL;
3637	struct iw_node_attr *node_attr;
3638	u8 weight = `0`;
3639	int i;
3640
3641	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3642	if (count == `0` \|\| sysfs_streq(s1: buf, s2: "") \|\|
3643	kstrtou8(s: buf, base: `0`, res: &weight) \|\| weight == `0`)
3644	return -EINVAL;
3645
3646	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3647	GFP_KERNEL);
3648	if (!new_wi_state)
3649	return -ENOMEM;
3650
3651	mutex_lock(&wi_state_lock);
3652	old_wi_state = rcu_dereference_protected(wi_state,
3653	lockdep_is_held(&wi_state_lock));
3654	if (old_wi_state) {
3655	memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3656	nr_node_ids * sizeof(u8));
3657	} else {
3658	for (i = `0`; i < nr_node_ids; i++)
3659	new_wi_state->iw_table[i] = `1`;
3660	}
3661	new_wi_state->iw_table[node_attr->nid] = weight;
3662	new_wi_state->mode_auto = false;
3663
3664	rcu_assign_pointer(wi_state, new_wi_state);
3665	mutex_unlock(lock: &wi_state_lock);
3666	if (old_wi_state) {
3667	synchronize_rcu();
3668	kfree(objp: old_wi_state);
3669	}
3670	return count;
3671	}
3672
3673	static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3674	struct kobj_attribute attr, char* *buf)
3675	{
3676	struct weighted_interleave_state *state;
3677	bool wi_auto = true;
3678
3679	rcu_read_lock();
3680	state = rcu_dereference(wi_state);
3681	if (state)
3682	wi_auto = state->mode_auto;
3683	rcu_read_unlock();
3684
3685	return sysfs_emit(buf, fmt: "%s\n", str_true_false(v: wi_auto));
3686	}
3687
3688	static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3689	struct kobj_attribute attr, const* char *buf, size_t count)
3690	{
3691	struct weighted_interleave_state new_wi_state, old_wi_state = NULL;
3692	unsigned int *bw;
3693	bool input;
3694	int i;
3695
3696	if (kstrtobool(s: buf, res: &input))
3697	return -EINVAL;
3698
3699	new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
3700	GFP_KERNEL);
3701	if (!new_wi_state)
3702	return -ENOMEM;
3703	for (i = `0`; i < nr_node_ids; i++)
3704	new_wi_state->iw_table[i] = `1`;
3705
3706	mutex_lock(&wi_state_lock);
3707	if (!input) {
3708	old_wi_state = rcu_dereference_protected(wi_state,
3709	lockdep_is_held(&wi_state_lock));
3710	if (!old_wi_state)
3711	goto update_wi_state;
3712	if (input == old_wi_state->mode_auto) {
3713	mutex_unlock(lock: &wi_state_lock);
3714	return count;
3715	}
3716
3717	memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3718	nr_node_ids * sizeof(u8));
3719	goto update_wi_state;
3720	}
3721
3722	bw = node_bw_table;
3723	if (!bw) {
3724	mutex_unlock(lock: &wi_state_lock);
3725	kfree(objp: new_wi_state);
3726	return -ENODEV;
3727	}
3728
3729	new_wi_state->mode_auto = true;
3730	reduce_interleave_weights(bw, new_iw: new_wi_state->iw_table);
3731
3732	update_wi_state:
3733	rcu_assign_pointer(wi_state, new_wi_state);
3734	mutex_unlock(lock: &wi_state_lock);
3735	if (old_wi_state) {
3736	synchronize_rcu();
3737	kfree(objp: old_wi_state);
3738	}
3739	return count;
3740	}
3741
3742	static void sysfs_wi_node_delete(int nid)
3743	{
3744	struct iw_node_attr *attr;
3745
3746	if (nid < `0` \|\| nid >= nr_node_ids)
3747	return;
3748
3749	mutex_lock(&wi_group->kobj_lock);
3750	attr = wi_group->nattrs[nid];
3751	if (!attr) {
3752	mutex_unlock(lock: &wi_group->kobj_lock);
3753	return;
3754	}
3755
3756	wi_group->nattrs[nid] = NULL;
3757	mutex_unlock(lock: &wi_group->kobj_lock);
3758
3759	sysfs_remove_file(kobj: &wi_group->wi_kobj, attr: &attr->kobj_attr.attr);
3760	kfree(objp: attr->kobj_attr.attr.name);
3761	kfree(objp: attr);
3762	}
3763
3764	static void sysfs_wi_node_delete_all(void)
3765	{
3766	int nid;
3767
3768	for (nid = `0`; nid < nr_node_ids; nid++)
3769	sysfs_wi_node_delete(nid);
3770	}
3771
3772	static void wi_state_free(void)
3773	{
3774	struct weighted_interleave_state *old_wi_state;
3775
3776	mutex_lock(&wi_state_lock);
3777	old_wi_state = rcu_dereference_protected(wi_state,
3778	lockdep_is_held(&wi_state_lock));
3779	rcu_assign_pointer(wi_state, NULL);
3780	mutex_unlock(lock: &wi_state_lock);
3781
3782	if (old_wi_state) {
3783	synchronize_rcu();
3784	kfree(objp: old_wi_state);
3785	}
3786	}
3787
3788	static struct kobj_attribute wi_auto_attr =
3789	__ATTR(auto, `0664`, weighted_interleave_auto_show,
3790	weighted_interleave_auto_store);
3791
3792	static void wi_cleanup(void) {
3793	sysfs_remove_file(kobj: &wi_group->wi_kobj, attr: &wi_auto_attr.attr);
3794	sysfs_wi_node_delete_all();
3795	wi_state_free();
3796	}
3797
3798	static void wi_kobj_release(struct kobject *wi_kobj)
3799	{
3800	kfree(objp: wi_group);
3801	}
3802
3803	static const struct kobj_type wi_ktype = {
3804	.sysfs_ops = &kobj_sysfs_ops,
3805	.release = wi_kobj_release,
3806	};
3807
3808	static int sysfs_wi_node_add(int nid)
3809	{
3810	int ret;
3811	char *name;
3812	struct iw_node_attr *new_attr;
3813
3814	if (nid < `0` \|\| nid >= nr_node_ids) {
3815	pr_err("invalid node id: %d\n", nid);
3816	return -EINVAL;
3817	}
3818
3819	new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
3820	if (!new_attr)
3821	return -ENOMEM;
3822
3823	name = kasprintf(GFP_KERNEL, fmt: "node%d", nid);
3824	if (!name) {
3825	kfree(objp: new_attr);
3826	return -ENOMEM;
3827	}
3828
3829	sysfs_attr_init(&new_attr->kobj_attr.attr);
3830	new_attr->kobj_attr.attr.name = name;
3831	new_attr->kobj_attr.attr.mode = `0644`;
3832	new_attr->kobj_attr.show = node_show;
3833	new_attr->kobj_attr.store = node_store;
3834	new_attr->nid = nid;
3835
3836	mutex_lock(&wi_group->kobj_lock);
3837	if (wi_group->nattrs[nid]) {
3838	mutex_unlock(lock: &wi_group->kobj_lock);
3839	ret = -EEXIST;
3840	goto out;
3841	}
3842
3843	ret = sysfs_create_file(kobj: &wi_group->wi_kobj, attr: &new_attr->kobj_attr.attr);
3844	if (ret) {
3845	mutex_unlock(lock: &wi_group->kobj_lock);
3846	goto out;
3847	}
3848	wi_group->nattrs[nid] = new_attr;
3849	mutex_unlock(lock: &wi_group->kobj_lock);
3850	return `0`;
3851
3852	out:
3853	kfree(objp: new_attr->kobj_attr.attr.name);
3854	kfree(objp: new_attr);
3855	return ret;
3856	}
3857
3858	static int wi_node_notifier(struct notifier_block *nb,
3859	unsigned long action, void *data)
3860	{
3861	int err;
3862	struct node_notify *nn = data;
3863	int nid = nn->nid;
3864
3865	switch (action) {
3866	case NODE_ADDED_FIRST_MEMORY:
3867	err = sysfs_wi_node_add(nid);
3868	if (err)
3869	pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3870	nid, err);
3871	break;
3872	case NODE_REMOVED_LAST_MEMORY:
3873	sysfs_wi_node_delete(nid);
3874	break;
3875	}
3876
3877	return NOTIFY_OK;
3878	}
3879
3880	static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3881	{
3882	int nid, err;
3883
3884	wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
3885	GFP_KERNEL);
3886	if (!wi_group)
3887	return -ENOMEM;
3888	mutex_init(&wi_group->kobj_lock);
3889
3890	err = kobject_init_and_add(kobj: &wi_group->wi_kobj, ktype: &wi_ktype, parent: mempolicy_kobj,
3891	fmt: "weighted_interleave");
3892	if (err)
3893	goto err_put_kobj;
3894
3895	err = sysfs_create_file(kobj: &wi_group->wi_kobj, attr: &wi_auto_attr.attr);
3896	if (err)
3897	goto err_put_kobj;
3898
3899	for_each_online_node(nid) {
3900	if (!node_state(node: nid, state: N_MEMORY))
3901	continue;
3902
3903	err = sysfs_wi_node_add(nid);
3904	if (err) {
3905	pr_err("failed to add sysfs for node%d during init: %d\n",
3906	nid, err);
3907	goto err_cleanup_kobj;
3908	}
3909	}
3910
3911	hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3912	return `0`;
3913
3914	err_cleanup_kobj:
3915	wi_cleanup();
3916	kobject_del(kobj: &wi_group->wi_kobj);
3917	err_put_kobj:
3918	kobject_put(kobj: &wi_group->wi_kobj);
3919	return err;
3920	}
3921
3922	static int __init mempolicy_sysfs_init(void)
3923	{
3924	int err;
3925	static struct kobject *mempolicy_kobj;
3926
3927	mempolicy_kobj = kobject_create_and_add(name: "mempolicy", parent: mm_kobj);
3928	if (!mempolicy_kobj)
3929	return -ENOMEM;
3930
3931	err = add_weighted_interleave_group(mempolicy_kobj);
3932	if (err)
3933	goto err_kobj;
3934
3935	return `0`;
3936
3937	err_kobj:
3938	kobject_del(kobj: mempolicy_kobj);
3939	kobject_put(kobj: mempolicy_kobj);
3940	return err;
3941	}
3942
3943	late_initcall(mempolicy_sysfs_init);
3944	#endif /* CONFIG_SYSFS */
3945

source code of linux/mm/mempolicy.c