numa.c source code [linux/arch/x86/mm/numa.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Common code for 32 and 64-bit NUMA /
3	#include <linux/acpi.h>
4	#include <linux/kernel.h>
5	#include <linux/mm.h>
6	#include <linux/of.h>
7	#include <linux/string.h>
8	#include <linux/init.h>
9	#include <linux/memblock.h>
10	#include <linux/mmzone.h>
11	#include <linux/ctype.h>
12	#include <linux/nodemask.h>
13	#include <linux/sched.h>
14	#include <linux/topology.h>
15	#include <linux/sort.h>
16
17	#include <asm/e820/api.h>
18	#include <asm/proto.h>
19	#include <asm/dma.h>
20	#include <asm/amd_nb.h>
21
22	#include "numa_internal.h"
23
24	int numa_off;
25	nodemask_t numa_nodes_parsed __initdata;
26
27	struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28	EXPORT_SYMBOL(node_data);
29
30	static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
31	static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
32
33	static int numa_distance_cnt;
34	static u8 *numa_distance;
35
36	static __init int numa_setup(char *opt)
37	{
38	if (!opt)
39	return -EINVAL;
40	if (!strncmp(opt, "off", `3`))
41	numa_off = `1`;
42	if (!strncmp(opt, "fake=", `5`))
43	return numa_emu_cmdline(str: opt + `5`);
44	if (!strncmp(opt, "noacpi", `6`))
45	disable_srat();
46	if (!strncmp(opt, "nohmat", `6`))
47	disable_hmat();
48	return `0`;
49	}
50	early_param("numa", numa_setup);
51
52	/*
53	* apicid, cpu, node mappings
54	*/
55	s16 __apicid_to_node[MAX_LOCAL_APIC] = {
56	[`0` ... MAX_LOCAL_APIC-`1`] = NUMA_NO_NODE
57	};
58
59	int numa_cpu_node(int cpu)
60	{
61	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
62
63	if (apicid != BAD_APICID)
64	return __apicid_to_node[apicid];
65	return NUMA_NO_NODE;
66	}
67
68	cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
69	EXPORT_SYMBOL(node_to_cpumask_map);
70
71	/*
72	* Map cpu index to node index
73	*/
74	DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
75	EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
76
77	void numa_set_node(int cpu, int node)
78	{
79	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
80
81	/ early setting, no percpu area yet /
82	if (cpu_to_node_map) {
83	cpu_to_node_map[cpu] = node;
84	return;
85	}
86
87	#ifdef CONFIG_DEBUG_PER_CPU_MAPS
88	if (cpu >= nr_cpu_ids \|\| !cpu_possible(cpu)) {
89	printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
90	dump_stack();
91	return;
92	}
93	#endif
94	per_cpu(x86_cpu_to_node_map, cpu) = node;
95
96	set_cpu_numa_node(cpu, node);
97	}
98
99	void numa_clear_node(int cpu)
100	{
101	numa_set_node(cpu, NUMA_NO_NODE);
102	}
103
104	/*
105	* Allocate node_to_cpumask_map based on number of available nodes
106	* Requires node_possible_map to be valid.
107	*
108	* Note: cpumask_of_node() is not valid until after this is done.
109	* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
110	*/
111	void __init setup_node_to_cpumask_map(void)
112	{
113	unsigned int node;
114
115	/ setup nr_node_ids if not done yet /
116	if (nr_node_ids == MAX_NUMNODES)
117	setup_nr_node_ids();
118
119	/ allocate the map /
120	for (node = `0`; node < nr_node_ids; node++)
121	alloc_bootmem_cpumask_var(mask: &node_to_cpumask_map[node]);
122
123	/ cpumask_of_node() will now work /
124	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
125	}
126
127	static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
128	struct numa_meminfo *mi)
129	{
130	/ ignore zero length blks /
131	if (start == end)
132	return `0`;
133
134	/ whine about and ignore invalid blks /
135	if (start > end \|\| nid < `0` \|\| nid >= MAX_NUMNODES) {
136	pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
137	nid, start, end - `1`);
138	return `0`;
139	}
140
141	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
142	pr_err("too many memblk ranges\n");
143	return -EINVAL;
144	}
145
146	mi->blk[mi->nr_blks].start = start;
147	mi->blk[mi->nr_blks].end = end;
148	mi->blk[mi->nr_blks].nid = nid;
149	mi->nr_blks++;
150	return `0`;
151	}
152
153	/**
154	* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
155	* @idx: Index of memblk to remove
156	* @mi: numa_meminfo to remove memblk from
157	*
158	* Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
159	* decrementing @mi->nr_blks.
160	*/
161	void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
162	{
163	mi->nr_blks--;
164	memmove(&mi->blk[idx], &mi->blk[idx + `1`],
165	(mi->nr_blks - idx) * sizeof(mi->blk[`0`]));
166	}
167
168	/**
169	* numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
170	* @dst: numa_meminfo to append block to
171	* @idx: Index of memblk to remove
172	* @src: numa_meminfo to remove memblk from
173	*/
174	static void __init numa_move_tail_memblk(struct numa_meminfo dst, int* idx,
175	struct numa_meminfo *src)
176	{
177	dst->blk[dst->nr_blks++] = src->blk[idx];
178	numa_remove_memblk_from(idx, mi: src);
179	}
180
181	/**
182	* numa_add_memblk - Add one numa_memblk to numa_meminfo
183	* @nid: NUMA node ID of the new memblk
184	* @start: Start address of the new memblk
185	* @end: End address of the new memblk
186	*
187	* Add a new memblk to the default numa_meminfo.
188	*
189	* RETURNS:
190	* 0 on success, -errno on failure.
191	*/
192	int __init numa_add_memblk(int nid, u64 start, u64 end)
193	{
194	return numa_add_memblk_to(nid, start, end, mi: &numa_meminfo);
195	}
196
197	/ Allocate NODE_DATA for a node on the local memory /
198	static void __init alloc_node_data(int nid)
199	{
200	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
201	u64 nd_pa;
202	void *nd;
203	int tnid;
204
205	/*
206	* Allocate node data. Try node-local memory and then any node.
207	* Never allocate in DMA zone.
208	*/
209	nd_pa = memblock_phys_alloc_try_nid(size: nd_size, SMP_CACHE_BYTES, nid);
210	if (!nd_pa) {
211	pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
212	nd_size, nid);
213	return;
214	}
215	nd = __va(nd_pa);
216
217	/ report and initialize /
218	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
219	nd_pa, nd_pa + nd_size - `1`);
220	tnid = early_pfn_to_nid(pfn: nd_pa >> PAGE_SHIFT);
221	if (tnid != nid)
222	printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
223
224	node_data[nid] = nd;
225	memset(NODE_DATA(nid), `0`, sizeof(pg_data_t));
226
227	node_set_online(nid);
228	}
229
230	/**
231	* numa_cleanup_meminfo - Cleanup a numa_meminfo
232	* @mi: numa_meminfo to clean up
233	*
234	* Sanitize @mi by merging and removing unnecessary memblks. Also check for
235	* conflicts and clear unused memblks.
236	*
237	* RETURNS:
238	* 0 on success, -errno on failure.
239	*/
240	int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
241	{
242	const u64 low = `0`;
243	const u64 high = PFN_PHYS(max_pfn);
244	int i, j, k;
245
246	/ first, trim all entries /
247	for (i = `0`; i < mi->nr_blks; i++) {
248	struct numa_memblk *bi = &mi->blk[i];
249
250	/ move / save reserved memory ranges /
251	if (!memblock_overlaps_region(type: &memblock.memory,
252	base: bi->start, size: bi->end - bi->start)) {
253	numa_move_tail_memblk(dst: &numa_reserved_meminfo, idx: i--, src: mi);
254	continue;
255	}
256
257	/ make sure all non-reserved blocks are inside the limits /
258	bi->start = max(bi->start, low);
259
260	/ preserve info for non-RAM areas above 'max_pfn': /
261	if (bi->end > high) {
262	numa_add_memblk_to(nid: bi->nid, start: high, end: bi->end,
263	mi: &numa_reserved_meminfo);
264	bi->end = high;
265	}
266
267	/ and there's no empty block /
268	if (bi->start >= bi->end)
269	numa_remove_memblk_from(idx: i--, mi);
270	}
271
272	/ merge neighboring / overlapping entries /
273	for (i = `0`; i < mi->nr_blks; i++) {
274	struct numa_memblk *bi = &mi->blk[i];
275
276	for (j = i + `1`; j < mi->nr_blks; j++) {
277	struct numa_memblk *bj = &mi->blk[j];
278	u64 start, end;
279
280	/*
281	* See whether there are overlapping blocks. Whine
282	* about but allow overlaps of the same nid. They
283	* will be merged below.
284	*/
285	if (bi->end > bj->start && bi->start < bj->end) {
286	if (bi->nid != bj->nid) {
287	pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
288	bi->nid, bi->start, bi->end - `1`,
289	bj->nid, bj->start, bj->end - `1`);
290	return -EINVAL;
291	}
292	pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
293	bi->nid, bi->start, bi->end - `1`,
294	bj->start, bj->end - `1`);
295	}
296
297	/*
298	* Join together blocks on the same node, holes
299	* between which don't overlap with memory on other
300	* nodes.
301	*/
302	if (bi->nid != bj->nid)
303	continue;
304	start = min(bi->start, bj->start);
305	end = max(bi->end, bj->end);
306	for (k = `0`; k < mi->nr_blks; k++) {
307	struct numa_memblk *bk = &mi->blk[k];
308
309	if (bi->nid == bk->nid)
310	continue;
311	if (start < bk->end && end > bk->start)
312	break;
313	}
314	if (k < mi->nr_blks)
315	continue;
316	printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
317	bi->nid, bi->start, bi->end - `1`, bj->start,
318	bj->end - `1`, start, end - `1`);
319	bi->start = start;
320	bi->end = end;
321	numa_remove_memblk_from(idx: j--, mi);
322	}
323	}
324
325	/ clear unused ones /
326	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
327	mi->blk[i].start = mi->blk[i].end = `0`;
328	mi->blk[i].nid = NUMA_NO_NODE;
329	}
330
331	return `0`;
332	}
333
334	/*
335	* Set nodes, which have memory in @mi, in *@nodemask.
336	*/
337	static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
338	const struct numa_meminfo *mi)
339	{
340	int i;
341
342	for (i = `0`; i < ARRAY_SIZE(mi->blk); i++)
343	if (mi->blk[i].start != mi->blk[i].end &&
344	mi->blk[i].nid != NUMA_NO_NODE)
345	node_set(mi->blk[i].nid, *nodemask);
346	}
347
348	/**
349	* numa_reset_distance - Reset NUMA distance table
350	*
351	* The current table is freed. The next numa_set_distance() call will
352	* create a new one.
353	*/
354	void __init numa_reset_distance(void)
355	{
356	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[`0`]);
357
358	/ numa_distance could be 1LU marking allocation failure, test cnt /
359	if (numa_distance_cnt)
360	memblock_free(ptr: numa_distance, size);
361	numa_distance_cnt = `0`;
362	numa_distance = NULL; / enable table creation /
363	}
364
365	static int __init numa_alloc_distance(void)
366	{
367	nodemask_t nodes_parsed;
368	size_t size;
369	int i, j, cnt = `0`;
370	u64 phys;
371
372	/ size the new table and allocate it /
373	nodes_parsed = numa_nodes_parsed;
374	numa_nodemask_from_meminfo(nodemask: &nodes_parsed, mi: &numa_meminfo);
375
376	for_each_node_mask(i, nodes_parsed)
377	cnt = i;
378	cnt++;
379	size = cnt * cnt * sizeof(numa_distance[`0`]);
380
381	phys = memblock_phys_alloc_range(size, PAGE_SIZE, start: `0`,
382	PFN_PHYS(max_pfn_mapped));
383	if (!phys) {
384	pr_warn("Warning: can't allocate distance table!\n");
385	/ don't retry until explicitly reset /
386	numa_distance = (void *)`1LU`;
387	return -ENOMEM;
388	}
389
390	numa_distance = __va(phys);
391	numa_distance_cnt = cnt;
392
393	/ fill with the default distances /
394	for (i = `0`; i < cnt; i++)
395	for (j = `0`; j < cnt; j++)
396	numa_distance[i * cnt + j] = i == j ?
397	LOCAL_DISTANCE : REMOTE_DISTANCE;
398	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
399
400	return `0`;
401	}
402
403	/**
404	* numa_set_distance - Set NUMA distance from one NUMA to another
405	* @from: the 'from' node to set distance
406	* @to: the 'to' node to set distance
407	* @distance: NUMA distance
408	*
409	* Set the distance from node @from to @to to @distance. If distance table
410	* doesn't exist, one which is large enough to accommodate all the currently
411	* known nodes will be created.
412	*
413	* If such table cannot be allocated, a warning is printed and further
414	* calls are ignored until the distance table is reset with
415	* numa_reset_distance().
416	*
417	* If @from or @to is higher than the highest known node or lower than zero
418	* at the time of table creation or @distance doesn't make sense, the call
419	* is ignored.
420	* This is to allow simplification of specific NUMA config implementations.
421	*/
422	void __init numa_set_distance(int from, int to, int distance)
423	{
424	if (!numa_distance && numa_alloc_distance() < `0`)
425	return;
426
427	if (from >= numa_distance_cnt \|\| to >= numa_distance_cnt \|\|
428	from < `0` \|\| to < `0`) {
429	pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
430	from, to, distance);
431	return;
432	}
433
434	if ((u8)distance != distance \|\|
435	(from == to && distance != LOCAL_DISTANCE)) {
436	pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
437	from, to, distance);
438	return;
439	}
440
441	numa_distance[from * numa_distance_cnt + to] = distance;
442	}
443
444	int __node_distance(int from, int to)
445	{
446	if (from >= numa_distance_cnt \|\| to >= numa_distance_cnt)
447	return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
448	return numa_distance[from * numa_distance_cnt + to];
449	}
450	EXPORT_SYMBOL(__node_distance);
451
452	/*
453	* Sanity check to catch more bad NUMA configurations (they are amazingly
454	* common). Make sure the nodes cover all memory.
455	*/
456	static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
457	{
458	u64 numaram, e820ram;
459	int i;
460
461	numaram = `0`;
462	for (i = `0`; i < mi->nr_blks; i++) {
463	u64 s = mi->blk[i].start >> PAGE_SHIFT;
464	u64 e = mi->blk[i].end >> PAGE_SHIFT;
465	numaram += e - s;
466	numaram -= __absent_pages_in_range(nid: mi->blk[i].nid, start_pfn: s, end_pfn: e);
467	if ((s64)numaram < `0`)
468	numaram = `0`;
469	}
470
471	e820ram = max_pfn - absent_pages_in_range(start_pfn: `0`, end_pfn: max_pfn);
472
473	/ We seem to lose 3 pages somewhere. Allow 1M of slack. /
474	if ((s64)(e820ram - numaram) >= (`1` << (`20` - PAGE_SHIFT))) {
475	printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
476	(numaram << PAGE_SHIFT) >> `20`,
477	(e820ram << PAGE_SHIFT) >> `20`);
478	return false;
479	}
480	return true;
481	}
482
483	/*
484	* Mark all currently memblock-reserved physical memory (which covers the
485	* kernel's own memory ranges) as hot-unswappable.
486	*/
487	static void __init numa_clear_kernel_node_hotplug(void)
488	{
489	nodemask_t reserved_nodemask = NODE_MASK_NONE;
490	struct memblock_region *mb_region;
491	int i;
492
493	/*
494	* We have to do some preprocessing of memblock regions, to
495	* make them suitable for reservation.
496	*
497	* At this time, all memory regions reserved by memblock are
498	* used by the kernel, but those regions are not split up
499	* along node boundaries yet, and don't necessarily have their
500	* node ID set yet either.
501	*
502	* So iterate over all memory known to the x86 architecture,
503	* and use those ranges to set the nid in memblock.reserved.
504	* This will split up the memblock regions along node
505	* boundaries and will set the node IDs as well.
506	*/
507	for (i = `0`; i < numa_meminfo.nr_blks; i++) {
508	struct numa_memblk *mb = numa_meminfo.blk + i;
509	int ret;
510
511	ret = memblock_set_node(base: mb->start, size: mb->end - mb->start, type: &memblock.reserved, nid: mb->nid);
512	WARN_ON_ONCE(ret);
513	}
514
515	/*
516	* Now go over all reserved memblock regions, to construct a
517	* node mask of all kernel reserved memory areas.
518	*
519	* [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
520	* numa_meminfo might not include all memblock.reserved
521	* memory ranges, because quirks such as trim_snb_memory()
522	* reserve specific pages for Sandy Bridge graphics. ]
523	*/
524	for_each_reserved_mem_region(mb_region) {
525	int nid = memblock_get_region_node(r: mb_region);
526
527	if (nid != MAX_NUMNODES)
528	node_set(nid, reserved_nodemask);
529	}
530
531	/*
532	* Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
533	* belonging to the reserved node mask.
534	*
535	* Note that this will include memory regions that reside
536	* on nodes that contain kernel memory - entire nodes
537	* become hot-unpluggable:
538	*/
539	for (i = `0`; i < numa_meminfo.nr_blks; i++) {
540	struct numa_memblk *mb = numa_meminfo.blk + i;
541
542	if (!node_isset(mb->nid, reserved_nodemask))
543	continue;
544
545	memblock_clear_hotplug(base: mb->start, size: mb->end - mb->start);
546	}
547	}
548
549	static int __init numa_register_memblks(struct numa_meminfo *mi)
550	{
551	int i, nid;
552
553	/ Account for nodes with cpus and no memory /
554	node_possible_map = numa_nodes_parsed;
555	numa_nodemask_from_meminfo(nodemask: &node_possible_map, mi);
556	if (WARN_ON(nodes_empty(node_possible_map)))
557	return -EINVAL;
558
559	for (i = `0`; i < mi->nr_blks; i++) {
560	struct numa_memblk *mb = &mi->blk[i];
561	memblock_set_node(base: mb->start, size: mb->end - mb->start,
562	type: &memblock.memory, nid: mb->nid);
563	}
564
565	/*
566	* At very early time, the kernel have to use some memory such as
567	* loading the kernel image. We cannot prevent this anyway. So any
568	* node the kernel resides in should be un-hotpluggable.
569	*
570	* And when we come here, alloc node data won't fail.
571	*/
572	numa_clear_kernel_node_hotplug();
573
574	/*
575	* If sections array is gonna be used for pfn -> nid mapping, check
576	* whether its granularity is fine enough.
577	*/
578	if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
579	unsigned long pfn_align = node_map_pfn_alignment();
580
581	if (pfn_align && pfn_align < PAGES_PER_SECTION) {
582	pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
583	PFN_PHYS(pfn_align) >> `20`,
584	PFN_PHYS(PAGES_PER_SECTION) >> `20`);
585	return -EINVAL;
586	}
587	}
588	if (!numa_meminfo_cover_memory(mi))
589	return -EINVAL;
590
591	/ Finally register nodes. /
592	for_each_node_mask(nid, node_possible_map) {
593	u64 start = PFN_PHYS(max_pfn);
594	u64 end = `0`;
595
596	for (i = `0`; i < mi->nr_blks; i++) {
597	if (nid != mi->blk[i].nid)
598	continue;
599	start = min(mi->blk[i].start, start);
600	end = max(mi->blk[i].end, end);
601	}
602
603	if (start >= end)
604	continue;
605
606	alloc_node_data(nid);
607	}
608
609	/ Dump memblock with node info and return. /
610	memblock_dump_all();
611	return `0`;
612	}
613
614	/*
615	* There are unfortunately some poorly designed mainboards around that
616	* only connect memory to a single CPU. This breaks the 1:1 cpu->node
617	* mapping. To avoid this fill in the mapping for all possible CPUs,
618	* as the number of CPUs is not known yet. We round robin the existing
619	* nodes.
620	*/
621	static void __init numa_init_array(void)
622	{
623	int rr, i;
624
625	rr = first_node(node_online_map);
626	for (i = `0`; i < nr_cpu_ids; i++) {
627	if (early_cpu_to_node(cpu: i) != NUMA_NO_NODE)
628	continue;
629	numa_set_node(cpu: i, node: rr);
630	rr = next_node_in(rr, node_online_map);
631	}
632	}
633
634	static int __init numa_init(int (init_func)(void*))
635	{
636	int i;
637	int ret;
638
639	for (i = `0`; i < MAX_LOCAL_APIC; i++)
640	set_apicid_to_node(apicid: i, NUMA_NO_NODE);
641
642	nodes_clear(numa_nodes_parsed);
643	nodes_clear(node_possible_map);
644	nodes_clear(node_online_map);
645	memset(&numa_meminfo, `0`, sizeof(numa_meminfo));
646	WARN_ON(memblock_set_node(`0`, ULLONG_MAX, &memblock.memory,
647	MAX_NUMNODES));
648	WARN_ON(memblock_set_node(`0`, ULLONG_MAX, &memblock.reserved,
649	MAX_NUMNODES));
650	/ In case that parsing SRAT failed. /
651	WARN_ON(memblock_clear_hotplug(`0`, ULLONG_MAX));
652	numa_reset_distance();
653
654	ret = init_func();
655	if (ret < `0`)
656	return ret;
657
658	/*
659	* We reset memblock back to the top-down direction
660	* here because if we configured ACPI_NUMA, we have
661	* parsed SRAT in init_func(). It is ok to have the
662	* reset here even if we did't configure ACPI_NUMA
663	* or acpi numa init fails and fallbacks to dummy
664	* numa init.
665	*/
666	memblock_set_bottom_up(enable: false);
667
668	ret = numa_cleanup_meminfo(mi: &numa_meminfo);
669	if (ret < `0`)
670	return ret;
671
672	numa_emulation(numa_meminfo: &numa_meminfo, numa_dist_cnt: numa_distance_cnt);
673
674	ret = numa_register_memblks(mi: &numa_meminfo);
675	if (ret < `0`)
676	return ret;
677
678	for (i = `0`; i < nr_cpu_ids; i++) {
679	int nid = early_cpu_to_node(cpu: i);
680
681	if (nid == NUMA_NO_NODE)
682	continue;
683	if (!node_online(nid))
684	numa_clear_node(cpu: i);
685	}
686	numa_init_array();
687
688	return `0`;
689	}
690
691	/**
692	* dummy_numa_init - Fallback dummy NUMA init
693	*
694	* Used if there's no underlying NUMA architecture, NUMA initialization
695	* fails, or NUMA is disabled on the command line.
696	*
697	* Must online at least one node and add memory blocks that cover all
698	* allowed memory. This function must not fail.
699	*/
700	static int __init dummy_numa_init(void)
701	{
702	printk(KERN_INFO "%s\n",
703	numa_off ? "NUMA turned off" : "No NUMA configuration found");
704	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
705	`0LLU`, PFN_PHYS(max_pfn) - `1`);
706
707	node_set(`0`, numa_nodes_parsed);
708	numa_add_memblk(nid: `0`, start: `0`, PFN_PHYS(max_pfn));
709
710	return `0`;
711	}
712
713	/**
714	* x86_numa_init - Initialize NUMA
715	*
716	* Try each configured NUMA initialization method until one succeeds. The
717	* last fallback is dummy single node config encompassing whole memory and
718	* never fails.
719	*/
720	void __init x86_numa_init(void)
721	{
722	if (!numa_off) {
723	#ifdef CONFIG_ACPI_NUMA
724	if (!numa_init(init_func: x86_acpi_numa_init))
725	return;
726	#endif
727	#ifdef CONFIG_AMD_NUMA
728	if (!numa_init(init_func: amd_numa_init))
729	return;
730	#endif
731	if (acpi_disabled && !numa_init(init_func: of_numa_init))
732	return;
733	}
734
735	numa_init(init_func: dummy_numa_init);
736	}
737
738
739	/*
740	* A node may exist which has one or more Generic Initiators but no CPUs and no
741	* memory.
742	*
743	* This function must be called after init_cpu_to_node(), to ensure that any
744	* memoryless CPU nodes have already been brought online, and before the
745	* node_data[nid] is needed for zone list setup in build_all_zonelists().
746	*
747	* When this function is called, any nodes containing either memory and/or CPUs
748	* will already be online and there is no need to do anything extra, even if
749	* they also contain one or more Generic Initiators.
750	*/
751	void __init init_gi_nodes(void)
752	{
753	int nid;
754
755	/*
756	* Exclude this node from
757	* bringup_nonboot_cpus
758	* cpu_up
759	* __try_online_node
760	* register_one_node
761	* because node_subsys is not initialized yet.
762	* TODO remove dependency on node_online
763	*/
764	for_each_node_state(nid, N_GENERIC_INITIATOR)
765	if (!node_online(nid))
766	node_set_online(nid);
767	}
768
769	/*
770	* Setup early cpu_to_node.
771	*
772	* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
773	* and apicid_to_node[] tables have valid entries for a CPU.
774	* This means we skip cpu_to_node[] initialisation for NUMA
775	* emulation and faking node case (when running a kernel compiled
776	* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
777	* is already initialized in a round robin manner at numa_init_array,
778	* prior to this call, and this initialization is good enough
779	* for the fake NUMA cases.
780	*
781	* Called before the per_cpu areas are setup.
782	*/
783	void __init init_cpu_to_node(void)
784	{
785	int cpu;
786	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
787
788	BUG_ON(cpu_to_apicid == NULL);
789
790	for_each_possible_cpu(cpu) {
791	int node = numa_cpu_node(cpu);
792
793	if (node == NUMA_NO_NODE)
794	continue;
795
796	/*
797	* Exclude this node from
798	* bringup_nonboot_cpus
799	* cpu_up
800	* __try_online_node
801	* register_one_node
802	* because node_subsys is not initialized yet.
803	* TODO remove dependency on node_online
804	*/
805	if (!node_online(node))
806	node_set_online(nid: node);
807
808	numa_set_node(cpu, node);
809	}
810	}
811
812	#ifndef CONFIG_DEBUG_PER_CPU_MAPS
813
814	# ifndef CONFIG_NUMA_EMU
815	void numa_add_cpu(int cpu)
816	{
817	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
818	}
819
820	void numa_remove_cpu(int cpu)
821	{
822	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
823	}
824	# endif /* !CONFIG_NUMA_EMU */
825
826	#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
827
828	int __cpu_to_node(int cpu)
829	{
830	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
831	printk(KERN_WARNING
832	"cpu_to_node(%d): usage too early!\n", cpu);
833	dump_stack();
834	return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
835	}
836	return per_cpu(x86_cpu_to_node_map, cpu);
837	}
838	EXPORT_SYMBOL(__cpu_to_node);
839
840	/*
841	* Same function as cpu_to_node() but used if called before the
842	* per_cpu areas are setup.
843	*/
844	int early_cpu_to_node(int cpu)
845	{
846	if (early_per_cpu_ptr(x86_cpu_to_node_map))
847	return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
848
849	if (!cpu_possible(cpu)) {
850	printk(KERN_WARNING
851	"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
852	dump_stack();
853	return NUMA_NO_NODE;
854	}
855	return per_cpu(x86_cpu_to_node_map, cpu);
856	}
857
858	void debug_cpumask_set_cpu(int cpu, int node, bool enable)
859	{
860	struct cpumask *mask;
861
862	if (node == NUMA_NO_NODE) {
863	/ early_cpu_to_node() already emits a warning and trace /
864	return;
865	}
866	mask = node_to_cpumask_map[node];
867	if (!cpumask_available(mask)) {
868	pr_err("node_to_cpumask_map[%i] NULL\n", node);
869	dump_stack();
870	return;
871	}
872
873	if (enable)
874	cpumask_set_cpu(cpu, dstp: mask);
875	else
876	cpumask_clear_cpu(cpu, dstp: mask);
877
878	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
879	enable ? "numa_add_cpu" : "numa_remove_cpu",
880	cpu, node, cpumask_pr_args(mask));
881	return;
882	}
883
884	# ifndef CONFIG_NUMA_EMU
885	static void numa_set_cpumask(int cpu, bool enable)
886	{
887	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
888	}
889
890	void numa_add_cpu(int cpu)
891	{
892	numa_set_cpumask(cpu, true);
893	}
894
895	void numa_remove_cpu(int cpu)
896	{
897	numa_set_cpumask(cpu, false);
898	}
899	# endif /* !CONFIG_NUMA_EMU */
900
901	/*
902	* Returns a pointer to the bitmask of CPUs on Node 'node'.
903	*/
904	const struct cpumask cpumask_of_node(int* node)
905	{
906	if ((unsigned)node >= nr_node_ids) {
907	printk(KERN_WARNING
908	"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
909	node, nr_node_ids);
910	dump_stack();
911	return cpu_none_mask;
912	}
913	if (!cpumask_available(mask: node_to_cpumask_map[node])) {
914	printk(KERN_WARNING
915	"cpumask_of_node(%d): no node_to_cpumask_map!\n",
916	node);
917	dump_stack();
918	return cpu_online_mask;
919	}
920	return node_to_cpumask_map[node];
921	}
922	EXPORT_SYMBOL(cpumask_of_node);
923
924	#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
925
926	#ifdef CONFIG_NUMA_KEEP_MEMINFO
927	static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
928	{
929	int i;
930
931	for (i = `0`; i < mi->nr_blks; i++)
932	if (mi->blk[i].start <= start && mi->blk[i].end > start)
933	return mi->blk[i].nid;
934	return NUMA_NO_NODE;
935	}
936
937	int phys_to_target_node(phys_addr_t start)
938	{
939	int nid = meminfo_to_nid(mi: &numa_meminfo, start);
940
941	/*
942	* Prefer online nodes, but if reserved memory might be
943	* hot-added continue the search with reserved ranges.
944	*/
945	if (nid != NUMA_NO_NODE)
946	return nid;
947
948	return meminfo_to_nid(mi: &numa_reserved_meminfo, start);
949	}
950	EXPORT_SYMBOL_GPL(phys_to_target_node);
951
952	int memory_add_physaddr_to_nid(u64 start)
953	{
954	int nid = meminfo_to_nid(mi: &numa_meminfo, start);
955
956	if (nid == NUMA_NO_NODE)
957	nid = numa_meminfo.blk[`0`].nid;
958	return nid;
959	}
960	EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
961
962	static int __init cmp_memblk(const void a, const* void *b)
963	{
964	const struct numa_memblk ma = (const struct numa_memblk **)a;
965	const struct numa_memblk mb = (const struct numa_memblk **)b;
966
967	return ma->start - mb->start;
968	}
969
970	static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
971
972	/**
973	* numa_fill_memblks - Fill gaps in numa_meminfo memblks
974	* @start: address to begin fill
975	* @end: address to end fill
976	*
977	* Find and extend numa_meminfo memblks to cover the @start-@end
978	* physical address range, such that the first memblk includes
979	* @start, the last memblk includes @end, and any gaps in between
980	* are filled.
981	*
982	* RETURNS:
983	* 0 : Success
984	* NUMA_NO_MEMBLK : No memblk exists in @start-@end range
985	*/
986
987	int __init numa_fill_memblks(u64 start, u64 end)
988	{
989	struct numa_memblk **blk = &numa_memblk_list[`0`];
990	struct numa_meminfo *mi = &numa_meminfo;
991	int count = `0`;
992	u64 prev_end;
993
994	/*
995	* Create a list of pointers to numa_meminfo memblks that
996	* overlap start, end. Exclude (start == bi->end) since
997	* end addresses in both a CFMWS range and a memblk range
998	* are exclusive.
999	*
1000	* This list of pointers is used to make in-place changes
1001	* that fill out the numa_meminfo memblks.
1002	*/
1003	for (int i = `0`; i < mi->nr_blks; i++) {
1004	struct numa_memblk *bi = &mi->blk[i];
1005
1006	if (start < bi->end && end >= bi->start) {
1007	blk[count] = &mi->blk[i];
1008	count++;
1009	}
1010	}
1011	if (!count)
1012	return NUMA_NO_MEMBLK;
1013
1014	/ Sort the list of pointers in memblk->start order /
1015	sort(base: &blk[`0`], num: count, size: sizeof(blk[`0`]), cmp_func: cmp_memblk, NULL);
1016
1017	/ Make sure the first/last memblks include start/end /
1018	blk[`0`]->start = min(blk[`0`]->start, start);
1019	blk[count - `1`]->end = max(blk[count - `1`]->end, end);
1020
1021	/*
1022	* Fill any gaps by tracking the previous memblks
1023	* end address and backfilling to it if needed.
1024	*/
1025	prev_end = blk[`0`]->end;
1026	for (int i = `1`; i < count; i++) {
1027	struct numa_memblk *curr = blk[i];
1028
1029	if (prev_end >= curr->start) {
1030	if (prev_end < curr->end)
1031	prev_end = curr->end;
1032	} else {
1033	curr->start = prev_end;
1034	prev_end = curr->end;
1035	}
1036	}
1037	return `0`;
1038	}
1039
1040	#endif
1041

source code of linux/arch/x86/mm/numa.c