numa_memblks.c source code [linux/mm/numa_memblks.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2
3	#include <linux/array_size.h>
4	#include <linux/sort.h>
5	#include <linux/printk.h>
6	#include <linux/memblock.h>
7	#include <linux/numa.h>
8	#include <linux/numa_memblks.h>
9
10	int numa_distance_cnt;
11	static u8 *numa_distance;
12
13	nodemask_t numa_nodes_parsed __initdata;
14
15	static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
16	static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
17
18	/*
19	* Set nodes, which have memory in @mi, in *@nodemask.
20	*/
21	static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
22	const struct numa_meminfo *mi)
23	{
24	int i;
25
26	for (i = `0`; i < ARRAY_SIZE(mi->blk); i++)
27	if (mi->blk[i].start != mi->blk[i].end &&
28	mi->blk[i].nid != NUMA_NO_NODE)
29	node_set(mi->blk[i].nid, *nodemask);
30	}
31
32	/**
33	* numa_reset_distance - Reset NUMA distance table
34	*
35	* The current table is freed. The next numa_set_distance() call will
36	* create a new one.
37	*/
38	void __init numa_reset_distance(void)
39	{
40	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[`0`]);
41
42	/ numa_distance could be 1LU marking allocation failure, test cnt /
43	if (numa_distance_cnt)
44	memblock_free(ptr: numa_distance, size);
45	numa_distance_cnt = `0`;
46	numa_distance = NULL; / enable table creation /
47	}
48
49	static int __init numa_alloc_distance(void)
50	{
51	nodemask_t nodes_parsed;
52	size_t size;
53	int i, j, cnt = `0`;
54
55	/ size the new table and allocate it /
56	nodes_parsed = numa_nodes_parsed;
57	numa_nodemask_from_meminfo(nodemask: &nodes_parsed, mi: &numa_meminfo);
58
59	for_each_node_mask(i, nodes_parsed)
60	cnt = i;
61	cnt++;
62	size = cnt * cnt * sizeof(numa_distance[`0`]);
63
64	numa_distance = memblock_alloc(size, PAGE_SIZE);
65	if (!numa_distance) {
66	pr_warn("Warning: can't allocate distance table!\n");
67	/ don't retry until explicitly reset /
68	numa_distance = (void *)`1LU`;
69	return -ENOMEM;
70	}
71
72	numa_distance_cnt = cnt;
73
74	/ fill with the default distances /
75	for (i = `0`; i < cnt; i++)
76	for (j = `0`; j < cnt; j++)
77	numa_distance[i * cnt + j] = i == j ?
78	LOCAL_DISTANCE : REMOTE_DISTANCE;
79	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
80
81	return `0`;
82	}
83
84	/**
85	* numa_set_distance - Set NUMA distance from one NUMA to another
86	* @from: the 'from' node to set distance
87	* @to: the 'to' node to set distance
88	* @distance: NUMA distance
89	*
90	* Set the distance from node @from to @to to @distance. If distance table
91	* doesn't exist, one which is large enough to accommodate all the currently
92	* known nodes will be created.
93	*
94	* If such table cannot be allocated, a warning is printed and further
95	* calls are ignored until the distance table is reset with
96	* numa_reset_distance().
97	*
98	* If @from or @to is higher than the highest known node or lower than zero
99	* at the time of table creation or @distance doesn't make sense, the call
100	* is ignored.
101	* This is to allow simplification of specific NUMA config implementations.
102	*/
103	void __init numa_set_distance(int from, int to, int distance)
104	{
105	if (!numa_distance && numa_alloc_distance() < `0`)
106	return;
107
108	if (from >= numa_distance_cnt \|\| to >= numa_distance_cnt \|\|
109	from < `0` \|\| to < `0`) {
110	pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
111	from, to, distance);
112	return;
113	}
114
115	if ((u8)distance != distance \|\|
116	(from == to && distance != LOCAL_DISTANCE)) {
117	pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
118	from, to, distance);
119	return;
120	}
121
122	numa_distance[from * numa_distance_cnt + to] = distance;
123	}
124
125	int __node_distance(int from, int to)
126	{
127	if (from >= numa_distance_cnt \|\| to >= numa_distance_cnt)
128	return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
129	return numa_distance[from * numa_distance_cnt + to];
130	}
131	EXPORT_SYMBOL(__node_distance);
132
133	static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
134	struct numa_meminfo *mi)
135	{
136	/ ignore zero length blks /
137	if (start == end)
138	return `0`;
139
140	/ whine about and ignore invalid blks /
141	if (start > end \|\| nid < `0` \|\| nid >= MAX_NUMNODES) {
142	pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
143	nid, start, end - `1`);
144	return `0`;
145	}
146
147	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
148	pr_err("too many memblk ranges\n");
149	return -EINVAL;
150	}
151
152	mi->blk[mi->nr_blks].start = start;
153	mi->blk[mi->nr_blks].end = end;
154	mi->blk[mi->nr_blks].nid = nid;
155	mi->nr_blks++;
156	return `0`;
157	}
158
159	/**
160	* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
161	* @idx: Index of memblk to remove
162	* @mi: numa_meminfo to remove memblk from
163	*
164	* Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
165	* decrementing @mi->nr_blks.
166	*/
167	void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
168	{
169	mi->nr_blks--;
170	memmove(&mi->blk[idx], &mi->blk[idx + `1`],
171	(mi->nr_blks - idx) * sizeof(mi->blk[`0`]));
172	}
173
174	/**
175	* numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
176	* @dst: numa_meminfo to append block to
177	* @idx: Index of memblk to remove
178	* @src: numa_meminfo to remove memblk from
179	*/
180	static void __init numa_move_tail_memblk(struct numa_meminfo dst, int* idx,
181	struct numa_meminfo *src)
182	{
183	dst->blk[dst->nr_blks++] = src->blk[idx];
184	numa_remove_memblk_from(idx, mi: src);
185	}
186
187	/**
188	* numa_add_memblk - Add one numa_memblk to numa_meminfo
189	* @nid: NUMA node ID of the new memblk
190	* @start: Start address of the new memblk
191	* @end: End address of the new memblk
192	*
193	* Add a new memblk to the default numa_meminfo.
194	*
195	* RETURNS:
196	* 0 on success, -errno on failure.
197	*/
198	int __init numa_add_memblk(int nid, u64 start, u64 end)
199	{
200	return numa_add_memblk_to(nid, start, end, mi: &numa_meminfo);
201	}
202
203	/**
204	* numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
205	* @nid: NUMA node ID of the new memblk
206	* @start: Start address of the new memblk
207	* @end: End address of the new memblk
208	*
209	* Add a new memblk to the numa_reserved_meminfo.
210	*
211	* Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
212	* against memblock_type information and moves any that intersect reserved
213	* ranges to numa_reserved_meminfo. However, when that information is known
214	* ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
215	* to numa_reserved_meminfo directly.
216	*
217	* RETURNS:
218	* 0 on success, -errno on failure.
219	*/
220	int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
221	{
222	return numa_add_memblk_to(nid, start, end, mi: &numa_reserved_meminfo);
223	}
224
225	/**
226	* numa_cleanup_meminfo - Cleanup a numa_meminfo
227	* @mi: numa_meminfo to clean up
228	*
229	* Sanitize @mi by merging and removing unnecessary memblks. Also check for
230	* conflicts and clear unused memblks.
231	*
232	* RETURNS:
233	* 0 on success, -errno on failure.
234	*/
235	int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
236	{
237	const u64 low = memblock_start_of_DRAM();
238	const u64 high = memblock_end_of_DRAM();
239	int i, j, k;
240
241	/ first, trim all entries /
242	for (i = `0`; i < mi->nr_blks; i++) {
243	struct numa_memblk *bi = &mi->blk[i];
244
245	/ move / save reserved memory ranges /
246	if (!memblock_overlaps_region(type: &memblock.memory,
247	base: bi->start, size: bi->end - bi->start)) {
248	numa_move_tail_memblk(dst: &numa_reserved_meminfo, idx: i--, src: mi);
249	continue;
250	}
251
252	/ make sure all non-reserved blocks are inside the limits /
253	bi->start = max(bi->start, low);
254
255	/ preserve info for non-RAM areas above 'max_pfn': /
256	if (bi->end > high) {
257	numa_add_memblk_to(nid: bi->nid, start: high, end: bi->end,
258	mi: &numa_reserved_meminfo);
259	bi->end = high;
260	}
261
262	/ and there's no empty block /
263	if (bi->start >= bi->end)
264	numa_remove_memblk_from(idx: i--, mi);
265	}
266
267	/ merge neighboring / overlapping entries /
268	for (i = `0`; i < mi->nr_blks; i++) {
269	struct numa_memblk *bi = &mi->blk[i];
270
271	for (j = i + `1`; j < mi->nr_blks; j++) {
272	struct numa_memblk *bj = &mi->blk[j];
273	u64 start, end;
274
275	/*
276	* See whether there are overlapping blocks. Whine
277	* about but allow overlaps of the same nid. They
278	* will be merged below.
279	*/
280	if (bi->end > bj->start && bi->start < bj->end) {
281	if (bi->nid != bj->nid) {
282	pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
283	bi->nid, bi->start, bi->end - `1`,
284	bj->nid, bj->start, bj->end - `1`);
285	return -EINVAL;
286	}
287	pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
288	bi->nid, bi->start, bi->end - `1`,
289	bj->start, bj->end - `1`);
290	}
291
292	/*
293	* Join together blocks on the same node, holes
294	* between which don't overlap with memory on other
295	* nodes.
296	*/
297	if (bi->nid != bj->nid)
298	continue;
299	start = min(bi->start, bj->start);
300	end = max(bi->end, bj->end);
301	for (k = `0`; k < mi->nr_blks; k++) {
302	struct numa_memblk *bk = &mi->blk[k];
303
304	if (bi->nid == bk->nid)
305	continue;
306	if (start < bk->end && end > bk->start)
307	break;
308	}
309	if (k < mi->nr_blks)
310	continue;
311	pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
312	bi->nid, bi->start, bi->end - `1`, bj->start,
313	bj->end - `1`, start, end - `1`);
314	bi->start = start;
315	bi->end = end;
316	numa_remove_memblk_from(idx: j--, mi);
317	}
318	}
319
320	/ clear unused ones /
321	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
322	mi->blk[i].start = mi->blk[i].end = `0`;
323	mi->blk[i].nid = NUMA_NO_NODE;
324	}
325
326	return `0`;
327	}
328
329	/*
330	* Mark all currently memblock-reserved physical memory (which covers the
331	* kernel's own memory ranges) as hot-unswappable.
332	*/
333	static void __init numa_clear_kernel_node_hotplug(void)
334	{
335	nodemask_t reserved_nodemask = NODE_MASK_NONE;
336	struct memblock_region *mb_region;
337	int i;
338
339	/*
340	* We have to do some preprocessing of memblock regions, to
341	* make them suitable for reservation.
342	*
343	* At this time, all memory regions reserved by memblock are
344	* used by the kernel, but those regions are not split up
345	* along node boundaries yet, and don't necessarily have their
346	* node ID set yet either.
347	*
348	* So iterate over all parsed memory blocks and use those ranges to
349	* set the nid in memblock.reserved. This will split up the
350	* memblock regions along node boundaries and will set the node IDs
351	* as well.
352	*/
353	for (i = `0`; i < numa_meminfo.nr_blks; i++) {
354	struct numa_memblk *mb = numa_meminfo.blk + i;
355	int ret;
356
357	ret = memblock_set_node(base: mb->start, size: mb->end - mb->start,
358	type: &memblock.reserved, nid: mb->nid);
359	WARN_ON_ONCE(ret);
360	}
361
362	/*
363	* Now go over all reserved memblock regions, to construct a
364	* node mask of all kernel reserved memory areas.
365	*
366	* [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
367	* numa_meminfo might not include all memblock.reserved
368	* memory ranges, because quirks such as trim_snb_memory()
369	* reserve specific pages for Sandy Bridge graphics. ]
370	*/
371	for_each_reserved_mem_region(mb_region) {
372	int nid = memblock_get_region_node(r: mb_region);
373
374	if (numa_valid_node(nid))
375	node_set(nid, reserved_nodemask);
376	}
377
378	/*
379	* Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
380	* belonging to the reserved node mask.
381	*
382	* Note that this will include memory regions that reside
383	* on nodes that contain kernel memory - entire nodes
384	* become hot-unpluggable:
385	*/
386	for (i = `0`; i < numa_meminfo.nr_blks; i++) {
387	struct numa_memblk *mb = numa_meminfo.blk + i;
388
389	if (!node_isset(mb->nid, reserved_nodemask))
390	continue;
391
392	memblock_clear_hotplug(base: mb->start, size: mb->end - mb->start);
393	}
394	}
395
396	static int __init numa_register_meminfo(struct numa_meminfo *mi)
397	{
398	int i;
399
400	/ Account for nodes with cpus and no memory /
401	node_possible_map = numa_nodes_parsed;
402	numa_nodemask_from_meminfo(nodemask: &node_possible_map, mi);
403	if (WARN_ON(nodes_empty(node_possible_map)))
404	return -EINVAL;
405
406	for (i = `0`; i < mi->nr_blks; i++) {
407	struct numa_memblk *mb = &mi->blk[i];
408
409	memblock_set_node(base: mb->start, size: mb->end - mb->start,
410	type: &memblock.memory, nid: mb->nid);
411	}
412
413	/*
414	* At very early time, the kernel have to use some memory such as
415	* loading the kernel image. We cannot prevent this anyway. So any
416	* node the kernel resides in should be un-hotpluggable.
417	*
418	* And when we come here, alloc node data won't fail.
419	*/
420	numa_clear_kernel_node_hotplug();
421
422	/*
423	* If sections array is gonna be used for pfn -> nid mapping, check
424	* whether its granularity is fine enough.
425	*/
426	if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
427	unsigned long pfn_align = node_map_pfn_alignment();
428
429	if (pfn_align && pfn_align < PAGES_PER_SECTION) {
430	unsigned long node_align_mb = PFN_PHYS(pfn_align) >> `20`;
431
432	unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> `20`;
433
434	pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
435	node_align_mb, sect_align_mb);
436	return -EINVAL;
437	}
438	}
439
440	return `0`;
441	}
442
443	int __init numa_memblks_init(int (init_func)(void*),
444	bool memblock_force_top_down)
445	{
446	phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
447	int ret;
448
449	nodes_clear(numa_nodes_parsed);
450	nodes_clear(node_possible_map);
451	nodes_clear(node_online_map);
452	memset(&numa_meminfo, `0`, sizeof(numa_meminfo));
453	WARN_ON(memblock_set_node(`0`, max_addr, &memblock.memory, NUMA_NO_NODE));
454	WARN_ON(memblock_set_node(`0`, max_addr, &memblock.reserved,
455	NUMA_NO_NODE));
456	/ In case that parsing SRAT failed. /
457	WARN_ON(memblock_clear_hotplug(`0`, max_addr));
458	numa_reset_distance();
459
460	ret = init_func();
461	if (ret < `0`)
462	return ret;
463
464	/*
465	* We reset memblock back to the top-down direction
466	* here because if we configured ACPI_NUMA, we have
467	* parsed SRAT in init_func(). It is ok to have the
468	* reset here even if we did't configure ACPI_NUMA
469	* or acpi numa init fails and fallbacks to dummy
470	* numa init.
471	*/
472	if (memblock_force_top_down)
473	memblock_set_bottom_up(enable: false);
474
475	ret = numa_cleanup_meminfo(mi: &numa_meminfo);
476	if (ret < `0`)
477	return ret;
478
479	numa_emulation(numa_meminfo: &numa_meminfo, numa_dist_cnt: numa_distance_cnt);
480
481	return numa_register_meminfo(mi: &numa_meminfo);
482	}
483
484	static int __init cmp_memblk(const void a, const* void *b)
485	{
486	const struct numa_memblk ma = (const struct numa_memblk **)a;
487	const struct numa_memblk mb = (const struct numa_memblk **)b;
488
489	return (ma->start > mb->start) - (ma->start < mb->start);
490	}
491
492	static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
493
494	/**
495	* numa_fill_memblks - Fill gaps in numa_meminfo memblks
496	* @start: address to begin fill
497	* @end: address to end fill
498	*
499	* Find and extend numa_meminfo memblks to cover the physical
500	* address range @start-@end
501	*
502	* RETURNS:
503	* 0 : Success
504	* NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
505	*/
506
507	int __init numa_fill_memblks(u64 start, u64 end)
508	{
509	struct numa_memblk **blk = &numa_memblk_list[`0`];
510	struct numa_meminfo *mi = &numa_meminfo;
511	int count = `0`;
512	u64 prev_end;
513
514	/*
515	* Create a list of pointers to numa_meminfo memblks that
516	* overlap start, end. The list is used to make in-place
517	* changes that fill out the numa_meminfo memblks.
518	*/
519	for (int i = `0`; i < mi->nr_blks; i++) {
520	struct numa_memblk *bi = &mi->blk[i];
521
522	if (memblock_addrs_overlap(base1: start, size1: end - start, base2: bi->start,
523	size2: bi->end - bi->start)) {
524	blk[count] = &mi->blk[i];
525	count++;
526	}
527	}
528	if (!count)
529	return NUMA_NO_MEMBLK;
530
531	/ Sort the list of pointers in memblk->start order /
532	sort(base: &blk[`0`], num: count, size: sizeof(blk[`0`]), cmp_func: cmp_memblk, NULL);
533
534	/ Make sure the first/last memblks include start/end /
535	blk[`0`]->start = min(blk[`0`]->start, start);
536	blk[count - `1`]->end = max(blk[count - `1`]->end, end);
537
538	/*
539	* Fill any gaps by tracking the previous memblks
540	* end address and backfilling to it if needed.
541	*/
542	prev_end = blk[`0`]->end;
543	for (int i = `1`; i < count; i++) {
544	struct numa_memblk *curr = blk[i];
545
546	if (prev_end >= curr->start) {
547	if (prev_end < curr->end)
548	prev_end = curr->end;
549	} else {
550	curr->start = prev_end;
551	prev_end = curr->end;
552	}
553	}
554	return `0`;
555	}
556
557	#ifdef CONFIG_NUMA_KEEP_MEMINFO
558	static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
559	{
560	int i;
561
562	for (i = `0`; i < mi->nr_blks; i++)
563	if (mi->blk[i].start <= start && mi->blk[i].end > start)
564	return mi->blk[i].nid;
565	return NUMA_NO_NODE;
566	}
567
568	int phys_to_target_node(u64 start)
569	{
570	int nid = meminfo_to_nid(mi: &numa_meminfo, start);
571
572	/*
573	* Prefer online nodes, but if reserved memory might be
574	* hot-added continue the search with reserved ranges.
575	*/
576	if (nid != NUMA_NO_NODE)
577	return nid;
578
579	return meminfo_to_nid(mi: &numa_reserved_meminfo, start);
580	}
581	EXPORT_SYMBOL_GPL(phys_to_target_node);
582
583	int memory_add_physaddr_to_nid(u64 start)
584	{
585	int nid = meminfo_to_nid(mi: &numa_meminfo, start);
586
587	if (nid == NUMA_NO_NODE)
588	nid = numa_meminfo.blk[`0`].nid;
589	return nid;
590	}
591	EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
592
593	#endif /* CONFIG_NUMA_KEEP_MEMINFO */
594

source code of linux/mm/numa_memblks.c