1// SPDX-License-Identifier: GPL-2.0-or-later
2
3#include <linux/array_size.h>
4#include <linux/sort.h>
5#include <linux/printk.h>
6#include <linux/memblock.h>
7#include <linux/numa.h>
8#include <linux/numa_memblks.h>
9
10#include <asm/numa.h>
11
12int numa_distance_cnt;
13static u8 *numa_distance;
14
15nodemask_t numa_nodes_parsed __initdata;
16
17static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
18static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
19
20/*
21 * Set nodes, which have memory in @mi, in *@nodemask.
22 */
23static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
24 const struct numa_meminfo *mi)
25{
26 int i;
27
28 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
29 if (mi->blk[i].start != mi->blk[i].end &&
30 mi->blk[i].nid != NUMA_NO_NODE)
31 node_set(mi->blk[i].nid, *nodemask);
32}
33
34/**
35 * numa_reset_distance - Reset NUMA distance table
36 *
37 * The current table is freed. The next numa_set_distance() call will
38 * create a new one.
39 */
40void __init numa_reset_distance(void)
41{
42 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
43
44 /* numa_distance could be 1LU marking allocation failure, test cnt */
45 if (numa_distance_cnt)
46 memblock_free(ptr: numa_distance, size);
47 numa_distance_cnt = 0;
48 numa_distance = NULL; /* enable table creation */
49}
50
51static int __init numa_alloc_distance(void)
52{
53 nodemask_t nodes_parsed;
54 size_t size;
55 int i, j, cnt = 0;
56
57 /* size the new table and allocate it */
58 nodes_parsed = numa_nodes_parsed;
59 numa_nodemask_from_meminfo(nodemask: &nodes_parsed, mi: &numa_meminfo);
60
61 for_each_node_mask(i, nodes_parsed)
62 cnt = i;
63 cnt++;
64 size = cnt * cnt * sizeof(numa_distance[0]);
65
66 numa_distance = memblock_alloc(size, PAGE_SIZE);
67 if (!numa_distance) {
68 pr_warn("Warning: can't allocate distance table!\n");
69 /* don't retry until explicitly reset */
70 numa_distance = (void *)1LU;
71 return -ENOMEM;
72 }
73
74 numa_distance_cnt = cnt;
75
76 /* fill with the default distances */
77 for (i = 0; i < cnt; i++)
78 for (j = 0; j < cnt; j++)
79 numa_distance[i * cnt + j] = i == j ?
80 LOCAL_DISTANCE : REMOTE_DISTANCE;
81 pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt);
82
83 return 0;
84}
85
86/**
87 * numa_set_distance - Set NUMA distance from one NUMA to another
88 * @from: the 'from' node to set distance
89 * @to: the 'to' node to set distance
90 * @distance: NUMA distance
91 *
92 * Set the distance from node @from to @to to @distance. If distance table
93 * doesn't exist, one which is large enough to accommodate all the currently
94 * known nodes will be created.
95 *
96 * If such table cannot be allocated, a warning is printed and further
97 * calls are ignored until the distance table is reset with
98 * numa_reset_distance().
99 *
100 * If @from or @to is higher than the highest known node or lower than zero
101 * at the time of table creation or @distance doesn't make sense, the call
102 * is ignored.
103 * This is to allow simplification of specific NUMA config implementations.
104 */
105void __init numa_set_distance(int from, int to, int distance)
106{
107 if (!numa_distance && numa_alloc_distance() < 0)
108 return;
109
110 if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
111 from < 0 || to < 0) {
112 pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
113 from, to, distance);
114 return;
115 }
116
117 if ((u8)distance != distance ||
118 (from == to && distance != LOCAL_DISTANCE)) {
119 pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
120 from, to, distance);
121 return;
122 }
123
124 numa_distance[from * numa_distance_cnt + to] = distance;
125}
126
127int __node_distance(int from, int to)
128{
129 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
130 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
131 return numa_distance[from * numa_distance_cnt + to];
132}
133EXPORT_SYMBOL(__node_distance);
134
135static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi)
137{
138 /* ignore zero length blks */
139 if (start == end)
140 return 0;
141
142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
145 nid, start, end - 1);
146 return 0;
147 }
148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("too many memblk ranges\n");
151 return -EINVAL;
152 }
153
154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++;
158 return 0;
159}
160
161/**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from
165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks.
168 */
169void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170{
171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174}
175
176/**
177 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
178 * @dst: numa_meminfo to append block to
179 * @idx: Index of memblk to remove
180 * @src: numa_meminfo to remove memblk from
181 */
182static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
183 struct numa_meminfo *src)
184{
185 dst->blk[dst->nr_blks++] = src->blk[idx];
186 numa_remove_memblk_from(idx, mi: src);
187}
188
189/**
190 * numa_add_memblk - Add one numa_memblk to numa_meminfo
191 * @nid: NUMA node ID of the new memblk
192 * @start: Start address of the new memblk
193 * @end: End address of the new memblk
194 *
195 * Add a new memblk to the default numa_meminfo.
196 *
197 * RETURNS:
198 * 0 on success, -errno on failure.
199 */
200int __init numa_add_memblk(int nid, u64 start, u64 end)
201{
202 return numa_add_memblk_to(nid, start, end, mi: &numa_meminfo);
203}
204
205/**
206 * numa_add_reserved_memblk - Add one numa_memblk to numa_reserved_meminfo
207 * @nid: NUMA node ID of the new memblk
208 * @start: Start address of the new memblk
209 * @end: End address of the new memblk
210 *
211 * Add a new memblk to the numa_reserved_meminfo.
212 *
213 * Usage Case: numa_cleanup_meminfo() reconciles all numa_memblk instances
214 * against memblock_type information and moves any that intersect reserved
215 * ranges to numa_reserved_meminfo. However, when that information is known
216 * ahead of time, we use numa_add_reserved_memblk() to add the numa_memblk
217 * to numa_reserved_meminfo directly.
218 *
219 * RETURNS:
220 * 0 on success, -errno on failure.
221 */
222int __init numa_add_reserved_memblk(int nid, u64 start, u64 end)
223{
224 return numa_add_memblk_to(nid, start, end, mi: &numa_reserved_meminfo);
225}
226
227/**
228 * numa_cleanup_meminfo - Cleanup a numa_meminfo
229 * @mi: numa_meminfo to clean up
230 *
231 * Sanitize @mi by merging and removing unnecessary memblks. Also check for
232 * conflicts and clear unused memblks.
233 *
234 * RETURNS:
235 * 0 on success, -errno on failure.
236 */
237int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
238{
239 const u64 low = memblock_start_of_DRAM();
240 const u64 high = memblock_end_of_DRAM();
241 int i, j, k;
242
243 /* first, trim all entries */
244 for (i = 0; i < mi->nr_blks; i++) {
245 struct numa_memblk *bi = &mi->blk[i];
246
247 /* move / save reserved memory ranges */
248 if (!memblock_overlaps_region(type: &memblock.memory,
249 base: bi->start, size: bi->end - bi->start)) {
250 numa_move_tail_memblk(dst: &numa_reserved_meminfo, idx: i--, src: mi);
251 continue;
252 }
253
254 /* make sure all non-reserved blocks are inside the limits */
255 bi->start = max(bi->start, low);
256
257 /* preserve info for non-RAM areas above 'max_pfn': */
258 if (bi->end > high) {
259 numa_add_memblk_to(nid: bi->nid, start: high, end: bi->end,
260 mi: &numa_reserved_meminfo);
261 bi->end = high;
262 }
263
264 /* and there's no empty block */
265 if (bi->start >= bi->end)
266 numa_remove_memblk_from(idx: i--, mi);
267 }
268
269 /* merge neighboring / overlapping entries */
270 for (i = 0; i < mi->nr_blks; i++) {
271 struct numa_memblk *bi = &mi->blk[i];
272
273 for (j = i + 1; j < mi->nr_blks; j++) {
274 struct numa_memblk *bj = &mi->blk[j];
275 u64 start, end;
276
277 /*
278 * See whether there are overlapping blocks. Whine
279 * about but allow overlaps of the same nid. They
280 * will be merged below.
281 */
282 if (bi->end > bj->start && bi->start < bj->end) {
283 if (bi->nid != bj->nid) {
284 pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
285 bi->nid, bi->start, bi->end - 1,
286 bj->nid, bj->start, bj->end - 1);
287 return -EINVAL;
288 }
289 pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
290 bi->nid, bi->start, bi->end - 1,
291 bj->start, bj->end - 1);
292 }
293
294 /*
295 * Join together blocks on the same node, holes
296 * between which don't overlap with memory on other
297 * nodes.
298 */
299 if (bi->nid != bj->nid)
300 continue;
301 start = min(bi->start, bj->start);
302 end = max(bi->end, bj->end);
303 for (k = 0; k < mi->nr_blks; k++) {
304 struct numa_memblk *bk = &mi->blk[k];
305
306 if (bi->nid == bk->nid)
307 continue;
308 if (start < bk->end && end > bk->start)
309 break;
310 }
311 if (k < mi->nr_blks)
312 continue;
313 pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
314 bi->nid, bi->start, bi->end - 1, bj->start,
315 bj->end - 1, start, end - 1);
316 bi->start = start;
317 bi->end = end;
318 numa_remove_memblk_from(idx: j--, mi);
319 }
320 }
321
322 /* clear unused ones */
323 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
324 mi->blk[i].start = mi->blk[i].end = 0;
325 mi->blk[i].nid = NUMA_NO_NODE;
326 }
327
328 return 0;
329}
330
331/*
332 * Mark all currently memblock-reserved physical memory (which covers the
333 * kernel's own memory ranges) as hot-unswappable.
334 */
335static void __init numa_clear_kernel_node_hotplug(void)
336{
337 nodemask_t reserved_nodemask = NODE_MASK_NONE;
338 struct memblock_region *mb_region;
339 int i;
340
341 /*
342 * We have to do some preprocessing of memblock regions, to
343 * make them suitable for reservation.
344 *
345 * At this time, all memory regions reserved by memblock are
346 * used by the kernel, but those regions are not split up
347 * along node boundaries yet, and don't necessarily have their
348 * node ID set yet either.
349 *
350 * So iterate over all parsed memory blocks and use those ranges to
351 * set the nid in memblock.reserved. This will split up the
352 * memblock regions along node boundaries and will set the node IDs
353 * as well.
354 */
355 for (i = 0; i < numa_meminfo.nr_blks; i++) {
356 struct numa_memblk *mb = numa_meminfo.blk + i;
357 int ret;
358
359 ret = memblock_set_node(base: mb->start, size: mb->end - mb->start,
360 type: &memblock.reserved, nid: mb->nid);
361 WARN_ON_ONCE(ret);
362 }
363
364 /*
365 * Now go over all reserved memblock regions, to construct a
366 * node mask of all kernel reserved memory areas.
367 *
368 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
369 * numa_meminfo might not include all memblock.reserved
370 * memory ranges, because quirks such as trim_snb_memory()
371 * reserve specific pages for Sandy Bridge graphics. ]
372 */
373 for_each_reserved_mem_region(mb_region) {
374 int nid = memblock_get_region_node(r: mb_region);
375
376 if (numa_valid_node(nid))
377 node_set(nid, reserved_nodemask);
378 }
379
380 /*
381 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
382 * belonging to the reserved node mask.
383 *
384 * Note that this will include memory regions that reside
385 * on nodes that contain kernel memory - entire nodes
386 * become hot-unpluggable:
387 */
388 for (i = 0; i < numa_meminfo.nr_blks; i++) {
389 struct numa_memblk *mb = numa_meminfo.blk + i;
390
391 if (!node_isset(mb->nid, reserved_nodemask))
392 continue;
393
394 memblock_clear_hotplug(base: mb->start, size: mb->end - mb->start);
395 }
396}
397
398static int __init numa_register_meminfo(struct numa_meminfo *mi)
399{
400 int i;
401
402 /* Account for nodes with cpus and no memory */
403 node_possible_map = numa_nodes_parsed;
404 numa_nodemask_from_meminfo(nodemask: &node_possible_map, mi);
405 if (WARN_ON(nodes_empty(node_possible_map)))
406 return -EINVAL;
407
408 for (i = 0; i < mi->nr_blks; i++) {
409 struct numa_memblk *mb = &mi->blk[i];
410
411 memblock_set_node(base: mb->start, size: mb->end - mb->start,
412 type: &memblock.memory, nid: mb->nid);
413 }
414
415 /*
416 * At very early time, the kernel have to use some memory such as
417 * loading the kernel image. We cannot prevent this anyway. So any
418 * node the kernel resides in should be un-hotpluggable.
419 *
420 * And when we come here, alloc node data won't fail.
421 */
422 numa_clear_kernel_node_hotplug();
423
424 /*
425 * If sections array is gonna be used for pfn -> nid mapping, check
426 * whether its granularity is fine enough.
427 */
428 if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
429 unsigned long pfn_align = node_map_pfn_alignment();
430
431 if (pfn_align && pfn_align < PAGES_PER_SECTION) {
432 unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M;
433
434 unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M;
435
436 pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
437 node_align_mb, sect_align_mb);
438 return -EINVAL;
439 }
440 }
441
442 return 0;
443}
444
445int __init numa_memblks_init(int (*init_func)(void),
446 bool memblock_force_top_down)
447{
448 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
449 int ret;
450
451 nodes_clear(numa_nodes_parsed);
452 nodes_clear(node_possible_map);
453 nodes_clear(node_online_map);
454 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
455 WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE));
456 WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved,
457 NUMA_NO_NODE));
458 /* In case that parsing SRAT failed. */
459 WARN_ON(memblock_clear_hotplug(0, max_addr));
460 numa_reset_distance();
461
462 ret = init_func();
463 if (ret < 0)
464 return ret;
465
466 /*
467 * We reset memblock back to the top-down direction
468 * here because if we configured ACPI_NUMA, we have
469 * parsed SRAT in init_func(). It is ok to have the
470 * reset here even if we did't configure ACPI_NUMA
471 * or acpi numa init fails and fallbacks to dummy
472 * numa init.
473 */
474 if (memblock_force_top_down)
475 memblock_set_bottom_up(enable: false);
476
477 ret = numa_cleanup_meminfo(mi: &numa_meminfo);
478 if (ret < 0)
479 return ret;
480
481 numa_emulation(numa_meminfo: &numa_meminfo, numa_dist_cnt: numa_distance_cnt);
482
483 return numa_register_meminfo(mi: &numa_meminfo);
484}
485
486static int __init cmp_memblk(const void *a, const void *b)
487{
488 const struct numa_memblk *ma = *(const struct numa_memblk **)a;
489 const struct numa_memblk *mb = *(const struct numa_memblk **)b;
490
491 return (ma->start > mb->start) - (ma->start < mb->start);
492}
493
494static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
495
496/**
497 * numa_fill_memblks - Fill gaps in numa_meminfo memblks
498 * @start: address to begin fill
499 * @end: address to end fill
500 *
501 * Find and extend numa_meminfo memblks to cover the physical
502 * address range @start-@end
503 *
504 * RETURNS:
505 * 0 : Success
506 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
507 */
508
509int __init numa_fill_memblks(u64 start, u64 end)
510{
511 struct numa_memblk **blk = &numa_memblk_list[0];
512 struct numa_meminfo *mi = &numa_meminfo;
513 int count = 0;
514 u64 prev_end;
515
516 /*
517 * Create a list of pointers to numa_meminfo memblks that
518 * overlap start, end. The list is used to make in-place
519 * changes that fill out the numa_meminfo memblks.
520 */
521 for (int i = 0; i < mi->nr_blks; i++) {
522 struct numa_memblk *bi = &mi->blk[i];
523
524 if (memblock_addrs_overlap(base1: start, size1: end - start, base2: bi->start,
525 size2: bi->end - bi->start)) {
526 blk[count] = &mi->blk[i];
527 count++;
528 }
529 }
530 if (!count)
531 return NUMA_NO_MEMBLK;
532
533 /* Sort the list of pointers in memblk->start order */
534 sort(base: &blk[0], num: count, size: sizeof(blk[0]), cmp_func: cmp_memblk, NULL);
535
536 /* Make sure the first/last memblks include start/end */
537 blk[0]->start = min(blk[0]->start, start);
538 blk[count - 1]->end = max(blk[count - 1]->end, end);
539
540 /*
541 * Fill any gaps by tracking the previous memblks
542 * end address and backfilling to it if needed.
543 */
544 prev_end = blk[0]->end;
545 for (int i = 1; i < count; i++) {
546 struct numa_memblk *curr = blk[i];
547
548 if (prev_end >= curr->start) {
549 if (prev_end < curr->end)
550 prev_end = curr->end;
551 } else {
552 curr->start = prev_end;
553 prev_end = curr->end;
554 }
555 }
556 return 0;
557}
558
559#ifdef CONFIG_NUMA_KEEP_MEMINFO
560static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
561{
562 int i;
563
564 for (i = 0; i < mi->nr_blks; i++)
565 if (mi->blk[i].start <= start && mi->blk[i].end > start)
566 return mi->blk[i].nid;
567 return NUMA_NO_NODE;
568}
569
570int phys_to_target_node(u64 start)
571{
572 int nid = meminfo_to_nid(mi: &numa_meminfo, start);
573
574 /*
575 * Prefer online nodes, but if reserved memory might be
576 * hot-added continue the search with reserved ranges.
577 */
578 if (nid != NUMA_NO_NODE)
579 return nid;
580
581 return meminfo_to_nid(mi: &numa_reserved_meminfo, start);
582}
583EXPORT_SYMBOL_GPL(phys_to_target_node);
584
585int memory_add_physaddr_to_nid(u64 start)
586{
587 int nid = meminfo_to_nid(mi: &numa_meminfo, start);
588
589 if (nid == NUMA_NO_NODE)
590 nid = numa_meminfo.blk[0].nid;
591 return nid;
592}
593EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
594
595#endif /* CONFIG_NUMA_KEEP_MEMINFO */
596

source code of linux/mm/numa_memblks.c