1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Common code for 32 and 64-bit NUMA */ |
3 | #include <linux/acpi.h> |
4 | #include <linux/kernel.h> |
5 | #include <linux/mm.h> |
6 | #include <linux/of.h> |
7 | #include <linux/string.h> |
8 | #include <linux/init.h> |
9 | #include <linux/memblock.h> |
10 | #include <linux/mmzone.h> |
11 | #include <linux/ctype.h> |
12 | #include <linux/nodemask.h> |
13 | #include <linux/sched.h> |
14 | #include <linux/topology.h> |
15 | #include <linux/sort.h> |
16 | |
17 | #include <asm/e820/api.h> |
18 | #include <asm/proto.h> |
19 | #include <asm/dma.h> |
20 | #include <asm/amd_nb.h> |
21 | |
22 | #include "numa_internal.h" |
23 | |
24 | int numa_off; |
25 | nodemask_t numa_nodes_parsed __initdata; |
26 | |
27 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
28 | EXPORT_SYMBOL(node_data); |
29 | |
30 | static struct numa_meminfo numa_meminfo __initdata_or_meminfo; |
31 | static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; |
32 | |
33 | static int numa_distance_cnt; |
34 | static u8 *numa_distance; |
35 | |
36 | static __init int numa_setup(char *opt) |
37 | { |
38 | if (!opt) |
39 | return -EINVAL; |
40 | if (!strncmp(opt, "off" , 3)) |
41 | numa_off = 1; |
42 | if (!strncmp(opt, "fake=" , 5)) |
43 | return numa_emu_cmdline(str: opt + 5); |
44 | if (!strncmp(opt, "noacpi" , 6)) |
45 | disable_srat(); |
46 | if (!strncmp(opt, "nohmat" , 6)) |
47 | disable_hmat(); |
48 | return 0; |
49 | } |
50 | early_param("numa" , numa_setup); |
51 | |
52 | /* |
53 | * apicid, cpu, node mappings |
54 | */ |
55 | s16 __apicid_to_node[MAX_LOCAL_APIC] = { |
56 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE |
57 | }; |
58 | |
59 | int numa_cpu_node(int cpu) |
60 | { |
61 | u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); |
62 | |
63 | if (apicid != BAD_APICID) |
64 | return __apicid_to_node[apicid]; |
65 | return NUMA_NO_NODE; |
66 | } |
67 | |
68 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
69 | EXPORT_SYMBOL(node_to_cpumask_map); |
70 | |
71 | /* |
72 | * Map cpu index to node index |
73 | */ |
74 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); |
75 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); |
76 | |
77 | void numa_set_node(int cpu, int node) |
78 | { |
79 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); |
80 | |
81 | /* early setting, no percpu area yet */ |
82 | if (cpu_to_node_map) { |
83 | cpu_to_node_map[cpu] = node; |
84 | return; |
85 | } |
86 | |
87 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS |
88 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { |
89 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n" , cpu); |
90 | dump_stack(); |
91 | return; |
92 | } |
93 | #endif |
94 | per_cpu(x86_cpu_to_node_map, cpu) = node; |
95 | |
96 | set_cpu_numa_node(cpu, node); |
97 | } |
98 | |
99 | void numa_clear_node(int cpu) |
100 | { |
101 | numa_set_node(cpu, NUMA_NO_NODE); |
102 | } |
103 | |
104 | /* |
105 | * Allocate node_to_cpumask_map based on number of available nodes |
106 | * Requires node_possible_map to be valid. |
107 | * |
108 | * Note: cpumask_of_node() is not valid until after this is done. |
109 | * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) |
110 | */ |
111 | void __init setup_node_to_cpumask_map(void) |
112 | { |
113 | unsigned int node; |
114 | |
115 | /* setup nr_node_ids if not done yet */ |
116 | if (nr_node_ids == MAX_NUMNODES) |
117 | setup_nr_node_ids(); |
118 | |
119 | /* allocate the map */ |
120 | for (node = 0; node < nr_node_ids; node++) |
121 | alloc_bootmem_cpumask_var(mask: &node_to_cpumask_map[node]); |
122 | |
123 | /* cpumask_of_node() will now work */ |
124 | pr_debug("Node to cpumask map for %u nodes\n" , nr_node_ids); |
125 | } |
126 | |
127 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, |
128 | struct numa_meminfo *mi) |
129 | { |
130 | /* ignore zero length blks */ |
131 | if (start == end) |
132 | return 0; |
133 | |
134 | /* whine about and ignore invalid blks */ |
135 | if (start > end || nid < 0 || nid >= MAX_NUMNODES) { |
136 | pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n" , |
137 | nid, start, end - 1); |
138 | return 0; |
139 | } |
140 | |
141 | if (mi->nr_blks >= NR_NODE_MEMBLKS) { |
142 | pr_err("too many memblk ranges\n" ); |
143 | return -EINVAL; |
144 | } |
145 | |
146 | mi->blk[mi->nr_blks].start = start; |
147 | mi->blk[mi->nr_blks].end = end; |
148 | mi->blk[mi->nr_blks].nid = nid; |
149 | mi->nr_blks++; |
150 | return 0; |
151 | } |
152 | |
153 | /** |
154 | * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo |
155 | * @idx: Index of memblk to remove |
156 | * @mi: numa_meminfo to remove memblk from |
157 | * |
158 | * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and |
159 | * decrementing @mi->nr_blks. |
160 | */ |
161 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) |
162 | { |
163 | mi->nr_blks--; |
164 | memmove(&mi->blk[idx], &mi->blk[idx + 1], |
165 | (mi->nr_blks - idx) * sizeof(mi->blk[0])); |
166 | } |
167 | |
168 | /** |
169 | * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another |
170 | * @dst: numa_meminfo to append block to |
171 | * @idx: Index of memblk to remove |
172 | * @src: numa_meminfo to remove memblk from |
173 | */ |
174 | static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, |
175 | struct numa_meminfo *src) |
176 | { |
177 | dst->blk[dst->nr_blks++] = src->blk[idx]; |
178 | numa_remove_memblk_from(idx, mi: src); |
179 | } |
180 | |
181 | /** |
182 | * numa_add_memblk - Add one numa_memblk to numa_meminfo |
183 | * @nid: NUMA node ID of the new memblk |
184 | * @start: Start address of the new memblk |
185 | * @end: End address of the new memblk |
186 | * |
187 | * Add a new memblk to the default numa_meminfo. |
188 | * |
189 | * RETURNS: |
190 | * 0 on success, -errno on failure. |
191 | */ |
192 | int __init numa_add_memblk(int nid, u64 start, u64 end) |
193 | { |
194 | return numa_add_memblk_to(nid, start, end, mi: &numa_meminfo); |
195 | } |
196 | |
197 | /* Allocate NODE_DATA for a node on the local memory */ |
198 | static void __init alloc_node_data(int nid) |
199 | { |
200 | const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
201 | u64 nd_pa; |
202 | void *nd; |
203 | int tnid; |
204 | |
205 | /* |
206 | * Allocate node data. Try node-local memory and then any node. |
207 | * Never allocate in DMA zone. |
208 | */ |
209 | nd_pa = memblock_phys_alloc_try_nid(size: nd_size, SMP_CACHE_BYTES, nid); |
210 | if (!nd_pa) { |
211 | pr_err("Cannot find %zu bytes in any node (initial node: %d)\n" , |
212 | nd_size, nid); |
213 | return; |
214 | } |
215 | nd = __va(nd_pa); |
216 | |
217 | /* report and initialize */ |
218 | printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n" , nid, |
219 | nd_pa, nd_pa + nd_size - 1); |
220 | tnid = early_pfn_to_nid(pfn: nd_pa >> PAGE_SHIFT); |
221 | if (tnid != nid) |
222 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n" , nid, tnid); |
223 | |
224 | node_data[nid] = nd; |
225 | memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); |
226 | |
227 | node_set_online(nid); |
228 | } |
229 | |
230 | /** |
231 | * numa_cleanup_meminfo - Cleanup a numa_meminfo |
232 | * @mi: numa_meminfo to clean up |
233 | * |
234 | * Sanitize @mi by merging and removing unnecessary memblks. Also check for |
235 | * conflicts and clear unused memblks. |
236 | * |
237 | * RETURNS: |
238 | * 0 on success, -errno on failure. |
239 | */ |
240 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi) |
241 | { |
242 | const u64 low = 0; |
243 | const u64 high = PFN_PHYS(max_pfn); |
244 | int i, j, k; |
245 | |
246 | /* first, trim all entries */ |
247 | for (i = 0; i < mi->nr_blks; i++) { |
248 | struct numa_memblk *bi = &mi->blk[i]; |
249 | |
250 | /* move / save reserved memory ranges */ |
251 | if (!memblock_overlaps_region(type: &memblock.memory, |
252 | base: bi->start, size: bi->end - bi->start)) { |
253 | numa_move_tail_memblk(dst: &numa_reserved_meminfo, idx: i--, src: mi); |
254 | continue; |
255 | } |
256 | |
257 | /* make sure all non-reserved blocks are inside the limits */ |
258 | bi->start = max(bi->start, low); |
259 | |
260 | /* preserve info for non-RAM areas above 'max_pfn': */ |
261 | if (bi->end > high) { |
262 | numa_add_memblk_to(nid: bi->nid, start: high, end: bi->end, |
263 | mi: &numa_reserved_meminfo); |
264 | bi->end = high; |
265 | } |
266 | |
267 | /* and there's no empty block */ |
268 | if (bi->start >= bi->end) |
269 | numa_remove_memblk_from(idx: i--, mi); |
270 | } |
271 | |
272 | /* merge neighboring / overlapping entries */ |
273 | for (i = 0; i < mi->nr_blks; i++) { |
274 | struct numa_memblk *bi = &mi->blk[i]; |
275 | |
276 | for (j = i + 1; j < mi->nr_blks; j++) { |
277 | struct numa_memblk *bj = &mi->blk[j]; |
278 | u64 start, end; |
279 | |
280 | /* |
281 | * See whether there are overlapping blocks. Whine |
282 | * about but allow overlaps of the same nid. They |
283 | * will be merged below. |
284 | */ |
285 | if (bi->end > bj->start && bi->start < bj->end) { |
286 | if (bi->nid != bj->nid) { |
287 | pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n" , |
288 | bi->nid, bi->start, bi->end - 1, |
289 | bj->nid, bj->start, bj->end - 1); |
290 | return -EINVAL; |
291 | } |
292 | pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n" , |
293 | bi->nid, bi->start, bi->end - 1, |
294 | bj->start, bj->end - 1); |
295 | } |
296 | |
297 | /* |
298 | * Join together blocks on the same node, holes |
299 | * between which don't overlap with memory on other |
300 | * nodes. |
301 | */ |
302 | if (bi->nid != bj->nid) |
303 | continue; |
304 | start = min(bi->start, bj->start); |
305 | end = max(bi->end, bj->end); |
306 | for (k = 0; k < mi->nr_blks; k++) { |
307 | struct numa_memblk *bk = &mi->blk[k]; |
308 | |
309 | if (bi->nid == bk->nid) |
310 | continue; |
311 | if (start < bk->end && end > bk->start) |
312 | break; |
313 | } |
314 | if (k < mi->nr_blks) |
315 | continue; |
316 | printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n" , |
317 | bi->nid, bi->start, bi->end - 1, bj->start, |
318 | bj->end - 1, start, end - 1); |
319 | bi->start = start; |
320 | bi->end = end; |
321 | numa_remove_memblk_from(idx: j--, mi); |
322 | } |
323 | } |
324 | |
325 | /* clear unused ones */ |
326 | for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { |
327 | mi->blk[i].start = mi->blk[i].end = 0; |
328 | mi->blk[i].nid = NUMA_NO_NODE; |
329 | } |
330 | |
331 | return 0; |
332 | } |
333 | |
334 | /* |
335 | * Set nodes, which have memory in @mi, in *@nodemask. |
336 | */ |
337 | static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, |
338 | const struct numa_meminfo *mi) |
339 | { |
340 | int i; |
341 | |
342 | for (i = 0; i < ARRAY_SIZE(mi->blk); i++) |
343 | if (mi->blk[i].start != mi->blk[i].end && |
344 | mi->blk[i].nid != NUMA_NO_NODE) |
345 | node_set(mi->blk[i].nid, *nodemask); |
346 | } |
347 | |
348 | /** |
349 | * numa_reset_distance - Reset NUMA distance table |
350 | * |
351 | * The current table is freed. The next numa_set_distance() call will |
352 | * create a new one. |
353 | */ |
354 | void __init numa_reset_distance(void) |
355 | { |
356 | size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); |
357 | |
358 | /* numa_distance could be 1LU marking allocation failure, test cnt */ |
359 | if (numa_distance_cnt) |
360 | memblock_free(ptr: numa_distance, size); |
361 | numa_distance_cnt = 0; |
362 | numa_distance = NULL; /* enable table creation */ |
363 | } |
364 | |
365 | static int __init numa_alloc_distance(void) |
366 | { |
367 | nodemask_t nodes_parsed; |
368 | size_t size; |
369 | int i, j, cnt = 0; |
370 | u64 phys; |
371 | |
372 | /* size the new table and allocate it */ |
373 | nodes_parsed = numa_nodes_parsed; |
374 | numa_nodemask_from_meminfo(nodemask: &nodes_parsed, mi: &numa_meminfo); |
375 | |
376 | for_each_node_mask(i, nodes_parsed) |
377 | cnt = i; |
378 | cnt++; |
379 | size = cnt * cnt * sizeof(numa_distance[0]); |
380 | |
381 | phys = memblock_phys_alloc_range(size, PAGE_SIZE, start: 0, |
382 | PFN_PHYS(max_pfn_mapped)); |
383 | if (!phys) { |
384 | pr_warn("Warning: can't allocate distance table!\n" ); |
385 | /* don't retry until explicitly reset */ |
386 | numa_distance = (void *)1LU; |
387 | return -ENOMEM; |
388 | } |
389 | |
390 | numa_distance = __va(phys); |
391 | numa_distance_cnt = cnt; |
392 | |
393 | /* fill with the default distances */ |
394 | for (i = 0; i < cnt; i++) |
395 | for (j = 0; j < cnt; j++) |
396 | numa_distance[i * cnt + j] = i == j ? |
397 | LOCAL_DISTANCE : REMOTE_DISTANCE; |
398 | printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n" , cnt); |
399 | |
400 | return 0; |
401 | } |
402 | |
403 | /** |
404 | * numa_set_distance - Set NUMA distance from one NUMA to another |
405 | * @from: the 'from' node to set distance |
406 | * @to: the 'to' node to set distance |
407 | * @distance: NUMA distance |
408 | * |
409 | * Set the distance from node @from to @to to @distance. If distance table |
410 | * doesn't exist, one which is large enough to accommodate all the currently |
411 | * known nodes will be created. |
412 | * |
413 | * If such table cannot be allocated, a warning is printed and further |
414 | * calls are ignored until the distance table is reset with |
415 | * numa_reset_distance(). |
416 | * |
417 | * If @from or @to is higher than the highest known node or lower than zero |
418 | * at the time of table creation or @distance doesn't make sense, the call |
419 | * is ignored. |
420 | * This is to allow simplification of specific NUMA config implementations. |
421 | */ |
422 | void __init numa_set_distance(int from, int to, int distance) |
423 | { |
424 | if (!numa_distance && numa_alloc_distance() < 0) |
425 | return; |
426 | |
427 | if (from >= numa_distance_cnt || to >= numa_distance_cnt || |
428 | from < 0 || to < 0) { |
429 | pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n" , |
430 | from, to, distance); |
431 | return; |
432 | } |
433 | |
434 | if ((u8)distance != distance || |
435 | (from == to && distance != LOCAL_DISTANCE)) { |
436 | pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n" , |
437 | from, to, distance); |
438 | return; |
439 | } |
440 | |
441 | numa_distance[from * numa_distance_cnt + to] = distance; |
442 | } |
443 | |
444 | int __node_distance(int from, int to) |
445 | { |
446 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) |
447 | return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; |
448 | return numa_distance[from * numa_distance_cnt + to]; |
449 | } |
450 | EXPORT_SYMBOL(__node_distance); |
451 | |
452 | /* |
453 | * Sanity check to catch more bad NUMA configurations (they are amazingly |
454 | * common). Make sure the nodes cover all memory. |
455 | */ |
456 | static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) |
457 | { |
458 | u64 numaram, e820ram; |
459 | int i; |
460 | |
461 | numaram = 0; |
462 | for (i = 0; i < mi->nr_blks; i++) { |
463 | u64 s = mi->blk[i].start >> PAGE_SHIFT; |
464 | u64 e = mi->blk[i].end >> PAGE_SHIFT; |
465 | numaram += e - s; |
466 | numaram -= __absent_pages_in_range(nid: mi->blk[i].nid, start_pfn: s, end_pfn: e); |
467 | if ((s64)numaram < 0) |
468 | numaram = 0; |
469 | } |
470 | |
471 | e820ram = max_pfn - absent_pages_in_range(start_pfn: 0, end_pfn: max_pfn); |
472 | |
473 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ |
474 | if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { |
475 | printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n" , |
476 | (numaram << PAGE_SHIFT) >> 20, |
477 | (e820ram << PAGE_SHIFT) >> 20); |
478 | return false; |
479 | } |
480 | return true; |
481 | } |
482 | |
483 | /* |
484 | * Mark all currently memblock-reserved physical memory (which covers the |
485 | * kernel's own memory ranges) as hot-unswappable. |
486 | */ |
487 | static void __init numa_clear_kernel_node_hotplug(void) |
488 | { |
489 | nodemask_t reserved_nodemask = NODE_MASK_NONE; |
490 | struct memblock_region *mb_region; |
491 | int i; |
492 | |
493 | /* |
494 | * We have to do some preprocessing of memblock regions, to |
495 | * make them suitable for reservation. |
496 | * |
497 | * At this time, all memory regions reserved by memblock are |
498 | * used by the kernel, but those regions are not split up |
499 | * along node boundaries yet, and don't necessarily have their |
500 | * node ID set yet either. |
501 | * |
502 | * So iterate over all memory known to the x86 architecture, |
503 | * and use those ranges to set the nid in memblock.reserved. |
504 | * This will split up the memblock regions along node |
505 | * boundaries and will set the node IDs as well. |
506 | */ |
507 | for (i = 0; i < numa_meminfo.nr_blks; i++) { |
508 | struct numa_memblk *mb = numa_meminfo.blk + i; |
509 | int ret; |
510 | |
511 | ret = memblock_set_node(base: mb->start, size: mb->end - mb->start, type: &memblock.reserved, nid: mb->nid); |
512 | WARN_ON_ONCE(ret); |
513 | } |
514 | |
515 | /* |
516 | * Now go over all reserved memblock regions, to construct a |
517 | * node mask of all kernel reserved memory areas. |
518 | * |
519 | * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, |
520 | * numa_meminfo might not include all memblock.reserved |
521 | * memory ranges, because quirks such as trim_snb_memory() |
522 | * reserve specific pages for Sandy Bridge graphics. ] |
523 | */ |
524 | for_each_reserved_mem_region(mb_region) { |
525 | int nid = memblock_get_region_node(r: mb_region); |
526 | |
527 | if (nid != MAX_NUMNODES) |
528 | node_set(nid, reserved_nodemask); |
529 | } |
530 | |
531 | /* |
532 | * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory |
533 | * belonging to the reserved node mask. |
534 | * |
535 | * Note that this will include memory regions that reside |
536 | * on nodes that contain kernel memory - entire nodes |
537 | * become hot-unpluggable: |
538 | */ |
539 | for (i = 0; i < numa_meminfo.nr_blks; i++) { |
540 | struct numa_memblk *mb = numa_meminfo.blk + i; |
541 | |
542 | if (!node_isset(mb->nid, reserved_nodemask)) |
543 | continue; |
544 | |
545 | memblock_clear_hotplug(base: mb->start, size: mb->end - mb->start); |
546 | } |
547 | } |
548 | |
549 | static int __init numa_register_memblks(struct numa_meminfo *mi) |
550 | { |
551 | int i, nid; |
552 | |
553 | /* Account for nodes with cpus and no memory */ |
554 | node_possible_map = numa_nodes_parsed; |
555 | numa_nodemask_from_meminfo(nodemask: &node_possible_map, mi); |
556 | if (WARN_ON(nodes_empty(node_possible_map))) |
557 | return -EINVAL; |
558 | |
559 | for (i = 0; i < mi->nr_blks; i++) { |
560 | struct numa_memblk *mb = &mi->blk[i]; |
561 | memblock_set_node(base: mb->start, size: mb->end - mb->start, |
562 | type: &memblock.memory, nid: mb->nid); |
563 | } |
564 | |
565 | /* |
566 | * At very early time, the kernel have to use some memory such as |
567 | * loading the kernel image. We cannot prevent this anyway. So any |
568 | * node the kernel resides in should be un-hotpluggable. |
569 | * |
570 | * And when we come here, alloc node data won't fail. |
571 | */ |
572 | numa_clear_kernel_node_hotplug(); |
573 | |
574 | /* |
575 | * If sections array is gonna be used for pfn -> nid mapping, check |
576 | * whether its granularity is fine enough. |
577 | */ |
578 | if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { |
579 | unsigned long pfn_align = node_map_pfn_alignment(); |
580 | |
581 | if (pfn_align && pfn_align < PAGES_PER_SECTION) { |
582 | pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n" , |
583 | PFN_PHYS(pfn_align) >> 20, |
584 | PFN_PHYS(PAGES_PER_SECTION) >> 20); |
585 | return -EINVAL; |
586 | } |
587 | } |
588 | if (!numa_meminfo_cover_memory(mi)) |
589 | return -EINVAL; |
590 | |
591 | /* Finally register nodes. */ |
592 | for_each_node_mask(nid, node_possible_map) { |
593 | u64 start = PFN_PHYS(max_pfn); |
594 | u64 end = 0; |
595 | |
596 | for (i = 0; i < mi->nr_blks; i++) { |
597 | if (nid != mi->blk[i].nid) |
598 | continue; |
599 | start = min(mi->blk[i].start, start); |
600 | end = max(mi->blk[i].end, end); |
601 | } |
602 | |
603 | if (start >= end) |
604 | continue; |
605 | |
606 | alloc_node_data(nid); |
607 | } |
608 | |
609 | /* Dump memblock with node info and return. */ |
610 | memblock_dump_all(); |
611 | return 0; |
612 | } |
613 | |
614 | /* |
615 | * There are unfortunately some poorly designed mainboards around that |
616 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node |
617 | * mapping. To avoid this fill in the mapping for all possible CPUs, |
618 | * as the number of CPUs is not known yet. We round robin the existing |
619 | * nodes. |
620 | */ |
621 | static void __init numa_init_array(void) |
622 | { |
623 | int rr, i; |
624 | |
625 | rr = first_node(node_online_map); |
626 | for (i = 0; i < nr_cpu_ids; i++) { |
627 | if (early_cpu_to_node(cpu: i) != NUMA_NO_NODE) |
628 | continue; |
629 | numa_set_node(cpu: i, node: rr); |
630 | rr = next_node_in(rr, node_online_map); |
631 | } |
632 | } |
633 | |
634 | static int __init numa_init(int (*init_func)(void)) |
635 | { |
636 | int i; |
637 | int ret; |
638 | |
639 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
640 | set_apicid_to_node(apicid: i, NUMA_NO_NODE); |
641 | |
642 | nodes_clear(numa_nodes_parsed); |
643 | nodes_clear(node_possible_map); |
644 | nodes_clear(node_online_map); |
645 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); |
646 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, |
647 | MAX_NUMNODES)); |
648 | WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, |
649 | MAX_NUMNODES)); |
650 | /* In case that parsing SRAT failed. */ |
651 | WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); |
652 | numa_reset_distance(); |
653 | |
654 | ret = init_func(); |
655 | if (ret < 0) |
656 | return ret; |
657 | |
658 | /* |
659 | * We reset memblock back to the top-down direction |
660 | * here because if we configured ACPI_NUMA, we have |
661 | * parsed SRAT in init_func(). It is ok to have the |
662 | * reset here even if we did't configure ACPI_NUMA |
663 | * or acpi numa init fails and fallbacks to dummy |
664 | * numa init. |
665 | */ |
666 | memblock_set_bottom_up(enable: false); |
667 | |
668 | ret = numa_cleanup_meminfo(mi: &numa_meminfo); |
669 | if (ret < 0) |
670 | return ret; |
671 | |
672 | numa_emulation(numa_meminfo: &numa_meminfo, numa_dist_cnt: numa_distance_cnt); |
673 | |
674 | ret = numa_register_memblks(mi: &numa_meminfo); |
675 | if (ret < 0) |
676 | return ret; |
677 | |
678 | for (i = 0; i < nr_cpu_ids; i++) { |
679 | int nid = early_cpu_to_node(cpu: i); |
680 | |
681 | if (nid == NUMA_NO_NODE) |
682 | continue; |
683 | if (!node_online(nid)) |
684 | numa_clear_node(cpu: i); |
685 | } |
686 | numa_init_array(); |
687 | |
688 | return 0; |
689 | } |
690 | |
691 | /** |
692 | * dummy_numa_init - Fallback dummy NUMA init |
693 | * |
694 | * Used if there's no underlying NUMA architecture, NUMA initialization |
695 | * fails, or NUMA is disabled on the command line. |
696 | * |
697 | * Must online at least one node and add memory blocks that cover all |
698 | * allowed memory. This function must not fail. |
699 | */ |
700 | static int __init dummy_numa_init(void) |
701 | { |
702 | printk(KERN_INFO "%s\n" , |
703 | numa_off ? "NUMA turned off" : "No NUMA configuration found" ); |
704 | printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n" , |
705 | 0LLU, PFN_PHYS(max_pfn) - 1); |
706 | |
707 | node_set(0, numa_nodes_parsed); |
708 | numa_add_memblk(nid: 0, start: 0, PFN_PHYS(max_pfn)); |
709 | |
710 | return 0; |
711 | } |
712 | |
713 | /** |
714 | * x86_numa_init - Initialize NUMA |
715 | * |
716 | * Try each configured NUMA initialization method until one succeeds. The |
717 | * last fallback is dummy single node config encompassing whole memory and |
718 | * never fails. |
719 | */ |
720 | void __init x86_numa_init(void) |
721 | { |
722 | if (!numa_off) { |
723 | #ifdef CONFIG_ACPI_NUMA |
724 | if (!numa_init(init_func: x86_acpi_numa_init)) |
725 | return; |
726 | #endif |
727 | #ifdef CONFIG_AMD_NUMA |
728 | if (!numa_init(init_func: amd_numa_init)) |
729 | return; |
730 | #endif |
731 | if (acpi_disabled && !numa_init(init_func: of_numa_init)) |
732 | return; |
733 | } |
734 | |
735 | numa_init(init_func: dummy_numa_init); |
736 | } |
737 | |
738 | |
739 | /* |
740 | * A node may exist which has one or more Generic Initiators but no CPUs and no |
741 | * memory. |
742 | * |
743 | * This function must be called after init_cpu_to_node(), to ensure that any |
744 | * memoryless CPU nodes have already been brought online, and before the |
745 | * node_data[nid] is needed for zone list setup in build_all_zonelists(). |
746 | * |
747 | * When this function is called, any nodes containing either memory and/or CPUs |
748 | * will already be online and there is no need to do anything extra, even if |
749 | * they also contain one or more Generic Initiators. |
750 | */ |
751 | void __init init_gi_nodes(void) |
752 | { |
753 | int nid; |
754 | |
755 | /* |
756 | * Exclude this node from |
757 | * bringup_nonboot_cpus |
758 | * cpu_up |
759 | * __try_online_node |
760 | * register_one_node |
761 | * because node_subsys is not initialized yet. |
762 | * TODO remove dependency on node_online |
763 | */ |
764 | for_each_node_state(nid, N_GENERIC_INITIATOR) |
765 | if (!node_online(nid)) |
766 | node_set_online(nid); |
767 | } |
768 | |
769 | /* |
770 | * Setup early cpu_to_node. |
771 | * |
772 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], |
773 | * and apicid_to_node[] tables have valid entries for a CPU. |
774 | * This means we skip cpu_to_node[] initialisation for NUMA |
775 | * emulation and faking node case (when running a kernel compiled |
776 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] |
777 | * is already initialized in a round robin manner at numa_init_array, |
778 | * prior to this call, and this initialization is good enough |
779 | * for the fake NUMA cases. |
780 | * |
781 | * Called before the per_cpu areas are setup. |
782 | */ |
783 | void __init init_cpu_to_node(void) |
784 | { |
785 | int cpu; |
786 | u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); |
787 | |
788 | BUG_ON(cpu_to_apicid == NULL); |
789 | |
790 | for_each_possible_cpu(cpu) { |
791 | int node = numa_cpu_node(cpu); |
792 | |
793 | if (node == NUMA_NO_NODE) |
794 | continue; |
795 | |
796 | /* |
797 | * Exclude this node from |
798 | * bringup_nonboot_cpus |
799 | * cpu_up |
800 | * __try_online_node |
801 | * register_one_node |
802 | * because node_subsys is not initialized yet. |
803 | * TODO remove dependency on node_online |
804 | */ |
805 | if (!node_online(node)) |
806 | node_set_online(nid: node); |
807 | |
808 | numa_set_node(cpu, node); |
809 | } |
810 | } |
811 | |
812 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS |
813 | |
814 | # ifndef CONFIG_NUMA_EMU |
815 | void numa_add_cpu(int cpu) |
816 | { |
817 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); |
818 | } |
819 | |
820 | void numa_remove_cpu(int cpu) |
821 | { |
822 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); |
823 | } |
824 | # endif /* !CONFIG_NUMA_EMU */ |
825 | |
826 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
827 | |
828 | int __cpu_to_node(int cpu) |
829 | { |
830 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { |
831 | printk(KERN_WARNING |
832 | "cpu_to_node(%d): usage too early!\n" , cpu); |
833 | dump_stack(); |
834 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; |
835 | } |
836 | return per_cpu(x86_cpu_to_node_map, cpu); |
837 | } |
838 | EXPORT_SYMBOL(__cpu_to_node); |
839 | |
840 | /* |
841 | * Same function as cpu_to_node() but used if called before the |
842 | * per_cpu areas are setup. |
843 | */ |
844 | int early_cpu_to_node(int cpu) |
845 | { |
846 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) |
847 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; |
848 | |
849 | if (!cpu_possible(cpu)) { |
850 | printk(KERN_WARNING |
851 | "early_cpu_to_node(%d): no per_cpu area!\n" , cpu); |
852 | dump_stack(); |
853 | return NUMA_NO_NODE; |
854 | } |
855 | return per_cpu(x86_cpu_to_node_map, cpu); |
856 | } |
857 | |
858 | void debug_cpumask_set_cpu(int cpu, int node, bool enable) |
859 | { |
860 | struct cpumask *mask; |
861 | |
862 | if (node == NUMA_NO_NODE) { |
863 | /* early_cpu_to_node() already emits a warning and trace */ |
864 | return; |
865 | } |
866 | mask = node_to_cpumask_map[node]; |
867 | if (!cpumask_available(mask)) { |
868 | pr_err("node_to_cpumask_map[%i] NULL\n" , node); |
869 | dump_stack(); |
870 | return; |
871 | } |
872 | |
873 | if (enable) |
874 | cpumask_set_cpu(cpu, dstp: mask); |
875 | else |
876 | cpumask_clear_cpu(cpu, dstp: mask); |
877 | |
878 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n" , |
879 | enable ? "numa_add_cpu" : "numa_remove_cpu" , |
880 | cpu, node, cpumask_pr_args(mask)); |
881 | return; |
882 | } |
883 | |
884 | # ifndef CONFIG_NUMA_EMU |
885 | static void numa_set_cpumask(int cpu, bool enable) |
886 | { |
887 | debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); |
888 | } |
889 | |
890 | void numa_add_cpu(int cpu) |
891 | { |
892 | numa_set_cpumask(cpu, true); |
893 | } |
894 | |
895 | void numa_remove_cpu(int cpu) |
896 | { |
897 | numa_set_cpumask(cpu, false); |
898 | } |
899 | # endif /* !CONFIG_NUMA_EMU */ |
900 | |
901 | /* |
902 | * Returns a pointer to the bitmask of CPUs on Node 'node'. |
903 | */ |
904 | const struct cpumask *cpumask_of_node(int node) |
905 | { |
906 | if ((unsigned)node >= nr_node_ids) { |
907 | printk(KERN_WARNING |
908 | "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n" , |
909 | node, nr_node_ids); |
910 | dump_stack(); |
911 | return cpu_none_mask; |
912 | } |
913 | if (!cpumask_available(mask: node_to_cpumask_map[node])) { |
914 | printk(KERN_WARNING |
915 | "cpumask_of_node(%d): no node_to_cpumask_map!\n" , |
916 | node); |
917 | dump_stack(); |
918 | return cpu_online_mask; |
919 | } |
920 | return node_to_cpumask_map[node]; |
921 | } |
922 | EXPORT_SYMBOL(cpumask_of_node); |
923 | |
924 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
925 | |
926 | #ifdef CONFIG_NUMA_KEEP_MEMINFO |
927 | static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) |
928 | { |
929 | int i; |
930 | |
931 | for (i = 0; i < mi->nr_blks; i++) |
932 | if (mi->blk[i].start <= start && mi->blk[i].end > start) |
933 | return mi->blk[i].nid; |
934 | return NUMA_NO_NODE; |
935 | } |
936 | |
937 | int phys_to_target_node(phys_addr_t start) |
938 | { |
939 | int nid = meminfo_to_nid(mi: &numa_meminfo, start); |
940 | |
941 | /* |
942 | * Prefer online nodes, but if reserved memory might be |
943 | * hot-added continue the search with reserved ranges. |
944 | */ |
945 | if (nid != NUMA_NO_NODE) |
946 | return nid; |
947 | |
948 | return meminfo_to_nid(mi: &numa_reserved_meminfo, start); |
949 | } |
950 | EXPORT_SYMBOL_GPL(phys_to_target_node); |
951 | |
952 | int memory_add_physaddr_to_nid(u64 start) |
953 | { |
954 | int nid = meminfo_to_nid(mi: &numa_meminfo, start); |
955 | |
956 | if (nid == NUMA_NO_NODE) |
957 | nid = numa_meminfo.blk[0].nid; |
958 | return nid; |
959 | } |
960 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
961 | |
962 | static int __init cmp_memblk(const void *a, const void *b) |
963 | { |
964 | const struct numa_memblk *ma = *(const struct numa_memblk **)a; |
965 | const struct numa_memblk *mb = *(const struct numa_memblk **)b; |
966 | |
967 | return ma->start - mb->start; |
968 | } |
969 | |
970 | static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; |
971 | |
972 | /** |
973 | * numa_fill_memblks - Fill gaps in numa_meminfo memblks |
974 | * @start: address to begin fill |
975 | * @end: address to end fill |
976 | * |
977 | * Find and extend numa_meminfo memblks to cover the @start-@end |
978 | * physical address range, such that the first memblk includes |
979 | * @start, the last memblk includes @end, and any gaps in between |
980 | * are filled. |
981 | * |
982 | * RETURNS: |
983 | * 0 : Success |
984 | * NUMA_NO_MEMBLK : No memblk exists in @start-@end range |
985 | */ |
986 | |
987 | int __init numa_fill_memblks(u64 start, u64 end) |
988 | { |
989 | struct numa_memblk **blk = &numa_memblk_list[0]; |
990 | struct numa_meminfo *mi = &numa_meminfo; |
991 | int count = 0; |
992 | u64 prev_end; |
993 | |
994 | /* |
995 | * Create a list of pointers to numa_meminfo memblks that |
996 | * overlap start, end. Exclude (start == bi->end) since |
997 | * end addresses in both a CFMWS range and a memblk range |
998 | * are exclusive. |
999 | * |
1000 | * This list of pointers is used to make in-place changes |
1001 | * that fill out the numa_meminfo memblks. |
1002 | */ |
1003 | for (int i = 0; i < mi->nr_blks; i++) { |
1004 | struct numa_memblk *bi = &mi->blk[i]; |
1005 | |
1006 | if (start < bi->end && end >= bi->start) { |
1007 | blk[count] = &mi->blk[i]; |
1008 | count++; |
1009 | } |
1010 | } |
1011 | if (!count) |
1012 | return NUMA_NO_MEMBLK; |
1013 | |
1014 | /* Sort the list of pointers in memblk->start order */ |
1015 | sort(base: &blk[0], num: count, size: sizeof(blk[0]), cmp_func: cmp_memblk, NULL); |
1016 | |
1017 | /* Make sure the first/last memblks include start/end */ |
1018 | blk[0]->start = min(blk[0]->start, start); |
1019 | blk[count - 1]->end = max(blk[count - 1]->end, end); |
1020 | |
1021 | /* |
1022 | * Fill any gaps by tracking the previous memblks |
1023 | * end address and backfilling to it if needed. |
1024 | */ |
1025 | prev_end = blk[0]->end; |
1026 | for (int i = 1; i < count; i++) { |
1027 | struct numa_memblk *curr = blk[i]; |
1028 | |
1029 | if (prev_end >= curr->start) { |
1030 | if (prev_end < curr->end) |
1031 | prev_end = curr->end; |
1032 | } else { |
1033 | curr->start = prev_end; |
1034 | prev_end = curr->end; |
1035 | } |
1036 | } |
1037 | return 0; |
1038 | } |
1039 | |
1040 | #endif |
1041 | |