1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * CPU/APIC topology |
4 | * |
5 | * The APIC IDs describe the system topology in multiple domain levels. |
6 | * The CPUID topology parser provides the information which part of the |
7 | * APIC ID is associated to the individual levels: |
8 | * |
9 | * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD] |
10 | * |
11 | * The root space contains the package (socket) IDs. |
12 | * |
13 | * Not enumerated levels consume 0 bits space, but conceptually they are |
14 | * always represented. If e.g. only CORE and THREAD levels are enumerated |
15 | * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE. |
16 | * |
17 | * If SMT is not supported, then the THREAD domain is still used. It then |
18 | * has the same physical ID as the CORE domain and is the only child of |
19 | * the core domain. |
20 | * |
21 | * This allows a unified view on the system independent of the enumerated |
22 | * domain levels without requiring any conditionals in the code. |
23 | */ |
24 | #define pr_fmt(fmt) "CPU topo: " fmt |
25 | #include <linux/cpu.h> |
26 | |
27 | #include <xen/xen.h> |
28 | |
29 | #include <asm/apic.h> |
30 | #include <asm/hypervisor.h> |
31 | #include <asm/io_apic.h> |
32 | #include <asm/mpspec.h> |
33 | #include <asm/smp.h> |
34 | |
35 | #include "cpu.h" |
36 | |
37 | /* |
38 | * Map cpu index to physical APIC ID |
39 | */ |
40 | DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); |
41 | DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID); |
42 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); |
43 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); |
44 | |
45 | /* Bitmap of physically present CPUs. */ |
46 | DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly; |
47 | |
48 | /* Used for CPU number allocation and parallel CPU bringup */ |
49 | u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, }; |
50 | |
51 | /* Bitmaps to mark registered APICs at each topology domain */ |
52 | static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init; |
53 | |
54 | /* |
55 | * Keep track of assigned, disabled and rejected CPUs. Present assigned |
56 | * with 1 as CPU #0 is reserved for the boot CPU. |
57 | */ |
58 | static struct { |
59 | unsigned int nr_assigned_cpus; |
60 | unsigned int nr_disabled_cpus; |
61 | unsigned int nr_rejected_cpus; |
62 | u32 boot_cpu_apic_id; |
63 | u32 real_bsp_apic_id; |
64 | } topo_info __ro_after_init = { |
65 | .nr_assigned_cpus = 1, |
66 | .boot_cpu_apic_id = BAD_APICID, |
67 | .real_bsp_apic_id = BAD_APICID, |
68 | }; |
69 | |
70 | #define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC) |
71 | |
72 | bool arch_match_cpu_phys_id(int cpu, u64 phys_id) |
73 | { |
74 | return phys_id == (u64)cpuid_to_apicid[cpu]; |
75 | } |
76 | |
77 | #ifdef CONFIG_SMP |
78 | static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) |
79 | { |
80 | if (!(apicid & (__max_threads_per_core - 1))) |
81 | cpumask_set_cpu(cpu, dstp: &__cpu_primary_thread_mask); |
82 | } |
83 | #else |
84 | static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } |
85 | #endif |
86 | |
87 | /* |
88 | * Convert the APIC ID to a domain level ID by masking out the low bits |
89 | * below the domain level @dom. |
90 | */ |
91 | static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom) |
92 | { |
93 | if (dom == TOPO_SMT_DOMAIN) |
94 | return apicid; |
95 | return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]); |
96 | } |
97 | |
98 | static int topo_lookup_cpuid(u32 apic_id) |
99 | { |
100 | int i; |
101 | |
102 | /* CPU# to APICID mapping is persistent once it is established */ |
103 | for (i = 0; i < topo_info.nr_assigned_cpus; i++) { |
104 | if (cpuid_to_apicid[i] == apic_id) |
105 | return i; |
106 | } |
107 | return -ENODEV; |
108 | } |
109 | |
110 | static __init int topo_get_cpunr(u32 apic_id) |
111 | { |
112 | int cpu = topo_lookup_cpuid(apic_id); |
113 | |
114 | if (cpu >= 0) |
115 | return cpu; |
116 | |
117 | return topo_info.nr_assigned_cpus++; |
118 | } |
119 | |
120 | static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) |
121 | { |
122 | #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) |
123 | early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; |
124 | early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; |
125 | #endif |
126 | set_cpu_present(cpu, present: true); |
127 | } |
128 | |
129 | static __init bool check_for_real_bsp(u32 apic_id) |
130 | { |
131 | /* |
132 | * There is no real good way to detect whether this a kdump() |
133 | * kernel, but except on the Voyager SMP monstrosity which is not |
134 | * longer supported, the real BSP APIC ID is the first one which is |
135 | * enumerated by firmware. That allows to detect whether the boot |
136 | * CPU is the real BSP. If it is not, then do not register the APIC |
137 | * because sending INIT to the real BSP would reset the whole |
138 | * system. |
139 | * |
140 | * The first APIC ID which is enumerated by firmware is detectable |
141 | * because the boot CPU APIC ID is registered before that without |
142 | * invoking this code. |
143 | */ |
144 | if (topo_info.real_bsp_apic_id != BAD_APICID) |
145 | return false; |
146 | |
147 | if (apic_id == topo_info.boot_cpu_apic_id) { |
148 | topo_info.real_bsp_apic_id = apic_id; |
149 | return false; |
150 | } |
151 | |
152 | pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x > %x\n" , |
153 | topo_info.boot_cpu_apic_id, apic_id); |
154 | pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n" ); |
155 | |
156 | topo_info.real_bsp_apic_id = apic_id; |
157 | return true; |
158 | } |
159 | |
160 | static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level, |
161 | unsigned long *map) |
162 | { |
163 | unsigned int id, end, cnt = 0; |
164 | |
165 | /* Calculate the exclusive end */ |
166 | end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]); |
167 | |
168 | /* Unfortunately there is no bitmap_weight_range() */ |
169 | for (id = find_next_bit(addr: map, size: end, offset: lvlid); id < end; id = find_next_bit(addr: map, size: end, offset: ++id)) |
170 | cnt++; |
171 | return cnt; |
172 | } |
173 | |
174 | static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) |
175 | { |
176 | int cpu, dom; |
177 | |
178 | if (present) { |
179 | set_bit(nr: apic_id, addr: phys_cpu_present_map); |
180 | |
181 | /* |
182 | * Double registration is valid in case of the boot CPU |
183 | * APIC because that is registered before the enumeration |
184 | * of the APICs via firmware parsers or VM guest |
185 | * mechanisms. |
186 | */ |
187 | if (apic_id == topo_info.boot_cpu_apic_id) |
188 | cpu = 0; |
189 | else |
190 | cpu = topo_get_cpunr(apic_id); |
191 | |
192 | cpuid_to_apicid[cpu] = apic_id; |
193 | topo_set_cpuids(cpu, apic_id, acpi_id); |
194 | } else { |
195 | u32 pkgid = topo_apicid(apicid: apic_id, dom: TOPO_PKG_DOMAIN); |
196 | |
197 | /* |
198 | * Check for present APICs in the same package when running |
199 | * on bare metal. Allow the bogosity in a guest. |
200 | */ |
201 | if (hypervisor_is_type(type: X86_HYPER_NATIVE) && |
202 | topo_unit_count(lvlid: pkgid, at_level: TOPO_PKG_DOMAIN, map: phys_cpu_present_map)) { |
203 | pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n" , |
204 | apic_id); |
205 | topo_info.nr_rejected_cpus++; |
206 | return; |
207 | } |
208 | |
209 | topo_info.nr_disabled_cpus++; |
210 | } |
211 | |
212 | /* |
213 | * Register present and possible CPUs in the domain |
214 | * maps. cpu_possible_map will be updated in |
215 | * topology_init_possible_cpus() after enumeration is done. |
216 | */ |
217 | for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) |
218 | set_bit(nr: topo_apicid(apicid: apic_id, dom), addr: apic_maps[dom].map); |
219 | } |
220 | |
221 | /** |
222 | * topology_register_apic - Register an APIC in early topology maps |
223 | * @apic_id: The APIC ID to set up |
224 | * @acpi_id: The ACPI ID associated to the APIC |
225 | * @present: True if the corresponding CPU is present |
226 | */ |
227 | void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present) |
228 | { |
229 | if (apic_id >= MAX_LOCAL_APIC) { |
230 | pr_err_once("APIC ID %x exceeds kernel limit of: %x\n" , apic_id, MAX_LOCAL_APIC - 1); |
231 | topo_info.nr_rejected_cpus++; |
232 | return; |
233 | } |
234 | |
235 | if (check_for_real_bsp(apic_id)) { |
236 | topo_info.nr_rejected_cpus++; |
237 | return; |
238 | } |
239 | |
240 | /* CPU numbers exhausted? */ |
241 | if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) { |
242 | pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n" , nr_cpu_ids); |
243 | topo_info.nr_rejected_cpus++; |
244 | return; |
245 | } |
246 | |
247 | topo_register_apic(apic_id, acpi_id, present); |
248 | } |
249 | |
250 | /** |
251 | * topology_register_boot_apic - Register the boot CPU APIC |
252 | * @apic_id: The APIC ID to set up |
253 | * |
254 | * Separate so CPU #0 can be assigned |
255 | */ |
256 | void __init topology_register_boot_apic(u32 apic_id) |
257 | { |
258 | WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID); |
259 | |
260 | topo_info.boot_cpu_apic_id = apic_id; |
261 | topo_register_apic(apic_id, CPU_ACPIID_INVALID, present: true); |
262 | } |
263 | |
264 | /** |
265 | * topology_get_logical_id - Retrieve the logical ID at a given topology domain level |
266 | * @apicid: The APIC ID for which to lookup the logical ID |
267 | * @at_level: The topology domain level to use |
268 | * |
269 | * @apicid must be a full APIC ID, not the normalized variant. It's valid to have |
270 | * all bits below the domain level specified by @at_level to be clear. So both |
271 | * real APIC IDs and backshifted normalized APIC IDs work correctly. |
272 | * |
273 | * Returns: |
274 | * - >= 0: The requested logical ID |
275 | * - -ERANGE: @apicid is out of range |
276 | * - -ENODEV: @apicid is not registered |
277 | */ |
278 | int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) |
279 | { |
280 | /* Remove the bits below @at_level to get the proper level ID of @apicid */ |
281 | unsigned int lvlid = topo_apicid(apicid, dom: at_level); |
282 | |
283 | if (lvlid >= MAX_LOCAL_APIC) |
284 | return -ERANGE; |
285 | if (!test_bit(lvlid, apic_maps[at_level].map)) |
286 | return -ENODEV; |
287 | /* Get the number of set bits before @lvlid. */ |
288 | return bitmap_weight(src: apic_maps[at_level].map, nbits: lvlid); |
289 | } |
290 | EXPORT_SYMBOL_GPL(topology_get_logical_id); |
291 | |
292 | /** |
293 | * topology_unit_count - Retrieve the count of specified units at a given topology domain level |
294 | * @apicid: The APIC ID which specifies the search range |
295 | * @which_units: The domain level specifying the units to count |
296 | * @at_level: The domain level at which @which_units have to be counted |
297 | * |
298 | * This returns the number of possible units according to the enumerated |
299 | * information. |
300 | * |
301 | * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN) |
302 | * counts the number of possible cores in the package to which @apicid |
303 | * belongs. |
304 | * |
305 | * @at_level must obviously be greater than @which_level to produce useful |
306 | * results. If @at_level is equal to @which_units the result is |
307 | * unsurprisingly 1. If @at_level is less than @which_units the results |
308 | * is by definition undefined and the function returns 0. |
309 | */ |
310 | unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, |
311 | enum x86_topology_domains at_level) |
312 | { |
313 | /* Remove the bits below @at_level to get the proper level ID of @apicid */ |
314 | unsigned int lvlid = topo_apicid(apicid, dom: at_level); |
315 | |
316 | if (lvlid >= MAX_LOCAL_APIC) |
317 | return 0; |
318 | if (!test_bit(lvlid, apic_maps[at_level].map)) |
319 | return 0; |
320 | if (which_units > at_level) |
321 | return 0; |
322 | if (which_units == at_level) |
323 | return 1; |
324 | return topo_unit_count(lvlid, at_level, map: apic_maps[which_units].map); |
325 | } |
326 | |
327 | #ifdef CONFIG_ACPI_HOTPLUG_CPU |
328 | /** |
329 | * topology_hotplug_apic - Handle a physical hotplugged APIC after boot |
330 | * @apic_id: The APIC ID to set up |
331 | * @acpi_id: The ACPI ID associated to the APIC |
332 | */ |
333 | int topology_hotplug_apic(u32 apic_id, u32 acpi_id) |
334 | { |
335 | int cpu; |
336 | |
337 | if (apic_id >= MAX_LOCAL_APIC) |
338 | return -EINVAL; |
339 | |
340 | /* Reject if the APIC ID was not registered during enumeration. */ |
341 | if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map)) |
342 | return -ENODEV; |
343 | |
344 | cpu = topo_lookup_cpuid(apic_id); |
345 | if (cpu < 0) |
346 | return -ENOSPC; |
347 | |
348 | set_bit(nr: apic_id, addr: phys_cpu_present_map); |
349 | topo_set_cpuids(cpu, apic_id, acpi_id); |
350 | cpu_mark_primary_thread(cpu, apicid: apic_id); |
351 | return cpu; |
352 | } |
353 | |
354 | /** |
355 | * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot |
356 | * @cpu: The CPU number for which the APIC ID is removed |
357 | */ |
358 | void topology_hotunplug_apic(unsigned int cpu) |
359 | { |
360 | u32 apic_id = cpuid_to_apicid[cpu]; |
361 | |
362 | if (apic_id == BAD_APICID) |
363 | return; |
364 | |
365 | per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; |
366 | clear_bit(nr: apic_id, addr: phys_cpu_present_map); |
367 | set_cpu_present(cpu, present: false); |
368 | } |
369 | #endif |
370 | |
371 | #ifdef CONFIG_X86_LOCAL_APIC |
372 | static unsigned int max_possible_cpus __initdata = NR_CPUS; |
373 | |
374 | /** |
375 | * topology_apply_cmdline_limits_early - Apply topology command line limits early |
376 | * |
377 | * Ensure that command line limits are in effect before firmware parsing |
378 | * takes place. |
379 | */ |
380 | void __init topology_apply_cmdline_limits_early(void) |
381 | { |
382 | unsigned int possible = nr_cpu_ids; |
383 | |
384 | /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ |
385 | if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) |
386 | possible = 1; |
387 | |
388 | /* 'possible_cpus=N' */ |
389 | possible = min_t(unsigned int, max_possible_cpus, possible); |
390 | |
391 | if (possible < nr_cpu_ids) { |
392 | pr_info("Limiting to %u possible CPUs\n" , possible); |
393 | set_nr_cpu_ids(possible); |
394 | } |
395 | } |
396 | |
397 | static __init bool restrict_to_up(void) |
398 | { |
399 | if (!smp_found_config || ioapic_is_disabled) |
400 | return true; |
401 | /* |
402 | * XEN PV is special as it does not advertise the local APIC |
403 | * properly, but provides a fake topology for it so that the |
404 | * infrastructure works. So don't apply the restrictions vs. APIC |
405 | * here. |
406 | */ |
407 | if (xen_pv_domain()) |
408 | return false; |
409 | |
410 | return apic_is_disabled; |
411 | } |
412 | |
413 | void __init topology_init_possible_cpus(void) |
414 | { |
415 | unsigned int assigned = topo_info.nr_assigned_cpus; |
416 | unsigned int disabled = topo_info.nr_disabled_cpus; |
417 | unsigned int cnta, cntb, cpu, allowed = 1; |
418 | unsigned int total = assigned + disabled; |
419 | u32 apicid, firstid; |
420 | |
421 | /* |
422 | * If there was no APIC registered, then fake one so that the |
423 | * topology bitmap is populated. That ensures that the code below |
424 | * is valid and the various query interfaces can be used |
425 | * unconditionally. This does not affect the actual APIC code in |
426 | * any way because either the local APIC address has not been |
427 | * registered or the local APIC was disabled on the command line. |
428 | */ |
429 | if (topo_info.boot_cpu_apic_id == BAD_APICID) |
430 | topology_register_boot_apic(apic_id: 0); |
431 | |
432 | if (!restrict_to_up()) { |
433 | if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { |
434 | disabled += assigned - nr_cpu_ids; |
435 | assigned = nr_cpu_ids; |
436 | } |
437 | allowed = min_t(unsigned int, total, nr_cpu_ids); |
438 | } |
439 | |
440 | if (total > allowed) |
441 | pr_warn("%u possible CPUs exceed the limit of %u\n" , total, allowed); |
442 | |
443 | assigned = min_t(unsigned int, allowed, assigned); |
444 | disabled = allowed - assigned; |
445 | |
446 | topo_info.nr_assigned_cpus = assigned; |
447 | topo_info.nr_disabled_cpus = disabled; |
448 | |
449 | total_cpus = allowed; |
450 | set_nr_cpu_ids(allowed); |
451 | |
452 | cnta = domain_weight(TOPO_PKG_DOMAIN); |
453 | cntb = domain_weight(TOPO_DIE_DOMAIN); |
454 | __max_logical_packages = cnta; |
455 | __max_dies_per_package = 1U << (get_count_order(count: cntb) - get_count_order(count: cnta)); |
456 | |
457 | pr_info("Max. logical packages: %3u\n" , cnta); |
458 | pr_info("Max. logical dies: %3u\n" , cntb); |
459 | pr_info("Max. dies per package: %3u\n" , __max_dies_per_package); |
460 | |
461 | cnta = domain_weight(TOPO_CORE_DOMAIN); |
462 | cntb = domain_weight(TOPO_SMT_DOMAIN); |
463 | /* |
464 | * Can't use order delta here as order(cnta) can be equal |
465 | * order(cntb) even if cnta != cntb. |
466 | */ |
467 | __max_threads_per_core = DIV_ROUND_UP(cntb, cnta); |
468 | pr_info("Max. threads per core: %3u\n" , __max_threads_per_core); |
469 | |
470 | firstid = find_first_bit(addr: apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC); |
471 | __num_cores_per_package = topology_unit_count(apicid: firstid, which_units: TOPO_CORE_DOMAIN, at_level: TOPO_PKG_DOMAIN); |
472 | pr_info("Num. cores per package: %3u\n" , __num_cores_per_package); |
473 | __num_threads_per_package = topology_unit_count(apicid: firstid, which_units: TOPO_SMT_DOMAIN, at_level: TOPO_PKG_DOMAIN); |
474 | pr_info("Num. threads per package: %3u\n" , __num_threads_per_package); |
475 | |
476 | pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n" , assigned, disabled); |
477 | if (topo_info.nr_rejected_cpus) |
478 | pr_info("Rejected CPUs %u\n" , topo_info.nr_rejected_cpus); |
479 | |
480 | init_cpu_present(cpumask_of(0)); |
481 | init_cpu_possible(cpumask_of(0)); |
482 | |
483 | /* Assign CPU numbers to non-present CPUs */ |
484 | for (apicid = 0; disabled; disabled--, apicid++) { |
485 | apicid = find_next_andnot_bit(addr1: apic_maps[TOPO_SMT_DOMAIN].map, addr2: phys_cpu_present_map, |
486 | MAX_LOCAL_APIC, offset: apicid); |
487 | if (apicid >= MAX_LOCAL_APIC) |
488 | break; |
489 | cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid; |
490 | } |
491 | |
492 | for (cpu = 0; cpu < allowed; cpu++) { |
493 | apicid = cpuid_to_apicid[cpu]; |
494 | |
495 | set_cpu_possible(cpu, possible: true); |
496 | |
497 | if (apicid == BAD_APICID) |
498 | continue; |
499 | |
500 | cpu_mark_primary_thread(cpu, apicid); |
501 | set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map)); |
502 | } |
503 | } |
504 | |
505 | /* |
506 | * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed. |
507 | */ |
508 | void __init topology_reset_possible_cpus_up(void) |
509 | { |
510 | init_cpu_present(cpumask_of(0)); |
511 | init_cpu_possible(cpumask_of(0)); |
512 | |
513 | bitmap_zero(dst: phys_cpu_present_map, MAX_LOCAL_APIC); |
514 | if (topo_info.boot_cpu_apic_id != BAD_APICID) |
515 | set_bit(nr: topo_info.boot_cpu_apic_id, addr: phys_cpu_present_map); |
516 | } |
517 | |
518 | static int __init setup_possible_cpus(char *str) |
519 | { |
520 | get_option(str: &str, pint: &max_possible_cpus); |
521 | return 0; |
522 | } |
523 | early_param("possible_cpus" , setup_possible_cpus); |
524 | #endif |
525 | |