smpboot.c source code [linux/arch/x86/kernel/smpboot.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* x86 SMP booting functions
4	*
5	* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
6	* (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
7	* Copyright 2001 Andi Kleen, SuSE Labs.
8	*
9	* Much of the core SMP work is based on previous work by Thomas Radke, to
10	* whom a great many thanks are extended.
11	*
12	* Thanks to Intel for making available several different Pentium,
13	* Pentium Pro and Pentium-II/Xeon MP machines.
14	* Original development of Linux SMP code supported by Caldera.
15	*
16	* Fixes
17	* Felix Koop : NR_CPUS used properly
18	* Jose Renau : Handle single CPU case.
19	* Alan Cox : By repeated request 8) - Total BogoMIPS report.
20	* Greg Wright : Fix for kernel stacks panic.
21	* Erich Boleyn : MP v1.4 and additional changes.
22	* Matthias Sattler : Changes for 2.1 kernel map.
23	* Michel Lespinasse : Changes for 2.1 kernel map.
24	* Michael Chastain : Change trampoline.S to gnu as.
25	* Alan Cox : Dumb bug: 'B' step PPro's are fine
26	* Ingo Molnar : Added APIC timers, based on code
27	* from Jose Renau
28	* Ingo Molnar : various cleanups and rewrites
29	* Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
30	* Maciej W. Rozycki : Bits for genuine 82489DX APICs
31	* Andi Kleen : Changed for SMP boot into long mode.
32	* Martin J. Bligh : Added support for multi-quad systems
33	* Dave Jones : Report invalid combinations of Athlon CPUs.
34	* Rusty Russell : Hacked into shape for new "hotplug" boot process.
35	* Andi Kleen : Converted to new state machine.
36	* Ashok Raj : CPU hotplug support
37	* Glauber Costa : i386 and x86_64 integration
38	*/
39
40	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41
42	#include <linux/init.h>
43	#include <linux/smp.h>
44	#include <linux/export.h>
45	#include <linux/sched.h>
46	#include <linux/sched/topology.h>
47	#include <linux/sched/hotplug.h>
48	#include <linux/sched/task_stack.h>
49	#include <linux/percpu.h>
50	#include <linux/memblock.h>
51	#include <linux/err.h>
52	#include <linux/nmi.h>
53	#include <linux/tboot.h>
54	#include <linux/gfp.h>
55	#include <linux/cpuidle.h>
56	#include <linux/kexec.h>
57	#include <linux/numa.h>
58	#include <linux/pgtable.h>
59	#include <linux/overflow.h>
60	#include <linux/stackprotector.h>
61	#include <linux/cpuhotplug.h>
62	#include <linux/mc146818rtc.h>
63	#include <linux/acpi.h>
64
65	#include <asm/acpi.h>
66	#include <asm/cacheinfo.h>
67	#include <asm/cpuid/api.h>
68	#include <asm/desc.h>
69	#include <asm/nmi.h>
70	#include <asm/irq.h>
71	#include <asm/realmode.h>
72	#include <asm/cpu.h>
73	#include <asm/numa.h>
74	#include <asm/tlbflush.h>
75	#include <asm/mtrr.h>
76	#include <asm/mwait.h>
77	#include <asm/apic.h>
78	#include <asm/io_apic.h>
79	#include <asm/fpu/api.h>
80	#include <asm/setup.h>
81	#include <asm/uv/uv.h>
82	#include <asm/microcode.h>
83	#include <asm/i8259.h>
84	#include <asm/misc.h>
85	#include <asm/qspinlock.h>
86	#include <asm/intel-family.h>
87	#include <asm/cpu_device_id.h>
88	#include <asm/spec-ctrl.h>
89	#include <asm/hw_irq.h>
90	#include <asm/stackprotector.h>
91	#include <asm/sev.h>
92	#include <asm/spec-ctrl.h>
93
94	/ representing HT siblings of each logical CPU /
95	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
96	EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
97
98	/ representing HT and core siblings of each logical CPU /
99	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
100	EXPORT_PER_CPU_SYMBOL(cpu_core_map);
101
102	/ representing HT, core, and die siblings of each logical CPU /
103	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
104	EXPORT_PER_CPU_SYMBOL(cpu_die_map);
105
106	/ Representing CPUs for which sibling maps can be computed /
107	static cpumask_var_t cpu_sibling_setup_mask;
108
109	struct mwait_cpu_dead {
110	unsigned int control;
111	unsigned int status;
112	};
113
114	#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
115	#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
116
117	/*
118	* Cache line aligned data for mwait_play_dead(). Separate on purpose so
119	* that it's unlikely to be touched by other CPUs.
120	*/
121	static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
122
123	/ Maximum number of SMT threads on any online core /
124	int __read_mostly __max_smt_threads = `1`;
125
126	/ Flag to indicate if a complete sched domain rebuild is required /
127	bool x86_topology_update;
128
129	int arch_update_cpu_topology(void)
130	{
131	int retval = x86_topology_update;
132
133	x86_topology_update = false;
134	return retval;
135	}
136
137	static unsigned int smpboot_warm_reset_vector_count;
138
139	static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
140	{
141	unsigned long flags;
142
143	spin_lock_irqsave(&rtc_lock, flags);
144	if (!smpboot_warm_reset_vector_count++) {
145	CMOS_WRITE(`0xa`, `0xf`);
146	((volatile* unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> `4`;
147	((volatile* unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & `0xf`;
148	}
149	spin_unlock_irqrestore(lock: &rtc_lock, flags);
150	}
151
152	static inline void smpboot_restore_warm_reset_vector(void)
153	{
154	unsigned long flags;
155
156	/*
157	* Paranoid: Set warm reset code and vector here back
158	* to default values.
159	*/
160	spin_lock_irqsave(&rtc_lock, flags);
161	if (!--smpboot_warm_reset_vector_count) {
162	CMOS_WRITE(`0`, `0xf`);
163	((volatile* u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = `0`;
164	}
165	spin_unlock_irqrestore(lock: &rtc_lock, flags);
166
167	}
168
169	/ Run the next set of setup steps for the upcoming CPU /
170	static void ap_starting(void)
171	{
172	int cpuid = smp_processor_id();
173
174	/ Mop up eventual mwait_play_dead() wreckage /
175	this_cpu_write(mwait_cpu_dead.status, `0`);
176	this_cpu_write(mwait_cpu_dead.control, `0`);
177
178	/*
179	* If woken up by an INIT in an 82489DX configuration the alive
180	* synchronization guarantees that the CPU does not reach this
181	* point before an INIT_deassert IPI reaches the local APIC, so it
182	* is now safe to touch the local APIC.
183	*
184	* Set up this CPU, first the APIC, which is probably redundant on
185	* most boards.
186	*/
187	apic_ap_setup();
188
189	/ Save the processor parameters. /
190	identify_secondary_cpu(cpu: cpuid);
191
192	/*
193	* The topology information must be up to date before
194	* notify_cpu_starting().
195	*/
196	set_cpu_sibling_map(cpuid);
197
198	ap_init_aperfmperf();
199
200	pr_debug("Stack at about %p\n", &cpuid);
201
202	wmb();
203
204	/*
205	* This runs the AP through all the cpuhp states to its target
206	* state CPUHP_ONLINE.
207	*/
208	notify_cpu_starting(cpu: cpuid);
209	}
210
211	static void ap_calibrate_delay(void)
212	{
213	/*
214	* Calibrate the delay loop and update loops_per_jiffy in cpu_data.
215	* identify_secondary_cpu() stored a value that is close but not as
216	* accurate as the value just calculated.
217	*
218	* As this is invoked after the TSC synchronization check,
219	* calibrate_delay_is_known() will skip the calibration routine
220	* when TSC is synchronized across sockets.
221	*/
222	calibrate_delay();
223	cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
224	}
225
226	/*
227	* Activate a secondary processor.
228	*/
229	static void notrace __noendbr start_secondary(void *unused)
230	{
231	/*
232	* Don't put anything except direct CPU state initialization
233	* before cpu_init(), SMP booting is too fragile that we want to
234	* limit the things done here to the most necessary things.
235	*/
236	cr4_init();
237
238	/*
239	* 32-bit specific. 64-bit reaches this code with the correct page
240	* table established. Yet another historical divergence.
241	*/
242	if (IS_ENABLED(CONFIG_X86_32)) {
243	/ switch away from the initial page table /
244	load_cr3(swapper_pg_dir);
245	__flush_tlb_all();
246	}
247
248	cpu_init_exception_handling(boot_cpu: false);
249
250	/*
251	* Load the microcode before reaching the AP alive synchronization
252	* point below so it is not part of the full per CPU serialized
253	* bringup part when "parallel" bringup is enabled.
254	*
255	* That's even safe when hyperthreading is enabled in the CPU as
256	* the core code starts the primary threads first and leaves the
257	* secondary threads waiting for SIPI. Loading microcode on
258	* physical cores concurrently is a safe operation.
259	*
260	* This covers both the Intel specific issue that concurrent
261	* microcode loading on SMT siblings must be prohibited and the
262	* vendor independent issue`that microcode loading which changes
263	* CPUID, MSRs etc. must be strictly serialized to maintain
264	* software state correctness.
265	*/
266	load_ucode_ap();
267
268	/*
269	* Synchronization point with the hotplug core. Sets this CPUs
270	* synchronization state to ALIVE and spin-waits for the control CPU to
271	* release this CPU for further bringup.
272	*/
273	cpuhp_ap_sync_alive();
274
275	cpu_init();
276	fpu__init_cpu();
277	rcutree_report_cpu_starting(raw_smp_processor_id());
278	x86_cpuinit.early_percpu_clock_init();
279
280	ap_starting();
281
282	/ Check TSC synchronization with the control CPU. /
283	check_tsc_sync_target();
284
285	/*
286	* Calibrate the delay loop after the TSC synchronization check.
287	* This allows to skip the calibration when TSC is synchronized
288	* across sockets.
289	*/
290	ap_calibrate_delay();
291
292	speculative_store_bypass_ht_init();
293
294	/*
295	* Lock vector_lock, set CPU online and bring the vector
296	* allocator online. Online must be set with vector_lock held
297	* to prevent a concurrent irq setup/teardown from seeing a
298	* half valid vector space.
299	*/
300	lock_vector_lock();
301	set_cpu_online(smp_processor_id(), online: true);
302	lapic_online();
303	unlock_vector_lock();
304	x86_platform.nmi_init();
305
306	/ enable local interrupts /
307	local_irq_enable();
308
309	x86_cpuinit.setup_percpu_clockev();
310
311	wmb();
312	cpu_startup_entry(state: CPUHP_AP_ONLINE_IDLE);
313	}
314	ANNOTATE_NOENDBR_SYM(start_secondary);
315
316	static bool
317	topology_same_node(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
318	{
319	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
320
321	return (cpu_to_node(cpu: cpu1) == cpu_to_node(cpu: cpu2));
322	}
323
324	static bool
325	topology_sane(struct cpuinfo_x86 c, struct* cpuinfo_x86 o, const* char *name)
326	{
327	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
328
329	return !WARN_ONCE(!topology_same_node(c, o),
330	"sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
331	"[node: %d != %d]. Ignoring dependency.\n",
332	cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
333	}
334
335	#define link_mask(mfunc, c1, c2) \
336	do { \
337	cpumask_set_cpu((c1), mfunc(c2)); \
338	cpumask_set_cpu((c2), mfunc(c1)); \
339	} while (0)
340
341	static bool match_smt(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
342	{
343	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
344	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
345
346	if (c->topo.pkg_id == o->topo.pkg_id &&
347	c->topo.die_id == o->topo.die_id &&
348	c->topo.amd_node_id == o->topo.amd_node_id &&
349	per_cpu_llc_id(cpu: cpu1) == per_cpu_llc_id(cpu: cpu2)) {
350	if (c->topo.core_id == o->topo.core_id)
351	return topology_sane(c, o, name: "smt");
352
353	if ((c->topo.cu_id != `0xff`) &&
354	(o->topo.cu_id != `0xff`) &&
355	(c->topo.cu_id == o->topo.cu_id))
356	return topology_sane(c, o, name: "smt");
357	}
358
359	} else if (c->topo.pkg_id == o->topo.pkg_id &&
360	c->topo.die_id == o->topo.die_id &&
361	c->topo.core_id == o->topo.core_id) {
362	return topology_sane(c, o, name: "smt");
363	}
364
365	return false;
366	}
367
368	static bool match_die(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
369	{
370	if (c->topo.pkg_id != o->topo.pkg_id \|\| c->topo.die_id != o->topo.die_id)
371	return false;
372
373	if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > `1`)
374	return c->topo.amd_node_id == o->topo.amd_node_id;
375
376	return true;
377	}
378
379	static bool match_l2c(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
380	{
381	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
382
383	/ If the arch didn't set up l2c_id, fall back to SMT /
384	if (per_cpu_l2c_id(cpu: cpu1) == BAD_APICID)
385	return match_smt(c, o);
386
387	/ Do not match if L2 cache id does not match: /
388	if (per_cpu_l2c_id(cpu: cpu1) != per_cpu_l2c_id(cpu: cpu2))
389	return false;
390
391	return topology_sane(c, o, name: "l2c");
392	}
393
394	/*
395	* Unlike the other levels, we do not enforce keeping a
396	* multicore group inside a NUMA node. If this happens, we will
397	* discard the MC level of the topology later.
398	*/
399	static bool match_pkg(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
400	{
401	if (c->topo.pkg_id == o->topo.pkg_id)
402	return true;
403	return false;
404	}
405
406	/*
407	* Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
408	*
409	* Any Intel CPU that has multiple nodes per package and does not
410	* match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
411	*
412	* When in SNC mode, these CPUs enumerate an LLC that is shared
413	* by multiple NUMA nodes. The LLC is shared for off-package data
414	* access but private to the NUMA node (half of the package) for
415	* on-package access. CPUID (the source of the information about
416	* the LLC) can only enumerate the cache as shared or unshared,
417	* but not this particular configuration.
418	*/
419
420	static const struct x86_cpu_id intel_cod_cpu[] = {
421	X86_MATCH_VFM(INTEL_HASWELL_X, `0`), / COD /
422	X86_MATCH_VFM(INTEL_BROADWELL_X, `0`), / COD /
423	X86_MATCH_VFM(INTEL_ANY, `1`), / SNC /
424	{}
425	};
426
427	static bool match_llc(struct cpuinfo_x86 c, struct* cpuinfo_x86 *o)
428	{
429	const struct x86_cpu_id *id = x86_match_cpu(match: intel_cod_cpu);
430	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
431	bool intel_snc = id && id->driver_data;
432
433	/ Do not match if we do not have a valid APICID for cpu: /
434	if (per_cpu_llc_id(cpu: cpu1) == BAD_APICID)
435	return false;
436
437	/ Do not match if LLC id does not match: /
438	if (per_cpu_llc_id(cpu: cpu1) != per_cpu_llc_id(cpu: cpu2))
439	return false;
440
441	/*
442	* Allow the SNC topology without warning. Return of false
443	* means 'c' does not share the LLC of 'o'. This will be
444	* reflected to userspace.
445	*/
446	if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
447	return false;
448
449	return topology_sane(c, o, name: "llc");
450	}
451
452
453	static inline int x86_sched_itmt_flags(void)
454	{
455	return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : `0`;
456	}
457
458	#ifdef CONFIG_SCHED_MC
459	static int x86_core_flags(void)
460	{
461	return cpu_core_flags() \| x86_sched_itmt_flags();
462	}
463	#endif
464	#ifdef CONFIG_SCHED_CLUSTER
465	static int x86_cluster_flags(void)
466	{
467	return cpu_cluster_flags() \| x86_sched_itmt_flags();
468	}
469	#endif
470
471	/*
472	* Set if a package/die has multiple NUMA nodes inside.
473	* AMD Magny-Cours, Intel Cluster-on-Die, and Intel
474	* Sub-NUMA Clustering have this.
475	*/
476	static bool x86_has_numa_in_package;
477
478	static struct sched_domain_topology_level x86_topology[] = {
479	SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
480	#ifdef CONFIG_SCHED_CLUSTER
481	SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS),
482	#endif
483	#ifdef CONFIG_SCHED_MC
484	SDTL_INIT(tl_mc_mask, x86_core_flags, MC),
485	#endif
486	SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG),
487	{ NULL },
488	};
489
490	static void __init build_sched_topology(void)
491	{
492	struct sched_domain_topology_level *topology = x86_topology;
493
494	/*
495	* When there is NUMA topology inside the package invalidate the
496	* PKG domain since the NUMA domains will auto-magically create the
497	* right spanning domains based on the SLIT.
498	*/
499	if (x86_has_numa_in_package) {
500	unsigned int pkgdom = ARRAY_SIZE(x86_topology) - `2`;
501
502	memset(&x86_topology[pkgdom], `0`, sizeof(x86_topology[pkgdom]));
503	}
504
505	/*
506	* Drop the SMT domains if there is only one thread per-core
507	* since it'll get degenerated by the scheduler anyways.
508	*/
509	if (cpu_smt_num_threads <= `1`)
510	++topology;
511
512	set_sched_topology(topology);
513	}
514
515	#ifdef CONFIG_NUMA
516	static int sched_avg_remote_distance;
517	static int avg_remote_numa_distance(void)
518	{
519	int i, j;
520	int distance, nr_remote, total_distance;
521
522	if (sched_avg_remote_distance > `0`)
523	return sched_avg_remote_distance;
524
525	nr_remote = `0`;
526	total_distance = `0`;
527	for_each_node_state(i, N_CPU) {
528	for_each_node_state(j, N_CPU) {
529	distance = node_distance(i, j);
530
531	if (distance >= REMOTE_DISTANCE) {
532	nr_remote++;
533	total_distance += distance;
534	}
535	}
536	}
537	if (nr_remote)
538	sched_avg_remote_distance = total_distance / nr_remote;
539	else
540	sched_avg_remote_distance = REMOTE_DISTANCE;
541
542	return sched_avg_remote_distance;
543	}
544
545	int arch_sched_node_distance(int from, int to)
546	{
547	int d = node_distance(from, to);
548
549	switch (boot_cpu_data.x86_vfm) {
550	case INTEL_GRANITERAPIDS_X:
551	case INTEL_ATOM_DARKMONT_X:
552
553	if (!x86_has_numa_in_package \|\| topology_max_packages() == `1` \|\|
554	d < REMOTE_DISTANCE)
555	return d;
556
557	/*
558	* With SNC enabled, there could be too many levels of remote
559	* NUMA node distances, creating NUMA domain levels
560	* including local nodes and partial remote nodes.
561	*
562	* Trim finer distance tuning for NUMA nodes in remote package
563	* for the purpose of building sched domains. Group NUMA nodes
564	* in the remote package in the same sched group.
565	* Simplify NUMA domains and avoid extra NUMA levels including
566	* different remote NUMA nodes and local nodes.
567	*
568	* GNR and CWF don't expect systems with more than 2 packages
569	* and more than 2 hops between packages. Single average remote
570	* distance won't be appropriate if there are more than 2
571	* packages as average distance to different remote packages
572	* could be different.
573	*/
574	WARN_ONCE(topology_max_packages() > `2`,
575	"sched: Expect only up to 2 packages for GNR or CWF, "
576	"but saw %d packages when building sched domains.",
577	topology_max_packages());
578
579	d = avg_remote_numa_distance();
580	}
581	return d;
582	}
583	#endif /* CONFIG_NUMA */
584
585	void set_cpu_sibling_map(int cpu)
586	{
587	bool has_smt = __max_threads_per_core > `1`;
588	bool has_mp = has_smt \|\| topology_num_cores_per_package() > `1`;
589	struct cpuinfo_x86 *c = &cpu_data(cpu);
590	struct cpuinfo_x86 *o;
591	int i, threads;
592
593	cpumask_set_cpu(cpu, dstp: cpu_sibling_setup_mask);
594
595	if (!has_mp) {
596	cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
597	cpumask_set_cpu(cpu, dstp: cpu_llc_shared_mask(cpu));
598	cpumask_set_cpu(cpu, dstp: cpu_l2c_shared_mask(cpu));
599	cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
600	cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
601	c->booted_cores = `1`;
602	return;
603	}
604
605	for_each_cpu(i, cpu_sibling_setup_mask) {
606	o = &cpu_data(i);
607
608	if (match_pkg(c, o) && !topology_same_node(c, o))
609	x86_has_numa_in_package = true;
610
611	if ((i == cpu) \|\| (has_smt && match_smt(c, o)))
612	link_mask(topology_sibling_cpumask, cpu, i);
613
614	if ((i == cpu) \|\| (has_mp && match_llc(c, o)))
615	link_mask(cpu_llc_shared_mask, cpu, i);
616
617	if ((i == cpu) \|\| (has_mp && match_l2c(c, o)))
618	link_mask(cpu_l2c_shared_mask, cpu, i);
619
620	if ((i == cpu) \|\| (has_mp && match_die(c, o)))
621	link_mask(topology_die_cpumask, cpu, i);
622	}
623
624	threads = cpumask_weight(topology_sibling_cpumask(cpu));
625	if (threads > __max_smt_threads)
626	__max_smt_threads = threads;
627
628	for_each_cpu(i, topology_sibling_cpumask(cpu))
629	cpu_data(i).smt_active = threads > `1`;
630
631	/*
632	* This needs a separate iteration over the cpus because we rely on all
633	* topology_sibling_cpumask links to be set-up.
634	*/
635	for_each_cpu(i, cpu_sibling_setup_mask) {
636	o = &cpu_data(i);
637
638	if ((i == cpu) \|\| (has_mp && match_pkg(c, o))) {
639	link_mask(topology_core_cpumask, cpu, i);
640
641	/*
642	* Does this new cpu bringup a new core?
643	*/
644	if (threads == `1`) {
645	/*
646	* for each core in package, increment
647	* the booted_cores for this new cpu
648	*/
649	if (cpumask_first(
650	topology_sibling_cpumask(i)) == i)
651	c->booted_cores++;
652	/*
653	* increment the core count for all
654	* the other cpus in this package
655	*/
656	if (i != cpu)
657	cpu_data(i).booted_cores++;
658	} else if (i != cpu && !c->booted_cores)
659	c->booted_cores = cpu_data(i).booted_cores;
660	}
661	}
662	}
663
664	/ maps the cpu to the sched domain representing multi-core /
665	const struct cpumask cpu_coregroup_mask(int* cpu)
666	{
667	return cpu_llc_shared_mask(cpu);
668	}
669
670	const struct cpumask cpu_clustergroup_mask(int* cpu)
671	{
672	return cpu_l2c_shared_mask(cpu);
673	}
674	EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
675
676	static void impress_friends(void)
677	{
678	int cpu;
679	unsigned long bogosum = `0`;
680	/*
681	* Allow the user to impress friends.
682	*/
683	pr_debug("Before bogomips\n");
684	for_each_online_cpu(cpu)
685	bogosum += cpu_data(cpu).loops_per_jiffy;
686
687	pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
688	num_online_cpus(),
689	bogosum/(`500000`/HZ),
690	(bogosum/(`5000`/HZ))%`100`);
691
692	pr_debug("Before bogocount - setting activated=1\n");
693	}
694
695	/*
696	* The Multiprocessor Specification 1.4 (1997) example code suggests
697	* that there should be a 10ms delay between the BSP asserting INIT
698	* and de-asserting INIT, when starting a remote processor.
699	* But that slows boot and resume on modern processors, which include
700	* many cores and don't require that delay.
701	*
702	* Cmdline "cpu_init_udelay=" is available to override this delay.
703	*/
704	#define UDELAY_10MS_LEGACY 10000
705
706	static unsigned int init_udelay = UINT_MAX;
707
708	static int __init cpu_init_udelay(char *str)
709	{
710	get_option(str: &str, pint: &init_udelay);
711
712	return `0`;
713	}
714	early_param("cpu_init_udelay", cpu_init_udelay);
715
716	static void __init smp_set_init_udelay(void)
717	{
718	/ if cmdline changed it from default, leave it alone /
719	if (init_udelay != UINT_MAX)
720	return;
721
722	/ if modern processor, use no delay /
723	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) \|\|
724	(boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= `0x18`) \|\|
725	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= `0xF`)) {
726	init_udelay = `0`;
727	return;
728	}
729	/ else, use legacy delay /
730	init_udelay = UDELAY_10MS_LEGACY;
731	}
732
733	/*
734	* Wake up AP by INIT, INIT, STARTUP sequence.
735	*/
736	static void send_init_sequence(u32 phys_apicid)
737	{
738	int maxlvt = lapic_get_maxlvt();
739
740	/ Be paranoid about clearing APIC errors. /
741	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
742	/ Due to the Pentium erratum 3AP. /
743	if (maxlvt > `3`)
744	apic_write(APIC_ESR, val: `0`);
745	apic_read(APIC_ESR);
746	}
747
748	/ Assert INIT on the target CPU /
749	apic_icr_write(APIC_INT_LEVELTRIG \| APIC_INT_ASSERT \| APIC_DM_INIT, high: phys_apicid);
750	safe_apic_wait_icr_idle();
751
752	udelay(usec: init_udelay);
753
754	/ Deassert INIT on the target CPU /
755	apic_icr_write(APIC_INT_LEVELTRIG \| APIC_DM_INIT, high: phys_apicid);
756	safe_apic_wait_icr_idle();
757	}
758
759	/*
760	* Wake up AP by INIT, INIT, STARTUP sequence.
761	*/
762	static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu)
763	{
764	unsigned long send_status = `0`, accept_status = `0`;
765	int num_starts, j, maxlvt;
766
767	preempt_disable();
768	maxlvt = lapic_get_maxlvt();
769	send_init_sequence(phys_apicid);
770
771	mb();
772
773	/*
774	* Should we send STARTUP IPIs ?
775	*
776	* Determine this based on the APIC version.
777	* If we don't have an integrated APIC, don't send the STARTUP IPIs.
778	*/
779	if (APIC_INTEGRATED(boot_cpu_apic_version))
780	num_starts = `2`;
781	else
782	num_starts = `0`;
783
784	/*
785	* Run STARTUP IPI loop.
786	*/
787	pr_debug("#startup loops: %d\n", num_starts);
788
789	for (j = `1`; j <= num_starts; j++) {
790	pr_debug("Sending STARTUP #%d\n", j);
791	if (maxlvt > `3`) / Due to the Pentium erratum 3AP. /
792	apic_write(APIC_ESR, val: `0`);
793	apic_read(APIC_ESR);
794	pr_debug("After apic_write\n");
795
796	/*
797	* STARTUP IPI
798	*/
799
800	/ Target chip /
801	/ Boot on the stack /
802	/ Kick the second /
803	apic_icr_write(APIC_DM_STARTUP \| (start_eip >> `12`),
804	high: phys_apicid);
805
806	/*
807	* Give the other CPU some time to accept the IPI.
808	*/
809	if (init_udelay == `0`)
810	udelay(usec: `10`);
811	else
812	udelay(usec: `300`);
813
814	pr_debug("Startup point 1\n");
815
816	pr_debug("Waiting for send to finish...\n");
817	send_status = safe_apic_wait_icr_idle();
818
819	/*
820	* Give the other CPU some time to accept the IPI.
821	*/
822	if (init_udelay == `0`)
823	udelay(usec: `10`);
824	else
825	udelay(usec: `200`);
826
827	if (maxlvt > `3`) / Due to the Pentium erratum 3AP. /
828	apic_write(APIC_ESR, val: `0`);
829	accept_status = (apic_read(APIC_ESR) & `0xEF`);
830	if (send_status \|\| accept_status)
831	break;
832	}
833	pr_debug("After Startup\n");
834
835	if (send_status)
836	pr_err("APIC never delivered???\n");
837	if (accept_status)
838	pr_err("APIC delivery error (%lx)\n", accept_status);
839
840	preempt_enable();
841	return (send_status \| accept_status);
842	}
843
844	/ reduce the number of lines printed when booting a large cpu count system /
845	static void announce_cpu(int cpu, int apicid)
846	{
847	static int width, node_width, first = `1`;
848	static int current_node = NUMA_NO_NODE;
849	int node = early_cpu_to_node(cpu);
850
851	if (!width)
852	width = num_digits(val: num_possible_cpus()) + `1`; / + '#' sign /
853
854	if (!node_width)
855	node_width = num_digits(num_possible_nodes()) + `1`; / + '#' /
856
857	if (system_state < SYSTEM_RUNNING) {
858	if (first)
859	pr_info("x86: Booting SMP configuration:\n");
860
861	if (node != current_node) {
862	if (current_node > (-`1`))
863	pr_cont("\n");
864	current_node = node;
865
866	printk(KERN_INFO ".... node %*s#%d, CPUs: ",
867	node_width - num_digits(node), " ", node);
868	}
869
870	/ Add padding for the BSP /
871	if (first)
872	pr_cont("%*s", width + `1`, " ");
873	first = `0`;
874
875	pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
876	} else
877	pr_info("Booting Node %d Processor %d APIC 0x%x\n",
878	node, cpu, apicid);
879	}
880
881	int common_cpu_up(unsigned int cpu, struct task_struct *idle)
882	{
883	int ret;
884
885	/ Just in case we booted with a single CPU. /
886	alternatives_enable_smp();
887
888	per_cpu(current_task, cpu) = idle;
889	cpu_init_stack_canary(cpu, idle);
890
891	/ Initialize the interrupt stack(s) /
892	ret = irq_init_percpu_irqstack(cpu);
893	if (ret)
894	return ret;
895
896	#ifdef CONFIG_X86_32
897	/ Stack for startup_32 can be just as for start_secondary onwards /
898	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
899	#endif
900	return `0`;
901	}
902
903	/*
904	* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
905	* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
906	* Returns zero if startup was successfully sent, else error code from
907	* ->wakeup_secondary_cpu.
908	*/
909	static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
910	{
911	unsigned long start_ip = real_mode_header->trampoline_start;
912	int ret;
913
914	#ifdef CONFIG_X86_64
915	/ If 64-bit wakeup method exists, use the 64-bit mode trampoline IP /
916	if (apic->wakeup_secondary_cpu_64)
917	start_ip = real_mode_header->trampoline_start64;
918	#endif
919	idle->thread.sp = (unsigned long)task_pt_regs(idle);
920	initial_code = (unsigned long)start_secondary;
921
922	if (IS_ENABLED(CONFIG_X86_32)) {
923	early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
924	initial_stack = idle->thread.sp;
925	} else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
926	smpboot_control = cpu;
927	}
928
929	/ Enable the espfix hack for this CPU /
930	init_espfix_ap(cpu);
931
932	/ So we see what's up /
933	announce_cpu(cpu, apicid);
934
935	/*
936	* This grunge runs the startup process for
937	* the targeted processor.
938	*/
939	if (x86_platform.legacy.warm_reset) {
940
941	pr_debug("Setting warm reset code and vector.\n");
942
943	smpboot_setup_warm_reset_vector(start_eip: start_ip);
944	/*
945	* Be paranoid about clearing APIC errors.
946	*/
947	if (APIC_INTEGRATED(boot_cpu_apic_version)) {
948	apic_write(APIC_ESR, val: `0`);
949	apic_read(APIC_ESR);
950	}
951	}
952
953	smp_mb();
954
955	/*
956	* Wake up a CPU in difference cases:
957	* - Use a method from the APIC driver if one defined, with wakeup
958	* straight to 64-bit mode preferred over wakeup to RM.
959	* Otherwise,
960	* - Use an INIT boot APIC message
961	*/
962	if (apic->wakeup_secondary_cpu_64)
963	ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
964	else if (apic->wakeup_secondary_cpu)
965	ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
966	else
967	ret = wakeup_secondary_cpu_via_init(phys_apicid: apicid, start_eip: start_ip, cpu);
968
969	/ If the wakeup mechanism failed, cleanup the warm reset vector /
970	if (ret)
971	arch_cpuhp_cleanup_kick_cpu(cpu);
972	return ret;
973	}
974
975	int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
976	{
977	u32 apicid = apic->cpu_present_to_apicid(cpu);
978	int err;
979
980	lockdep_assert_irqs_enabled();
981
982	pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
983
984	if (apicid == BAD_APICID \|\| !apic_id_valid(apic_id: apicid)) {
985	pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid);
986	return -EINVAL;
987	}
988
989	if (!test_bit(apicid, phys_cpu_present_map)) {
990	pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid);
991	return -EINVAL;
992	}
993
994	/*
995	* Save current MTRR state in case it was changed since early boot
996	* (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
997	*/
998	mtrr_save_state();
999
1000	/ the FPU context is blank, nobody can own it /
1001	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
1002
1003	err = common_cpu_up(cpu, idle: tidle);
1004	if (err)
1005	return err;
1006
1007	err = do_boot_cpu(apicid, cpu, idle: tidle);
1008	if (err)
1009	pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
1010
1011	return err;
1012	}
1013
1014	int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
1015	{
1016	return smp_ops.kick_ap_alive(cpu, tidle);
1017	}
1018
1019	void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)
1020	{
1021	/ Cleanup possible dangling ends... /
1022	if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
1023	smpboot_restore_warm_reset_vector();
1024	}
1025
1026	void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
1027	{
1028	if (smp_ops.cleanup_dead_cpu)
1029	smp_ops.cleanup_dead_cpu(cpu);
1030
1031	if (system_state == SYSTEM_RUNNING)
1032	pr_info("CPU %u is now offline\n", cpu);
1033	}
1034
1035	void arch_cpuhp_sync_state_poll(void)
1036	{
1037	if (smp_ops.poll_sync_state)
1038	smp_ops.poll_sync_state();
1039	}
1040
1041	/**
1042	* arch_disable_smp_support() - Disables SMP support for x86 at boottime
1043	*/
1044	void __init arch_disable_smp_support(void)
1045	{
1046	disable_ioapic_support();
1047	}
1048
1049	/*
1050	* Fall back to non SMP mode after errors.
1051	*
1052	* RED-PEN audit/test this more. I bet there is more state messed up here.
1053	*/
1054	static __init void disable_smp(void)
1055	{
1056	pr_info("SMP disabled\n");
1057
1058	disable_ioapic_support();
1059	topology_reset_possible_cpus_up();
1060
1061	cpumask_set_cpu(cpu: `0`, topology_sibling_cpumask(`0`));
1062	cpumask_set_cpu(cpu: `0`, topology_core_cpumask(`0`));
1063	cpumask_set_cpu(cpu: `0`, topology_die_cpumask(`0`));
1064	}
1065
1066	void __init smp_prepare_cpus_common(void)
1067	{
1068	unsigned int cpu, node;
1069
1070	/ Mark all except the boot CPU as hotpluggable /
1071	for_each_possible_cpu(cpu) {
1072	if (cpu)
1073	per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
1074	}
1075
1076	for_each_possible_cpu(cpu) {
1077	node = cpu_to_node(cpu);
1078
1079	zalloc_cpumask_var_node(mask: &per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
1080	zalloc_cpumask_var_node(mask: &per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
1081	zalloc_cpumask_var_node(mask: &per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
1082	zalloc_cpumask_var_node(mask: &per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
1083	zalloc_cpumask_var_node(mask: &per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
1084	}
1085
1086	set_cpu_sibling_map(`0`);
1087	}
1088
1089	void __init smp_prepare_boot_cpu(void)
1090	{
1091	smp_ops.smp_prepare_boot_cpu();
1092	}
1093
1094	#ifdef CONFIG_X86_64
1095	/ Establish whether parallel bringup can be supported. /
1096	bool __init arch_cpuhp_init_parallel_bringup(void)
1097	{
1098	if (!x86_cpuinit.parallel_bringup) {
1099	pr_info("Parallel CPU startup disabled by the platform\n");
1100	return false;
1101	}
1102
1103	smpboot_control = STARTUP_READ_APICID;
1104	pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control);
1105	return true;
1106	}
1107	#endif
1108
1109	/*
1110	* Prepare for SMP bootup.
1111	* @max_cpus: configured maximum number of CPUs, It is a legacy parameter
1112	* for common interface support.
1113	*/
1114	void __init native_smp_prepare_cpus(unsigned int max_cpus)
1115	{
1116	smp_prepare_cpus_common();
1117
1118	switch (apic_intr_mode) {
1119	case APIC_PIC:
1120	case APIC_VIRTUAL_WIRE_NO_CONFIG:
1121	disable_smp();
1122	return;
1123	case APIC_SYMMETRIC_IO_NO_ROUTING:
1124	disable_smp();
1125	/ Setup local timer /
1126	x86_init.timers.setup_percpu_clockev();
1127	return;
1128	case APIC_VIRTUAL_WIRE:
1129	case APIC_SYMMETRIC_IO:
1130	break;
1131	}
1132
1133	/ Setup local timer /
1134	x86_init.timers.setup_percpu_clockev();
1135
1136	pr_info("CPU0: ");
1137	print_cpu_info(&cpu_data(`0`));
1138
1139	uv_system_init();
1140
1141	smp_set_init_udelay();
1142
1143	speculative_store_bypass_ht_init();
1144
1145	snp_set_wakeup_secondary_cpu();
1146	}
1147
1148	void arch_thaw_secondary_cpus_begin(void)
1149	{
1150	set_cache_aps_delayed_init(true);
1151	}
1152
1153	void arch_thaw_secondary_cpus_end(void)
1154	{
1155	cache_aps_init();
1156	}
1157
1158	/*
1159	* Early setup to make printk work.
1160	*/
1161	void __init native_smp_prepare_boot_cpu(void)
1162	{
1163	int me = smp_processor_id();
1164
1165	/ SMP handles this from setup_per_cpu_areas() /
1166	if (!IS_ENABLED(CONFIG_SMP))
1167	switch_gdt_and_percpu_base(me);
1168
1169	native_pv_lock_init();
1170	}
1171
1172	void __init native_smp_cpus_done(unsigned int max_cpus)
1173	{
1174	pr_debug("Boot done\n");
1175
1176	build_sched_topology();
1177	nmi_selftest();
1178	impress_friends();
1179	cache_aps_init();
1180	}
1181
1182	/ correctly size the local cpu masks /
1183	void __init setup_cpu_local_masks(void)
1184	{
1185	alloc_bootmem_cpumask_var(mask: &cpu_sibling_setup_mask);
1186	}
1187
1188	#ifdef CONFIG_HOTPLUG_CPU
1189
1190	/ Recompute SMT state for all CPUs on offline /
1191	static void recompute_smt_state(void)
1192	{
1193	int max_threads, cpu;
1194
1195	max_threads = `0`;
1196	for_each_online_cpu (cpu) {
1197	int threads = cpumask_weight(topology_sibling_cpumask(cpu));
1198
1199	if (threads > max_threads)
1200	max_threads = threads;
1201	}
1202	__max_smt_threads = max_threads;
1203	}
1204
1205	static void remove_siblinginfo(int cpu)
1206	{
1207	int sibling;
1208	struct cpuinfo_x86 *c = &cpu_data(cpu);
1209
1210	for_each_cpu(sibling, topology_core_cpumask(cpu)) {
1211	cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
1212	//*
1213	* last thread sibling in this cpu core going down
1214	*/
1215	if (cpumask_weight(topology_sibling_cpumask(cpu)) == `1`)
1216	cpu_data(sibling).booted_cores--;
1217	}
1218
1219	for_each_cpu(sibling, topology_die_cpumask(cpu))
1220	cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
1221
1222	for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
1223	cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1224	if (cpumask_weight(topology_sibling_cpumask(sibling)) == `1`)
1225	cpu_data(sibling).smt_active = false;
1226	}
1227
1228	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1229	cpumask_clear_cpu(cpu, dstp: cpu_llc_shared_mask(cpu: sibling));
1230	for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
1231	cpumask_clear_cpu(cpu, dstp: cpu_l2c_shared_mask(cpu: sibling));
1232	cpumask_clear(dstp: cpu_llc_shared_mask(cpu));
1233	cpumask_clear(dstp: cpu_l2c_shared_mask(cpu));
1234	cpumask_clear(topology_sibling_cpumask(cpu));
1235	cpumask_clear(topology_core_cpumask(cpu));
1236	cpumask_clear(topology_die_cpumask(cpu));
1237	c->topo.core_id = `0`;
1238	c->booted_cores = `0`;
1239	cpumask_clear_cpu(cpu, dstp: cpu_sibling_setup_mask);
1240	recompute_smt_state();
1241	}
1242
1243	static void remove_cpu_from_maps(int cpu)
1244	{
1245	set_cpu_online(cpu, online: false);
1246	numa_remove_cpu(cpu);
1247	}
1248
1249	void cpu_disable_common(void)
1250	{
1251	int cpu = smp_processor_id();
1252
1253	remove_siblinginfo(cpu);
1254
1255	/*
1256	* Stop allowing kernel-mode FPU. This is needed so that if the CPU is
1257	* brought online again, the initial state is not allowed:
1258	*/
1259	this_cpu_write(kernel_fpu_allowed, false);
1260
1261	/ It's now safe to remove this processor from the online map /
1262	lock_vector_lock();
1263	remove_cpu_from_maps(cpu);
1264	unlock_vector_lock();
1265	fixup_irqs();
1266	lapic_offline();
1267	}
1268
1269	int native_cpu_disable(void)
1270	{
1271	int ret;
1272
1273	ret = lapic_can_unplug_cpu();
1274	if (ret)
1275	return ret;
1276
1277	cpu_disable_common();
1278
1279	/*
1280	* Disable the local APIC. Otherwise IPI broadcasts will reach
1281	* it. It still responds normally to INIT, NMI, SMI, and SIPI
1282	* messages.
1283	*
1284	* Disabling the APIC must happen after cpu_disable_common()
1285	* which invokes fixup_irqs().
1286	*
1287	* Disabling the APIC preserves already set bits in IRR, but
1288	* an interrupt arriving after disabling the local APIC does not
1289	* set the corresponding IRR bit.
1290	*
1291	* fixup_irqs() scans IRR for set bits so it can raise a not
1292	* yet handled interrupt on the new destination CPU via an IPI
1293	* but obviously it can't do so for IRR bits which are not set.
1294	* IOW, interrupts arriving after disabling the local APIC will
1295	* be lost.
1296	*/
1297	apic_soft_disable();
1298
1299	return `0`;
1300	}
1301
1302	void play_dead_common(void)
1303	{
1304	idle_task_exit();
1305
1306	cpuhp_ap_report_dead();
1307
1308	local_irq_disable();
1309	}
1310
1311	/*
1312	* We need to flush the caches before going to sleep, lest we have
1313	* dirty data in our caches when we come back up.
1314	*/
1315	void __noreturn mwait_play_dead(unsigned int eax_hint)
1316	{
1317	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
1318
1319	/ Set up state for the kexec() hack below /
1320	md->status = CPUDEAD_MWAIT_WAIT;
1321	md->control = CPUDEAD_MWAIT_WAIT;
1322
1323	wbinvd();
1324
1325	while (`1`) {
1326	/*
1327	* The CLFLUSH is a workaround for erratum AAI65 for
1328	* the Xeon 7400 series. It's not clear it is actually
1329	* needed, but it should be harmless in either case.
1330	* The WBINVD is insufficient due to the spurious-wakeup
1331	* case where we return around the loop.
1332	*/
1333	mb();
1334	clflush(p: md);
1335	mb();
1336	__monitor(eax: md, ecx: `0`, edx: `0`);
1337	mb();
1338	__mwait(eax: eax_hint, ecx: `0`);
1339
1340	if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
1341	/*
1342	* Kexec is about to happen. Don't go back into mwait() as
1343	* the kexec kernel might overwrite text and data including
1344	* page tables and stack. So mwait() would resume when the
1345	* monitor cache line is written to and then the CPU goes
1346	* south due to overwritten text, page tables and stack.
1347	*
1348	* Note: This does _NOT_ protect against a stray MCE, NMI,
1349	* SMI. They will resume execution at the instruction
1350	* following the HLT instruction and run into the problem
1351	* which this is trying to prevent.
1352	*/
1353	WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
1354	while(`1`)
1355	native_halt();
1356	}
1357	}
1358	}
1359
1360	/*
1361	* Kick all "offline" CPUs out of mwait on kexec(). See comment in
1362	* mwait_play_dead().
1363	*/
1364	void smp_kick_mwait_play_dead(void)
1365	{
1366	u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
1367	struct mwait_cpu_dead *md;
1368	unsigned int cpu, i;
1369
1370	for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
1371	md = per_cpu_ptr(&mwait_cpu_dead, cpu);
1372
1373	/ Does it sit in mwait_play_dead() ? /
1374	if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
1375	continue;
1376
1377	/ Wait up to 5ms /
1378	for (i = `0`; READ_ONCE(md->status) != newstate && i < `1000`; i++) {
1379	/ Bring it out of mwait /
1380	WRITE_ONCE(md->control, newstate);
1381	udelay(usec: `5`);
1382	}
1383
1384	if (READ_ONCE(md->status) != newstate)
1385	pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
1386	}
1387	}
1388
1389	void __noreturn hlt_play_dead(void)
1390	{
1391	if (__this_cpu_read(cpu_info.x86) >= `4`)
1392	wbinvd();
1393
1394	while (`1`)
1395	native_halt();
1396	}
1397
1398	void __noreturn native_play_dead(void)
1399	{
1400	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
1401	__update_spec_ctrl(val: `0`);
1402
1403	play_dead_common();
1404	tboot_shutdown(shutdown_type: TB_SHUTDOWN_WFS);
1405
1406	/ Below returns only on error. /
1407	cpuidle_play_dead();
1408	hlt_play_dead();
1409	}
1410
1411	#else /* ... !CONFIG_HOTPLUG_CPU */
1412	int native_cpu_disable(void)
1413	{
1414	return -ENOSYS;
1415	}
1416
1417	void __noreturn native_play_dead(void)
1418	{
1419	BUG();
1420	}
1421
1422	#endif
1423

source code of linux/arch/x86/kernel/smpboot.c