setup_64.c source code [linux/arch/powerpc/kernel/setup_64.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	*
4	* Common boot and setup code.
5	*
6	* Copyright (C) 2001 PPC64 Team, IBM Corp
7	*/
8
9	#include <linux/export.h>
10	#include <linux/string.h>
11	#include <linux/sched.h>
12	#include <linux/init.h>
13	#include <linux/kernel.h>
14	#include <linux/reboot.h>
15	#include <linux/delay.h>
16	#include <linux/initrd.h>
17	#include <linux/seq_file.h>
18	#include <linux/ioport.h>
19	#include <linux/console.h>
20	#include <linux/utsname.h>
21	#include <linux/tty.h>
22	#include <linux/root_dev.h>
23	#include <linux/notifier.h>
24	#include <linux/cpu.h>
25	#include <linux/unistd.h>
26	#include <linux/serial.h>
27	#include <linux/serial_8250.h>
28	#include <linux/memblock.h>
29	#include <linux/pci.h>
30	#include <linux/lockdep.h>
31	#include <linux/memory.h>
32	#include <linux/nmi.h>
33	#include <linux/pgtable.h>
34	#include <linux/of.h>
35	#include <linux/of_fdt.h>
36
37	#include <asm/asm-prototypes.h>
38	#include <asm/kvm_guest.h>
39	#include <asm/io.h>
40	#include <asm/kdump.h>
41	#include <asm/processor.h>
42	#include <asm/smp.h>
43	#include <asm/elf.h>
44	#include <asm/machdep.h>
45	#include <asm/paca.h>
46	#include <asm/time.h>
47	#include <asm/cputable.h>
48	#include <asm/dt_cpu_ftrs.h>
49	#include <asm/sections.h>
50	#include <asm/btext.h>
51	#include <asm/nvram.h>
52	#include <asm/setup.h>
53	#include <asm/rtas.h>
54	#include <asm/iommu.h>
55	#include <asm/serial.h>
56	#include <asm/cache.h>
57	#include <asm/page.h>
58	#include <asm/mmu.h>
59	#include <asm/firmware.h>
60	#include <asm/xmon.h>
61	#include <asm/udbg.h>
62	#include <asm/kexec.h>
63	#include <asm/code-patching.h>
64	#include <asm/ftrace.h>
65	#include <asm/opal.h>
66	#include <asm/cputhreads.h>
67	#include <asm/hw_irq.h>
68	#include <asm/feature-fixups.h>
69	#include <asm/kup.h>
70	#include <asm/early_ioremap.h>
71	#include <asm/pgalloc.h>
72
73	#include "setup.h"
74
75	int spinning_secondaries;
76	u64 ppc64_pft_size;
77
78	struct ppc64_caches ppc64_caches = {
79	.l1d = {
80	.block_size = `0x40`,
81	.log_block_size = `6`,
82	},
83	.l1i = {
84	.block_size = `0x40`,
85	.log_block_size = `6`
86	},
87	};
88	EXPORT_SYMBOL_GPL(ppc64_caches);
89
90	#if defined(CONFIG_PPC_BOOK3E_64) && defined(CONFIG_SMP)
91	void __init setup_tlb_core_data(void)
92	{
93	int cpu;
94
95	BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != `0`);
96
97	for_each_possible_cpu(cpu) {
98	int first = cpu_first_thread_sibling(cpu);
99
100	/*
101	* If we boot via kdump on a non-primary thread,
102	* make sure we point at the thread that actually
103	* set up this TLB.
104	*/
105	if (cpu_first_thread_sibling(boot_cpuid) == first)
106	first = boot_cpuid;
107
108	paca_ptrs[cpu]->tcd_ptr = &paca_ptrs[first]->tcd;
109
110	/*
111	* If we have threads, we need either tlbsrx.
112	* or e6500 tablewalk mode, or else TLB handlers
113	* will be racy and could produce duplicate entries.
114	* Should we panic instead?
115	*/
116	WARN_ONCE(smt_enabled_at_boot >= `2` &&
117	book3e_htw_mode != PPC_HTW_E6500,
118	"%s: unsupported MMU configuration\n", __func__);
119	}
120	}
121	#endif
122
123	#ifdef CONFIG_SMP
124
125	static char *smt_enabled_cmdline;
126
127	/ Look for ibm,smt-enabled OF option /
128	void __init check_smt_enabled(void)
129	{
130	struct device_node *dn;
131	const char *smt_option;
132
133	/ Default to enabling all threads /
134	smt_enabled_at_boot = threads_per_core;
135
136	/ Allow the command line to overrule the OF option /
137	if (smt_enabled_cmdline) {
138	if (!strcmp(smt_enabled_cmdline, "on"))
139	smt_enabled_at_boot = threads_per_core;
140	else if (!strcmp(smt_enabled_cmdline, "off"))
141	smt_enabled_at_boot = `0`;
142	else {
143	int smt;
144	int rc;
145
146	rc = kstrtoint(s: smt_enabled_cmdline, base: `10`, res: &smt);
147	if (!rc)
148	smt_enabled_at_boot =
149	min(threads_per_core, smt);
150	}
151	} else {
152	dn = of_find_node_by_path(path: "/options");
153	if (dn) {
154	smt_option = of_get_property(node: dn, name: "ibm,smt-enabled",
155	NULL);
156
157	if (smt_option) {
158	if (!strcmp(smt_option, "on"))
159	smt_enabled_at_boot = threads_per_core;
160	else if (!strcmp(smt_option, "off"))
161	smt_enabled_at_boot = `0`;
162	}
163
164	of_node_put(node: dn);
165	}
166	}
167	}
168
169	/ Look for smt-enabled= cmdline option /
170	static int __init early_smt_enabled(char *p)
171	{
172	smt_enabled_cmdline = p;
173	return `0`;
174	}
175	early_param("smt-enabled", early_smt_enabled);
176
177	#endif /* CONFIG_SMP */
178
179	/* Fix up paca fields required for the boot cpu /
180	static void __init fixup_boot_paca(struct paca_struct *boot_paca)
181	{
182	/ The boot cpu is started /
183	boot_paca->cpu_start = `1`;
184	#ifdef CONFIG_PPC_BOOK3S_64
185	/*
186	* Give the early boot machine check stack somewhere to use, use
187	* half of the init stack. This is a bit hacky but there should not be
188	* deep stack usage in early init so shouldn't overflow it or overwrite
189	* things.
190	*/
191	boot_paca->mc_emergency_sp = (void *)&init_thread_union +
192	(THREAD_SIZE/`2`);
193	#endif
194	/ Allow percpu accesses to work until we setup percpu data /
195	boot_paca->data_offset = `0`;
196	/ Mark interrupts soft and hard disabled in PACA /
197	boot_paca->irq_soft_mask = IRQS_DISABLED;
198	boot_paca->irq_happened = PACA_IRQ_HARD_DIS;
199	WARN_ON(mfmsr() & MSR_EE);
200	}
201
202	static void __init configure_exceptions(void)
203	{
204	/*
205	* Setup the trampolines from the lowmem exception vectors
206	* to the kdump kernel when not using a relocatable kernel.
207	*/
208	setup_kdump_trampoline();
209
210	/ Under a PAPR hypervisor, we need hypercalls /
211	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
212	/*
213	* - PR KVM does not support AIL mode interrupts in the host
214	* while a PR guest is running.
215	*
216	* - SCV system call interrupt vectors are only implemented for
217	* AIL mode interrupts.
218	*
219	* - On pseries, AIL mode can only be enabled and disabled
220	* system-wide so when a PR VM is created on a pseries host,
221	* all CPUs of the host are set to AIL=0 mode.
222	*
223	* - Therefore host CPUs must not execute scv while a PR VM
224	* exists.
225	*
226	* - SCV support can not be disabled dynamically because the
227	* feature is advertised to host userspace. Disabling the
228	* facility and emulating it would be possible but is not
229	* implemented.
230	*
231	* - So SCV support is blanket disabled if PR KVM could possibly
232	* run. That is, PR support compiled in, booting on pseries
233	* with hash MMU.
234	*/
235	if (IS_ENABLED(CONFIG_KVM_BOOK3S_PR_POSSIBLE) && !radix_enabled()) {
236	init_task.thread.fscr &= ~FSCR_SCV;
237	cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV;
238	}
239
240	/ Enable AIL if possible /
241	if (!pseries_enable_reloc_on_exc()) {
242	init_task.thread.fscr &= ~FSCR_SCV;
243	cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV;
244	}
245
246	/*
247	* Tell the hypervisor that we want our exceptions to
248	* be taken in little endian mode.
249	*
250	* We don't call this for big endian as our calling convention
251	* makes us always enter in BE, and the call may fail under
252	* some circumstances with kdump.
253	*/
254	#ifdef __LITTLE_ENDIAN__
255	pseries_little_endian_exceptions();
256	#endif
257	} else {
258	/ Set endian mode using OPAL /
259	if (firmware_has_feature(FW_FEATURE_OPAL))
260	opal_configure_cores();
261
262	/ AIL on native is done in cpu_ready_for_interrupts() /
263	}
264	}
265
266	static void cpu_ready_for_interrupts(void)
267	{
268	/*
269	* Enable AIL if supported, and we are in hypervisor mode. This
270	* is called once for every processor.
271	*
272	* If we are not in hypervisor mode the job is done once for
273	* the whole partition in configure_exceptions().
274	*/
275	if (cpu_has_feature(CPU_FTR_HVMODE)) {
276	unsigned long lpcr = mfspr(SPRN_LPCR);
277	unsigned long new_lpcr = lpcr;
278
279	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
280	/ P10 DD1 does not have HAIL /
281	if (pvr_version_is(PVR_POWER10) &&
282	(mfspr(SPRN_PVR) & `0xf00`) == `0x100`)
283	new_lpcr \|= LPCR_AIL_3;
284	else
285	new_lpcr \|= LPCR_HAIL;
286	} else if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
287	new_lpcr \|= LPCR_AIL_3;
288	}
289
290	if (new_lpcr != lpcr)
291	mtspr(SPRN_LPCR, new_lpcr);
292	}
293
294	/*
295	* Set HFSCR:TM based on CPU features:
296	* In the special case of TM no suspend (P9N DD2.1), Linux is
297	* told TM is off via the dt-ftrs but told to (partially) use
298	* it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM]
299	* will be off from dt-ftrs but we need to turn it on for the
300	* no suspend case.
301	*/
302	if (cpu_has_feature(CPU_FTR_HVMODE)) {
303	if (cpu_has_feature(CPU_FTR_TM_COMP))
304	mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) \| HFSCR_TM);
305	else
306	mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM);
307	}
308
309	/ Set IR and DR in PACA MSR /
310	get_paca()->kernel_msr = MSR_KERNEL;
311	}
312
313	unsigned long spr_default_dscr = `0`;
314
315	static void __init record_spr_defaults(void)
316	{
317	if (early_cpu_has_feature(CPU_FTR_DSCR))
318	spr_default_dscr = mfspr(SPRN_DSCR);
319	}
320
321	/*
322	* Early initialization entry point. This is called by head.S
323	* with MMU translation disabled. We rely on the "feature" of
324	* the CPU that ignores the top 2 bits of the address in real
325	* mode so we can access kernel globals normally provided we
326	* only toy with things in the RMO region. From here, we do
327	* some early parsing of the device-tree to setup out MEMBLOCK
328	* data structures, and allocate & initialize the hash table
329	* and segment tables so we can start running with translation
330	* enabled.
331	*
332	* It is this function which will call the probe() callback of
333	* the various platform types and copy the matching one to the
334	* global ppc_md structure. Your platform can eventually do
335	* some very early initializations from the probe() routine, but
336	* this is not recommended, be very careful as, for example, the
337	* device-tree is not accessible via normal means at this point.
338	*/
339
340	void __init early_setup(unsigned long dt_ptr)
341	{
342	static __initdata struct paca_struct boot_paca;
343
344	/ -------- printk is _NOT_ safe to use here ! ------- /
345
346	/*
347	* Assume we're on cpu 0 for now.
348	*
349	* We need to load a PACA very early for a few reasons.
350	*
351	* The stack protector canary is stored in the paca, so as soon as we
352	* call any stack protected code we need r13 pointing somewhere valid.
353	*
354	* If we are using kcov it will call in_task() in its instrumentation,
355	* which relies on the current task from the PACA.
356	*
357	* dt_cpu_ftrs_init() calls into generic OF/fdt code, as well as
358	* printk(), which can trigger both stack protector and kcov.
359	*
360	* percpu variables and spin locks also use the paca.
361	*
362	* So set up a temporary paca. It will be replaced below once we know
363	* what CPU we are on.
364	*/
365	initialise_paca(&boot_paca, `0`);
366	fixup_boot_paca(boot_paca: &boot_paca);
367	WARN_ON(local_paca);
368	setup_paca(&boot_paca); / install the paca into registers /
369
370	/ -------- printk is now safe to use ------- /
371
372	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && (mfmsr() & MSR_HV))
373	enable_machine_check();
374
375	/ Try new device tree based feature discovery ... /
376	if (!dt_cpu_ftrs_init(__va(dt_ptr)))
377	/ Otherwise use the old style CPU table /
378	identify_cpu(`0`, mfspr(SPRN_PVR));
379
380	/ Enable early debugging if any specified (see udbg.h) /
381	udbg_early_init();
382
383	udbg_printf(" -> %s(), dt_ptr: 0x%lx\n", __func__, dt_ptr);
384
385	/*
386	* Do early initialization using the flattened device
387	* tree, such as retrieving the physical memory map or
388	* calculating/retrieving the hash table size, discover
389	* boot_cpuid and boot_cpu_hwid.
390	*/
391	early_init_devtree(__va(dt_ptr));
392
393	allocate_paca_ptrs();
394	allocate_paca(boot_cpuid);
395	set_hard_smp_processor_id(boot_cpuid, boot_cpu_hwid);
396	fixup_boot_paca(paca_ptrs[boot_cpuid]);
397	setup_paca(paca_ptrs[boot_cpuid]); / install the paca into registers /
398	// smp_processor_id() now reports boot_cpuid
399
400	#ifdef CONFIG_SMP
401	task_thread_info(current)->cpu = boot_cpuid; // fix task_cpu(current)
402	#endif
403
404	/*
405	* Configure exception handlers. This include setting up trampolines
406	* if needed, setting exception endian mode, etc...
407	*/
408	configure_exceptions();
409
410	/*
411	* Configure Kernel Userspace Protection. This needs to happen before
412	* feature fixups for platforms that implement this using features.
413	*/
414	setup_kup();
415
416	/ Apply all the dynamic patching /
417	apply_feature_fixups();
418	setup_feature_keys();
419
420	/ Initialize the hash table or TLB handling /
421	early_init_mmu();
422
423	early_ioremap_setup();
424
425	/*
426	* After firmware and early platform setup code has set things up,
427	* we note the SPR values for configurable control/performance
428	* registers, and use those as initial defaults.
429	*/
430	record_spr_defaults();
431
432	/*
433	* At this point, we can let interrupts switch to virtual mode
434	* (the MMU has been setup), so adjust the MSR in the PACA to
435	* have IR and DR set and enable AIL if it exists
436	*/
437	cpu_ready_for_interrupts();
438
439	/*
440	* We enable ftrace here, but since we only support DYNAMIC_FTRACE, it
441	* will only actually get enabled on the boot cpu much later once
442	* ftrace itself has been initialized.
443	*/
444	this_cpu_enable_ftrace();
445
446	udbg_printf(" <- %s()\n", __func__);
447
448	#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
449	/*
450	* This needs to be done last (after the above udbg_printf() even)
451	*
452	* Right after we return from this function, we turn on the MMU
453	* which means the real-mode access trick that btext does will
454	* no longer work, it needs to switch to using a real MMU
455	* mapping. This call will ensure that it does
456	*/
457	btext_map();
458	#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
459	}
460
461	#ifdef CONFIG_SMP
462	void early_setup_secondary(void)
463	{
464	/ Mark interrupts disabled in PACA /
465	irq_soft_mask_set(IRQS_DISABLED);
466
467	/ Initialize the hash table or TLB handling /
468	early_init_mmu_secondary();
469
470	/ Perform any KUP setup that is per-cpu /
471	setup_kup();
472
473	/*
474	* At this point, we can let interrupts switch to virtual mode
475	* (the MMU has been setup), so adjust the MSR in the PACA to
476	* have IR and DR set.
477	*/
478	cpu_ready_for_interrupts();
479	}
480
481	#endif /* CONFIG_SMP */
482
483	void __noreturn panic_smp_self_stop(void)
484	{
485	hard_irq_disable();
486	spin_begin();
487	while (`1`)
488	spin_cpu_relax();
489	}
490
491	#if defined(CONFIG_SMP) \|\| defined(CONFIG_KEXEC_CORE)
492	static bool use_spinloop(void)
493	{
494	if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
495	/*
496	* See comments in head_64.S -- not all platforms insert
497	* secondaries at __secondary_hold and wait at the spin
498	* loop.
499	*/
500	if (firmware_has_feature(FW_FEATURE_OPAL))
501	return false;
502	return true;
503	}
504
505	/*
506	* When book3e boots from kexec, the ePAPR spin table does
507	* not get used.
508	*/
509	return of_property_read_bool(np: of_chosen, propname: "linux,booted-from-kexec");
510	}
511
512	void smp_release_cpus(void)
513	{
514	unsigned long *ptr;
515	int i;
516
517	if (!use_spinloop())
518	return;
519
520	/ All secondary cpus are spinning on a common spinloop, release them*
521	* all now so they can start to spin on their individual paca
522	* spinloops. For non SMP kernels, the secondary cpus never get out
523	* of the common spinloop.
524	*/
525
526	ptr = (unsigned long )((unsigned* long)&__secondary_hold_spinloop
527	- PHYSICAL_START);
528	*ptr = ppc_function_entry(generic_secondary_smp_init);
529
530	/ And wait a bit for them to catch up /
531	for (i = `0`; i < `100000`; i++) {
532	mb();
533	HMT_low();
534	if (spinning_secondaries == `0`)
535	break;
536	udelay(`1`);
537	}
538	pr_debug("spinning_secondaries = %d\n", spinning_secondaries);
539	}
540	#endif /* CONFIG_SMP \|\| CONFIG_KEXEC_CORE */
541
542	/*
543	* Initialize some remaining members of the ppc64_caches and systemcfg
544	* structures
545	* (at least until we get rid of them completely). This is mostly some
546	* cache informations about the CPU that will be used by cache flush
547	* routines and/or provided to userland
548	*/
549
550	static void __init init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize,
551	u32 bsize, u32 sets)
552	{
553	info->size = size;
554	info->sets = sets;
555	info->line_size = lsize;
556	info->block_size = bsize;
557	info->log_block_size = __ilog2(bsize);
558	if (bsize)
559	info->blocks_per_page = PAGE_SIZE / bsize;
560	else
561	info->blocks_per_page = `0`;
562
563	if (sets == `0`)
564	info->assoc = `0xffff`;
565	else
566	info->assoc = size / (sets * lsize);
567	}
568
569	static bool __init parse_cache_info(struct device_node *np,
570	bool icache,
571	struct ppc_cache_info *info)
572	{
573	static const char *ipropnames[] __initdata = {
574	"i-cache-size",
575	"i-cache-sets",
576	"i-cache-block-size",
577	"i-cache-line-size",
578	};
579	static const char *dpropnames[] __initdata = {
580	"d-cache-size",
581	"d-cache-sets",
582	"d-cache-block-size",
583	"d-cache-line-size",
584	};
585	const char **propnames = icache ? ipropnames : dpropnames;
586	const __be32 sizep, lsizep, bsizep, setsp;
587	u32 size, lsize, bsize, sets;
588	bool success = true;
589
590	size = `0`;
591	sets = -`1u`;
592	lsize = bsize = cur_cpu_spec->dcache_bsize;
593	sizep = of_get_property(node: np, name: propnames[`0`], NULL);
594	if (sizep != NULL)
595	size = be32_to_cpu(*sizep);
596	setsp = of_get_property(node: np, name: propnames[`1`], NULL);
597	if (setsp != NULL)
598	sets = be32_to_cpu(*setsp);
599	bsizep = of_get_property(node: np, name: propnames[`2`], NULL);
600	lsizep = of_get_property(node: np, name: propnames[`3`], NULL);
601	if (bsizep == NULL)
602	bsizep = lsizep;
603	if (lsizep == NULL)
604	lsizep = bsizep;
605	if (lsizep != NULL)
606	lsize = be32_to_cpu(*lsizep);
607	if (bsizep != NULL)
608	bsize = be32_to_cpu(*bsizep);
609	if (sizep == NULL \|\| bsizep == NULL \|\| lsizep == NULL)
610	success = false;
611
612	/*
613	* OF is weird .. it represents fully associative caches
614	* as "1 way" which doesn't make much sense and doesn't
615	* leave room for direct mapped. We'll assume that 0
616	* in OF means direct mapped for that reason.
617	*/
618	if (sets == `1`)
619	sets = `0`;
620	else if (sets == `0`)
621	sets = `1`;
622
623	init_cache_info(info, size, lsize, bsize, sets);
624
625	return success;
626	}
627
628	void __init initialize_cache_info(void)
629	{
630	struct device_node cpu = NULL, l2, *l3 = NULL;
631	u32 pvr;
632
633	/*
634	* All shipping POWER8 machines have a firmware bug that
635	* puts incorrect information in the device-tree. This will
636	* be (hopefully) fixed for future chips but for now hard
637	* code the values if we are running on one of these
638	*/
639	pvr = PVR_VER(mfspr(SPRN_PVR));
640	if (pvr == PVR_POWER8 \|\| pvr == PVR_POWER8E \|\|
641	pvr == PVR_POWER8NVL) {
642	/ size lsize blk sets /
643	init_cache_info(info: &ppc64_caches.l1i, size: `0x8000`, lsize: `128`, bsize: `128`, sets: `32`);
644	init_cache_info(info: &ppc64_caches.l1d, size: `0x10000`, lsize: `128`, bsize: `128`, sets: `64`);
645	init_cache_info(info: &ppc64_caches.l2, size: `0x80000`, lsize: `128`, bsize: `0`, sets: `512`);
646	init_cache_info(info: &ppc64_caches.l3, size: `0x800000`, lsize: `128`, bsize: `0`, sets: `8192`);
647	} else
648	cpu = of_find_node_by_type(NULL, type: "cpu");
649
650	/*
651	* We're assuming all of the CPUs have the same
652	* d-cache and i-cache sizes... -Peter
653	*/
654	if (cpu) {
655	if (!parse_cache_info(np: cpu, icache: false, info: &ppc64_caches.l1d))
656	pr_warn("Argh, can't find dcache properties !\n");
657
658	if (!parse_cache_info(np: cpu, icache: true, info: &ppc64_caches.l1i))
659	pr_warn("Argh, can't find icache properties !\n");
660
661	/*
662	* Try to find the L2 and L3 if any. Assume they are
663	* unified and use the D-side properties.
664	*/
665	l2 = of_find_next_cache_node(cpu);
666	of_node_put(node: cpu);
667	if (l2) {
668	parse_cache_info(np: l2, icache: false, info: &ppc64_caches.l2);
669	l3 = of_find_next_cache_node(l2);
670	of_node_put(node: l2);
671	}
672	if (l3) {
673	parse_cache_info(np: l3, icache: false, info: &ppc64_caches.l3);
674	of_node_put(node: l3);
675	}
676	}
677
678	/ For use by binfmt_elf /
679	dcache_bsize = ppc64_caches.l1d.block_size;
680	icache_bsize = ppc64_caches.l1i.block_size;
681
682	cur_cpu_spec->dcache_bsize = dcache_bsize;
683	cur_cpu_spec->icache_bsize = icache_bsize;
684	}
685
686	/*
687	* This returns the limit below which memory accesses to the linear
688	* mapping are guarnateed not to cause an architectural exception (e.g.,
689	* TLB or SLB miss fault).
690	*
691	* This is used to allocate PACAs and various interrupt stacks that
692	* that are accessed early in interrupt handlers that must not cause
693	* re-entrant interrupts.
694	*/
695	__init u64 ppc64_bolted_size(void)
696	{
697	#ifdef CONFIG_PPC_BOOK3E_64
698	/ Freescale BookE bolts the entire linear mapping /
699	/ XXX: BookE ppc64_rma_limit setup seems to disagree? /
700	if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E))
701	return linear_map_top;
702	/ Other BookE, we assume the first GB is bolted /
703	return `1ul` << `30`;
704	#else
705	/ BookS radix, does not take faults on linear mapping /
706	if (early_radix_enabled())
707	return ULONG_MAX;
708
709	/ BookS hash, the first segment is bolted /
710	if (early_mmu_has_feature(MMU_FTR_1T_SEGMENT))
711	return `1UL` << SID_SHIFT_1T;
712	return `1UL` << SID_SHIFT;
713	#endif
714	}
715
716	static void __init alloc_stack(unsigned* long limit, int cpu)
717	{
718	void *ptr;
719
720	BUILD_BUG_ON(STACK_INT_FRAME_SIZE % `16`);
721
722	ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_ALIGN,
723	MEMBLOCK_LOW_LIMIT, max_addr: limit,
724	nid: early_cpu_to_node(cpu));
725	if (!ptr)
726	panic(fmt: "cannot allocate stacks");
727
728	return ptr;
729	}
730
731	void __init irqstack_early_init(void)
732	{
733	u64 limit = ppc64_bolted_size();
734	unsigned int i;
735
736	/*
737	* Interrupt stacks must be in the first segment since we
738	* cannot afford to take SLB misses on them. They are not
739	* accessed in realmode.
740	*/
741	for_each_possible_cpu(i) {
742	softirq_ctx[i] = alloc_stack(limit, i);
743	hardirq_ctx[i] = alloc_stack(limit, i);
744	}
745	}
746
747	#ifdef CONFIG_PPC_BOOK3E_64
748	void __init exc_lvl_early_init(void)
749	{
750	unsigned int i;
751
752	for_each_possible_cpu(i) {
753	void *sp;
754
755	sp = alloc_stack(ULONG_MAX, i);
756	critirq_ctx[i] = sp;
757	paca_ptrs[i]->crit_kstack = sp + THREAD_SIZE;
758
759	sp = alloc_stack(ULONG_MAX, i);
760	dbgirq_ctx[i] = sp;
761	paca_ptrs[i]->dbg_kstack = sp + THREAD_SIZE;
762
763	sp = alloc_stack(ULONG_MAX, i);
764	mcheckirq_ctx[i] = sp;
765	paca_ptrs[i]->mc_kstack = sp + THREAD_SIZE;
766	}
767
768	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
769	patch_exception(`0x040`, exc_debug_debug_book3e);
770	}
771	#endif
772
773	/*
774	* Stack space used when we detect a bad kernel stack pointer, and
775	* early in SMP boots before relocation is enabled. Exclusive emergency
776	* stack for machine checks.
777	*/
778	void __init emergency_stack_init(void)
779	{
780	u64 limit, mce_limit;
781	unsigned int i;
782
783	/*
784	* Emergency stacks must be under 256MB, we cannot afford to take
785	* SLB misses on them. The ABI also requires them to be 128-byte
786	* aligned.
787	*
788	* Since we use these as temporary stacks during secondary CPU
789	* bringup, machine check, system reset, and HMI, we need to get
790	* at them in real mode. This means they must also be within the RMO
791	* region.
792	*
793	* The IRQ stacks allocated elsewhere in this file are zeroed and
794	* initialized in kernel/irq.c. These are initialized here in order
795	* to have emergency stacks available as early as possible.
796	*/
797	limit = mce_limit = min(ppc64_bolted_size(), ppc64_rma_size);
798
799	/*
800	* Machine check on pseries calls rtas, but can't use the static
801	* rtas_args due to a machine check hitting while the lock is held.
802	* rtas args have to be under 4GB, so the machine check stack is
803	* limited to 4GB so args can be put on stack.
804	*/
805	if (firmware_has_feature(FW_FEATURE_LPAR) && mce_limit > SZ_4G)
806	mce_limit = SZ_4G;
807
808	for_each_possible_cpu(i) {
809	paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
810
811	#ifdef CONFIG_PPC_BOOK3S_64
812	/ emergency stack for NMI exception handling. /
813	paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;
814
815	/ emergency stack for machine check exception handling. /
816	paca_ptrs[i]->mc_emergency_sp = alloc_stack(mce_limit, i) + THREAD_SIZE;
817	#endif
818	}
819	}
820
821	#ifdef CONFIG_SMP
822	static int pcpu_cpu_distance(unsigned int from, unsigned int to)
823	{
824	if (early_cpu_to_node(cpu: from) == early_cpu_to_node(cpu: to))
825	return LOCAL_DISTANCE;
826	else
827	return REMOTE_DISTANCE;
828	}
829
830	static __init int pcpu_cpu_to_node(int cpu)
831	{
832	return early_cpu_to_node(cpu);
833	}
834
835	unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
836	EXPORT_SYMBOL(__per_cpu_offset);
837
838	void __init setup_per_cpu_areas(void)
839	{
840	const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
841	size_t atom_size;
842	unsigned long delta;
843	unsigned int cpu;
844	int rc = -EINVAL;
845
846	/*
847	* BookE and BookS radix are historical values and should be revisited.
848	*/
849	if (IS_ENABLED(CONFIG_PPC_BOOK3E_64)) {
850	atom_size = SZ_1M;
851	} else if (radix_enabled()) {
852	atom_size = PAGE_SIZE;
853	} else if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) {
854	/*
855	* Linear mapping is one of 4K, 1M and 16M. For 4K, no need
856	* to group units. For larger mappings, use 1M atom which
857	* should be large enough to contain a number of units.
858	*/
859	if (mmu_linear_psize == MMU_PAGE_4K)
860	atom_size = PAGE_SIZE;
861	else
862	atom_size = SZ_1M;
863	}
864
865	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
866	rc = pcpu_embed_first_chunk(reserved_size: `0`, dyn_size, atom_size, cpu_distance_fn: pcpu_cpu_distance,
867	cpu_to_nd_fn: pcpu_cpu_to_node);
868	if (rc)
869	pr_warn("PERCPU: %s allocator failed (%d), "
870	"falling back to page size\n",
871	pcpu_fc_names[pcpu_chosen_fc], rc);
872	}
873
874	if (rc < `0`)
875	rc = pcpu_page_first_chunk(reserved_size: `0`, cpu_to_nd_fn: pcpu_cpu_to_node);
876	if (rc < `0`)
877	panic(fmt: "cannot initialize percpu area (err=%d)", rc);
878
879	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
880	for_each_possible_cpu(cpu) {
881	__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
882	paca_ptrs[cpu]->data_offset = __per_cpu_offset[cpu];
883	}
884	}
885	#endif
886
887	#ifdef CONFIG_MEMORY_HOTPLUG
888	unsigned long memory_block_size_bytes(void)
889	{
890	if (ppc_md.memory_block_size)
891	return ppc_md.memory_block_size();
892
893	return MIN_MEMORY_BLOCK_SIZE;
894	}
895	#endif
896
897	#if defined(CONFIG_PPC_INDIRECT_PIO) \|\| defined(CONFIG_PPC_INDIRECT_MMIO)
898	struct ppc_pci_io ppc_pci_io;
899	EXPORT_SYMBOL(ppc_pci_io);
900	#endif
901
902	#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
903	u64 hw_nmi_get_sample_period(int watchdog_thresh)
904	{
905	return ppc_proc_freq * watchdog_thresh;
906	}
907	#endif
908
909	/*
910	* The perf based hardlockup detector breaks PMU event based branches, so
911	* disable it by default. Book3S has a soft-nmi hardlockup detector based
912	* on the decrementer interrupt, so it does not suffer from this problem.
913	*
914	* It is likely to get false positives in KVM guests, so disable it there
915	* by default too. PowerVM will not stop or arbitrarily oversubscribe
916	* CPUs, but give a minimum regular allotment even with SPLPAR, so enable
917	* the detector for non-KVM guests, assume PowerVM.
918	*/
919	static int __init disable_hardlockup_detector(void)
920	{
921	#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
922	hardlockup_detector_disable();
923	#else
924	if (firmware_has_feature(FW_FEATURE_LPAR)) {
925	if (is_kvm_guest())
926	hardlockup_detector_disable();
927	}
928	#endif
929
930	return `0`;
931	}
932	early_initcall(disable_hardlockup_detector);
933

source code of linux/arch/powerpc/kernel/setup_64.c