core.c source code [linux/arch/x86/kernel/cpu/mce/core.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Machine check handler.
4	*
5	* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
6	* Rest from unknown author(s).
7	* 2004 Andi Kleen. Rewrote most of it.
8	* Copyright 2008 Intel Corporation
9	* Author: Andi Kleen
10	*/
11
12	#include <linux/thread_info.h>
13	#include <linux/capability.h>
14	#include <linux/miscdevice.h>
15	#include <linux/ratelimit.h>
16	#include <linux/rcupdate.h>
17	#include <linux/kobject.h>
18	#include <linux/uaccess.h>
19	#include <linux/kdebug.h>
20	#include <linux/kernel.h>
21	#include <linux/percpu.h>
22	#include <linux/string.h>
23	#include <linux/device.h>
24	#include <linux/syscore_ops.h>
25	#include <linux/delay.h>
26	#include <linux/ctype.h>
27	#include <linux/sched.h>
28	#include <linux/sysfs.h>
29	#include <linux/types.h>
30	#include <linux/slab.h>
31	#include <linux/init.h>
32	#include <linux/kmod.h>
33	#include <linux/poll.h>
34	#include <linux/nmi.h>
35	#include <linux/cpu.h>
36	#include <linux/ras.h>
37	#include <linux/smp.h>
38	#include <linux/fs.h>
39	#include <linux/mm.h>
40	#include <linux/debugfs.h>
41	#include <linux/irq_work.h>
42	#include <linux/export.h>
43	#include <linux/set_memory.h>
44	#include <linux/sync_core.h>
45	#include <linux/task_work.h>
46	#include <linux/hardirq.h>
47
48	#include <asm/intel-family.h>
49	#include <asm/processor.h>
50	#include <asm/traps.h>
51	#include <asm/tlbflush.h>
52	#include <asm/mce.h>
53	#include <asm/msr.h>
54	#include <asm/reboot.h>
55
56	#include "internal.h"
57
58	/ sysfs synchronization /
59	static DEFINE_MUTEX(mce_sysfs_mutex);
60
61	#define CREATE_TRACE_POINTS
62	#include <trace/events/mce.h>
63
64	#define SPINUNIT 100 /* 100ns */
65
66	DEFINE_PER_CPU(unsigned, mce_exception_count);
67
68	DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
69
70	DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
71
72	#define ATTR_LEN 16
73	/ One object for each MCE bank, shared by all CPUs /
74	struct mce_bank_dev {
75	struct device_attribute attr; / device attribute /
76	char attrname[ATTR_LEN]; / attribute name /
77	u8 bank; / bank number /
78	};
79	static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
80
81	struct mce_vendor_flags mce_flags __read_mostly;
82
83	struct mca_config mca_cfg __read_mostly = {
84	.bootlog = -`1`,
85	.monarch_timeout = -`1`
86	};
87
88	static DEFINE_PER_CPU(struct mce, mces_seen);
89	static unsigned long mce_need_notify;
90
91	/*
92	* MCA banks polled by the period polling timer for corrected events.
93	* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
94	*/
95	DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
96	[`0` ... BITS_TO_LONGS(MAX_NR_BANKS)-`1`] = ~`0UL`
97	};
98
99	/*
100	* MCA banks controlled through firmware first for corrected errors.
101	* This is a global list of banks for which we won't enable CMCI and we
102	* won't poll. Firmware controls these banks and is responsible for
103	* reporting corrected errors through GHES. Uncorrected/recoverable
104	* errors are still notified through a machine check.
105	*/
106	mce_banks_t mce_banks_ce_disabled;
107
108	static struct work_struct mce_work;
109	static struct irq_work mce_irq_work;
110
111	/*
112	* CPU/chipset specific EDAC code can register a notifier call here to print
113	* MCE errors in a human-readable form.
114	*/
115	BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
116
117	/ Do initial initialization of a struct mce /
118	void mce_setup(struct mce *m)
119	{
120	memset(m, `0`, sizeof(struct mce));
121	m->cpu = m->extcpu = smp_processor_id();
122	/ need the internal __ version to avoid deadlocks /
123	m->time = __ktime_get_real_seconds();
124	m->cpuvendor = boot_cpu_data.x86_vendor;
125	m->cpuid = cpuid_eax(op: `1`);
126	m->socketid = cpu_data(m->extcpu).topo.pkg_id;
127	m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
128	m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
129	m->ppin = cpu_data(m->extcpu).ppin;
130	m->microcode = boot_cpu_data.microcode;
131	}
132
133	DEFINE_PER_CPU(struct mce, injectm);
134	EXPORT_PER_CPU_SYMBOL_GPL(injectm);
135
136	void mce_log(struct mce *m)
137	{
138	if (!mce_gen_pool_add(mce: m))
139	irq_work_queue(work: &mce_irq_work);
140	}
141	EXPORT_SYMBOL_GPL(mce_log);
142
143	void mce_register_decode_chain(struct notifier_block *nb)
144	{
145	if (WARN_ON(nb->priority < MCE_PRIO_LOWEST \|\|
146	nb->priority > MCE_PRIO_HIGHEST))
147	return;
148
149	blocking_notifier_chain_register(nh: &x86_mce_decoder_chain, nb);
150	}
151	EXPORT_SYMBOL_GPL(mce_register_decode_chain);
152
153	void mce_unregister_decode_chain(struct notifier_block *nb)
154	{
155	blocking_notifier_chain_unregister(nh: &x86_mce_decoder_chain, nb);
156	}
157	EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
158
159	static void __print_mce(struct mce *m)
160	{
161	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
162	m->extcpu,
163	(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
164	m->mcgstatus, m->bank, m->status);
165
166	if (m->ip) {
167	pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
168	!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
169	m->cs, m->ip);
170
171	if (m->cs == __KERNEL_CS)
172	pr_cont("{%pS}", (void )(unsigned* long)m->ip);
173	pr_cont("\n");
174	}
175
176	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
177	if (m->addr)
178	pr_cont("ADDR %llx ", m->addr);
179	if (m->misc)
180	pr_cont("MISC %llx ", m->misc);
181	if (m->ppin)
182	pr_cont("PPIN %llx ", m->ppin);
183
184	if (mce_flags.smca) {
185	if (m->synd)
186	pr_cont("SYND %llx ", m->synd);
187	if (m->ipid)
188	pr_cont("IPID %llx ", m->ipid);
189	}
190
191	pr_cont("\n");
192
193	/*
194	* Note this output is parsed by external tools and old fields
195	* should not be changed.
196	*/
197	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
198	m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
199	m->microcode);
200	}
201
202	static void print_mce(struct mce *m)
203	{
204	__print_mce(m);
205
206	if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
207	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
208	}
209
210	#define PANIC_TIMEOUT 5 /* 5 seconds */
211
212	static atomic_t mce_panicked;
213
214	static int fake_panic;
215	static atomic_t mce_fake_panicked;
216
217	/ Panic in progress. Enable interrupts and wait for final IPI /
218	static void wait_for_panic(void)
219	{
220	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
221
222	preempt_disable();
223	local_irq_enable();
224	while (timeout-- > `0`)
225	udelay(`1`);
226	if (panic_timeout == `0`)
227	panic_timeout = mca_cfg.panic_timeout;
228	panic(fmt: "Panicing machine check CPU died");
229	}
230
231	static noinstr void mce_panic(const char msg, struct* mce final, char* *exp)
232	{
233	struct llist_node *pending;
234	struct mce_evt_llist *l;
235	int apei_err = `0`;
236
237	/*
238	* Allow instrumentation around external facilities usage. Not that it
239	* matters a whole lot since the machine is going to panic anyway.
240	*/
241	instrumentation_begin();
242
243	if (!fake_panic) {
244	/*
245	* Make sure only one CPU runs in machine check panic
246	*/
247	if (atomic_inc_return(v: &mce_panicked) > `1`)
248	wait_for_panic();
249	barrier();
250
251	bust_spinlocks(yes: `1`);
252	console_verbose();
253	} else {
254	/ Don't log too much for fake panic /
255	if (atomic_inc_return(v: &mce_fake_panicked) > `1`)
256	goto out;
257	}
258	pending = mce_gen_pool_prepare_records();
259	/ First print corrected ones that are still unlogged /
260	llist_for_each_entry(l, pending, llnode) {
261	struct mce *m = &l->mce;
262	if (!(m->status & MCI_STATUS_UC)) {
263	print_mce(m);
264	if (!apei_err)
265	apei_err = apei_write_mce(m);
266	}
267	}
268	/ Now print uncorrected but with the final one last /
269	llist_for_each_entry(l, pending, llnode) {
270	struct mce *m = &l->mce;
271	if (!(m->status & MCI_STATUS_UC))
272	continue;
273	if (!final \|\| mce_cmp(m1: m, m2: final)) {
274	print_mce(m);
275	if (!apei_err)
276	apei_err = apei_write_mce(m);
277	}
278	}
279	if (final) {
280	print_mce(m: final);
281	if (!apei_err)
282	apei_err = apei_write_mce(m: final);
283	}
284	if (exp)
285	pr_emerg(HW_ERR "Machine check: %s\n", exp);
286	if (!fake_panic) {
287	if (panic_timeout == `0`)
288	panic_timeout = mca_cfg.panic_timeout;
289	panic(fmt: msg);
290	} else
291	pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
292
293	out:
294	instrumentation_end();
295	}
296
297	/ Support code for software error injection /
298
299	static int msr_to_offset(u32 msr)
300	{
301	unsigned bank = __this_cpu_read(injectm.bank);
302
303	if (msr == mca_cfg.rip_msr)
304	return offsetof(struct mce, ip);
305	if (msr == mca_msr_reg(bank, reg: MCA_STATUS))
306	return offsetof(struct mce, status);
307	if (msr == mca_msr_reg(bank, reg: MCA_ADDR))
308	return offsetof(struct mce, addr);
309	if (msr == mca_msr_reg(bank, reg: MCA_MISC))
310	return offsetof(struct mce, misc);
311	if (msr == MSR_IA32_MCG_STATUS)
312	return offsetof(struct mce, mcgstatus);
313	return -`1`;
314	}
315
316	void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
317	{
318	if (wrmsr) {
319	pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
320	(unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
321	regs->ip, (void *)regs->ip);
322	} else {
323	pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
324	(unsigned int)regs->cx, regs->ip, (void *)regs->ip);
325	}
326
327	show_stack_regs(regs);
328
329	panic(fmt: "MCA architectural violation!\n");
330
331	while (true)
332	cpu_relax();
333	}
334
335	/ MSR access wrappers used for error injection /
336	noinstr u64 mce_rdmsrl(u32 msr)
337	{
338	DECLARE_ARGS(val, low, high);
339
340	if (__this_cpu_read(injectm.finished)) {
341	int offset;
342	u64 ret;
343
344	instrumentation_begin();
345
346	offset = msr_to_offset(msr);
347	if (offset < `0`)
348	ret = `0`;
349	else
350	ret = (u64 )((char *)this_cpu_ptr(&injectm) + offset);
351
352	instrumentation_end();
353
354	return ret;
355	}
356
357	/*
358	* RDMSR on MCA MSRs should not fault. If they do, this is very much an
359	* architectural violation and needs to be reported to hw vendor. Panic
360	* the box to not allow any further progress.
361	*/
362	asm volatile("1: rdmsr\n"
363	"2:\n"
364	_ASM_EXTABLE_TYPE(`1b`, `2b`, EX_TYPE_RDMSR_IN_MCE)
365	: EAX_EDX_RET(val, low, high) : "c" (msr));
366
367
368	return EAX_EDX_VAL(val, low, high);
369	}
370
371	static noinstr void mce_wrmsrl(u32 msr, u64 v)
372	{
373	u32 low, high;
374
375	if (__this_cpu_read(injectm.finished)) {
376	int offset;
377
378	instrumentation_begin();
379
380	offset = msr_to_offset(msr);
381	if (offset >= `0`)
382	(u64 )((char *)this_cpu_ptr(&injectm) + offset) = v;
383
384	instrumentation_end();
385
386	return;
387	}
388
389	low = (u32)v;
390	high = (u32)(v >> `32`);
391
392	/ See comment in mce_rdmsrl() /
393	asm volatile("1: wrmsr\n"
394	"2:\n"
395	_ASM_EXTABLE_TYPE(`1b`, `2b`, EX_TYPE_WRMSR_IN_MCE)
396	: : "c" (msr), "a"(low), "d" (high) : "memory");
397	}
398
399	/*
400	* Collect all global (w.r.t. this processor) status about this machine
401	* check into our "mce" struct so that we can use it later to assess
402	* the severity of the problem as we read per-bank specific details.
403	*/
404	static noinstr void mce_gather_info(struct mce m, struct* pt_regs *regs)
405	{
406	/*
407	* Enable instrumentation around mce_setup() which calls external
408	* facilities.
409	*/
410	instrumentation_begin();
411	mce_setup(m);
412	instrumentation_end();
413
414	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
415	if (regs) {
416	/*
417	* Get the address of the instruction at the time of
418	* the machine check error.
419	*/
420	if (m->mcgstatus & (MCG_STATUS_RIPV\|MCG_STATUS_EIPV)) {
421	m->ip = regs->ip;
422	m->cs = regs->cs;
423
424	/*
425	* When in VM86 mode make the cs look like ring 3
426	* always. This is a lie, but it's better than passing
427	* the additional vm86 bit around everywhere.
428	*/
429	if (v8086_mode(regs))
430	m->cs \|= `3`;
431	}
432	/ Use accurate RIP reporting if available. /
433	if (mca_cfg.rip_msr)
434	m->ip = mce_rdmsrl(msr: mca_cfg.rip_msr);
435	}
436	}
437
438	int mce_available(struct cpuinfo_x86 *c)
439	{
440	if (mca_cfg.disabled)
441	return `0`;
442	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
443	}
444
445	static void mce_schedule_work(void)
446	{
447	if (!mce_gen_pool_empty())
448	schedule_work(work: &mce_work);
449	}
450
451	static void mce_irq_work_cb(struct irq_work *entry)
452	{
453	mce_schedule_work();
454	}
455
456	bool mce_usable_address(struct mce *m)
457	{
458	if (!(m->status & MCI_STATUS_ADDRV))
459	return false;
460
461	switch (m->cpuvendor) {
462	case X86_VENDOR_AMD:
463	return amd_mce_usable_address(m);
464
465	case X86_VENDOR_INTEL:
466	case X86_VENDOR_ZHAOXIN:
467	return intel_mce_usable_address(m);
468
469	default:
470	return true;
471	}
472	}
473	EXPORT_SYMBOL_GPL(mce_usable_address);
474
475	bool mce_is_memory_error(struct mce *m)
476	{
477	switch (m->cpuvendor) {
478	case X86_VENDOR_AMD:
479	case X86_VENDOR_HYGON:
480	return amd_mce_is_memory_error(m);
481
482	case X86_VENDOR_INTEL:
483	case X86_VENDOR_ZHAOXIN:
484	/*
485	* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
486	*
487	* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
488	* indicating a memory error. Bit 8 is used for indicating a
489	* cache hierarchy error. The combination of bit 2 and bit 3
490	* is used for indicating a `generic' cache hierarchy error
491	* But we can't just blindly check the above bits, because if
492	* bit 11 is set, then it is a bus/interconnect error - and
493	* either way the above bits just gives more detail on what
494	* bus/interconnect error happened. Note that bit 12 can be
495	* ignored, as it's the "filter" bit.
496	*/
497	return (m->status & `0xef80`) == BIT(`7`) \|\|
498	(m->status & `0xef00`) == BIT(`8`) \|\|
499	(m->status & `0xeffc`) == `0xc`;
500
501	default:
502	return false;
503	}
504	}
505	EXPORT_SYMBOL_GPL(mce_is_memory_error);
506
507	static bool whole_page(struct mce *m)
508	{
509	if (!mca_cfg.ser \|\| !(m->status & MCI_STATUS_MISCV))
510	return true;
511
512	return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
513	}
514
515	bool mce_is_correctable(struct mce *m)
516	{
517	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
518	return false;
519
520	if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
521	return false;
522
523	if (m->status & MCI_STATUS_UC)
524	return false;
525
526	return true;
527	}
528	EXPORT_SYMBOL_GPL(mce_is_correctable);
529
530	static int mce_early_notifier(struct notifier_block nb, unsigned* long val,
531	void *data)
532	{
533	struct mce m = (struct* mce *)data;
534
535	if (!m)
536	return NOTIFY_DONE;
537
538	/ Emit the trace record: /
539	trace_mce_record(m);
540
541	set_bit(nr: `0`, addr: &mce_need_notify);
542
543	mce_notify_irq();
544
545	return NOTIFY_DONE;
546	}
547
548	static struct notifier_block early_nb = {
549	.notifier_call = mce_early_notifier,
550	.priority = MCE_PRIO_EARLY,
551	};
552
553	static int uc_decode_notifier(struct notifier_block nb, unsigned* long val,
554	void *data)
555	{
556	struct mce mce = (struct* mce *)data;
557	unsigned long pfn;
558
559	if (!mce \|\| !mce_usable_address(mce))
560	return NOTIFY_DONE;
561
562	if (mce->severity != MCE_AO_SEVERITY &&
563	mce->severity != MCE_DEFERRED_SEVERITY)
564	return NOTIFY_DONE;
565
566	pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
567	if (!memory_failure(pfn, flags: `0`)) {
568	set_mce_nospec(pfn);
569	mce->kflags \|= MCE_HANDLED_UC;
570	}
571
572	return NOTIFY_OK;
573	}
574
575	static struct notifier_block mce_uc_nb = {
576	.notifier_call = uc_decode_notifier,
577	.priority = MCE_PRIO_UC,
578	};
579
580	static int mce_default_notifier(struct notifier_block nb, unsigned* long val,
581	void *data)
582	{
583	struct mce m = (struct* mce *)data;
584
585	if (!m)
586	return NOTIFY_DONE;
587
588	if (mca_cfg.print_all \|\| !m->kflags)
589	__print_mce(m);
590
591	return NOTIFY_DONE;
592	}
593
594	static struct notifier_block mce_default_nb = {
595	.notifier_call = mce_default_notifier,
596	/ lowest prio, we want it to run last. /
597	.priority = MCE_PRIO_LOWEST,
598	};
599
600	/*
601	* Read ADDR and MISC registers.
602	*/
603	static noinstr void mce_read_aux(struct mce m, int* i)
604	{
605	if (m->status & MCI_STATUS_MISCV)
606	m->misc = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_MISC));
607
608	if (m->status & MCI_STATUS_ADDRV) {
609	m->addr = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_ADDR));
610
611	/*
612	* Mask the reported address by the reported granularity.
613	*/
614	if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
615	u8 shift = MCI_MISC_ADDR_LSB(m->misc);
616	m->addr >>= shift;
617	m->addr <<= shift;
618	}
619
620	smca_extract_err_addr(m);
621	}
622
623	if (mce_flags.smca) {
624	m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
625
626	if (m->status & MCI_STATUS_SYNDV)
627	m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
628	}
629	}
630
631	DEFINE_PER_CPU(unsigned, mce_poll_count);
632
633	/*
634	* Poll for corrected events or events that happened before reset.
635	* Those are just logged through /dev/mcelog.
636	*
637	* This is executed in standard interrupt context.
638	*
639	* Note: spec recommends to panic for fatal unsignalled
640	* errors here. However this would be quite problematic --
641	* we would need to reimplement the Monarch handling and
642	* it would mess up the exclusion between exception handler
643	* and poll handler -- * so we skip this for now.
644	* These cases should not happen anyways, or only when the CPU
645	* is already totally * confused. In this case it's likely it will
646	* not fully execute the machine check handler either.
647	*/
648	bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
649	{
650	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
651	bool error_seen = false;
652	struct mce m;
653	int i;
654
655	this_cpu_inc(mce_poll_count);
656
657	mce_gather_info(m: &m, NULL);
658
659	if (flags & MCP_TIMESTAMP)
660	m.tsc = rdtsc();
661
662	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
663	if (!mce_banks[i].ctl \|\| !test_bit(i, *b))
664	continue;
665
666	m.misc = `0`;
667	m.addr = `0`;
668	m.bank = i;
669
670	barrier();
671	m.status = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
672
673	/ If this entry is not valid, ignore it /
674	if (!(m.status & MCI_STATUS_VAL))
675	continue;
676
677	/*
678	* If we are logging everything (at CPU online) or this
679	* is a corrected error, then we must log it.
680	*/
681	if ((flags & MCP_UC) \|\| !(m.status & MCI_STATUS_UC))
682	goto log_it;
683
684	/*
685	* Newer Intel systems that support software error
686	* recovery need to make additional checks. Other
687	* CPUs should skip over uncorrected errors, but log
688	* everything else.
689	*/
690	if (!mca_cfg.ser) {
691	if (m.status & MCI_STATUS_UC)
692	continue;
693	goto log_it;
694	}
695
696	/ Log "not enabled" (speculative) errors /
697	if (!(m.status & MCI_STATUS_EN))
698	goto log_it;
699
700	/*
701	* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
702	* UC == 1 && PCC == 0 && S == 0
703	*/
704	if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
705	goto log_it;
706
707	/*
708	* Skip anything else. Presumption is that our read of this
709	* bank is racing with a machine check. Leave the log alone
710	* for do_machine_check() to deal with it.
711	*/
712	continue;
713
714	log_it:
715	error_seen = true;
716
717	if (flags & MCP_DONTLOG)
718	goto clear_it;
719
720	mce_read_aux(m: &m, i);
721	m.severity = mce_severity(a: &m, NULL, NULL, is_excp: false);
722	/*
723	* Don't get the IP here because it's unlikely to
724	* have anything to do with the actual error location.
725	*/
726
727	if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
728	goto clear_it;
729
730	if (flags & MCP_QUEUE_LOG)
731	mce_gen_pool_add(mce: &m);
732	else
733	mce_log(&m);
734
735	clear_it:
736	/*
737	* Clear state for this bank.
738	*/
739	mce_wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), v: `0`);
740	}
741
742	/*
743	* Don't clear MCG_STATUS here because it's only defined for
744	* exceptions.
745	*/
746
747	sync_core();
748
749	return error_seen;
750	}
751	EXPORT_SYMBOL_GPL(machine_check_poll);
752
753	/*
754	* During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
755	* EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
756	* Vol 3B Table 15-20). But this confuses both the code that determines
757	* whether the machine check occurred in kernel or user mode, and also
758	* the severity assessment code. Pretend that EIPV was set, and take the
759	* ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
760	*/
761	static __always_inline void
762	quirk_sandybridge_ifu(int bank, struct mce m, struct* pt_regs *regs)
763	{
764	if (bank != `0`)
765	return;
766	if ((m->mcgstatus & (MCG_STATUS_EIPV\|MCG_STATUS_RIPV)) != `0`)
767	return;
768	if ((m->status & (MCI_STATUS_OVER\|MCI_STATUS_UC\|
769	MCI_STATUS_EN\|MCI_STATUS_MISCV\|MCI_STATUS_ADDRV\|
770	MCI_STATUS_PCC\|MCI_STATUS_S\|MCI_STATUS_AR\|
771	MCACOD)) !=
772	(MCI_STATUS_UC\|MCI_STATUS_EN\|
773	MCI_STATUS_MISCV\|MCI_STATUS_ADDRV\|MCI_STATUS_S\|
774	MCI_STATUS_AR\|MCACOD_INSTR))
775	return;
776
777	m->mcgstatus \|= MCG_STATUS_EIPV;
778	m->ip = regs->ip;
779	m->cs = regs->cs;
780	}
781
782	/*
783	* Disable fast string copy and return from the MCE handler upon the first SRAR
784	* MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
785	* CPUs.
786	* The fast string copy instructions ("REP; MOVS*") could consume an
787	* uncorrectable memory error in the cache line _right after_ the desired region
788	* to copy and raise an MCE with RIP pointing to the instruction _after_ the
789	* "REP; MOVS*".
790	* This mitigation addresses the issue completely with the caveat of performance
791	* degradation on the CPU affected. This is still better than the OS crashing on
792	* MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
793	* kernel context (e.g., copy_page).
794	*
795	* Returns true when fast string copy on CPU has been disabled.
796	*/
797	static noinstr bool quirk_skylake_repmov(void)
798	{
799	u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
800	u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
801	u64 mc1_status;
802
803	/*
804	* Apply the quirk only to local machine checks, i.e., no broadcast
805	* sync is needed.
806	*/
807	if (!(mcgstatus & MCG_STATUS_LMCES) \|\|
808	!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
809	return false;
810
811	mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(`1`));
812
813	/ Check for a software-recoverable data fetch error. /
814	if ((mc1_status &
815	(MCI_STATUS_VAL \| MCI_STATUS_OVER \| MCI_STATUS_UC \| MCI_STATUS_EN \|
816	MCI_STATUS_ADDRV \| MCI_STATUS_MISCV \| MCI_STATUS_PCC \|
817	MCI_STATUS_AR \| MCI_STATUS_S)) ==
818	(MCI_STATUS_VAL \| MCI_STATUS_UC \| MCI_STATUS_EN \|
819	MCI_STATUS_ADDRV \| MCI_STATUS_MISCV \|
820	MCI_STATUS_AR \| MCI_STATUS_S)) {
821	misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
822	mce_wrmsrl(MSR_IA32_MISC_ENABLE, v: misc_enable);
823	mce_wrmsrl(MSR_IA32_MCx_STATUS(`1`), v: `0`);
824
825	instrumentation_begin();
826	pr_err_once("Erratum detected, disable fast string copy instructions.\n");
827	instrumentation_end();
828
829	return true;
830	}
831
832	return false;
833	}
834
835	/*
836	* Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
837	* errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
838	*
839	* However, the context is still valid, so save the "cs" register for later use.
840	*
841	* The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
842	*
843	* The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
844	*/
845	static __always_inline void quirk_zen_ifu(int bank, struct mce m, struct* pt_regs *regs)
846	{
847	if (bank != `1`)
848	return;
849	if (!(m->status & MCI_STATUS_POISON))
850	return;
851
852	m->cs = regs->cs;
853	}
854
855	/*
856	* Do a quick check if any of the events requires a panic.
857	* This decides if we keep the events around or clear them.
858	*/
859	static __always_inline int mce_no_way_out(struct mce m, char* *msg, unsigned* long *validp,
860	struct pt_regs *regs)
861	{
862	char tmp = msg;
863	int i;
864
865	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
866	m->status = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
867	if (!(m->status & MCI_STATUS_VAL))
868	continue;
869
870	arch___set_bit(nr: i, addr: validp);
871	if (mce_flags.snb_ifu_quirk)
872	quirk_sandybridge_ifu(bank: i, m, regs);
873
874	if (mce_flags.zen_ifu_quirk)
875	quirk_zen_ifu(bank: i, m, regs);
876
877	m->bank = i;
878	if (mce_severity(a: m, regs, msg: &tmp, is_excp: true) >= MCE_PANIC_SEVERITY) {
879	mce_read_aux(m, i);
880	*msg = tmp;
881	return `1`;
882	}
883	}
884	return `0`;
885	}
886
887	/*
888	* Variable to establish order between CPUs while scanning.
889	* Each CPU spins initially until executing is equal its number.
890	*/
891	static atomic_t mce_executing;
892
893	/*
894	* Defines order of CPUs on entry. First CPU becomes Monarch.
895	*/
896	static atomic_t mce_callin;
897
898	/*
899	* Track which CPUs entered the MCA broadcast synchronization and which not in
900	* order to print holdouts.
901	*/
902	static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
903
904	/*
905	* Check if a timeout waiting for other CPUs happened.
906	*/
907	static noinstr int mce_timed_out(u64 t, const* char *msg)
908	{
909	int ret = `0`;
910
911	/ Enable instrumentation around calls to external facilities /
912	instrumentation_begin();
913
914	/*
915	* The others already did panic for some reason.
916	* Bail out like in a timeout.
917	* rmb() to tell the compiler that system_state
918	* might have been modified by someone else.
919	*/
920	rmb();
921	if (atomic_read(v: &mce_panicked))
922	wait_for_panic();
923	if (!mca_cfg.monarch_timeout)
924	goto out;
925	if ((s64)*t < SPINUNIT) {
926	if (cpumask_and(dstp: &mce_missing_cpus, cpu_online_mask, src2p: &mce_missing_cpus))
927	pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
928	cpumask_pr_args(&mce_missing_cpus));
929	mce_panic(msg, NULL, NULL);
930
931	ret = `1`;
932	goto out;
933	}
934	*t -= SPINUNIT;
935
936	out:
937	touch_nmi_watchdog();
938
939	instrumentation_end();
940
941	return ret;
942	}
943
944	/*
945	* The Monarch's reign. The Monarch is the CPU who entered
946	* the machine check handler first. It waits for the others to
947	* raise the exception too and then grades them. When any
948	* error is fatal panic. Only then let the others continue.
949	*
950	* The other CPUs entering the MCE handler will be controlled by the
951	* Monarch. They are called Subjects.
952	*
953	* This way we prevent any potential data corruption in a unrecoverable case
954	* and also makes sure always all CPU's errors are examined.
955	*
956	* Also this detects the case of a machine check event coming from outer
957	* space (not detected by any CPUs) In this case some external agent wants
958	* us to shut down, so panic too.
959	*
960	* The other CPUs might still decide to panic if the handler happens
961	* in a unrecoverable place, but in this case the system is in a semi-stable
962	* state and won't corrupt anything by itself. It's ok to let the others
963	* continue for a bit first.
964	*
965	* All the spin loops have timeouts; when a timeout happens a CPU
966	* typically elects itself to be Monarch.
967	*/
968	static void mce_reign(void)
969	{
970	int cpu;
971	struct mce *m = NULL;
972	int global_worst = `0`;
973	char *msg = NULL;
974
975	/*
976	* This CPU is the Monarch and the other CPUs have run
977	* through their handlers.
978	* Grade the severity of the errors of all the CPUs.
979	*/
980	for_each_possible_cpu(cpu) {
981	struct mce *mtmp = &per_cpu(mces_seen, cpu);
982
983	if (mtmp->severity > global_worst) {
984	global_worst = mtmp->severity;
985	m = &per_cpu(mces_seen, cpu);
986	}
987	}
988
989	/*
990	* Cannot recover? Panic here then.
991	* This dumps all the mces in the log buffer and stops the
992	* other CPUs.
993	*/
994	if (m && global_worst >= MCE_PANIC_SEVERITY) {
995	/ call mce_severity() to get "msg" for panic /
996	mce_severity(a: m, NULL, msg: &msg, is_excp: true);
997	mce_panic(msg: "Fatal machine check", final: m, exp: msg);
998	}
999
1000	/*
1001	* For UC somewhere we let the CPU who detects it handle it.
1002	* Also must let continue the others, otherwise the handling
1003	* CPU could deadlock on a lock.
1004	*/
1005
1006	/*
1007	* No machine check event found. Must be some external
1008	* source or one CPU is hung. Panic.
1009	*/
1010	if (global_worst <= MCE_KEEP_SEVERITY)
1011	mce_panic(msg: "Fatal machine check from unknown source", NULL, NULL);
1012
1013	/*
1014	* Now clear all the mces_seen so that they don't reappear on
1015	* the next mce.
1016	*/
1017	for_each_possible_cpu(cpu)
1018	memset(&per_cpu(mces_seen, cpu), `0`, sizeof(struct mce));
1019	}
1020
1021	static atomic_t global_nwo;
1022
1023	/*
1024	* Start of Monarch synchronization. This waits until all CPUs have
1025	* entered the exception handler and then determines if any of them
1026	* saw a fatal event that requires panic. Then it executes them
1027	* in the entry order.
1028	* TBD double check parallel CPU hotunplug
1029	*/
1030	static noinstr int mce_start(int *no_way_out)
1031	{
1032	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1033	int order, ret = -`1`;
1034
1035	if (!timeout)
1036	return ret;
1037
1038	raw_atomic_add(i: *no_way_out, v: &global_nwo);
1039	/*
1040	* Rely on the implied barrier below, such that global_nwo
1041	* is updated before mce_callin.
1042	*/
1043	order = raw_atomic_inc_return(v: &mce_callin);
1044	arch_cpumask_clear_cpu(smp_processor_id(), dstp: &mce_missing_cpus);
1045
1046	/ Enable instrumentation around calls to external facilities /
1047	instrumentation_begin();
1048
1049	/*
1050	* Wait for everyone.
1051	*/
1052	while (raw_atomic_read(v: &mce_callin) != num_online_cpus()) {
1053	if (mce_timed_out(t: &timeout,
1054	msg: "Timeout: Not all CPUs entered broadcast exception handler")) {
1055	raw_atomic_set(v: &global_nwo, i: `0`);
1056	goto out;
1057	}
1058	ndelay(SPINUNIT);
1059	}
1060
1061	/*
1062	* mce_callin should be read before global_nwo
1063	*/
1064	smp_rmb();
1065
1066	if (order == `1`) {
1067	/*
1068	* Monarch: Starts executing now, the others wait.
1069	*/
1070	raw_atomic_set(v: &mce_executing, i: `1`);
1071	} else {
1072	/*
1073	* Subject: Now start the scanning loop one by one in
1074	* the original callin order.
1075	* This way when there are any shared banks it will be
1076	* only seen by one CPU before cleared, avoiding duplicates.
1077	*/
1078	while (raw_atomic_read(v: &mce_executing) < order) {
1079	if (mce_timed_out(t: &timeout,
1080	msg: "Timeout: Subject CPUs unable to finish machine check processing")) {
1081	raw_atomic_set(v: &global_nwo, i: `0`);
1082	goto out;
1083	}
1084	ndelay(SPINUNIT);
1085	}
1086	}
1087
1088	/*
1089	* Cache the global no_way_out state.
1090	*/
1091	*no_way_out = raw_atomic_read(v: &global_nwo);
1092
1093	ret = order;
1094
1095	out:
1096	instrumentation_end();
1097
1098	return ret;
1099	}
1100
1101	/*
1102	* Synchronize between CPUs after main scanning loop.
1103	* This invokes the bulk of the Monarch processing.
1104	*/
1105	static noinstr int mce_end(int order)
1106	{
1107	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1108	int ret = -`1`;
1109
1110	/ Allow instrumentation around external facilities. /
1111	instrumentation_begin();
1112
1113	if (!timeout)
1114	goto reset;
1115	if (order < `0`)
1116	goto reset;
1117
1118	/*
1119	* Allow others to run.
1120	*/
1121	atomic_inc(v: &mce_executing);
1122
1123	if (order == `1`) {
1124	/*
1125	* Monarch: Wait for everyone to go through their scanning
1126	* loops.
1127	*/
1128	while (atomic_read(v: &mce_executing) <= num_online_cpus()) {
1129	if (mce_timed_out(t: &timeout,
1130	msg: "Timeout: Monarch CPU unable to finish machine check processing"))
1131	goto reset;
1132	ndelay(SPINUNIT);
1133	}
1134
1135	mce_reign();
1136	barrier();
1137	ret = `0`;
1138	} else {
1139	/*
1140	* Subject: Wait for Monarch to finish.
1141	*/
1142	while (atomic_read(v: &mce_executing) != `0`) {
1143	if (mce_timed_out(t: &timeout,
1144	msg: "Timeout: Monarch CPU did not finish machine check processing"))
1145	goto reset;
1146	ndelay(SPINUNIT);
1147	}
1148
1149	/*
1150	* Don't reset anything. That's done by the Monarch.
1151	*/
1152	ret = `0`;
1153	goto out;
1154	}
1155
1156	/*
1157	* Reset all global state.
1158	*/
1159	reset:
1160	atomic_set(v: &global_nwo, i: `0`);
1161	atomic_set(v: &mce_callin, i: `0`);
1162	cpumask_setall(dstp: &mce_missing_cpus);
1163	barrier();
1164
1165	/*
1166	* Let others run again.
1167	*/
1168	atomic_set(v: &mce_executing, i: `0`);
1169
1170	out:
1171	instrumentation_end();
1172
1173	return ret;
1174	}
1175
1176	static __always_inline void mce_clear_state(unsigned long *toclear)
1177	{
1178	int i;
1179
1180	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
1181	if (arch_test_bit(nr: i, addr: toclear))
1182	mce_wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), v: `0`);
1183	}
1184	}
1185
1186	/*
1187	* Cases where we avoid rendezvous handler timeout:
1188	* 1) If this CPU is offline.
1189	*
1190	* 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1191	* skip those CPUs which remain looping in the 1st kernel - see
1192	* crash_nmi_callback().
1193	*
1194	* Note: there still is a small window between kexec-ing and the new,
1195	* kdump kernel establishing a new #MC handler where a broadcasted MCE
1196	* might not get handled properly.
1197	*/
1198	static noinstr bool mce_check_crashing_cpu(void)
1199	{
1200	unsigned int cpu = smp_processor_id();
1201
1202	if (arch_cpu_is_offline(cpu) \|\|
1203	(crashing_cpu != -`1` && crashing_cpu != cpu)) {
1204	u64 mcgstatus;
1205
1206	mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
1207
1208	if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1209	if (mcgstatus & MCG_STATUS_LMCES)
1210	return false;
1211	}
1212
1213	if (mcgstatus & MCG_STATUS_RIPV) {
1214	__wrmsr(MSR_IA32_MCG_STATUS, low: `0`, high: `0`);
1215	return true;
1216	}
1217	}
1218	return false;
1219	}
1220
1221	static __always_inline int
1222	__mc_scan_banks(struct mce m, struct* pt_regs regs, struct* mce *final,
1223	unsigned long toclear, unsigned* long valid_banks, int* no_way_out,
1224	int *worst)
1225	{
1226	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1227	struct mca_config *cfg = &mca_cfg;
1228	int severity, i, taint = `0`;
1229
1230	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
1231	arch___clear_bit(nr: i, addr: toclear);
1232	if (!arch_test_bit(nr: i, addr: valid_banks))
1233	continue;
1234
1235	if (!mce_banks[i].ctl)
1236	continue;
1237
1238	m->misc = `0`;
1239	m->addr = `0`;
1240	m->bank = i;
1241
1242	m->status = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
1243	if (!(m->status & MCI_STATUS_VAL))
1244	continue;
1245
1246	/*
1247	* Corrected or non-signaled errors are handled by
1248	* machine_check_poll(). Leave them alone, unless this panics.
1249	*/
1250	if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1251	!no_way_out)
1252	continue;
1253
1254	/ Set taint even when machine check was not enabled. /
1255	taint++;
1256
1257	severity = mce_severity(a: m, regs, NULL, is_excp: true);
1258
1259	/*
1260	* When machine check was for corrected/deferred handler don't
1261	* touch, unless we're panicking.
1262	*/
1263	if ((severity == MCE_KEEP_SEVERITY \|\|
1264	severity == MCE_UCNA_SEVERITY) && !no_way_out)
1265	continue;
1266
1267	arch___set_bit(nr: i, addr: toclear);
1268
1269	/ Machine check event was not enabled. Clear, but ignore. /
1270	if (severity == MCE_NO_SEVERITY)
1271	continue;
1272
1273	mce_read_aux(m, i);
1274
1275	/ assuming valid severity level != 0 /
1276	m->severity = severity;
1277
1278	/*
1279	* Enable instrumentation around the mce_log() call which is
1280	* done in #MC context, where instrumentation is disabled.
1281	*/
1282	instrumentation_begin();
1283	mce_log(m);
1284	instrumentation_end();
1285
1286	if (severity > *worst) {
1287	final = m;
1288	*worst = severity;
1289	}
1290	}
1291
1292	/ mce_clear_state will clear final, save locally for use later /*
1293	m = final;
1294
1295	return taint;
1296	}
1297
1298	static void kill_me_now(struct callback_head *ch)
1299	{
1300	struct task_struct p = container_of(ch, struct* task_struct, mce_kill_me);
1301
1302	p->mce_count = `0`;
1303	force_sig(SIGBUS);
1304	}
1305
1306	static void kill_me_maybe(struct callback_head *cb)
1307	{
1308	struct task_struct p = container_of(cb, struct* task_struct, mce_kill_me);
1309	int flags = MF_ACTION_REQUIRED;
1310	unsigned long pfn;
1311	int ret;
1312
1313	p->mce_count = `0`;
1314	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
1315
1316	if (!p->mce_ripv)
1317	flags \|= MF_MUST_KILL;
1318
1319	pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
1320	ret = memory_failure(pfn, flags);
1321	if (!ret) {
1322	set_mce_nospec(pfn);
1323	sync_core();
1324	return;
1325	}
1326
1327	/*
1328	* -EHWPOISON from memory_failure() means that it already sent SIGBUS
1329	* to the current process with the proper error info,
1330	* -EOPNOTSUPP means hwpoison_filter() filtered the error event,
1331	*
1332	* In both cases, no further processing is required.
1333	*/
1334	if (ret == -EHWPOISON \|\| ret == -EOPNOTSUPP)
1335	return;
1336
1337	pr_err("Memory error not recovered");
1338	kill_me_now(ch: cb);
1339	}
1340
1341	static void kill_me_never(struct callback_head *cb)
1342	{
1343	struct task_struct p = container_of(cb, struct* task_struct, mce_kill_me);
1344	unsigned long pfn;
1345
1346	p->mce_count = `0`;
1347	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
1348	pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
1349	if (!memory_failure(pfn, flags: `0`))
1350	set_mce_nospec(pfn);
1351	}
1352
1353	static void queue_task_work(struct mce m, char* msg, void* (func)(struct* callback_head *))
1354	{
1355	int count = ++current->mce_count;
1356
1357	/ First call, save all the details /
1358	if (count == `1`) {
1359	current->mce_addr = m->addr;
1360	current->mce_kflags = m->kflags;
1361	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
1362	current->mce_whole_page = whole_page(m);
1363	current->mce_kill_me.func = func;
1364	}
1365
1366	/ Ten is likely overkill. Don't expect more than two faults before task_work() /
1367	if (count > `10`)
1368	mce_panic(msg: "Too many consecutive machine checks while accessing user data", final: m, exp: msg);
1369
1370	/ Second or later call, make sure page address matches the one from first call /
1371	if (count > `1` && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
1372	mce_panic(msg: "Consecutive machine checks to different user pages", final: m, exp: msg);
1373
1374	/ Do not call task_work_add() more than once /
1375	if (count > `1`)
1376	return;
1377
1378	task_work_add(current, twork: &current->mce_kill_me, mode: TWA_RESUME);
1379	}
1380
1381	/ Handle unconfigured int18 (should never happen) /
1382	static noinstr void unexpected_machine_check(struct pt_regs *regs)
1383	{
1384	instrumentation_begin();
1385	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1386	smp_processor_id());
1387	instrumentation_end();
1388	}
1389
1390	/*
1391	* The actual machine check handler. This only handles real exceptions when
1392	* something got corrupted coming in through int 18.
1393	*
1394	* This is executed in #MC context not subject to normal locking rules.
1395	* This implies that most kernel services cannot be safely used. Don't even
1396	* think about putting a printk in there!
1397	*
1398	* On Intel systems this is entered on all CPUs in parallel through
1399	* MCE broadcast. However some CPUs might be broken beyond repair,
1400	* so be always careful when synchronizing with others.
1401	*
1402	* Tracing and kprobes are disabled: if we interrupted a kernel context
1403	* with IF=1, we need to minimize stack usage. There are also recursion
1404	* issues: if the machine check was due to a failure of the memory
1405	* backing the user stack, tracing that reads the user stack will cause
1406	* potentially infinite recursion.
1407	*
1408	* Currently, the #MC handler calls out to a number of external facilities
1409	* and, therefore, allows instrumentation around them. The optimal thing to
1410	* have would be to do the absolutely minimal work required in #MC context
1411	* and have instrumentation disabled only around that. Further processing can
1412	* then happen in process context where instrumentation is allowed. Achieving
1413	* that requires careful auditing and modifications. Until then, the code
1414	* allows instrumentation temporarily, where required. *
1415	*/
1416	noinstr void do_machine_check(struct pt_regs *regs)
1417	{
1418	int worst = `0`, order, no_way_out, kill_current_task, lmce, taint = `0`;
1419	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { `0` };
1420	DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { `0` };
1421	struct mce m, *final;
1422	char *msg = NULL;
1423
1424	if (unlikely(mce_flags.p5))
1425	return pentium_machine_check(regs);
1426	else if (unlikely(mce_flags.winchip))
1427	return winchip_machine_check(regs);
1428	else if (unlikely(!mca_cfg.initialized))
1429	return unexpected_machine_check(regs);
1430
1431	if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
1432	goto clear;
1433
1434	/*
1435	* Establish sequential order between the CPUs entering the machine
1436	* check handler.
1437	*/
1438	order = -`1`;
1439
1440	/*
1441	* If no_way_out gets set, there is no safe way to recover from this
1442	* MCE.
1443	*/
1444	no_way_out = `0`;
1445
1446	/*
1447	* If kill_current_task is not set, there might be a way to recover from this
1448	* error.
1449	*/
1450	kill_current_task = `0`;
1451
1452	/*
1453	* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1454	* on Intel.
1455	*/
1456	lmce = `1`;
1457
1458	this_cpu_inc(mce_exception_count);
1459
1460	mce_gather_info(m: &m, regs);
1461	m.tsc = rdtsc();
1462
1463	final = this_cpu_ptr(&mces_seen);
1464	*final = m;
1465
1466	no_way_out = mce_no_way_out(m: &m, msg: &msg, validp: valid_banks, regs);
1467
1468	barrier();
1469
1470	/*
1471	* When no restart IP might need to kill or panic.
1472	* Assume the worst for now, but if we find the
1473	* severity is MCE_AR_SEVERITY we have other options.
1474	*/
1475	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1476	kill_current_task = `1`;
1477	/*
1478	* Check if this MCE is signaled to only this logical processor,
1479	* on Intel, Zhaoxin only.
1480	*/
1481	if (m.cpuvendor == X86_VENDOR_INTEL \|\|
1482	m.cpuvendor == X86_VENDOR_ZHAOXIN)
1483	lmce = m.mcgstatus & MCG_STATUS_LMCES;
1484
1485	/*
1486	* Local machine check may already know that we have to panic.
1487	* Broadcast machine check begins rendezvous in mce_start()
1488	* Go through all banks in exclusion of the other CPUs. This way we
1489	* don't report duplicated events on shared banks because the first one
1490	* to see it will clear it.
1491	*/
1492	if (lmce) {
1493	if (no_way_out)
1494	mce_panic(msg: "Fatal local machine check", final: &m, exp: msg);
1495	} else {
1496	order = mce_start(no_way_out: &no_way_out);
1497	}
1498
1499	taint = __mc_scan_banks(m: &m, regs, final, toclear, valid_banks, no_way_out, worst: &worst);
1500
1501	if (!no_way_out)
1502	mce_clear_state(toclear);
1503
1504	/*
1505	* Do most of the synchronization with other CPUs.
1506	* When there's any problem use only local no_way_out state.
1507	*/
1508	if (!lmce) {
1509	if (mce_end(order) < `0`) {
1510	if (!no_way_out)
1511	no_way_out = worst >= MCE_PANIC_SEVERITY;
1512
1513	if (no_way_out)
1514	mce_panic(msg: "Fatal machine check on current CPU", final: &m, exp: msg);
1515	}
1516	} else {
1517	/*
1518	* If there was a fatal machine check we should have
1519	* already called mce_panic earlier in this function.
1520	* Since we re-read the banks, we might have found
1521	* something new. Check again to see if we found a
1522	* fatal error. We call "mce_severity()" again to
1523	* make sure we have the right "msg".
1524	*/
1525	if (worst >= MCE_PANIC_SEVERITY) {
1526	mce_severity(a: &m, regs, msg: &msg, is_excp: true);
1527	mce_panic(msg: "Local fatal machine check!", final: &m, exp: msg);
1528	}
1529	}
1530
1531	/*
1532	* Enable instrumentation around the external facilities like task_work_add()
1533	* (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
1534	* properly would need a lot more involved reorganization.
1535	*/
1536	instrumentation_begin();
1537
1538	if (taint)
1539	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1540
1541	if (worst != MCE_AR_SEVERITY && !kill_current_task)
1542	goto out;
1543
1544	/ Fault was in user mode and we need to take some action /
1545	if ((m.cs & `3`) == `3`) {
1546	/ If this triggers there is no way to recover. Die hard. /
1547	BUG_ON(!on_thread_stack() \|\| !user_mode(regs));
1548
1549	if (!mce_usable_address(&m))
1550	queue_task_work(m: &m, msg, func: kill_me_now);
1551	else
1552	queue_task_work(m: &m, msg, func: kill_me_maybe);
1553
1554	} else {
1555	/*
1556	* Handle an MCE which has happened in kernel space but from
1557	* which the kernel can recover: ex_has_fault_handler() has
1558	* already verified that the rIP at which the error happened is
1559	* a rIP from which the kernel can recover (by jumping to
1560	* recovery code specified in _ASM_EXTABLE_FAULT()) and the
1561	* corresponding exception handler which would do that is the
1562	* proper one.
1563	*/
1564	if (m.kflags & MCE_IN_KERNEL_RECOV) {
1565	if (!fixup_exception(regs, X86_TRAP_MC, error_code: `0`, fault_addr: `0`))
1566	mce_panic(msg: "Failed kernel mode recovery", final: &m, exp: msg);
1567	}
1568
1569	if (m.kflags & MCE_IN_KERNEL_COPYIN)
1570	queue_task_work(m: &m, msg, func: kill_me_never);
1571	}
1572
1573	out:
1574	instrumentation_end();
1575
1576	clear:
1577	mce_wrmsrl(MSR_IA32_MCG_STATUS, v: `0`);
1578	}
1579	EXPORT_SYMBOL_GPL(do_machine_check);
1580
1581	#ifndef CONFIG_MEMORY_FAILURE
1582	int memory_failure(unsigned long pfn, int flags)
1583	{
1584	/ mce_severity() should not hand us an ACTION_REQUIRED error /
1585	BUG_ON(flags & MF_ACTION_REQUIRED);
1586	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1587	"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1588	pfn);
1589
1590	return `0`;
1591	}
1592	#endif
1593
1594	/*
1595	* Periodic polling timer for "silent" machine check errors. If the
1596	* poller finds an MCE, poll 2x faster. When the poller finds no more
1597	* errors, poll 2x slower (up to check_interval seconds).
1598	*/
1599	static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1600
1601	static DEFINE_PER_CPU(unsigned long, mce_next_interval); / in jiffies /
1602	static DEFINE_PER_CPU(struct timer_list, mce_timer);
1603
1604	static unsigned long mce_adjust_timer_default(unsigned long interval)
1605	{
1606	return interval;
1607	}
1608
1609	static unsigned long (mce_adjust_timer)(unsigned* long interval) = mce_adjust_timer_default;
1610
1611	static void __start_timer(struct timer_list t, unsigned* long interval)
1612	{
1613	unsigned long when = jiffies + interval;
1614	unsigned long flags;
1615
1616	local_irq_save(flags);
1617
1618	if (!timer_pending(timer: t) \|\| time_before(when, t->expires))
1619	mod_timer(timer: t, expires: round_jiffies(j: when));
1620
1621	local_irq_restore(flags);
1622	}
1623
1624	static void mc_poll_banks_default(void)
1625	{
1626	machine_check_poll(`0`, this_cpu_ptr(&mce_poll_banks));
1627	}
1628
1629	void (mc_poll_banks)(void*) = mc_poll_banks_default;
1630
1631	static void mce_timer_fn(struct timer_list *t)
1632	{
1633	struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1634	unsigned long iv;
1635
1636	WARN_ON(cpu_t != t);
1637
1638	iv = __this_cpu_read(mce_next_interval);
1639
1640	if (mce_available(this_cpu_ptr(&cpu_info))) {
1641	mc_poll_banks();
1642
1643	if (mce_intel_cmci_poll()) {
1644	iv = mce_adjust_timer(iv);
1645	goto done;
1646	}
1647	}
1648
1649	/*
1650	* Alert userspace if needed. If we logged an MCE, reduce the polling
1651	* interval, otherwise increase the polling interval.
1652	*/
1653	if (mce_notify_irq())
1654	iv = max(iv / `2`, (unsigned long) HZ/`100`);
1655	else
1656	iv = min(iv * `2`, round_jiffies_relative(check_interval * HZ));
1657
1658	done:
1659	__this_cpu_write(mce_next_interval, iv);
1660	__start_timer(t, interval: iv);
1661	}
1662
1663	/*
1664	* Ensure that the timer is firing in @interval from now.
1665	*/
1666	void mce_timer_kick(unsigned long interval)
1667	{
1668	struct timer_list *t = this_cpu_ptr(&mce_timer);
1669	unsigned long iv = __this_cpu_read(mce_next_interval);
1670
1671	__start_timer(t, interval);
1672
1673	if (interval < iv)
1674	__this_cpu_write(mce_next_interval, interval);
1675	}
1676
1677	/ Must not be called in IRQ context where del_timer_sync() can deadlock /
1678	static void mce_timer_delete_all(void)
1679	{
1680	int cpu;
1681
1682	for_each_online_cpu(cpu)
1683	del_timer_sync(timer: &per_cpu(mce_timer, cpu));
1684	}
1685
1686	/*
1687	* Notify the user(s) about new machine check events.
1688	* Can be called from interrupt context, but not from machine check/NMI
1689	* context.
1690	*/
1691	int mce_notify_irq(void)
1692	{
1693	/ Not more than two messages every minute /
1694	static DEFINE_RATELIMIT_STATE(ratelimit, `60`*HZ, `2`);
1695
1696	if (test_and_clear_bit(nr: `0`, addr: &mce_need_notify)) {
1697	mce_work_trigger();
1698
1699	if (__ratelimit(&ratelimit))
1700	pr_info(HW_ERR "Machine check events logged\n");
1701
1702	return `1`;
1703	}
1704	return `0`;
1705	}
1706	EXPORT_SYMBOL_GPL(mce_notify_irq);
1707
1708	static void __mcheck_cpu_mce_banks_init(void)
1709	{
1710	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1711	u8 n_banks = this_cpu_read(mce_num_banks);
1712	int i;
1713
1714	for (i = `0`; i < n_banks; i++) {
1715	struct mce_bank *b = &mce_banks[i];
1716
1717	/*
1718	* Init them all, __mcheck_cpu_apply_quirks() is going to apply
1719	* the required vendor quirks before
1720	* __mcheck_cpu_init_clear_banks() does the final bank setup.
1721	*/
1722	b->ctl = -`1ULL`;
1723	b->init = true;
1724	}
1725	}
1726
1727	/*
1728	* Initialize Machine Checks for a CPU.
1729	*/
1730	static void __mcheck_cpu_cap_init(void)
1731	{
1732	u64 cap;
1733	u8 b;
1734
1735	rdmsrl(MSR_IA32_MCG_CAP, cap);
1736
1737	b = cap & MCG_BANKCNT_MASK;
1738
1739	if (b > MAX_NR_BANKS) {
1740	pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1741	smp_processor_id(), MAX_NR_BANKS, b);
1742	b = MAX_NR_BANKS;
1743	}
1744
1745	this_cpu_write(mce_num_banks, b);
1746
1747	__mcheck_cpu_mce_banks_init();
1748
1749	/ Use accurate RIP reporting if available. /
1750	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= `9`)
1751	mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1752
1753	if (cap & MCG_SER_P)
1754	mca_cfg.ser = `1`;
1755	}
1756
1757	static void __mcheck_cpu_init_generic(void)
1758	{
1759	enum mcp_flags m_fl = `0`;
1760	mce_banks_t all_banks;
1761	u64 cap;
1762
1763	if (!mca_cfg.bootlog)
1764	m_fl = MCP_DONTLOG;
1765
1766	/*
1767	* Log the machine checks left over from the previous reset. Log them
1768	* only, do not start processing them. That will happen in mcheck_late_init()
1769	* when all consumers have been registered on the notifier chain.
1770	*/
1771	bitmap_fill(dst: all_banks, MAX_NR_BANKS);
1772	machine_check_poll(MCP_UC \| MCP_QUEUE_LOG \| m_fl, &all_banks);
1773
1774	cr4_set_bits(X86_CR4_MCE);
1775
1776	rdmsrl(MSR_IA32_MCG_CAP, cap);
1777	if (cap & MCG_CTL_P)
1778	wrmsr(MSR_IA32_MCG_CTL, `0xffffffff`, `0xffffffff`);
1779	}
1780
1781	static void __mcheck_cpu_init_clear_banks(void)
1782	{
1783	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1784	int i;
1785
1786	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
1787	struct mce_bank *b = &mce_banks[i];
1788
1789	if (!b->init)
1790	continue;
1791	wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: b->ctl);
1792	wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), val: `0`);
1793	}
1794	}
1795
1796	/*
1797	* Do a final check to see if there are any unused/RAZ banks.
1798	*
1799	* This must be done after the banks have been initialized and any quirks have
1800	* been applied.
1801	*
1802	* Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
1803	* Otherwise, a user who disables a bank will not be able to re-enable it
1804	* without a system reboot.
1805	*/
1806	static void __mcheck_cpu_check_banks(void)
1807	{
1808	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1809	u64 msrval;
1810	int i;
1811
1812	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
1813	struct mce_bank *b = &mce_banks[i];
1814
1815	if (!b->init)
1816	continue;
1817
1818	rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
1819	b->init = !!msrval;
1820	}
1821	}
1822
1823	/ Add per CPU specific workarounds here /
1824	static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1825	{
1826	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1827	struct mca_config *cfg = &mca_cfg;
1828
1829	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1830	pr_info("unknown CPU type - not enabling MCE support\n");
1831	return -EOPNOTSUPP;
1832	}
1833
1834	/ This should be disabled by the BIOS, but isn't always /
1835	if (c->x86_vendor == X86_VENDOR_AMD) {
1836	if (c->x86 == `15` && this_cpu_read(mce_num_banks) > `4`) {
1837	/*
1838	* disable GART TBL walk error reporting, which
1839	* trips off incorrectly with the IOMMU & 3ware
1840	* & Cerberus:
1841	*/
1842	clear_bit(nr: `10`, addr: (unsigned long *)&mce_banks[`4`].ctl);
1843	}
1844	if (c->x86 < `0x11` && cfg->bootlog < `0`) {
1845	/*
1846	* Lots of broken BIOS around that don't clear them
1847	* by default and leave crap in there. Don't log:
1848	*/
1849	cfg->bootlog = `0`;
1850	}
1851	/*
1852	* Various K7s with broken bank 0 around. Always disable
1853	* by default.
1854	*/
1855	if (c->x86 == `6` && this_cpu_read(mce_num_banks) > `0`)
1856	mce_banks[`0`].ctl = `0`;
1857
1858	/*
1859	* overflow_recov is supported for F15h Models 00h-0fh
1860	* even though we don't have a CPUID bit for it.
1861	*/
1862	if (c->x86 == `0x15` && c->x86_model <= `0xf`)
1863	mce_flags.overflow_recov = `1`;
1864
1865	if (c->x86 >= `0x17` && c->x86 <= `0x1A`)
1866	mce_flags.zen_ifu_quirk = `1`;
1867
1868	}
1869
1870	if (c->x86_vendor == X86_VENDOR_INTEL) {
1871	/*
1872	* SDM documents that on family 6 bank 0 should not be written
1873	* because it aliases to another special BIOS controlled
1874	* register.
1875	* But it's not aliased anymore on model 0x1a+
1876	* Don't ignore bank 0 completely because there could be a
1877	* valid event later, merely don't write CTL0.
1878	*/
1879
1880	if (c->x86 == `6` && c->x86_model < `0x1A` && this_cpu_read(mce_num_banks) > `0`)
1881	mce_banks[`0`].init = false;
1882
1883	/*
1884	* All newer Intel systems support MCE broadcasting. Enable
1885	* synchronization with a one second timeout.
1886	*/
1887	if ((c->x86 > `6` \|\| (c->x86 == `6` && c->x86_model >= `0xe`)) &&
1888	cfg->monarch_timeout < `0`)
1889	cfg->monarch_timeout = USEC_PER_SEC;
1890
1891	/*
1892	* There are also broken BIOSes on some Pentium M and
1893	* earlier systems:
1894	*/
1895	if (c->x86 == `6` && c->x86_model <= `13` && cfg->bootlog < `0`)
1896	cfg->bootlog = `0`;
1897
1898	if (c->x86 == `6` && c->x86_model == `45`)
1899	mce_flags.snb_ifu_quirk = `1`;
1900
1901	/*
1902	* Skylake, Cascacde Lake and Cooper Lake require a quirk on
1903	* rep movs.
1904	*/
1905	if (c->x86 == `6` && c->x86_model == INTEL_FAM6_SKYLAKE_X)
1906	mce_flags.skx_repmov_quirk = `1`;
1907	}
1908
1909	if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1910	/*
1911	* All newer Zhaoxin CPUs support MCE broadcasting. Enable
1912	* synchronization with a one second timeout.
1913	*/
1914	if (c->x86 > `6` \|\| (c->x86_model == `0x19` \|\| c->x86_model == `0x1f`)) {
1915	if (cfg->monarch_timeout < `0`)
1916	cfg->monarch_timeout = USEC_PER_SEC;
1917	}
1918	}
1919
1920	if (cfg->monarch_timeout < `0`)
1921	cfg->monarch_timeout = `0`;
1922	if (cfg->bootlog != `0`)
1923	cfg->panic_timeout = `30`;
1924
1925	return `0`;
1926	}
1927
1928	static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1929	{
1930	if (c->x86 != `5`)
1931	return `0`;
1932
1933	switch (c->x86_vendor) {
1934	case X86_VENDOR_INTEL:
1935	intel_p5_mcheck_init(c);
1936	mce_flags.p5 = `1`;
1937	return `1`;
1938	case X86_VENDOR_CENTAUR:
1939	winchip_mcheck_init(c);
1940	mce_flags.winchip = `1`;
1941	return `1`;
1942	default:
1943	return `0`;
1944	}
1945
1946	return `0`;
1947	}
1948
1949	/*
1950	* Init basic CPU features needed for early decoding of MCEs.
1951	*/
1952	static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1953	{
1954	if (c->x86_vendor == X86_VENDOR_AMD \|\| c->x86_vendor == X86_VENDOR_HYGON) {
1955	mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1956	mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1957	mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
1958	mce_flags.amd_threshold = `1`;
1959	}
1960	}
1961
1962	static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1963	{
1964	struct mca_config *cfg = &mca_cfg;
1965
1966	/*
1967	* All newer Centaur CPUs support MCE broadcasting. Enable
1968	* synchronization with a one second timeout.
1969	*/
1970	if ((c->x86 == `6` && c->x86_model == `0xf` && c->x86_stepping >= `0xe`) \|\|
1971	c->x86 > `6`) {
1972	if (cfg->monarch_timeout < `0`)
1973	cfg->monarch_timeout = USEC_PER_SEC;
1974	}
1975	}
1976
1977	static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1978	{
1979	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1980
1981	/*
1982	* These CPUs have MCA bank 8 which reports only one error type called
1983	* SVAD (System View Address Decoder). The reporting of that error is
1984	* controlled by IA32_MC8.CTL.0.
1985	*
1986	* If enabled, prefetching on these CPUs will cause SVAD MCE when
1987	* virtual machines start and result in a system panic. Always disable
1988	* bank 8 SVAD error by default.
1989	*/
1990	if ((c->x86 == `7` && c->x86_model == `0x1b`) \|\|
1991	(c->x86_model == `0x19` \|\| c->x86_model == `0x1f`)) {
1992	if (this_cpu_read(mce_num_banks) > `8`)
1993	mce_banks[`8`].ctl = `0`;
1994	}
1995
1996	intel_init_cmci();
1997	intel_init_lmce();
1998	mce_adjust_timer = cmci_intel_adjust_timer;
1999	}
2000
2001	static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
2002	{
2003	intel_clear_lmce();
2004	}
2005
2006	static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
2007	{
2008	switch (c->x86_vendor) {
2009	case X86_VENDOR_INTEL:
2010	mce_intel_feature_init(c);
2011	mce_adjust_timer = cmci_intel_adjust_timer;
2012	break;
2013
2014	case X86_VENDOR_AMD: {
2015	mce_amd_feature_init(c);
2016	break;
2017	}
2018
2019	case X86_VENDOR_HYGON:
2020	mce_hygon_feature_init(c);
2021	break;
2022
2023	case X86_VENDOR_CENTAUR:
2024	mce_centaur_feature_init(c);
2025	break;
2026
2027	case X86_VENDOR_ZHAOXIN:
2028	mce_zhaoxin_feature_init(c);
2029	break;
2030
2031	default:
2032	break;
2033	}
2034	}
2035
2036	static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2037	{
2038	switch (c->x86_vendor) {
2039	case X86_VENDOR_INTEL:
2040	mce_intel_feature_clear(c);
2041	break;
2042
2043	case X86_VENDOR_ZHAOXIN:
2044	mce_zhaoxin_feature_clear(c);
2045	break;
2046
2047	default:
2048	break;
2049	}
2050	}
2051
2052	static void mce_start_timer(struct timer_list *t)
2053	{
2054	unsigned long iv = check_interval * HZ;
2055
2056	if (mca_cfg.ignore_ce \|\| !iv)
2057	return;
2058
2059	this_cpu_write(mce_next_interval, iv);
2060	__start_timer(t, interval: iv);
2061	}
2062
2063	static void __mcheck_cpu_setup_timer(void)
2064	{
2065	struct timer_list *t = this_cpu_ptr(&mce_timer);
2066
2067	timer_setup(t, mce_timer_fn, TIMER_PINNED);
2068	}
2069
2070	static void __mcheck_cpu_init_timer(void)
2071	{
2072	struct timer_list *t = this_cpu_ptr(&mce_timer);
2073
2074	timer_setup(t, mce_timer_fn, TIMER_PINNED);
2075	mce_start_timer(t);
2076	}
2077
2078	bool filter_mce(struct mce *m)
2079	{
2080	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
2081	return amd_filter_mce(m);
2082	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2083	return intel_filter_mce(m);
2084
2085	return false;
2086	}
2087
2088	static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
2089	{
2090	irqentry_state_t irq_state;
2091
2092	WARN_ON_ONCE(user_mode(regs));
2093
2094	/*
2095	* Only required when from kernel mode. See
2096	* mce_check_crashing_cpu() for details.
2097	*/
2098	if (mca_cfg.initialized && mce_check_crashing_cpu())
2099	return;
2100
2101	irq_state = irqentry_nmi_enter(regs);
2102
2103	do_machine_check(regs);
2104
2105	irqentry_nmi_exit(regs, irq_state);
2106	}
2107
2108	static __always_inline void exc_machine_check_user(struct pt_regs *regs)
2109	{
2110	irqentry_enter_from_user_mode(regs);
2111
2112	do_machine_check(regs);
2113
2114	irqentry_exit_to_user_mode(regs);
2115	}
2116
2117	#ifdef CONFIG_X86_64
2118	/ MCE hit kernel mode /
2119	DEFINE_IDTENTRY_MCE(exc_machine_check)
2120	{
2121	unsigned long dr7;
2122
2123	dr7 = local_db_save();
2124	exc_machine_check_kernel(regs);
2125	local_db_restore(dr7);
2126	}
2127
2128	/ The user mode variant. /
2129	DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
2130	{
2131	unsigned long dr7;
2132
2133	dr7 = local_db_save();
2134	exc_machine_check_user(regs);
2135	local_db_restore(dr7);
2136	}
2137	#else
2138	/ 32bit unified entry point /
2139	DEFINE_IDTENTRY_RAW(exc_machine_check)
2140	{
2141	unsigned long dr7;
2142
2143	dr7 = local_db_save();
2144	if (user_mode(regs))
2145	exc_machine_check_user(regs);
2146	else
2147	exc_machine_check_kernel(regs);
2148	local_db_restore(dr7);
2149	}
2150	#endif
2151
2152	/*
2153	* Called for each booted CPU to set up machine checks.
2154	* Must be called with preempt off:
2155	*/
2156	void mcheck_cpu_init(struct cpuinfo_x86 *c)
2157	{
2158	if (mca_cfg.disabled)
2159	return;
2160
2161	if (__mcheck_cpu_ancient_init(c))
2162	return;
2163
2164	if (!mce_available(c))
2165	return;
2166
2167	__mcheck_cpu_cap_init();
2168
2169	if (__mcheck_cpu_apply_quirks(c) < `0`) {
2170	mca_cfg.disabled = `1`;
2171	return;
2172	}
2173
2174	if (mce_gen_pool_init()) {
2175	mca_cfg.disabled = `1`;
2176	pr_emerg("Couldn't allocate MCE records pool!\n");
2177	return;
2178	}
2179
2180	mca_cfg.initialized = `1`;
2181
2182	__mcheck_cpu_init_early(c);
2183	__mcheck_cpu_init_generic();
2184	__mcheck_cpu_init_vendor(c);
2185	__mcheck_cpu_init_clear_banks();
2186	__mcheck_cpu_check_banks();
2187	__mcheck_cpu_setup_timer();
2188	}
2189
2190	/*
2191	* Called for each booted CPU to clear some machine checks opt-ins
2192	*/
2193	void mcheck_cpu_clear(struct cpuinfo_x86 *c)
2194	{
2195	if (mca_cfg.disabled)
2196	return;
2197
2198	if (!mce_available(c))
2199	return;
2200
2201	/*
2202	* Possibly to clear general settings generic to x86
2203	* __mcheck_cpu_clear_generic(c);
2204	*/
2205	__mcheck_cpu_clear_vendor(c);
2206
2207	}
2208
2209	static void __mce_disable_bank(void *arg)
2210	{
2211	int bank = ((int* *)arg);
2212	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2213	cmci_disable_bank(bank);
2214	}
2215
2216	void mce_disable_bank(int bank)
2217	{
2218	if (bank >= this_cpu_read(mce_num_banks)) {
2219	pr_warn(FW_BUG
2220	"Ignoring request to disable invalid MCA bank %d.\n",
2221	bank);
2222	return;
2223	}
2224	set_bit(nr: bank, addr: mce_banks_ce_disabled);
2225	on_each_cpu(func: __mce_disable_bank, info: &bank, wait: `1`);
2226	}
2227
2228	/*
2229	* mce=off Disables machine check
2230	* mce=no_cmci Disables CMCI
2231	* mce=no_lmce Disables LMCE
2232	* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2233	* mce=print_all Print all machine check logs to console
2234	* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2235	* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2236	* monarchtimeout is how long to wait for other CPUs on machine
2237	* check, or 0 to not wait
2238	* mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
2239	and older.
2240	* mce=nobootlog Don't log MCEs from before booting.
2241	* mce=bios_cmci_threshold Don't program the CMCI threshold
2242	* mce=recovery force enable copy_mc_fragile()
2243	*/
2244	static int __init mcheck_enable(char *str)
2245	{
2246	struct mca_config *cfg = &mca_cfg;
2247
2248	if (*str == `0`) {
2249	enable_p5_mce();
2250	return `1`;
2251	}
2252	if (*str == `'='`)
2253	str++;
2254	if (!strcmp(str, "off"))
2255	cfg->disabled = `1`;
2256	else if (!strcmp(str, "no_cmci"))
2257	cfg->cmci_disabled = true;
2258	else if (!strcmp(str, "no_lmce"))
2259	cfg->lmce_disabled = `1`;
2260	else if (!strcmp(str, "dont_log_ce"))
2261	cfg->dont_log_ce = true;
2262	else if (!strcmp(str, "print_all"))
2263	cfg->print_all = true;
2264	else if (!strcmp(str, "ignore_ce"))
2265	cfg->ignore_ce = true;
2266	else if (!strcmp(str, "bootlog") \|\| !strcmp(str, "nobootlog"))
2267	cfg->bootlog = (str[`0`] == `'b'`);
2268	else if (!strcmp(str, "bios_cmci_threshold"))
2269	cfg->bios_cmci_threshold = `1`;
2270	else if (!strcmp(str, "recovery"))
2271	cfg->recovery = `1`;
2272	else if (isdigit(c: str[`0`]))
2273	get_option(str: &str, pint: &(cfg->monarch_timeout));
2274	else {
2275	pr_info("mce argument %s ignored. Please use /sys\n", str);
2276	return `0`;
2277	}
2278	return `1`;
2279	}
2280	__setup("mce", mcheck_enable);
2281
2282	int __init mcheck_init(void)
2283	{
2284	mce_register_decode_chain(&early_nb);
2285	mce_register_decode_chain(&mce_uc_nb);
2286	mce_register_decode_chain(&mce_default_nb);
2287
2288	INIT_WORK(&mce_work, mce_gen_pool_process);
2289	init_irq_work(work: &mce_irq_work, func: mce_irq_work_cb);
2290
2291	return `0`;
2292	}
2293
2294	/*
2295	* mce_syscore: PM support
2296	*/
2297
2298	/*
2299	* Disable machine checks on suspend and shutdown. We can't really handle
2300	* them later.
2301	*/
2302	static void mce_disable_error_reporting(void)
2303	{
2304	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2305	int i;
2306
2307	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
2308	struct mce_bank *b = &mce_banks[i];
2309
2310	if (b->init)
2311	wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: `0`);
2312	}
2313	return;
2314	}
2315
2316	static void vendor_disable_error_reporting(void)
2317	{
2318	/*
2319	* Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
2320	* MSRs are socket-wide. Disabling them for just a single offlined CPU
2321	* is bad, since it will inhibit reporting for all shared resources on
2322	* the socket like the last level cache (LLC), the integrated memory
2323	* controller (iMC), etc.
2324	*/
2325	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL \|\|
2326	boot_cpu_data.x86_vendor == X86_VENDOR_HYGON \|\|
2327	boot_cpu_data.x86_vendor == X86_VENDOR_AMD \|\|
2328	boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
2329	return;
2330
2331	mce_disable_error_reporting();
2332	}
2333
2334	static int mce_syscore_suspend(void)
2335	{
2336	vendor_disable_error_reporting();
2337	return `0`;
2338	}
2339
2340	static void mce_syscore_shutdown(void)
2341	{
2342	vendor_disable_error_reporting();
2343	}
2344
2345	/*
2346	* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2347	* Only one CPU is active at this time, the others get re-added later using
2348	* CPU hotplug:
2349	*/
2350	static void mce_syscore_resume(void)
2351	{
2352	__mcheck_cpu_init_generic();
2353	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2354	__mcheck_cpu_init_clear_banks();
2355	}
2356
2357	static struct syscore_ops mce_syscore_ops = {
2358	.suspend = mce_syscore_suspend,
2359	.shutdown = mce_syscore_shutdown,
2360	.resume = mce_syscore_resume,
2361	};
2362
2363	/*
2364	* mce_device: Sysfs support
2365	*/
2366
2367	static void mce_cpu_restart(void *data)
2368	{
2369	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2370	return;
2371	__mcheck_cpu_init_generic();
2372	__mcheck_cpu_init_clear_banks();
2373	__mcheck_cpu_init_timer();
2374	}
2375
2376	/ Reinit MCEs after user configuration changes /
2377	static void mce_restart(void)
2378	{
2379	mce_timer_delete_all();
2380	on_each_cpu(func: mce_cpu_restart, NULL, wait: `1`);
2381	mce_schedule_work();
2382	}
2383
2384	/ Toggle features for corrected errors /
2385	static void mce_disable_cmci(void *data)
2386	{
2387	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2388	return;
2389	cmci_clear();
2390	}
2391
2392	static void mce_enable_ce(void *all)
2393	{
2394	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2395	return;
2396	cmci_reenable();
2397	cmci_recheck();
2398	if (all)
2399	__mcheck_cpu_init_timer();
2400	}
2401
2402	static struct bus_type mce_subsys = {
2403	.name = "machinecheck",
2404	.dev_name = "machinecheck",
2405	};
2406
2407	DEFINE_PER_CPU(struct device *, mce_device);
2408
2409	static inline struct mce_bank_dev attr_to_bank(struct* device_attribute *attr)
2410	{
2411	return container_of(attr, struct mce_bank_dev, attr);
2412	}
2413
2414	static ssize_t show_bank(struct device s, struct* device_attribute *attr,
2415	char *buf)
2416	{
2417	u8 bank = attr_to_bank(attr)->bank;
2418	struct mce_bank *b;
2419
2420	if (bank >= per_cpu(mce_num_banks, s->id))
2421	return -EINVAL;
2422
2423	b = &per_cpu(mce_banks_array, s->id)[bank];
2424
2425	if (!b->init)
2426	return -ENODEV;
2427
2428	return sprintf(buf, fmt: "%llx\n", b->ctl);
2429	}
2430
2431	static ssize_t set_bank(struct device s, struct* device_attribute *attr,
2432	const char *buf, size_t size)
2433	{
2434	u8 bank = attr_to_bank(attr)->bank;
2435	struct mce_bank *b;
2436	u64 new;
2437
2438	if (kstrtou64(s: buf, base: `0`, res: &new) < `0`)
2439	return -EINVAL;
2440
2441	if (bank >= per_cpu(mce_num_banks, s->id))
2442	return -EINVAL;
2443
2444	b = &per_cpu(mce_banks_array, s->id)[bank];
2445
2446	if (!b->init)
2447	return -ENODEV;
2448
2449	b->ctl = new;
2450	mce_restart();
2451
2452	return size;
2453	}
2454
2455	static ssize_t set_ignore_ce(struct device *s,
2456	struct device_attribute *attr,
2457	const char *buf, size_t size)
2458	{
2459	u64 new;
2460
2461	if (kstrtou64(s: buf, base: `0`, res: &new) < `0`)
2462	return -EINVAL;
2463
2464	mutex_lock(&mce_sysfs_mutex);
2465	if (mca_cfg.ignore_ce ^ !!new) {
2466	if (new) {
2467	/ disable ce features /
2468	mce_timer_delete_all();
2469	on_each_cpu(func: mce_disable_cmci, NULL, wait: `1`);
2470	mca_cfg.ignore_ce = true;
2471	} else {
2472	/ enable ce features /
2473	mca_cfg.ignore_ce = false;
2474	on_each_cpu(func: mce_enable_ce, info: (void *)`1`, wait: `1`);
2475	}
2476	}
2477	mutex_unlock(lock: &mce_sysfs_mutex);
2478
2479	return size;
2480	}
2481
2482	static ssize_t set_cmci_disabled(struct device *s,
2483	struct device_attribute *attr,
2484	const char *buf, size_t size)
2485	{
2486	u64 new;
2487
2488	if (kstrtou64(s: buf, base: `0`, res: &new) < `0`)
2489	return -EINVAL;
2490
2491	mutex_lock(&mce_sysfs_mutex);
2492	if (mca_cfg.cmci_disabled ^ !!new) {
2493	if (new) {
2494	/ disable cmci /
2495	on_each_cpu(func: mce_disable_cmci, NULL, wait: `1`);
2496	mca_cfg.cmci_disabled = true;
2497	} else {
2498	/ enable cmci /
2499	mca_cfg.cmci_disabled = false;
2500	on_each_cpu(func: mce_enable_ce, NULL, wait: `1`);
2501	}
2502	}
2503	mutex_unlock(lock: &mce_sysfs_mutex);
2504
2505	return size;
2506	}
2507
2508	static ssize_t store_int_with_restart(struct device *s,
2509	struct device_attribute *attr,
2510	const char *buf, size_t size)
2511	{
2512	unsigned long old_check_interval = check_interval;
2513	ssize_t ret = device_store_ulong(dev: s, attr, buf, count: size);
2514
2515	if (check_interval == old_check_interval)
2516	return ret;
2517
2518	mutex_lock(&mce_sysfs_mutex);
2519	mce_restart();
2520	mutex_unlock(lock: &mce_sysfs_mutex);
2521
2522	return ret;
2523	}
2524
2525	static DEVICE_INT_ATTR(monarch_timeout, `0644`, mca_cfg.monarch_timeout);
2526	static DEVICE_BOOL_ATTR(dont_log_ce, `0644`, mca_cfg.dont_log_ce);
2527	static DEVICE_BOOL_ATTR(print_all, `0644`, mca_cfg.print_all);
2528
2529	static struct dev_ext_attribute dev_attr_check_interval = {
2530	__ATTR(check_interval, `0644`, device_show_int, store_int_with_restart),
2531	&check_interval
2532	};
2533
2534	static struct dev_ext_attribute dev_attr_ignore_ce = {
2535	__ATTR(ignore_ce, `0644`, device_show_bool, set_ignore_ce),
2536	&mca_cfg.ignore_ce
2537	};
2538
2539	static struct dev_ext_attribute dev_attr_cmci_disabled = {
2540	__ATTR(cmci_disabled, `0644`, device_show_bool, set_cmci_disabled),
2541	&mca_cfg.cmci_disabled
2542	};
2543
2544	static struct device_attribute *mce_device_attrs[] = {
2545	&dev_attr_check_interval.attr,
2546	#ifdef CONFIG_X86_MCELOG_LEGACY
2547	&dev_attr_trigger,
2548	#endif
2549	&dev_attr_monarch_timeout.attr,
2550	&dev_attr_dont_log_ce.attr,
2551	&dev_attr_print_all.attr,
2552	&dev_attr_ignore_ce.attr,
2553	&dev_attr_cmci_disabled.attr,
2554	NULL
2555	};
2556
2557	static cpumask_var_t mce_device_initialized;
2558
2559	static void mce_device_release(struct device *dev)
2560	{
2561	kfree(objp: dev);
2562	}
2563
2564	/ Per CPU device init. All of the CPUs still share the same bank device: /
2565	static int mce_device_create(unsigned int cpu)
2566	{
2567	struct device *dev;
2568	int err;
2569	int i, j;
2570
2571	if (!mce_available(c: &boot_cpu_data))
2572	return -EIO;
2573
2574	dev = per_cpu(mce_device, cpu);
2575	if (dev)
2576	return `0`;
2577
2578	dev = kzalloc(size: sizeof(*dev), GFP_KERNEL);
2579	if (!dev)
2580	return -ENOMEM;
2581	dev->id = cpu;
2582	dev->bus = &mce_subsys;
2583	dev->release = &mce_device_release;
2584
2585	err = device_register(dev);
2586	if (err) {
2587	put_device(dev);
2588	return err;
2589	}
2590
2591	for (i = `0`; mce_device_attrs[i]; i++) {
2592	err = device_create_file(device: dev, entry: mce_device_attrs[i]);
2593	if (err)
2594	goto error;
2595	}
2596	for (j = `0`; j < per_cpu(mce_num_banks, cpu); j++) {
2597	err = device_create_file(device: dev, entry: &mce_bank_devs[j].attr);
2598	if (err)
2599	goto error2;
2600	}
2601	cpumask_set_cpu(cpu, dstp: mce_device_initialized);
2602	per_cpu(mce_device, cpu) = dev;
2603
2604	return `0`;
2605	error2:
2606	while (--j >= `0`)
2607	device_remove_file(dev, attr: &mce_bank_devs[j].attr);
2608	error:
2609	while (--i >= `0`)
2610	device_remove_file(dev, attr: mce_device_attrs[i]);
2611
2612	device_unregister(dev);
2613
2614	return err;
2615	}
2616
2617	static void mce_device_remove(unsigned int cpu)
2618	{
2619	struct device *dev = per_cpu(mce_device, cpu);
2620	int i;
2621
2622	if (!cpumask_test_cpu(cpu, cpumask: mce_device_initialized))
2623	return;
2624
2625	for (i = `0`; mce_device_attrs[i]; i++)
2626	device_remove_file(dev, attr: mce_device_attrs[i]);
2627
2628	for (i = `0`; i < per_cpu(mce_num_banks, cpu); i++)
2629	device_remove_file(dev, attr: &mce_bank_devs[i].attr);
2630
2631	device_unregister(dev);
2632	cpumask_clear_cpu(cpu, dstp: mce_device_initialized);
2633	per_cpu(mce_device, cpu) = NULL;
2634	}
2635
2636	/ Make sure there are no machine checks on offlined CPUs. /
2637	static void mce_disable_cpu(void)
2638	{
2639	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2640	return;
2641
2642	if (!cpuhp_tasks_frozen)
2643	cmci_clear();
2644
2645	vendor_disable_error_reporting();
2646	}
2647
2648	static void mce_reenable_cpu(void)
2649	{
2650	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2651	int i;
2652
2653	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2654	return;
2655
2656	if (!cpuhp_tasks_frozen)
2657	cmci_reenable();
2658	for (i = `0`; i < this_cpu_read(mce_num_banks); i++) {
2659	struct mce_bank *b = &mce_banks[i];
2660
2661	if (b->init)
2662	wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: b->ctl);
2663	}
2664	}
2665
2666	static int mce_cpu_dead(unsigned int cpu)
2667	{
2668	mce_intel_hcpu_update(cpu);
2669
2670	/ intentionally ignoring frozen here /
2671	if (!cpuhp_tasks_frozen)
2672	cmci_rediscover();
2673	return `0`;
2674	}
2675
2676	static int mce_cpu_online(unsigned int cpu)
2677	{
2678	struct timer_list *t = this_cpu_ptr(&mce_timer);
2679	int ret;
2680
2681	mce_device_create(cpu);
2682
2683	ret = mce_threshold_create_device(cpu);
2684	if (ret) {
2685	mce_device_remove(cpu);
2686	return ret;
2687	}
2688	mce_reenable_cpu();
2689	mce_start_timer(t);
2690	return `0`;
2691	}
2692
2693	static int mce_cpu_pre_down(unsigned int cpu)
2694	{
2695	struct timer_list *t = this_cpu_ptr(&mce_timer);
2696
2697	mce_disable_cpu();
2698	del_timer_sync(timer: t);
2699	mce_threshold_remove_device(cpu);
2700	mce_device_remove(cpu);
2701	return `0`;
2702	}
2703
2704	static __init void mce_init_banks(void)
2705	{
2706	int i;
2707
2708	for (i = `0`; i < MAX_NR_BANKS; i++) {
2709	struct mce_bank_dev *b = &mce_bank_devs[i];
2710	struct device_attribute *a = &b->attr;
2711
2712	b->bank = i;
2713
2714	sysfs_attr_init(&a->attr);
2715	a->attr.name = b->attrname;
2716	snprintf(buf: b->attrname, ATTR_LEN, fmt: "bank%d", i);
2717
2718	a->attr.mode = `0644`;
2719	a->show = show_bank;
2720	a->store = set_bank;
2721	}
2722	}
2723
2724	/*
2725	* When running on XEN, this initcall is ordered against the XEN mcelog
2726	* initcall:
2727	*
2728	* device_initcall(xen_late_init_mcelog);
2729	* device_initcall_sync(mcheck_init_device);
2730	*/
2731	static __init int mcheck_init_device(void)
2732	{
2733	int err;
2734
2735	/*
2736	* Check if we have a spare virtual bit. This will only become
2737	* a problem if/when we move beyond 5-level page tables.
2738	*/
2739	MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= `63`);
2740
2741	if (!mce_available(c: &boot_cpu_data)) {
2742	err = -EIO;
2743	goto err_out;
2744	}
2745
2746	if (!zalloc_cpumask_var(mask: &mce_device_initialized, GFP_KERNEL)) {
2747	err = -ENOMEM;
2748	goto err_out;
2749	}
2750
2751	mce_init_banks();
2752
2753	err = subsys_system_register(subsys: &mce_subsys, NULL);
2754	if (err)
2755	goto err_out_mem;
2756
2757	err = cpuhp_setup_state(state: CPUHP_X86_MCE_DEAD, name: "x86/mce:dead", NULL,
2758	teardown: mce_cpu_dead);
2759	if (err)
2760	goto err_out_mem;
2761
2762	/*
2763	* Invokes mce_cpu_online() on all CPUs which are online when
2764	* the state is installed.
2765	*/
2766	err = cpuhp_setup_state(state: CPUHP_AP_ONLINE_DYN, name: "x86/mce:online",
2767	startup: mce_cpu_online, teardown: mce_cpu_pre_down);
2768	if (err < `0`)
2769	goto err_out_online;
2770
2771	register_syscore_ops(ops: &mce_syscore_ops);
2772
2773	return `0`;
2774
2775	err_out_online:
2776	cpuhp_remove_state(state: CPUHP_X86_MCE_DEAD);
2777
2778	err_out_mem:
2779	free_cpumask_var(mask: mce_device_initialized);
2780
2781	err_out:
2782	pr_err("Unable to init MCE device (rc: %d)\n", err);
2783
2784	return err;
2785	}
2786	device_initcall_sync(mcheck_init_device);
2787
2788	/*
2789	* Old style boot options parsing. Only for compatibility.
2790	*/
2791	static int __init mcheck_disable(char *str)
2792	{
2793	mca_cfg.disabled = `1`;
2794	return `1`;
2795	}
2796	__setup("nomce", mcheck_disable);
2797
2798	#ifdef CONFIG_DEBUG_FS
2799	struct dentry mce_get_debugfs_dir(void*)
2800	{
2801	static struct dentry *dmce;
2802
2803	if (!dmce)
2804	dmce = debugfs_create_dir(name: "mce", NULL);
2805
2806	return dmce;
2807	}
2808
2809	static void mce_reset(void)
2810	{
2811	atomic_set(v: &mce_fake_panicked, i: `0`);
2812	atomic_set(v: &mce_executing, i: `0`);
2813	atomic_set(v: &mce_callin, i: `0`);
2814	atomic_set(v: &global_nwo, i: `0`);
2815	cpumask_setall(dstp: &mce_missing_cpus);
2816	}
2817
2818	static int fake_panic_get(void data, u64 val)
2819	{
2820	*val = fake_panic;
2821	return `0`;
2822	}
2823
2824	static int fake_panic_set(void *data, u64 val)
2825	{
2826	mce_reset();
2827	fake_panic = val;
2828	return `0`;
2829	}
2830
2831	DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2832	"%llu\n");
2833
2834	static void __init mcheck_debugfs_init(void)
2835	{
2836	struct dentry *dmce;
2837
2838	dmce = mce_get_debugfs_dir();
2839	debugfs_create_file_unsafe(name: "fake_panic", mode: `0444`, parent: dmce, NULL,
2840	fops: &fake_panic_fops);
2841	}
2842	#else
2843	static void __init mcheck_debugfs_init(void) { }
2844	#endif
2845
2846	static int __init mcheck_late_init(void)
2847	{
2848	if (mca_cfg.recovery)
2849	enable_copy_mc_fragile();
2850
2851	mcheck_debugfs_init();
2852
2853	/*
2854	* Flush out everything that has been logged during early boot, now that
2855	* everything has been initialized (workqueues, decoders, ...).
2856	*/
2857	mce_schedule_work();
2858
2859	return `0`;
2860	}
2861	late_initcall(mcheck_late_init);
2862

source code of linux/arch/x86/kernel/cpu/mce/core.c