1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Machine check handler.
4 *
5 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
6 * Rest from unknown author(s).
7 * 2004 Andi Kleen. Rewrote most of it.
8 * Copyright 2008 Intel Corporation
9 * Author: Andi Kleen
10 */
11
12#include <linux/thread_info.h>
13#include <linux/capability.h>
14#include <linux/miscdevice.h>
15#include <linux/ratelimit.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/device.h>
24#include <linux/syscore_ops.h>
25#include <linux/delay.h>
26#include <linux/ctype.h>
27#include <linux/sched.h>
28#include <linux/sysfs.h>
29#include <linux/types.h>
30#include <linux/slab.h>
31#include <linux/init.h>
32#include <linux/kmod.h>
33#include <linux/poll.h>
34#include <linux/nmi.h>
35#include <linux/cpu.h>
36#include <linux/ras.h>
37#include <linux/smp.h>
38#include <linux/fs.h>
39#include <linux/mm.h>
40#include <linux/debugfs.h>
41#include <linux/irq_work.h>
42#include <linux/export.h>
43#include <linux/set_memory.h>
44#include <linux/sync_core.h>
45#include <linux/task_work.h>
46#include <linux/hardirq.h>
47
48#include <asm/intel-family.h>
49#include <asm/processor.h>
50#include <asm/traps.h>
51#include <asm/tlbflush.h>
52#include <asm/mce.h>
53#include <asm/msr.h>
54#include <asm/reboot.h>
55
56#include "internal.h"
57
58/* sysfs synchronization */
59static DEFINE_MUTEX(mce_sysfs_mutex);
60
61#define CREATE_TRACE_POINTS
62#include <trace/events/mce.h>
63
64#define SPINUNIT 100 /* 100ns */
65
66DEFINE_PER_CPU(unsigned, mce_exception_count);
67
68DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
69
70DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
71
72#define ATTR_LEN 16
73/* One object for each MCE bank, shared by all CPUs */
74struct mce_bank_dev {
75 struct device_attribute attr; /* device attribute */
76 char attrname[ATTR_LEN]; /* attribute name */
77 u8 bank; /* bank number */
78};
79static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
80
81struct mce_vendor_flags mce_flags __read_mostly;
82
83struct mca_config mca_cfg __read_mostly = {
84 .bootlog = -1,
85 .monarch_timeout = -1
86};
87
88static DEFINE_PER_CPU(struct mce, mces_seen);
89static unsigned long mce_need_notify;
90
91/*
92 * MCA banks polled by the period polling timer for corrected events.
93 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
94 */
95DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
96 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
97};
98
99/*
100 * MCA banks controlled through firmware first for corrected errors.
101 * This is a global list of banks for which we won't enable CMCI and we
102 * won't poll. Firmware controls these banks and is responsible for
103 * reporting corrected errors through GHES. Uncorrected/recoverable
104 * errors are still notified through a machine check.
105 */
106mce_banks_t mce_banks_ce_disabled;
107
108static struct work_struct mce_work;
109static struct irq_work mce_irq_work;
110
111/*
112 * CPU/chipset specific EDAC code can register a notifier call here to print
113 * MCE errors in a human-readable form.
114 */
115BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
116
117/* Do initial initialization of a struct mce */
118void mce_setup(struct mce *m)
119{
120 memset(m, 0, sizeof(struct mce));
121 m->cpu = m->extcpu = smp_processor_id();
122 /* need the internal __ version to avoid deadlocks */
123 m->time = __ktime_get_real_seconds();
124 m->cpuvendor = boot_cpu_data.x86_vendor;
125 m->cpuid = cpuid_eax(op: 1);
126 m->socketid = cpu_data(m->extcpu).topo.pkg_id;
127 m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
128 m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
129 m->ppin = cpu_data(m->extcpu).ppin;
130 m->microcode = boot_cpu_data.microcode;
131}
132
133DEFINE_PER_CPU(struct mce, injectm);
134EXPORT_PER_CPU_SYMBOL_GPL(injectm);
135
136void mce_log(struct mce *m)
137{
138 if (!mce_gen_pool_add(mce: m))
139 irq_work_queue(work: &mce_irq_work);
140}
141EXPORT_SYMBOL_GPL(mce_log);
142
143void mce_register_decode_chain(struct notifier_block *nb)
144{
145 if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
146 nb->priority > MCE_PRIO_HIGHEST))
147 return;
148
149 blocking_notifier_chain_register(nh: &x86_mce_decoder_chain, nb);
150}
151EXPORT_SYMBOL_GPL(mce_register_decode_chain);
152
153void mce_unregister_decode_chain(struct notifier_block *nb)
154{
155 blocking_notifier_chain_unregister(nh: &x86_mce_decoder_chain, nb);
156}
157EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
158
159static void __print_mce(struct mce *m)
160{
161 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
162 m->extcpu,
163 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
164 m->mcgstatus, m->bank, m->status);
165
166 if (m->ip) {
167 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
168 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
169 m->cs, m->ip);
170
171 if (m->cs == __KERNEL_CS)
172 pr_cont("{%pS}", (void *)(unsigned long)m->ip);
173 pr_cont("\n");
174 }
175
176 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
177 if (m->addr)
178 pr_cont("ADDR %llx ", m->addr);
179 if (m->misc)
180 pr_cont("MISC %llx ", m->misc);
181 if (m->ppin)
182 pr_cont("PPIN %llx ", m->ppin);
183
184 if (mce_flags.smca) {
185 if (m->synd)
186 pr_cont("SYND %llx ", m->synd);
187 if (m->ipid)
188 pr_cont("IPID %llx ", m->ipid);
189 }
190
191 pr_cont("\n");
192
193 /*
194 * Note this output is parsed by external tools and old fields
195 * should not be changed.
196 */
197 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
198 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
199 m->microcode);
200}
201
202static void print_mce(struct mce *m)
203{
204 __print_mce(m);
205
206 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
207 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
208}
209
210#define PANIC_TIMEOUT 5 /* 5 seconds */
211
212static atomic_t mce_panicked;
213
214static int fake_panic;
215static atomic_t mce_fake_panicked;
216
217/* Panic in progress. Enable interrupts and wait for final IPI */
218static void wait_for_panic(void)
219{
220 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
221
222 preempt_disable();
223 local_irq_enable();
224 while (timeout-- > 0)
225 udelay(1);
226 if (panic_timeout == 0)
227 panic_timeout = mca_cfg.panic_timeout;
228 panic(fmt: "Panicing machine check CPU died");
229}
230
231static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
232{
233 struct llist_node *pending;
234 struct mce_evt_llist *l;
235 int apei_err = 0;
236
237 /*
238 * Allow instrumentation around external facilities usage. Not that it
239 * matters a whole lot since the machine is going to panic anyway.
240 */
241 instrumentation_begin();
242
243 if (!fake_panic) {
244 /*
245 * Make sure only one CPU runs in machine check panic
246 */
247 if (atomic_inc_return(v: &mce_panicked) > 1)
248 wait_for_panic();
249 barrier();
250
251 bust_spinlocks(yes: 1);
252 console_verbose();
253 } else {
254 /* Don't log too much for fake panic */
255 if (atomic_inc_return(v: &mce_fake_panicked) > 1)
256 goto out;
257 }
258 pending = mce_gen_pool_prepare_records();
259 /* First print corrected ones that are still unlogged */
260 llist_for_each_entry(l, pending, llnode) {
261 struct mce *m = &l->mce;
262 if (!(m->status & MCI_STATUS_UC)) {
263 print_mce(m);
264 if (!apei_err)
265 apei_err = apei_write_mce(m);
266 }
267 }
268 /* Now print uncorrected but with the final one last */
269 llist_for_each_entry(l, pending, llnode) {
270 struct mce *m = &l->mce;
271 if (!(m->status & MCI_STATUS_UC))
272 continue;
273 if (!final || mce_cmp(m1: m, m2: final)) {
274 print_mce(m);
275 if (!apei_err)
276 apei_err = apei_write_mce(m);
277 }
278 }
279 if (final) {
280 print_mce(m: final);
281 if (!apei_err)
282 apei_err = apei_write_mce(m: final);
283 }
284 if (exp)
285 pr_emerg(HW_ERR "Machine check: %s\n", exp);
286 if (!fake_panic) {
287 if (panic_timeout == 0)
288 panic_timeout = mca_cfg.panic_timeout;
289 panic(fmt: msg);
290 } else
291 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
292
293out:
294 instrumentation_end();
295}
296
297/* Support code for software error injection */
298
299static int msr_to_offset(u32 msr)
300{
301 unsigned bank = __this_cpu_read(injectm.bank);
302
303 if (msr == mca_cfg.rip_msr)
304 return offsetof(struct mce, ip);
305 if (msr == mca_msr_reg(bank, reg: MCA_STATUS))
306 return offsetof(struct mce, status);
307 if (msr == mca_msr_reg(bank, reg: MCA_ADDR))
308 return offsetof(struct mce, addr);
309 if (msr == mca_msr_reg(bank, reg: MCA_MISC))
310 return offsetof(struct mce, misc);
311 if (msr == MSR_IA32_MCG_STATUS)
312 return offsetof(struct mce, mcgstatus);
313 return -1;
314}
315
316void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
317{
318 if (wrmsr) {
319 pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
320 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
321 regs->ip, (void *)regs->ip);
322 } else {
323 pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
324 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
325 }
326
327 show_stack_regs(regs);
328
329 panic(fmt: "MCA architectural violation!\n");
330
331 while (true)
332 cpu_relax();
333}
334
335/* MSR access wrappers used for error injection */
336noinstr u64 mce_rdmsrl(u32 msr)
337{
338 DECLARE_ARGS(val, low, high);
339
340 if (__this_cpu_read(injectm.finished)) {
341 int offset;
342 u64 ret;
343
344 instrumentation_begin();
345
346 offset = msr_to_offset(msr);
347 if (offset < 0)
348 ret = 0;
349 else
350 ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
351
352 instrumentation_end();
353
354 return ret;
355 }
356
357 /*
358 * RDMSR on MCA MSRs should not fault. If they do, this is very much an
359 * architectural violation and needs to be reported to hw vendor. Panic
360 * the box to not allow any further progress.
361 */
362 asm volatile("1: rdmsr\n"
363 "2:\n"
364 _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
365 : EAX_EDX_RET(val, low, high) : "c" (msr));
366
367
368 return EAX_EDX_VAL(val, low, high);
369}
370
371static noinstr void mce_wrmsrl(u32 msr, u64 v)
372{
373 u32 low, high;
374
375 if (__this_cpu_read(injectm.finished)) {
376 int offset;
377
378 instrumentation_begin();
379
380 offset = msr_to_offset(msr);
381 if (offset >= 0)
382 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
383
384 instrumentation_end();
385
386 return;
387 }
388
389 low = (u32)v;
390 high = (u32)(v >> 32);
391
392 /* See comment in mce_rdmsrl() */
393 asm volatile("1: wrmsr\n"
394 "2:\n"
395 _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
396 : : "c" (msr), "a"(low), "d" (high) : "memory");
397}
398
399/*
400 * Collect all global (w.r.t. this processor) status about this machine
401 * check into our "mce" struct so that we can use it later to assess
402 * the severity of the problem as we read per-bank specific details.
403 */
404static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
405{
406 /*
407 * Enable instrumentation around mce_setup() which calls external
408 * facilities.
409 */
410 instrumentation_begin();
411 mce_setup(m);
412 instrumentation_end();
413
414 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
415 if (regs) {
416 /*
417 * Get the address of the instruction at the time of
418 * the machine check error.
419 */
420 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
421 m->ip = regs->ip;
422 m->cs = regs->cs;
423
424 /*
425 * When in VM86 mode make the cs look like ring 3
426 * always. This is a lie, but it's better than passing
427 * the additional vm86 bit around everywhere.
428 */
429 if (v8086_mode(regs))
430 m->cs |= 3;
431 }
432 /* Use accurate RIP reporting if available. */
433 if (mca_cfg.rip_msr)
434 m->ip = mce_rdmsrl(msr: mca_cfg.rip_msr);
435 }
436}
437
438int mce_available(struct cpuinfo_x86 *c)
439{
440 if (mca_cfg.disabled)
441 return 0;
442 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
443}
444
445static void mce_schedule_work(void)
446{
447 if (!mce_gen_pool_empty())
448 schedule_work(work: &mce_work);
449}
450
451static void mce_irq_work_cb(struct irq_work *entry)
452{
453 mce_schedule_work();
454}
455
456bool mce_usable_address(struct mce *m)
457{
458 if (!(m->status & MCI_STATUS_ADDRV))
459 return false;
460
461 switch (m->cpuvendor) {
462 case X86_VENDOR_AMD:
463 return amd_mce_usable_address(m);
464
465 case X86_VENDOR_INTEL:
466 case X86_VENDOR_ZHAOXIN:
467 return intel_mce_usable_address(m);
468
469 default:
470 return true;
471 }
472}
473EXPORT_SYMBOL_GPL(mce_usable_address);
474
475bool mce_is_memory_error(struct mce *m)
476{
477 switch (m->cpuvendor) {
478 case X86_VENDOR_AMD:
479 case X86_VENDOR_HYGON:
480 return amd_mce_is_memory_error(m);
481
482 case X86_VENDOR_INTEL:
483 case X86_VENDOR_ZHAOXIN:
484 /*
485 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
486 *
487 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
488 * indicating a memory error. Bit 8 is used for indicating a
489 * cache hierarchy error. The combination of bit 2 and bit 3
490 * is used for indicating a `generic' cache hierarchy error
491 * But we can't just blindly check the above bits, because if
492 * bit 11 is set, then it is a bus/interconnect error - and
493 * either way the above bits just gives more detail on what
494 * bus/interconnect error happened. Note that bit 12 can be
495 * ignored, as it's the "filter" bit.
496 */
497 return (m->status & 0xef80) == BIT(7) ||
498 (m->status & 0xef00) == BIT(8) ||
499 (m->status & 0xeffc) == 0xc;
500
501 default:
502 return false;
503 }
504}
505EXPORT_SYMBOL_GPL(mce_is_memory_error);
506
507static bool whole_page(struct mce *m)
508{
509 if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
510 return true;
511
512 return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
513}
514
515bool mce_is_correctable(struct mce *m)
516{
517 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
518 return false;
519
520 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
521 return false;
522
523 if (m->status & MCI_STATUS_UC)
524 return false;
525
526 return true;
527}
528EXPORT_SYMBOL_GPL(mce_is_correctable);
529
530static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
531 void *data)
532{
533 struct mce *m = (struct mce *)data;
534
535 if (!m)
536 return NOTIFY_DONE;
537
538 /* Emit the trace record: */
539 trace_mce_record(m);
540
541 set_bit(nr: 0, addr: &mce_need_notify);
542
543 mce_notify_irq();
544
545 return NOTIFY_DONE;
546}
547
548static struct notifier_block early_nb = {
549 .notifier_call = mce_early_notifier,
550 .priority = MCE_PRIO_EARLY,
551};
552
553static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
554 void *data)
555{
556 struct mce *mce = (struct mce *)data;
557 unsigned long pfn;
558
559 if (!mce || !mce_usable_address(mce))
560 return NOTIFY_DONE;
561
562 if (mce->severity != MCE_AO_SEVERITY &&
563 mce->severity != MCE_DEFERRED_SEVERITY)
564 return NOTIFY_DONE;
565
566 pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
567 if (!memory_failure(pfn, flags: 0)) {
568 set_mce_nospec(pfn);
569 mce->kflags |= MCE_HANDLED_UC;
570 }
571
572 return NOTIFY_OK;
573}
574
575static struct notifier_block mce_uc_nb = {
576 .notifier_call = uc_decode_notifier,
577 .priority = MCE_PRIO_UC,
578};
579
580static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
581 void *data)
582{
583 struct mce *m = (struct mce *)data;
584
585 if (!m)
586 return NOTIFY_DONE;
587
588 if (mca_cfg.print_all || !m->kflags)
589 __print_mce(m);
590
591 return NOTIFY_DONE;
592}
593
594static struct notifier_block mce_default_nb = {
595 .notifier_call = mce_default_notifier,
596 /* lowest prio, we want it to run last. */
597 .priority = MCE_PRIO_LOWEST,
598};
599
600/*
601 * Read ADDR and MISC registers.
602 */
603static noinstr void mce_read_aux(struct mce *m, int i)
604{
605 if (m->status & MCI_STATUS_MISCV)
606 m->misc = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_MISC));
607
608 if (m->status & MCI_STATUS_ADDRV) {
609 m->addr = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_ADDR));
610
611 /*
612 * Mask the reported address by the reported granularity.
613 */
614 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
615 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
616 m->addr >>= shift;
617 m->addr <<= shift;
618 }
619
620 smca_extract_err_addr(m);
621 }
622
623 if (mce_flags.smca) {
624 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
625
626 if (m->status & MCI_STATUS_SYNDV)
627 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
628 }
629}
630
631DEFINE_PER_CPU(unsigned, mce_poll_count);
632
633/*
634 * Poll for corrected events or events that happened before reset.
635 * Those are just logged through /dev/mcelog.
636 *
637 * This is executed in standard interrupt context.
638 *
639 * Note: spec recommends to panic for fatal unsignalled
640 * errors here. However this would be quite problematic --
641 * we would need to reimplement the Monarch handling and
642 * it would mess up the exclusion between exception handler
643 * and poll handler -- * so we skip this for now.
644 * These cases should not happen anyways, or only when the CPU
645 * is already totally * confused. In this case it's likely it will
646 * not fully execute the machine check handler either.
647 */
648bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
649{
650 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
651 bool error_seen = false;
652 struct mce m;
653 int i;
654
655 this_cpu_inc(mce_poll_count);
656
657 mce_gather_info(m: &m, NULL);
658
659 if (flags & MCP_TIMESTAMP)
660 m.tsc = rdtsc();
661
662 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
663 if (!mce_banks[i].ctl || !test_bit(i, *b))
664 continue;
665
666 m.misc = 0;
667 m.addr = 0;
668 m.bank = i;
669
670 barrier();
671 m.status = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
672
673 /* If this entry is not valid, ignore it */
674 if (!(m.status & MCI_STATUS_VAL))
675 continue;
676
677 /*
678 * If we are logging everything (at CPU online) or this
679 * is a corrected error, then we must log it.
680 */
681 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
682 goto log_it;
683
684 /*
685 * Newer Intel systems that support software error
686 * recovery need to make additional checks. Other
687 * CPUs should skip over uncorrected errors, but log
688 * everything else.
689 */
690 if (!mca_cfg.ser) {
691 if (m.status & MCI_STATUS_UC)
692 continue;
693 goto log_it;
694 }
695
696 /* Log "not enabled" (speculative) errors */
697 if (!(m.status & MCI_STATUS_EN))
698 goto log_it;
699
700 /*
701 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
702 * UC == 1 && PCC == 0 && S == 0
703 */
704 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
705 goto log_it;
706
707 /*
708 * Skip anything else. Presumption is that our read of this
709 * bank is racing with a machine check. Leave the log alone
710 * for do_machine_check() to deal with it.
711 */
712 continue;
713
714log_it:
715 error_seen = true;
716
717 if (flags & MCP_DONTLOG)
718 goto clear_it;
719
720 mce_read_aux(m: &m, i);
721 m.severity = mce_severity(a: &m, NULL, NULL, is_excp: false);
722 /*
723 * Don't get the IP here because it's unlikely to
724 * have anything to do with the actual error location.
725 */
726
727 if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
728 goto clear_it;
729
730 if (flags & MCP_QUEUE_LOG)
731 mce_gen_pool_add(mce: &m);
732 else
733 mce_log(&m);
734
735clear_it:
736 /*
737 * Clear state for this bank.
738 */
739 mce_wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), v: 0);
740 }
741
742 /*
743 * Don't clear MCG_STATUS here because it's only defined for
744 * exceptions.
745 */
746
747 sync_core();
748
749 return error_seen;
750}
751EXPORT_SYMBOL_GPL(machine_check_poll);
752
753/*
754 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
755 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
756 * Vol 3B Table 15-20). But this confuses both the code that determines
757 * whether the machine check occurred in kernel or user mode, and also
758 * the severity assessment code. Pretend that EIPV was set, and take the
759 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
760 */
761static __always_inline void
762quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
763{
764 if (bank != 0)
765 return;
766 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
767 return;
768 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
769 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
770 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
771 MCACOD)) !=
772 (MCI_STATUS_UC|MCI_STATUS_EN|
773 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
774 MCI_STATUS_AR|MCACOD_INSTR))
775 return;
776
777 m->mcgstatus |= MCG_STATUS_EIPV;
778 m->ip = regs->ip;
779 m->cs = regs->cs;
780}
781
782/*
783 * Disable fast string copy and return from the MCE handler upon the first SRAR
784 * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
785 * CPUs.
786 * The fast string copy instructions ("REP; MOVS*") could consume an
787 * uncorrectable memory error in the cache line _right after_ the desired region
788 * to copy and raise an MCE with RIP pointing to the instruction _after_ the
789 * "REP; MOVS*".
790 * This mitigation addresses the issue completely with the caveat of performance
791 * degradation on the CPU affected. This is still better than the OS crashing on
792 * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
793 * kernel context (e.g., copy_page).
794 *
795 * Returns true when fast string copy on CPU has been disabled.
796 */
797static noinstr bool quirk_skylake_repmov(void)
798{
799 u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
800 u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
801 u64 mc1_status;
802
803 /*
804 * Apply the quirk only to local machine checks, i.e., no broadcast
805 * sync is needed.
806 */
807 if (!(mcgstatus & MCG_STATUS_LMCES) ||
808 !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
809 return false;
810
811 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
812
813 /* Check for a software-recoverable data fetch error. */
814 if ((mc1_status &
815 (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
816 MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
817 MCI_STATUS_AR | MCI_STATUS_S)) ==
818 (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
819 MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
820 MCI_STATUS_AR | MCI_STATUS_S)) {
821 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
822 mce_wrmsrl(MSR_IA32_MISC_ENABLE, v: misc_enable);
823 mce_wrmsrl(MSR_IA32_MCx_STATUS(1), v: 0);
824
825 instrumentation_begin();
826 pr_err_once("Erratum detected, disable fast string copy instructions.\n");
827 instrumentation_end();
828
829 return true;
830 }
831
832 return false;
833}
834
835/*
836 * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
837 * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
838 *
839 * However, the context is still valid, so save the "cs" register for later use.
840 *
841 * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
842 *
843 * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
844 */
845static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
846{
847 if (bank != 1)
848 return;
849 if (!(m->status & MCI_STATUS_POISON))
850 return;
851
852 m->cs = regs->cs;
853}
854
855/*
856 * Do a quick check if any of the events requires a panic.
857 * This decides if we keep the events around or clear them.
858 */
859static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
860 struct pt_regs *regs)
861{
862 char *tmp = *msg;
863 int i;
864
865 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
866 m->status = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
867 if (!(m->status & MCI_STATUS_VAL))
868 continue;
869
870 arch___set_bit(nr: i, addr: validp);
871 if (mce_flags.snb_ifu_quirk)
872 quirk_sandybridge_ifu(bank: i, m, regs);
873
874 if (mce_flags.zen_ifu_quirk)
875 quirk_zen_ifu(bank: i, m, regs);
876
877 m->bank = i;
878 if (mce_severity(a: m, regs, msg: &tmp, is_excp: true) >= MCE_PANIC_SEVERITY) {
879 mce_read_aux(m, i);
880 *msg = tmp;
881 return 1;
882 }
883 }
884 return 0;
885}
886
887/*
888 * Variable to establish order between CPUs while scanning.
889 * Each CPU spins initially until executing is equal its number.
890 */
891static atomic_t mce_executing;
892
893/*
894 * Defines order of CPUs on entry. First CPU becomes Monarch.
895 */
896static atomic_t mce_callin;
897
898/*
899 * Track which CPUs entered the MCA broadcast synchronization and which not in
900 * order to print holdouts.
901 */
902static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
903
904/*
905 * Check if a timeout waiting for other CPUs happened.
906 */
907static noinstr int mce_timed_out(u64 *t, const char *msg)
908{
909 int ret = 0;
910
911 /* Enable instrumentation around calls to external facilities */
912 instrumentation_begin();
913
914 /*
915 * The others already did panic for some reason.
916 * Bail out like in a timeout.
917 * rmb() to tell the compiler that system_state
918 * might have been modified by someone else.
919 */
920 rmb();
921 if (atomic_read(v: &mce_panicked))
922 wait_for_panic();
923 if (!mca_cfg.monarch_timeout)
924 goto out;
925 if ((s64)*t < SPINUNIT) {
926 if (cpumask_and(dstp: &mce_missing_cpus, cpu_online_mask, src2p: &mce_missing_cpus))
927 pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
928 cpumask_pr_args(&mce_missing_cpus));
929 mce_panic(msg, NULL, NULL);
930
931 ret = 1;
932 goto out;
933 }
934 *t -= SPINUNIT;
935
936out:
937 touch_nmi_watchdog();
938
939 instrumentation_end();
940
941 return ret;
942}
943
944/*
945 * The Monarch's reign. The Monarch is the CPU who entered
946 * the machine check handler first. It waits for the others to
947 * raise the exception too and then grades them. When any
948 * error is fatal panic. Only then let the others continue.
949 *
950 * The other CPUs entering the MCE handler will be controlled by the
951 * Monarch. They are called Subjects.
952 *
953 * This way we prevent any potential data corruption in a unrecoverable case
954 * and also makes sure always all CPU's errors are examined.
955 *
956 * Also this detects the case of a machine check event coming from outer
957 * space (not detected by any CPUs) In this case some external agent wants
958 * us to shut down, so panic too.
959 *
960 * The other CPUs might still decide to panic if the handler happens
961 * in a unrecoverable place, but in this case the system is in a semi-stable
962 * state and won't corrupt anything by itself. It's ok to let the others
963 * continue for a bit first.
964 *
965 * All the spin loops have timeouts; when a timeout happens a CPU
966 * typically elects itself to be Monarch.
967 */
968static void mce_reign(void)
969{
970 int cpu;
971 struct mce *m = NULL;
972 int global_worst = 0;
973 char *msg = NULL;
974
975 /*
976 * This CPU is the Monarch and the other CPUs have run
977 * through their handlers.
978 * Grade the severity of the errors of all the CPUs.
979 */
980 for_each_possible_cpu(cpu) {
981 struct mce *mtmp = &per_cpu(mces_seen, cpu);
982
983 if (mtmp->severity > global_worst) {
984 global_worst = mtmp->severity;
985 m = &per_cpu(mces_seen, cpu);
986 }
987 }
988
989 /*
990 * Cannot recover? Panic here then.
991 * This dumps all the mces in the log buffer and stops the
992 * other CPUs.
993 */
994 if (m && global_worst >= MCE_PANIC_SEVERITY) {
995 /* call mce_severity() to get "msg" for panic */
996 mce_severity(a: m, NULL, msg: &msg, is_excp: true);
997 mce_panic(msg: "Fatal machine check", final: m, exp: msg);
998 }
999
1000 /*
1001 * For UC somewhere we let the CPU who detects it handle it.
1002 * Also must let continue the others, otherwise the handling
1003 * CPU could deadlock on a lock.
1004 */
1005
1006 /*
1007 * No machine check event found. Must be some external
1008 * source or one CPU is hung. Panic.
1009 */
1010 if (global_worst <= MCE_KEEP_SEVERITY)
1011 mce_panic(msg: "Fatal machine check from unknown source", NULL, NULL);
1012
1013 /*
1014 * Now clear all the mces_seen so that they don't reappear on
1015 * the next mce.
1016 */
1017 for_each_possible_cpu(cpu)
1018 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
1019}
1020
1021static atomic_t global_nwo;
1022
1023/*
1024 * Start of Monarch synchronization. This waits until all CPUs have
1025 * entered the exception handler and then determines if any of them
1026 * saw a fatal event that requires panic. Then it executes them
1027 * in the entry order.
1028 * TBD double check parallel CPU hotunplug
1029 */
1030static noinstr int mce_start(int *no_way_out)
1031{
1032 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1033 int order, ret = -1;
1034
1035 if (!timeout)
1036 return ret;
1037
1038 raw_atomic_add(i: *no_way_out, v: &global_nwo);
1039 /*
1040 * Rely on the implied barrier below, such that global_nwo
1041 * is updated before mce_callin.
1042 */
1043 order = raw_atomic_inc_return(v: &mce_callin);
1044 arch_cpumask_clear_cpu(smp_processor_id(), dstp: &mce_missing_cpus);
1045
1046 /* Enable instrumentation around calls to external facilities */
1047 instrumentation_begin();
1048
1049 /*
1050 * Wait for everyone.
1051 */
1052 while (raw_atomic_read(v: &mce_callin) != num_online_cpus()) {
1053 if (mce_timed_out(t: &timeout,
1054 msg: "Timeout: Not all CPUs entered broadcast exception handler")) {
1055 raw_atomic_set(v: &global_nwo, i: 0);
1056 goto out;
1057 }
1058 ndelay(SPINUNIT);
1059 }
1060
1061 /*
1062 * mce_callin should be read before global_nwo
1063 */
1064 smp_rmb();
1065
1066 if (order == 1) {
1067 /*
1068 * Monarch: Starts executing now, the others wait.
1069 */
1070 raw_atomic_set(v: &mce_executing, i: 1);
1071 } else {
1072 /*
1073 * Subject: Now start the scanning loop one by one in
1074 * the original callin order.
1075 * This way when there are any shared banks it will be
1076 * only seen by one CPU before cleared, avoiding duplicates.
1077 */
1078 while (raw_atomic_read(v: &mce_executing) < order) {
1079 if (mce_timed_out(t: &timeout,
1080 msg: "Timeout: Subject CPUs unable to finish machine check processing")) {
1081 raw_atomic_set(v: &global_nwo, i: 0);
1082 goto out;
1083 }
1084 ndelay(SPINUNIT);
1085 }
1086 }
1087
1088 /*
1089 * Cache the global no_way_out state.
1090 */
1091 *no_way_out = raw_atomic_read(v: &global_nwo);
1092
1093 ret = order;
1094
1095out:
1096 instrumentation_end();
1097
1098 return ret;
1099}
1100
1101/*
1102 * Synchronize between CPUs after main scanning loop.
1103 * This invokes the bulk of the Monarch processing.
1104 */
1105static noinstr int mce_end(int order)
1106{
1107 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1108 int ret = -1;
1109
1110 /* Allow instrumentation around external facilities. */
1111 instrumentation_begin();
1112
1113 if (!timeout)
1114 goto reset;
1115 if (order < 0)
1116 goto reset;
1117
1118 /*
1119 * Allow others to run.
1120 */
1121 atomic_inc(v: &mce_executing);
1122
1123 if (order == 1) {
1124 /*
1125 * Monarch: Wait for everyone to go through their scanning
1126 * loops.
1127 */
1128 while (atomic_read(v: &mce_executing) <= num_online_cpus()) {
1129 if (mce_timed_out(t: &timeout,
1130 msg: "Timeout: Monarch CPU unable to finish machine check processing"))
1131 goto reset;
1132 ndelay(SPINUNIT);
1133 }
1134
1135 mce_reign();
1136 barrier();
1137 ret = 0;
1138 } else {
1139 /*
1140 * Subject: Wait for Monarch to finish.
1141 */
1142 while (atomic_read(v: &mce_executing) != 0) {
1143 if (mce_timed_out(t: &timeout,
1144 msg: "Timeout: Monarch CPU did not finish machine check processing"))
1145 goto reset;
1146 ndelay(SPINUNIT);
1147 }
1148
1149 /*
1150 * Don't reset anything. That's done by the Monarch.
1151 */
1152 ret = 0;
1153 goto out;
1154 }
1155
1156 /*
1157 * Reset all global state.
1158 */
1159reset:
1160 atomic_set(v: &global_nwo, i: 0);
1161 atomic_set(v: &mce_callin, i: 0);
1162 cpumask_setall(dstp: &mce_missing_cpus);
1163 barrier();
1164
1165 /*
1166 * Let others run again.
1167 */
1168 atomic_set(v: &mce_executing, i: 0);
1169
1170out:
1171 instrumentation_end();
1172
1173 return ret;
1174}
1175
1176static __always_inline void mce_clear_state(unsigned long *toclear)
1177{
1178 int i;
1179
1180 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1181 if (arch_test_bit(nr: i, addr: toclear))
1182 mce_wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), v: 0);
1183 }
1184}
1185
1186/*
1187 * Cases where we avoid rendezvous handler timeout:
1188 * 1) If this CPU is offline.
1189 *
1190 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1191 * skip those CPUs which remain looping in the 1st kernel - see
1192 * crash_nmi_callback().
1193 *
1194 * Note: there still is a small window between kexec-ing and the new,
1195 * kdump kernel establishing a new #MC handler where a broadcasted MCE
1196 * might not get handled properly.
1197 */
1198static noinstr bool mce_check_crashing_cpu(void)
1199{
1200 unsigned int cpu = smp_processor_id();
1201
1202 if (arch_cpu_is_offline(cpu) ||
1203 (crashing_cpu != -1 && crashing_cpu != cpu)) {
1204 u64 mcgstatus;
1205
1206 mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
1207
1208 if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1209 if (mcgstatus & MCG_STATUS_LMCES)
1210 return false;
1211 }
1212
1213 if (mcgstatus & MCG_STATUS_RIPV) {
1214 __wrmsr(MSR_IA32_MCG_STATUS, low: 0, high: 0);
1215 return true;
1216 }
1217 }
1218 return false;
1219}
1220
1221static __always_inline int
1222__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
1223 unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
1224 int *worst)
1225{
1226 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1227 struct mca_config *cfg = &mca_cfg;
1228 int severity, i, taint = 0;
1229
1230 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1231 arch___clear_bit(nr: i, addr: toclear);
1232 if (!arch_test_bit(nr: i, addr: valid_banks))
1233 continue;
1234
1235 if (!mce_banks[i].ctl)
1236 continue;
1237
1238 m->misc = 0;
1239 m->addr = 0;
1240 m->bank = i;
1241
1242 m->status = mce_rdmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS));
1243 if (!(m->status & MCI_STATUS_VAL))
1244 continue;
1245
1246 /*
1247 * Corrected or non-signaled errors are handled by
1248 * machine_check_poll(). Leave them alone, unless this panics.
1249 */
1250 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1251 !no_way_out)
1252 continue;
1253
1254 /* Set taint even when machine check was not enabled. */
1255 taint++;
1256
1257 severity = mce_severity(a: m, regs, NULL, is_excp: true);
1258
1259 /*
1260 * When machine check was for corrected/deferred handler don't
1261 * touch, unless we're panicking.
1262 */
1263 if ((severity == MCE_KEEP_SEVERITY ||
1264 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1265 continue;
1266
1267 arch___set_bit(nr: i, addr: toclear);
1268
1269 /* Machine check event was not enabled. Clear, but ignore. */
1270 if (severity == MCE_NO_SEVERITY)
1271 continue;
1272
1273 mce_read_aux(m, i);
1274
1275 /* assuming valid severity level != 0 */
1276 m->severity = severity;
1277
1278 /*
1279 * Enable instrumentation around the mce_log() call which is
1280 * done in #MC context, where instrumentation is disabled.
1281 */
1282 instrumentation_begin();
1283 mce_log(m);
1284 instrumentation_end();
1285
1286 if (severity > *worst) {
1287 *final = *m;
1288 *worst = severity;
1289 }
1290 }
1291
1292 /* mce_clear_state will clear *final, save locally for use later */
1293 *m = *final;
1294
1295 return taint;
1296}
1297
1298static void kill_me_now(struct callback_head *ch)
1299{
1300 struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
1301
1302 p->mce_count = 0;
1303 force_sig(SIGBUS);
1304}
1305
1306static void kill_me_maybe(struct callback_head *cb)
1307{
1308 struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1309 int flags = MF_ACTION_REQUIRED;
1310 unsigned long pfn;
1311 int ret;
1312
1313 p->mce_count = 0;
1314 pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
1315
1316 if (!p->mce_ripv)
1317 flags |= MF_MUST_KILL;
1318
1319 pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
1320 ret = memory_failure(pfn, flags);
1321 if (!ret) {
1322 set_mce_nospec(pfn);
1323 sync_core();
1324 return;
1325 }
1326
1327 /*
1328 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
1329 * to the current process with the proper error info,
1330 * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
1331 *
1332 * In both cases, no further processing is required.
1333 */
1334 if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
1335 return;
1336
1337 pr_err("Memory error not recovered");
1338 kill_me_now(ch: cb);
1339}
1340
1341static void kill_me_never(struct callback_head *cb)
1342{
1343 struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1344 unsigned long pfn;
1345
1346 p->mce_count = 0;
1347 pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
1348 pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
1349 if (!memory_failure(pfn, flags: 0))
1350 set_mce_nospec(pfn);
1351}
1352
1353static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
1354{
1355 int count = ++current->mce_count;
1356
1357 /* First call, save all the details */
1358 if (count == 1) {
1359 current->mce_addr = m->addr;
1360 current->mce_kflags = m->kflags;
1361 current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
1362 current->mce_whole_page = whole_page(m);
1363 current->mce_kill_me.func = func;
1364 }
1365
1366 /* Ten is likely overkill. Don't expect more than two faults before task_work() */
1367 if (count > 10)
1368 mce_panic(msg: "Too many consecutive machine checks while accessing user data", final: m, exp: msg);
1369
1370 /* Second or later call, make sure page address matches the one from first call */
1371 if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
1372 mce_panic(msg: "Consecutive machine checks to different user pages", final: m, exp: msg);
1373
1374 /* Do not call task_work_add() more than once */
1375 if (count > 1)
1376 return;
1377
1378 task_work_add(current, twork: &current->mce_kill_me, mode: TWA_RESUME);
1379}
1380
1381/* Handle unconfigured int18 (should never happen) */
1382static noinstr void unexpected_machine_check(struct pt_regs *regs)
1383{
1384 instrumentation_begin();
1385 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1386 smp_processor_id());
1387 instrumentation_end();
1388}
1389
1390/*
1391 * The actual machine check handler. This only handles real exceptions when
1392 * something got corrupted coming in through int 18.
1393 *
1394 * This is executed in #MC context not subject to normal locking rules.
1395 * This implies that most kernel services cannot be safely used. Don't even
1396 * think about putting a printk in there!
1397 *
1398 * On Intel systems this is entered on all CPUs in parallel through
1399 * MCE broadcast. However some CPUs might be broken beyond repair,
1400 * so be always careful when synchronizing with others.
1401 *
1402 * Tracing and kprobes are disabled: if we interrupted a kernel context
1403 * with IF=1, we need to minimize stack usage. There are also recursion
1404 * issues: if the machine check was due to a failure of the memory
1405 * backing the user stack, tracing that reads the user stack will cause
1406 * potentially infinite recursion.
1407 *
1408 * Currently, the #MC handler calls out to a number of external facilities
1409 * and, therefore, allows instrumentation around them. The optimal thing to
1410 * have would be to do the absolutely minimal work required in #MC context
1411 * and have instrumentation disabled only around that. Further processing can
1412 * then happen in process context where instrumentation is allowed. Achieving
1413 * that requires careful auditing and modifications. Until then, the code
1414 * allows instrumentation temporarily, where required. *
1415 */
1416noinstr void do_machine_check(struct pt_regs *regs)
1417{
1418 int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
1419 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
1420 DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
1421 struct mce m, *final;
1422 char *msg = NULL;
1423
1424 if (unlikely(mce_flags.p5))
1425 return pentium_machine_check(regs);
1426 else if (unlikely(mce_flags.winchip))
1427 return winchip_machine_check(regs);
1428 else if (unlikely(!mca_cfg.initialized))
1429 return unexpected_machine_check(regs);
1430
1431 if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
1432 goto clear;
1433
1434 /*
1435 * Establish sequential order between the CPUs entering the machine
1436 * check handler.
1437 */
1438 order = -1;
1439
1440 /*
1441 * If no_way_out gets set, there is no safe way to recover from this
1442 * MCE.
1443 */
1444 no_way_out = 0;
1445
1446 /*
1447 * If kill_current_task is not set, there might be a way to recover from this
1448 * error.
1449 */
1450 kill_current_task = 0;
1451
1452 /*
1453 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1454 * on Intel.
1455 */
1456 lmce = 1;
1457
1458 this_cpu_inc(mce_exception_count);
1459
1460 mce_gather_info(m: &m, regs);
1461 m.tsc = rdtsc();
1462
1463 final = this_cpu_ptr(&mces_seen);
1464 *final = m;
1465
1466 no_way_out = mce_no_way_out(m: &m, msg: &msg, validp: valid_banks, regs);
1467
1468 barrier();
1469
1470 /*
1471 * When no restart IP might need to kill or panic.
1472 * Assume the worst for now, but if we find the
1473 * severity is MCE_AR_SEVERITY we have other options.
1474 */
1475 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1476 kill_current_task = 1;
1477 /*
1478 * Check if this MCE is signaled to only this logical processor,
1479 * on Intel, Zhaoxin only.
1480 */
1481 if (m.cpuvendor == X86_VENDOR_INTEL ||
1482 m.cpuvendor == X86_VENDOR_ZHAOXIN)
1483 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1484
1485 /*
1486 * Local machine check may already know that we have to panic.
1487 * Broadcast machine check begins rendezvous in mce_start()
1488 * Go through all banks in exclusion of the other CPUs. This way we
1489 * don't report duplicated events on shared banks because the first one
1490 * to see it will clear it.
1491 */
1492 if (lmce) {
1493 if (no_way_out)
1494 mce_panic(msg: "Fatal local machine check", final: &m, exp: msg);
1495 } else {
1496 order = mce_start(no_way_out: &no_way_out);
1497 }
1498
1499 taint = __mc_scan_banks(m: &m, regs, final, toclear, valid_banks, no_way_out, worst: &worst);
1500
1501 if (!no_way_out)
1502 mce_clear_state(toclear);
1503
1504 /*
1505 * Do most of the synchronization with other CPUs.
1506 * When there's any problem use only local no_way_out state.
1507 */
1508 if (!lmce) {
1509 if (mce_end(order) < 0) {
1510 if (!no_way_out)
1511 no_way_out = worst >= MCE_PANIC_SEVERITY;
1512
1513 if (no_way_out)
1514 mce_panic(msg: "Fatal machine check on current CPU", final: &m, exp: msg);
1515 }
1516 } else {
1517 /*
1518 * If there was a fatal machine check we should have
1519 * already called mce_panic earlier in this function.
1520 * Since we re-read the banks, we might have found
1521 * something new. Check again to see if we found a
1522 * fatal error. We call "mce_severity()" again to
1523 * make sure we have the right "msg".
1524 */
1525 if (worst >= MCE_PANIC_SEVERITY) {
1526 mce_severity(a: &m, regs, msg: &msg, is_excp: true);
1527 mce_panic(msg: "Local fatal machine check!", final: &m, exp: msg);
1528 }
1529 }
1530
1531 /*
1532 * Enable instrumentation around the external facilities like task_work_add()
1533 * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
1534 * properly would need a lot more involved reorganization.
1535 */
1536 instrumentation_begin();
1537
1538 if (taint)
1539 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1540
1541 if (worst != MCE_AR_SEVERITY && !kill_current_task)
1542 goto out;
1543
1544 /* Fault was in user mode and we need to take some action */
1545 if ((m.cs & 3) == 3) {
1546 /* If this triggers there is no way to recover. Die hard. */
1547 BUG_ON(!on_thread_stack() || !user_mode(regs));
1548
1549 if (!mce_usable_address(&m))
1550 queue_task_work(m: &m, msg, func: kill_me_now);
1551 else
1552 queue_task_work(m: &m, msg, func: kill_me_maybe);
1553
1554 } else {
1555 /*
1556 * Handle an MCE which has happened in kernel space but from
1557 * which the kernel can recover: ex_has_fault_handler() has
1558 * already verified that the rIP at which the error happened is
1559 * a rIP from which the kernel can recover (by jumping to
1560 * recovery code specified in _ASM_EXTABLE_FAULT()) and the
1561 * corresponding exception handler which would do that is the
1562 * proper one.
1563 */
1564 if (m.kflags & MCE_IN_KERNEL_RECOV) {
1565 if (!fixup_exception(regs, X86_TRAP_MC, error_code: 0, fault_addr: 0))
1566 mce_panic(msg: "Failed kernel mode recovery", final: &m, exp: msg);
1567 }
1568
1569 if (m.kflags & MCE_IN_KERNEL_COPYIN)
1570 queue_task_work(m: &m, msg, func: kill_me_never);
1571 }
1572
1573out:
1574 instrumentation_end();
1575
1576clear:
1577 mce_wrmsrl(MSR_IA32_MCG_STATUS, v: 0);
1578}
1579EXPORT_SYMBOL_GPL(do_machine_check);
1580
1581#ifndef CONFIG_MEMORY_FAILURE
1582int memory_failure(unsigned long pfn, int flags)
1583{
1584 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1585 BUG_ON(flags & MF_ACTION_REQUIRED);
1586 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1587 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1588 pfn);
1589
1590 return 0;
1591}
1592#endif
1593
1594/*
1595 * Periodic polling timer for "silent" machine check errors. If the
1596 * poller finds an MCE, poll 2x faster. When the poller finds no more
1597 * errors, poll 2x slower (up to check_interval seconds).
1598 */
1599static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1600
1601static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1602static DEFINE_PER_CPU(struct timer_list, mce_timer);
1603
1604static unsigned long mce_adjust_timer_default(unsigned long interval)
1605{
1606 return interval;
1607}
1608
1609static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1610
1611static void __start_timer(struct timer_list *t, unsigned long interval)
1612{
1613 unsigned long when = jiffies + interval;
1614 unsigned long flags;
1615
1616 local_irq_save(flags);
1617
1618 if (!timer_pending(timer: t) || time_before(when, t->expires))
1619 mod_timer(timer: t, expires: round_jiffies(j: when));
1620
1621 local_irq_restore(flags);
1622}
1623
1624static void mc_poll_banks_default(void)
1625{
1626 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1627}
1628
1629void (*mc_poll_banks)(void) = mc_poll_banks_default;
1630
1631static void mce_timer_fn(struct timer_list *t)
1632{
1633 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1634 unsigned long iv;
1635
1636 WARN_ON(cpu_t != t);
1637
1638 iv = __this_cpu_read(mce_next_interval);
1639
1640 if (mce_available(this_cpu_ptr(&cpu_info))) {
1641 mc_poll_banks();
1642
1643 if (mce_intel_cmci_poll()) {
1644 iv = mce_adjust_timer(iv);
1645 goto done;
1646 }
1647 }
1648
1649 /*
1650 * Alert userspace if needed. If we logged an MCE, reduce the polling
1651 * interval, otherwise increase the polling interval.
1652 */
1653 if (mce_notify_irq())
1654 iv = max(iv / 2, (unsigned long) HZ/100);
1655 else
1656 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1657
1658done:
1659 __this_cpu_write(mce_next_interval, iv);
1660 __start_timer(t, interval: iv);
1661}
1662
1663/*
1664 * Ensure that the timer is firing in @interval from now.
1665 */
1666void mce_timer_kick(unsigned long interval)
1667{
1668 struct timer_list *t = this_cpu_ptr(&mce_timer);
1669 unsigned long iv = __this_cpu_read(mce_next_interval);
1670
1671 __start_timer(t, interval);
1672
1673 if (interval < iv)
1674 __this_cpu_write(mce_next_interval, interval);
1675}
1676
1677/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1678static void mce_timer_delete_all(void)
1679{
1680 int cpu;
1681
1682 for_each_online_cpu(cpu)
1683 del_timer_sync(timer: &per_cpu(mce_timer, cpu));
1684}
1685
1686/*
1687 * Notify the user(s) about new machine check events.
1688 * Can be called from interrupt context, but not from machine check/NMI
1689 * context.
1690 */
1691int mce_notify_irq(void)
1692{
1693 /* Not more than two messages every minute */
1694 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1695
1696 if (test_and_clear_bit(nr: 0, addr: &mce_need_notify)) {
1697 mce_work_trigger();
1698
1699 if (__ratelimit(&ratelimit))
1700 pr_info(HW_ERR "Machine check events logged\n");
1701
1702 return 1;
1703 }
1704 return 0;
1705}
1706EXPORT_SYMBOL_GPL(mce_notify_irq);
1707
1708static void __mcheck_cpu_mce_banks_init(void)
1709{
1710 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1711 u8 n_banks = this_cpu_read(mce_num_banks);
1712 int i;
1713
1714 for (i = 0; i < n_banks; i++) {
1715 struct mce_bank *b = &mce_banks[i];
1716
1717 /*
1718 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
1719 * the required vendor quirks before
1720 * __mcheck_cpu_init_clear_banks() does the final bank setup.
1721 */
1722 b->ctl = -1ULL;
1723 b->init = true;
1724 }
1725}
1726
1727/*
1728 * Initialize Machine Checks for a CPU.
1729 */
1730static void __mcheck_cpu_cap_init(void)
1731{
1732 u64 cap;
1733 u8 b;
1734
1735 rdmsrl(MSR_IA32_MCG_CAP, cap);
1736
1737 b = cap & MCG_BANKCNT_MASK;
1738
1739 if (b > MAX_NR_BANKS) {
1740 pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1741 smp_processor_id(), MAX_NR_BANKS, b);
1742 b = MAX_NR_BANKS;
1743 }
1744
1745 this_cpu_write(mce_num_banks, b);
1746
1747 __mcheck_cpu_mce_banks_init();
1748
1749 /* Use accurate RIP reporting if available. */
1750 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1751 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1752
1753 if (cap & MCG_SER_P)
1754 mca_cfg.ser = 1;
1755}
1756
1757static void __mcheck_cpu_init_generic(void)
1758{
1759 enum mcp_flags m_fl = 0;
1760 mce_banks_t all_banks;
1761 u64 cap;
1762
1763 if (!mca_cfg.bootlog)
1764 m_fl = MCP_DONTLOG;
1765
1766 /*
1767 * Log the machine checks left over from the previous reset. Log them
1768 * only, do not start processing them. That will happen in mcheck_late_init()
1769 * when all consumers have been registered on the notifier chain.
1770 */
1771 bitmap_fill(dst: all_banks, MAX_NR_BANKS);
1772 machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
1773
1774 cr4_set_bits(X86_CR4_MCE);
1775
1776 rdmsrl(MSR_IA32_MCG_CAP, cap);
1777 if (cap & MCG_CTL_P)
1778 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1779}
1780
1781static void __mcheck_cpu_init_clear_banks(void)
1782{
1783 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1784 int i;
1785
1786 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1787 struct mce_bank *b = &mce_banks[i];
1788
1789 if (!b->init)
1790 continue;
1791 wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: b->ctl);
1792 wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_STATUS), val: 0);
1793 }
1794}
1795
1796/*
1797 * Do a final check to see if there are any unused/RAZ banks.
1798 *
1799 * This must be done after the banks have been initialized and any quirks have
1800 * been applied.
1801 *
1802 * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
1803 * Otherwise, a user who disables a bank will not be able to re-enable it
1804 * without a system reboot.
1805 */
1806static void __mcheck_cpu_check_banks(void)
1807{
1808 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1809 u64 msrval;
1810 int i;
1811
1812 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1813 struct mce_bank *b = &mce_banks[i];
1814
1815 if (!b->init)
1816 continue;
1817
1818 rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
1819 b->init = !!msrval;
1820 }
1821}
1822
1823/* Add per CPU specific workarounds here */
1824static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1825{
1826 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1827 struct mca_config *cfg = &mca_cfg;
1828
1829 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1830 pr_info("unknown CPU type - not enabling MCE support\n");
1831 return -EOPNOTSUPP;
1832 }
1833
1834 /* This should be disabled by the BIOS, but isn't always */
1835 if (c->x86_vendor == X86_VENDOR_AMD) {
1836 if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
1837 /*
1838 * disable GART TBL walk error reporting, which
1839 * trips off incorrectly with the IOMMU & 3ware
1840 * & Cerberus:
1841 */
1842 clear_bit(nr: 10, addr: (unsigned long *)&mce_banks[4].ctl);
1843 }
1844 if (c->x86 < 0x11 && cfg->bootlog < 0) {
1845 /*
1846 * Lots of broken BIOS around that don't clear them
1847 * by default and leave crap in there. Don't log:
1848 */
1849 cfg->bootlog = 0;
1850 }
1851 /*
1852 * Various K7s with broken bank 0 around. Always disable
1853 * by default.
1854 */
1855 if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
1856 mce_banks[0].ctl = 0;
1857
1858 /*
1859 * overflow_recov is supported for F15h Models 00h-0fh
1860 * even though we don't have a CPUID bit for it.
1861 */
1862 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1863 mce_flags.overflow_recov = 1;
1864
1865 if (c->x86 >= 0x17 && c->x86 <= 0x1A)
1866 mce_flags.zen_ifu_quirk = 1;
1867
1868 }
1869
1870 if (c->x86_vendor == X86_VENDOR_INTEL) {
1871 /*
1872 * SDM documents that on family 6 bank 0 should not be written
1873 * because it aliases to another special BIOS controlled
1874 * register.
1875 * But it's not aliased anymore on model 0x1a+
1876 * Don't ignore bank 0 completely because there could be a
1877 * valid event later, merely don't write CTL0.
1878 */
1879
1880 if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
1881 mce_banks[0].init = false;
1882
1883 /*
1884 * All newer Intel systems support MCE broadcasting. Enable
1885 * synchronization with a one second timeout.
1886 */
1887 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1888 cfg->monarch_timeout < 0)
1889 cfg->monarch_timeout = USEC_PER_SEC;
1890
1891 /*
1892 * There are also broken BIOSes on some Pentium M and
1893 * earlier systems:
1894 */
1895 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1896 cfg->bootlog = 0;
1897
1898 if (c->x86 == 6 && c->x86_model == 45)
1899 mce_flags.snb_ifu_quirk = 1;
1900
1901 /*
1902 * Skylake, Cascacde Lake and Cooper Lake require a quirk on
1903 * rep movs.
1904 */
1905 if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
1906 mce_flags.skx_repmov_quirk = 1;
1907 }
1908
1909 if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1910 /*
1911 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
1912 * synchronization with a one second timeout.
1913 */
1914 if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1915 if (cfg->monarch_timeout < 0)
1916 cfg->monarch_timeout = USEC_PER_SEC;
1917 }
1918 }
1919
1920 if (cfg->monarch_timeout < 0)
1921 cfg->monarch_timeout = 0;
1922 if (cfg->bootlog != 0)
1923 cfg->panic_timeout = 30;
1924
1925 return 0;
1926}
1927
1928static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1929{
1930 if (c->x86 != 5)
1931 return 0;
1932
1933 switch (c->x86_vendor) {
1934 case X86_VENDOR_INTEL:
1935 intel_p5_mcheck_init(c);
1936 mce_flags.p5 = 1;
1937 return 1;
1938 case X86_VENDOR_CENTAUR:
1939 winchip_mcheck_init(c);
1940 mce_flags.winchip = 1;
1941 return 1;
1942 default:
1943 return 0;
1944 }
1945
1946 return 0;
1947}
1948
1949/*
1950 * Init basic CPU features needed for early decoding of MCEs.
1951 */
1952static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1953{
1954 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
1955 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1956 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1957 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
1958 mce_flags.amd_threshold = 1;
1959 }
1960}
1961
1962static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1963{
1964 struct mca_config *cfg = &mca_cfg;
1965
1966 /*
1967 * All newer Centaur CPUs support MCE broadcasting. Enable
1968 * synchronization with a one second timeout.
1969 */
1970 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1971 c->x86 > 6) {
1972 if (cfg->monarch_timeout < 0)
1973 cfg->monarch_timeout = USEC_PER_SEC;
1974 }
1975}
1976
1977static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1978{
1979 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1980
1981 /*
1982 * These CPUs have MCA bank 8 which reports only one error type called
1983 * SVAD (System View Address Decoder). The reporting of that error is
1984 * controlled by IA32_MC8.CTL.0.
1985 *
1986 * If enabled, prefetching on these CPUs will cause SVAD MCE when
1987 * virtual machines start and result in a system panic. Always disable
1988 * bank 8 SVAD error by default.
1989 */
1990 if ((c->x86 == 7 && c->x86_model == 0x1b) ||
1991 (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1992 if (this_cpu_read(mce_num_banks) > 8)
1993 mce_banks[8].ctl = 0;
1994 }
1995
1996 intel_init_cmci();
1997 intel_init_lmce();
1998 mce_adjust_timer = cmci_intel_adjust_timer;
1999}
2000
2001static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
2002{
2003 intel_clear_lmce();
2004}
2005
2006static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
2007{
2008 switch (c->x86_vendor) {
2009 case X86_VENDOR_INTEL:
2010 mce_intel_feature_init(c);
2011 mce_adjust_timer = cmci_intel_adjust_timer;
2012 break;
2013
2014 case X86_VENDOR_AMD: {
2015 mce_amd_feature_init(c);
2016 break;
2017 }
2018
2019 case X86_VENDOR_HYGON:
2020 mce_hygon_feature_init(c);
2021 break;
2022
2023 case X86_VENDOR_CENTAUR:
2024 mce_centaur_feature_init(c);
2025 break;
2026
2027 case X86_VENDOR_ZHAOXIN:
2028 mce_zhaoxin_feature_init(c);
2029 break;
2030
2031 default:
2032 break;
2033 }
2034}
2035
2036static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2037{
2038 switch (c->x86_vendor) {
2039 case X86_VENDOR_INTEL:
2040 mce_intel_feature_clear(c);
2041 break;
2042
2043 case X86_VENDOR_ZHAOXIN:
2044 mce_zhaoxin_feature_clear(c);
2045 break;
2046
2047 default:
2048 break;
2049 }
2050}
2051
2052static void mce_start_timer(struct timer_list *t)
2053{
2054 unsigned long iv = check_interval * HZ;
2055
2056 if (mca_cfg.ignore_ce || !iv)
2057 return;
2058
2059 this_cpu_write(mce_next_interval, iv);
2060 __start_timer(t, interval: iv);
2061}
2062
2063static void __mcheck_cpu_setup_timer(void)
2064{
2065 struct timer_list *t = this_cpu_ptr(&mce_timer);
2066
2067 timer_setup(t, mce_timer_fn, TIMER_PINNED);
2068}
2069
2070static void __mcheck_cpu_init_timer(void)
2071{
2072 struct timer_list *t = this_cpu_ptr(&mce_timer);
2073
2074 timer_setup(t, mce_timer_fn, TIMER_PINNED);
2075 mce_start_timer(t);
2076}
2077
2078bool filter_mce(struct mce *m)
2079{
2080 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
2081 return amd_filter_mce(m);
2082 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2083 return intel_filter_mce(m);
2084
2085 return false;
2086}
2087
2088static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
2089{
2090 irqentry_state_t irq_state;
2091
2092 WARN_ON_ONCE(user_mode(regs));
2093
2094 /*
2095 * Only required when from kernel mode. See
2096 * mce_check_crashing_cpu() for details.
2097 */
2098 if (mca_cfg.initialized && mce_check_crashing_cpu())
2099 return;
2100
2101 irq_state = irqentry_nmi_enter(regs);
2102
2103 do_machine_check(regs);
2104
2105 irqentry_nmi_exit(regs, irq_state);
2106}
2107
2108static __always_inline void exc_machine_check_user(struct pt_regs *regs)
2109{
2110 irqentry_enter_from_user_mode(regs);
2111
2112 do_machine_check(regs);
2113
2114 irqentry_exit_to_user_mode(regs);
2115}
2116
2117#ifdef CONFIG_X86_64
2118/* MCE hit kernel mode */
2119DEFINE_IDTENTRY_MCE(exc_machine_check)
2120{
2121 unsigned long dr7;
2122
2123 dr7 = local_db_save();
2124 exc_machine_check_kernel(regs);
2125 local_db_restore(dr7);
2126}
2127
2128/* The user mode variant. */
2129DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
2130{
2131 unsigned long dr7;
2132
2133 dr7 = local_db_save();
2134 exc_machine_check_user(regs);
2135 local_db_restore(dr7);
2136}
2137#else
2138/* 32bit unified entry point */
2139DEFINE_IDTENTRY_RAW(exc_machine_check)
2140{
2141 unsigned long dr7;
2142
2143 dr7 = local_db_save();
2144 if (user_mode(regs))
2145 exc_machine_check_user(regs);
2146 else
2147 exc_machine_check_kernel(regs);
2148 local_db_restore(dr7);
2149}
2150#endif
2151
2152/*
2153 * Called for each booted CPU to set up machine checks.
2154 * Must be called with preempt off:
2155 */
2156void mcheck_cpu_init(struct cpuinfo_x86 *c)
2157{
2158 if (mca_cfg.disabled)
2159 return;
2160
2161 if (__mcheck_cpu_ancient_init(c))
2162 return;
2163
2164 if (!mce_available(c))
2165 return;
2166
2167 __mcheck_cpu_cap_init();
2168
2169 if (__mcheck_cpu_apply_quirks(c) < 0) {
2170 mca_cfg.disabled = 1;
2171 return;
2172 }
2173
2174 if (mce_gen_pool_init()) {
2175 mca_cfg.disabled = 1;
2176 pr_emerg("Couldn't allocate MCE records pool!\n");
2177 return;
2178 }
2179
2180 mca_cfg.initialized = 1;
2181
2182 __mcheck_cpu_init_early(c);
2183 __mcheck_cpu_init_generic();
2184 __mcheck_cpu_init_vendor(c);
2185 __mcheck_cpu_init_clear_banks();
2186 __mcheck_cpu_check_banks();
2187 __mcheck_cpu_setup_timer();
2188}
2189
2190/*
2191 * Called for each booted CPU to clear some machine checks opt-ins
2192 */
2193void mcheck_cpu_clear(struct cpuinfo_x86 *c)
2194{
2195 if (mca_cfg.disabled)
2196 return;
2197
2198 if (!mce_available(c))
2199 return;
2200
2201 /*
2202 * Possibly to clear general settings generic to x86
2203 * __mcheck_cpu_clear_generic(c);
2204 */
2205 __mcheck_cpu_clear_vendor(c);
2206
2207}
2208
2209static void __mce_disable_bank(void *arg)
2210{
2211 int bank = *((int *)arg);
2212 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2213 cmci_disable_bank(bank);
2214}
2215
2216void mce_disable_bank(int bank)
2217{
2218 if (bank >= this_cpu_read(mce_num_banks)) {
2219 pr_warn(FW_BUG
2220 "Ignoring request to disable invalid MCA bank %d.\n",
2221 bank);
2222 return;
2223 }
2224 set_bit(nr: bank, addr: mce_banks_ce_disabled);
2225 on_each_cpu(func: __mce_disable_bank, info: &bank, wait: 1);
2226}
2227
2228/*
2229 * mce=off Disables machine check
2230 * mce=no_cmci Disables CMCI
2231 * mce=no_lmce Disables LMCE
2232 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2233 * mce=print_all Print all machine check logs to console
2234 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2235 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2236 * monarchtimeout is how long to wait for other CPUs on machine
2237 * check, or 0 to not wait
2238 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
2239 and older.
2240 * mce=nobootlog Don't log MCEs from before booting.
2241 * mce=bios_cmci_threshold Don't program the CMCI threshold
2242 * mce=recovery force enable copy_mc_fragile()
2243 */
2244static int __init mcheck_enable(char *str)
2245{
2246 struct mca_config *cfg = &mca_cfg;
2247
2248 if (*str == 0) {
2249 enable_p5_mce();
2250 return 1;
2251 }
2252 if (*str == '=')
2253 str++;
2254 if (!strcmp(str, "off"))
2255 cfg->disabled = 1;
2256 else if (!strcmp(str, "no_cmci"))
2257 cfg->cmci_disabled = true;
2258 else if (!strcmp(str, "no_lmce"))
2259 cfg->lmce_disabled = 1;
2260 else if (!strcmp(str, "dont_log_ce"))
2261 cfg->dont_log_ce = true;
2262 else if (!strcmp(str, "print_all"))
2263 cfg->print_all = true;
2264 else if (!strcmp(str, "ignore_ce"))
2265 cfg->ignore_ce = true;
2266 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2267 cfg->bootlog = (str[0] == 'b');
2268 else if (!strcmp(str, "bios_cmci_threshold"))
2269 cfg->bios_cmci_threshold = 1;
2270 else if (!strcmp(str, "recovery"))
2271 cfg->recovery = 1;
2272 else if (isdigit(c: str[0]))
2273 get_option(str: &str, pint: &(cfg->monarch_timeout));
2274 else {
2275 pr_info("mce argument %s ignored. Please use /sys\n", str);
2276 return 0;
2277 }
2278 return 1;
2279}
2280__setup("mce", mcheck_enable);
2281
2282int __init mcheck_init(void)
2283{
2284 mce_register_decode_chain(&early_nb);
2285 mce_register_decode_chain(&mce_uc_nb);
2286 mce_register_decode_chain(&mce_default_nb);
2287
2288 INIT_WORK(&mce_work, mce_gen_pool_process);
2289 init_irq_work(work: &mce_irq_work, func: mce_irq_work_cb);
2290
2291 return 0;
2292}
2293
2294/*
2295 * mce_syscore: PM support
2296 */
2297
2298/*
2299 * Disable machine checks on suspend and shutdown. We can't really handle
2300 * them later.
2301 */
2302static void mce_disable_error_reporting(void)
2303{
2304 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2305 int i;
2306
2307 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2308 struct mce_bank *b = &mce_banks[i];
2309
2310 if (b->init)
2311 wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: 0);
2312 }
2313 return;
2314}
2315
2316static void vendor_disable_error_reporting(void)
2317{
2318 /*
2319 * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
2320 * MSRs are socket-wide. Disabling them for just a single offlined CPU
2321 * is bad, since it will inhibit reporting for all shared resources on
2322 * the socket like the last level cache (LLC), the integrated memory
2323 * controller (iMC), etc.
2324 */
2325 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
2326 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
2327 boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
2328 boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
2329 return;
2330
2331 mce_disable_error_reporting();
2332}
2333
2334static int mce_syscore_suspend(void)
2335{
2336 vendor_disable_error_reporting();
2337 return 0;
2338}
2339
2340static void mce_syscore_shutdown(void)
2341{
2342 vendor_disable_error_reporting();
2343}
2344
2345/*
2346 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2347 * Only one CPU is active at this time, the others get re-added later using
2348 * CPU hotplug:
2349 */
2350static void mce_syscore_resume(void)
2351{
2352 __mcheck_cpu_init_generic();
2353 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2354 __mcheck_cpu_init_clear_banks();
2355}
2356
2357static struct syscore_ops mce_syscore_ops = {
2358 .suspend = mce_syscore_suspend,
2359 .shutdown = mce_syscore_shutdown,
2360 .resume = mce_syscore_resume,
2361};
2362
2363/*
2364 * mce_device: Sysfs support
2365 */
2366
2367static void mce_cpu_restart(void *data)
2368{
2369 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2370 return;
2371 __mcheck_cpu_init_generic();
2372 __mcheck_cpu_init_clear_banks();
2373 __mcheck_cpu_init_timer();
2374}
2375
2376/* Reinit MCEs after user configuration changes */
2377static void mce_restart(void)
2378{
2379 mce_timer_delete_all();
2380 on_each_cpu(func: mce_cpu_restart, NULL, wait: 1);
2381 mce_schedule_work();
2382}
2383
2384/* Toggle features for corrected errors */
2385static void mce_disable_cmci(void *data)
2386{
2387 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2388 return;
2389 cmci_clear();
2390}
2391
2392static void mce_enable_ce(void *all)
2393{
2394 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2395 return;
2396 cmci_reenable();
2397 cmci_recheck();
2398 if (all)
2399 __mcheck_cpu_init_timer();
2400}
2401
2402static struct bus_type mce_subsys = {
2403 .name = "machinecheck",
2404 .dev_name = "machinecheck",
2405};
2406
2407DEFINE_PER_CPU(struct device *, mce_device);
2408
2409static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
2410{
2411 return container_of(attr, struct mce_bank_dev, attr);
2412}
2413
2414static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2415 char *buf)
2416{
2417 u8 bank = attr_to_bank(attr)->bank;
2418 struct mce_bank *b;
2419
2420 if (bank >= per_cpu(mce_num_banks, s->id))
2421 return -EINVAL;
2422
2423 b = &per_cpu(mce_banks_array, s->id)[bank];
2424
2425 if (!b->init)
2426 return -ENODEV;
2427
2428 return sprintf(buf, fmt: "%llx\n", b->ctl);
2429}
2430
2431static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2432 const char *buf, size_t size)
2433{
2434 u8 bank = attr_to_bank(attr)->bank;
2435 struct mce_bank *b;
2436 u64 new;
2437
2438 if (kstrtou64(s: buf, base: 0, res: &new) < 0)
2439 return -EINVAL;
2440
2441 if (bank >= per_cpu(mce_num_banks, s->id))
2442 return -EINVAL;
2443
2444 b = &per_cpu(mce_banks_array, s->id)[bank];
2445
2446 if (!b->init)
2447 return -ENODEV;
2448
2449 b->ctl = new;
2450 mce_restart();
2451
2452 return size;
2453}
2454
2455static ssize_t set_ignore_ce(struct device *s,
2456 struct device_attribute *attr,
2457 const char *buf, size_t size)
2458{
2459 u64 new;
2460
2461 if (kstrtou64(s: buf, base: 0, res: &new) < 0)
2462 return -EINVAL;
2463
2464 mutex_lock(&mce_sysfs_mutex);
2465 if (mca_cfg.ignore_ce ^ !!new) {
2466 if (new) {
2467 /* disable ce features */
2468 mce_timer_delete_all();
2469 on_each_cpu(func: mce_disable_cmci, NULL, wait: 1);
2470 mca_cfg.ignore_ce = true;
2471 } else {
2472 /* enable ce features */
2473 mca_cfg.ignore_ce = false;
2474 on_each_cpu(func: mce_enable_ce, info: (void *)1, wait: 1);
2475 }
2476 }
2477 mutex_unlock(lock: &mce_sysfs_mutex);
2478
2479 return size;
2480}
2481
2482static ssize_t set_cmci_disabled(struct device *s,
2483 struct device_attribute *attr,
2484 const char *buf, size_t size)
2485{
2486 u64 new;
2487
2488 if (kstrtou64(s: buf, base: 0, res: &new) < 0)
2489 return -EINVAL;
2490
2491 mutex_lock(&mce_sysfs_mutex);
2492 if (mca_cfg.cmci_disabled ^ !!new) {
2493 if (new) {
2494 /* disable cmci */
2495 on_each_cpu(func: mce_disable_cmci, NULL, wait: 1);
2496 mca_cfg.cmci_disabled = true;
2497 } else {
2498 /* enable cmci */
2499 mca_cfg.cmci_disabled = false;
2500 on_each_cpu(func: mce_enable_ce, NULL, wait: 1);
2501 }
2502 }
2503 mutex_unlock(lock: &mce_sysfs_mutex);
2504
2505 return size;
2506}
2507
2508static ssize_t store_int_with_restart(struct device *s,
2509 struct device_attribute *attr,
2510 const char *buf, size_t size)
2511{
2512 unsigned long old_check_interval = check_interval;
2513 ssize_t ret = device_store_ulong(dev: s, attr, buf, count: size);
2514
2515 if (check_interval == old_check_interval)
2516 return ret;
2517
2518 mutex_lock(&mce_sysfs_mutex);
2519 mce_restart();
2520 mutex_unlock(lock: &mce_sysfs_mutex);
2521
2522 return ret;
2523}
2524
2525static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2526static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2527static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
2528
2529static struct dev_ext_attribute dev_attr_check_interval = {
2530 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2531 &check_interval
2532};
2533
2534static struct dev_ext_attribute dev_attr_ignore_ce = {
2535 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2536 &mca_cfg.ignore_ce
2537};
2538
2539static struct dev_ext_attribute dev_attr_cmci_disabled = {
2540 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2541 &mca_cfg.cmci_disabled
2542};
2543
2544static struct device_attribute *mce_device_attrs[] = {
2545 &dev_attr_check_interval.attr,
2546#ifdef CONFIG_X86_MCELOG_LEGACY
2547 &dev_attr_trigger,
2548#endif
2549 &dev_attr_monarch_timeout.attr,
2550 &dev_attr_dont_log_ce.attr,
2551 &dev_attr_print_all.attr,
2552 &dev_attr_ignore_ce.attr,
2553 &dev_attr_cmci_disabled.attr,
2554 NULL
2555};
2556
2557static cpumask_var_t mce_device_initialized;
2558
2559static void mce_device_release(struct device *dev)
2560{
2561 kfree(objp: dev);
2562}
2563
2564/* Per CPU device init. All of the CPUs still share the same bank device: */
2565static int mce_device_create(unsigned int cpu)
2566{
2567 struct device *dev;
2568 int err;
2569 int i, j;
2570
2571 if (!mce_available(c: &boot_cpu_data))
2572 return -EIO;
2573
2574 dev = per_cpu(mce_device, cpu);
2575 if (dev)
2576 return 0;
2577
2578 dev = kzalloc(size: sizeof(*dev), GFP_KERNEL);
2579 if (!dev)
2580 return -ENOMEM;
2581 dev->id = cpu;
2582 dev->bus = &mce_subsys;
2583 dev->release = &mce_device_release;
2584
2585 err = device_register(dev);
2586 if (err) {
2587 put_device(dev);
2588 return err;
2589 }
2590
2591 for (i = 0; mce_device_attrs[i]; i++) {
2592 err = device_create_file(device: dev, entry: mce_device_attrs[i]);
2593 if (err)
2594 goto error;
2595 }
2596 for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
2597 err = device_create_file(device: dev, entry: &mce_bank_devs[j].attr);
2598 if (err)
2599 goto error2;
2600 }
2601 cpumask_set_cpu(cpu, dstp: mce_device_initialized);
2602 per_cpu(mce_device, cpu) = dev;
2603
2604 return 0;
2605error2:
2606 while (--j >= 0)
2607 device_remove_file(dev, attr: &mce_bank_devs[j].attr);
2608error:
2609 while (--i >= 0)
2610 device_remove_file(dev, attr: mce_device_attrs[i]);
2611
2612 device_unregister(dev);
2613
2614 return err;
2615}
2616
2617static void mce_device_remove(unsigned int cpu)
2618{
2619 struct device *dev = per_cpu(mce_device, cpu);
2620 int i;
2621
2622 if (!cpumask_test_cpu(cpu, cpumask: mce_device_initialized))
2623 return;
2624
2625 for (i = 0; mce_device_attrs[i]; i++)
2626 device_remove_file(dev, attr: mce_device_attrs[i]);
2627
2628 for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
2629 device_remove_file(dev, attr: &mce_bank_devs[i].attr);
2630
2631 device_unregister(dev);
2632 cpumask_clear_cpu(cpu, dstp: mce_device_initialized);
2633 per_cpu(mce_device, cpu) = NULL;
2634}
2635
2636/* Make sure there are no machine checks on offlined CPUs. */
2637static void mce_disable_cpu(void)
2638{
2639 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2640 return;
2641
2642 if (!cpuhp_tasks_frozen)
2643 cmci_clear();
2644
2645 vendor_disable_error_reporting();
2646}
2647
2648static void mce_reenable_cpu(void)
2649{
2650 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2651 int i;
2652
2653 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2654 return;
2655
2656 if (!cpuhp_tasks_frozen)
2657 cmci_reenable();
2658 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2659 struct mce_bank *b = &mce_banks[i];
2660
2661 if (b->init)
2662 wrmsrl(msr: mca_msr_reg(bank: i, reg: MCA_CTL), val: b->ctl);
2663 }
2664}
2665
2666static int mce_cpu_dead(unsigned int cpu)
2667{
2668 mce_intel_hcpu_update(cpu);
2669
2670 /* intentionally ignoring frozen here */
2671 if (!cpuhp_tasks_frozen)
2672 cmci_rediscover();
2673 return 0;
2674}
2675
2676static int mce_cpu_online(unsigned int cpu)
2677{
2678 struct timer_list *t = this_cpu_ptr(&mce_timer);
2679 int ret;
2680
2681 mce_device_create(cpu);
2682
2683 ret = mce_threshold_create_device(cpu);
2684 if (ret) {
2685 mce_device_remove(cpu);
2686 return ret;
2687 }
2688 mce_reenable_cpu();
2689 mce_start_timer(t);
2690 return 0;
2691}
2692
2693static int mce_cpu_pre_down(unsigned int cpu)
2694{
2695 struct timer_list *t = this_cpu_ptr(&mce_timer);
2696
2697 mce_disable_cpu();
2698 del_timer_sync(timer: t);
2699 mce_threshold_remove_device(cpu);
2700 mce_device_remove(cpu);
2701 return 0;
2702}
2703
2704static __init void mce_init_banks(void)
2705{
2706 int i;
2707
2708 for (i = 0; i < MAX_NR_BANKS; i++) {
2709 struct mce_bank_dev *b = &mce_bank_devs[i];
2710 struct device_attribute *a = &b->attr;
2711
2712 b->bank = i;
2713
2714 sysfs_attr_init(&a->attr);
2715 a->attr.name = b->attrname;
2716 snprintf(buf: b->attrname, ATTR_LEN, fmt: "bank%d", i);
2717
2718 a->attr.mode = 0644;
2719 a->show = show_bank;
2720 a->store = set_bank;
2721 }
2722}
2723
2724/*
2725 * When running on XEN, this initcall is ordered against the XEN mcelog
2726 * initcall:
2727 *
2728 * device_initcall(xen_late_init_mcelog);
2729 * device_initcall_sync(mcheck_init_device);
2730 */
2731static __init int mcheck_init_device(void)
2732{
2733 int err;
2734
2735 /*
2736 * Check if we have a spare virtual bit. This will only become
2737 * a problem if/when we move beyond 5-level page tables.
2738 */
2739 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2740
2741 if (!mce_available(c: &boot_cpu_data)) {
2742 err = -EIO;
2743 goto err_out;
2744 }
2745
2746 if (!zalloc_cpumask_var(mask: &mce_device_initialized, GFP_KERNEL)) {
2747 err = -ENOMEM;
2748 goto err_out;
2749 }
2750
2751 mce_init_banks();
2752
2753 err = subsys_system_register(subsys: &mce_subsys, NULL);
2754 if (err)
2755 goto err_out_mem;
2756
2757 err = cpuhp_setup_state(state: CPUHP_X86_MCE_DEAD, name: "x86/mce:dead", NULL,
2758 teardown: mce_cpu_dead);
2759 if (err)
2760 goto err_out_mem;
2761
2762 /*
2763 * Invokes mce_cpu_online() on all CPUs which are online when
2764 * the state is installed.
2765 */
2766 err = cpuhp_setup_state(state: CPUHP_AP_ONLINE_DYN, name: "x86/mce:online",
2767 startup: mce_cpu_online, teardown: mce_cpu_pre_down);
2768 if (err < 0)
2769 goto err_out_online;
2770
2771 register_syscore_ops(ops: &mce_syscore_ops);
2772
2773 return 0;
2774
2775err_out_online:
2776 cpuhp_remove_state(state: CPUHP_X86_MCE_DEAD);
2777
2778err_out_mem:
2779 free_cpumask_var(mask: mce_device_initialized);
2780
2781err_out:
2782 pr_err("Unable to init MCE device (rc: %d)\n", err);
2783
2784 return err;
2785}
2786device_initcall_sync(mcheck_init_device);
2787
2788/*
2789 * Old style boot options parsing. Only for compatibility.
2790 */
2791static int __init mcheck_disable(char *str)
2792{
2793 mca_cfg.disabled = 1;
2794 return 1;
2795}
2796__setup("nomce", mcheck_disable);
2797
2798#ifdef CONFIG_DEBUG_FS
2799struct dentry *mce_get_debugfs_dir(void)
2800{
2801 static struct dentry *dmce;
2802
2803 if (!dmce)
2804 dmce = debugfs_create_dir(name: "mce", NULL);
2805
2806 return dmce;
2807}
2808
2809static void mce_reset(void)
2810{
2811 atomic_set(v: &mce_fake_panicked, i: 0);
2812 atomic_set(v: &mce_executing, i: 0);
2813 atomic_set(v: &mce_callin, i: 0);
2814 atomic_set(v: &global_nwo, i: 0);
2815 cpumask_setall(dstp: &mce_missing_cpus);
2816}
2817
2818static int fake_panic_get(void *data, u64 *val)
2819{
2820 *val = fake_panic;
2821 return 0;
2822}
2823
2824static int fake_panic_set(void *data, u64 val)
2825{
2826 mce_reset();
2827 fake_panic = val;
2828 return 0;
2829}
2830
2831DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2832 "%llu\n");
2833
2834static void __init mcheck_debugfs_init(void)
2835{
2836 struct dentry *dmce;
2837
2838 dmce = mce_get_debugfs_dir();
2839 debugfs_create_file_unsafe(name: "fake_panic", mode: 0444, parent: dmce, NULL,
2840 fops: &fake_panic_fops);
2841}
2842#else
2843static void __init mcheck_debugfs_init(void) { }
2844#endif
2845
2846static int __init mcheck_late_init(void)
2847{
2848 if (mca_cfg.recovery)
2849 enable_copy_mc_fragile();
2850
2851 mcheck_debugfs_init();
2852
2853 /*
2854 * Flush out everything that has been logged during early boot, now that
2855 * everything has been initialized (workqueues, decoders, ...).
2856 */
2857 mce_schedule_work();
2858
2859 return 0;
2860}
2861late_initcall(mcheck_late_init);
2862

source code of linux/arch/x86/kernel/cpu/mce/core.c