1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * GHES/EDAC Linux driver
4 *
5 * Copyright (c) 2013 by Mauro Carvalho Chehab
6 *
7 * Red Hat Inc. https://www.redhat.com
8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12#include <acpi/ghes.h>
13#include <linux/edac.h>
14#include <linux/dmi.h>
15#include "edac_module.h"
16#include <ras/ras_event.h>
17#include <linux/notifier.h>
18#include <linux/string.h>
19
20#define OTHER_DETAIL_LEN 400
21
22struct ghes_pvt {
23 struct mem_ctl_info *mci;
24
25 /* Buffers for the error handling routine */
26 char other_detail[OTHER_DETAIL_LEN];
27 char msg[80];
28};
29
30static refcount_t ghes_refcount = REFCOUNT_INIT(0);
31
32/*
33 * Access to ghes_pvt must be protected by ghes_lock. The spinlock
34 * also provides the necessary (implicit) memory barrier for the SMP
35 * case to make the pointer visible on another CPU.
36 */
37static struct ghes_pvt *ghes_pvt;
38
39/*
40 * This driver's representation of the system hardware, as collected
41 * from DMI.
42 */
43static struct ghes_hw_desc {
44 int num_dimms;
45 struct dimm_info *dimms;
46} ghes_hw;
47
48/* GHES registration mutex */
49static DEFINE_MUTEX(ghes_reg_mutex);
50
51/*
52 * Sync with other, potentially concurrent callers of
53 * ghes_edac_report_mem_error(). We don't know what the
54 * "inventive" firmware would do.
55 */
56static DEFINE_SPINLOCK(ghes_lock);
57
58static bool system_scanned;
59
60static struct list_head *ghes_devs;
61
62/* Memory Device - Type 17 of SMBIOS spec */
63struct memdev_dmi_entry {
64 u8 type;
65 u8 length;
66 u16 handle;
67 u16 phys_mem_array_handle;
68 u16 mem_err_info_handle;
69 u16 total_width;
70 u16 data_width;
71 u16 size;
72 u8 form_factor;
73 u8 device_set;
74 u8 device_locator;
75 u8 bank_locator;
76 u8 memory_type;
77 u16 type_detail;
78 u16 speed;
79 u8 manufacturer;
80 u8 serial_number;
81 u8 asset_tag;
82 u8 part_number;
83 u8 attributes;
84 u32 extended_size;
85 u16 conf_mem_clk_speed;
86} __attribute__((__packed__));
87
88static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
89{
90 struct dimm_info *dimm;
91
92 mci_for_each_dimm(mci, dimm) {
93 if (dimm->smbios_handle == handle)
94 return dimm;
95 }
96
97 return NULL;
98}
99
100static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
101{
102 const char *bank = NULL, *device = NULL;
103
104 dmi_memdev_name(handle, bank: &bank, device: &device);
105
106 /*
107 * Set to a NULL string when both bank and device are zero. In this case,
108 * the label assigned by default will be preserved.
109 */
110 snprintf(buf: dimm->label, size: sizeof(dimm->label), fmt: "%s%s%s",
111 (bank && *bank) ? bank : "",
112 (bank && *bank && device && *device) ? " " : "",
113 (device && *device) ? device : "");
114}
115
116static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry)
117{
118 u16 rdr_mask = BIT(7) | BIT(13);
119
120 if (entry->size == 0xffff) {
121 pr_info("Can't get DIMM%i size\n", dimm->idx);
122 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
123 } else if (entry->size == 0x7fff) {
124 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
125 } else {
126 if (entry->size & BIT(15))
127 dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10);
128 else
129 dimm->nr_pages = MiB_TO_PAGES(entry->size);
130 }
131
132 switch (entry->memory_type) {
133 case 0x12:
134 if (entry->type_detail & BIT(13))
135 dimm->mtype = MEM_RDDR;
136 else
137 dimm->mtype = MEM_DDR;
138 break;
139 case 0x13:
140 if (entry->type_detail & BIT(13))
141 dimm->mtype = MEM_RDDR2;
142 else
143 dimm->mtype = MEM_DDR2;
144 break;
145 case 0x14:
146 dimm->mtype = MEM_FB_DDR2;
147 break;
148 case 0x18:
149 if (entry->type_detail & BIT(12))
150 dimm->mtype = MEM_NVDIMM;
151 else if (entry->type_detail & BIT(13))
152 dimm->mtype = MEM_RDDR3;
153 else
154 dimm->mtype = MEM_DDR3;
155 break;
156 case 0x1a:
157 if (entry->type_detail & BIT(12))
158 dimm->mtype = MEM_NVDIMM;
159 else if (entry->type_detail & BIT(13))
160 dimm->mtype = MEM_RDDR4;
161 else
162 dimm->mtype = MEM_DDR4;
163 break;
164 default:
165 if (entry->type_detail & BIT(6))
166 dimm->mtype = MEM_RMBS;
167 else if ((entry->type_detail & rdr_mask) == rdr_mask)
168 dimm->mtype = MEM_RDR;
169 else if (entry->type_detail & BIT(7))
170 dimm->mtype = MEM_SDR;
171 else if (entry->type_detail & BIT(9))
172 dimm->mtype = MEM_EDO;
173 else
174 dimm->mtype = MEM_UNKNOWN;
175 }
176
177 /*
178 * Actually, we can only detect if the memory has bits for
179 * checksum or not
180 */
181 if (entry->total_width == entry->data_width)
182 dimm->edac_mode = EDAC_NONE;
183 else
184 dimm->edac_mode = EDAC_SECDED;
185
186 dimm->dtype = DEV_UNKNOWN;
187 dimm->grain = 128; /* Likely, worse case */
188
189 dimm_setup_label(dimm, handle: entry->handle);
190
191 if (dimm->nr_pages) {
192 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
193 dimm->idx, edac_mem_types[dimm->mtype],
194 PAGES_TO_MiB(dimm->nr_pages),
195 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
196 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
197 entry->memory_type, entry->type_detail,
198 entry->total_width, entry->data_width);
199 }
200
201 dimm->smbios_handle = entry->handle;
202}
203
204static void enumerate_dimms(const struct dmi_header *dh, void *arg)
205{
206 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
207 struct ghes_hw_desc *hw = (struct ghes_hw_desc *)arg;
208 struct dimm_info *d;
209
210 if (dh->type != DMI_ENTRY_MEM_DEVICE)
211 return;
212
213 /* Enlarge the array with additional 16 */
214 if (!hw->num_dimms || !(hw->num_dimms % 16)) {
215 struct dimm_info *new;
216
217 new = krealloc_array(hw->dimms, hw->num_dimms + 16,
218 sizeof(struct dimm_info), GFP_KERNEL);
219 if (!new) {
220 WARN_ON_ONCE(1);
221 return;
222 }
223
224 hw->dimms = new;
225 }
226
227 d = &hw->dimms[hw->num_dimms];
228 d->idx = hw->num_dimms;
229
230 assign_dmi_dimm_info(dimm: d, entry);
231
232 hw->num_dimms++;
233}
234
235static void ghes_scan_system(void)
236{
237 if (system_scanned)
238 return;
239
240 dmi_walk(decode: enumerate_dimms, private_data: &ghes_hw);
241
242 system_scanned = true;
243}
244
245static int print_mem_error_other_detail(const struct cper_sec_mem_err *mem, char *msg,
246 const char *location, unsigned int len)
247{
248 u32 n;
249
250 if (!msg)
251 return 0;
252
253 n = 0;
254 len -= 1;
255
256 n += scnprintf(buf: msg + n, size: len - n, fmt: "APEI location: %s ", location);
257
258 if (!(mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS))
259 goto out;
260
261 n += scnprintf(buf: msg + n, size: len - n, fmt: "status(0x%016llx): ", mem->error_status);
262 n += scnprintf(buf: msg + n, size: len - n, fmt: "%s ", cper_mem_err_status_str(status: mem->error_status));
263
264out:
265 msg[n] = '\0';
266
267 return n;
268}
269
270static int ghes_edac_report_mem_error(struct notifier_block *nb,
271 unsigned long val, void *data)
272{
273 struct cper_sec_mem_err *mem_err = (struct cper_sec_mem_err *)data;
274 struct cper_mem_err_compact cmem;
275 struct edac_raw_error_desc *e;
276 struct mem_ctl_info *mci;
277 unsigned long sev = val;
278 struct ghes_pvt *pvt;
279 unsigned long flags;
280 char *p;
281
282 /*
283 * We can do the locking below because GHES defers error processing
284 * from NMI to IRQ context. Whenever that changes, we'd at least
285 * know.
286 */
287 if (WARN_ON_ONCE(in_nmi()))
288 return NOTIFY_OK;
289
290 spin_lock_irqsave(&ghes_lock, flags);
291
292 pvt = ghes_pvt;
293 if (!pvt)
294 goto unlock;
295
296 mci = pvt->mci;
297 e = &mci->error_desc;
298
299 /* Cleans the error report buffer */
300 memset(e, 0, sizeof (*e));
301 e->error_count = 1;
302 e->grain = 1;
303 e->msg = pvt->msg;
304 e->other_detail = pvt->other_detail;
305 e->top_layer = -1;
306 e->mid_layer = -1;
307 e->low_layer = -1;
308 *pvt->other_detail = '\0';
309 *pvt->msg = '\0';
310
311 switch (sev) {
312 case GHES_SEV_CORRECTED:
313 e->type = HW_EVENT_ERR_CORRECTED;
314 break;
315 case GHES_SEV_RECOVERABLE:
316 e->type = HW_EVENT_ERR_UNCORRECTED;
317 break;
318 case GHES_SEV_PANIC:
319 e->type = HW_EVENT_ERR_FATAL;
320 break;
321 default:
322 case GHES_SEV_NO:
323 e->type = HW_EVENT_ERR_INFO;
324 }
325
326 edac_dbg(1, "error validation_bits: 0x%08llx\n",
327 (long long)mem_err->validation_bits);
328
329 /* Error type, mapped on e->msg */
330 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
331 u8 etype = mem_err->error_type;
332
333 p = pvt->msg;
334 p += snprintf(buf: p, size: sizeof(pvt->msg), fmt: "%s", cper_mem_err_type_str(etype));
335 } else {
336 strscpy(pvt->msg, "unknown error");
337 }
338
339 /* Error address */
340 if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
341 e->page_frame_number = PHYS_PFN(mem_err->physical_addr);
342 e->offset_in_page = offset_in_page(mem_err->physical_addr);
343 }
344
345 /* Error grain */
346 if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
347 e->grain = ~mem_err->physical_addr_mask + 1;
348
349 /* Memory error location, mapped on e->location */
350 p = e->location;
351 cper_mem_err_pack(mem_err, &cmem);
352 p += cper_mem_err_location(mem: &cmem, msg: p);
353
354 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
355 struct dimm_info *dimm;
356
357 p += cper_dimm_err_location(mem: &cmem, msg: p);
358 dimm = find_dimm_by_handle(mci, handle: mem_err->mem_dev_handle);
359 if (dimm) {
360 e->top_layer = dimm->idx;
361 strscpy(e->label, dimm->label);
362 }
363 }
364 if (p > e->location)
365 *(p - 1) = '\0';
366
367 if (!*e->label)
368 strscpy(e->label, "unknown memory");
369
370 /* All other fields are mapped on e->other_detail */
371 p = pvt->other_detail;
372 p += print_mem_error_other_detail(mem: mem_err, msg: p, location: e->location, OTHER_DETAIL_LEN);
373 if (p > pvt->other_detail)
374 *(p - 1) = '\0';
375
376 edac_raw_mc_handle_error(e);
377
378unlock:
379 spin_unlock_irqrestore(lock: &ghes_lock, flags);
380
381 return NOTIFY_OK;
382}
383
384static struct notifier_block ghes_edac_mem_err_nb = {
385 .notifier_call = ghes_edac_report_mem_error,
386 .priority = 0,
387};
388
389static int ghes_edac_register(struct device *dev)
390{
391 bool fake = false;
392 struct mem_ctl_info *mci;
393 struct ghes_pvt *pvt;
394 struct edac_mc_layer layers[1];
395 unsigned long flags;
396 int rc = 0;
397
398 /* finish another registration/unregistration instance first */
399 mutex_lock(&ghes_reg_mutex);
400
401 /*
402 * We have only one logical memory controller to which all DIMMs belong.
403 */
404 if (refcount_inc_not_zero(r: &ghes_refcount))
405 goto unlock;
406
407 ghes_scan_system();
408
409 /* Check if we've got a bogus BIOS */
410 if (!ghes_hw.num_dimms) {
411 fake = true;
412 ghes_hw.num_dimms = 1;
413 }
414
415 layers[0].type = EDAC_MC_LAYER_ALL_MEM;
416 layers[0].size = ghes_hw.num_dimms;
417 layers[0].is_virt_csrow = true;
418
419 mci = edac_mc_alloc(mc_num: 0, ARRAY_SIZE(layers), layers, sz_pvt: sizeof(struct ghes_pvt));
420 if (!mci) {
421 pr_info("Can't allocate memory for EDAC data\n");
422 rc = -ENOMEM;
423 goto unlock;
424 }
425
426 pvt = mci->pvt_info;
427 pvt->mci = mci;
428
429 mci->pdev = dev;
430 mci->mtype_cap = MEM_FLAG_EMPTY;
431 mci->edac_ctl_cap = EDAC_FLAG_NONE;
432 mci->edac_cap = EDAC_FLAG_NONE;
433 mci->mod_name = "ghes_edac.c";
434 mci->ctl_name = "ghes_edac";
435 mci->dev_name = "ghes";
436
437 if (fake) {
438 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
439 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
440 pr_info("work on such system. Use this driver with caution\n");
441 }
442
443 pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms);
444
445 if (!fake) {
446 struct dimm_info *src, *dst;
447 int i = 0;
448
449 mci_for_each_dimm(mci, dst) {
450 src = &ghes_hw.dimms[i];
451
452 dst->idx = src->idx;
453 dst->smbios_handle = src->smbios_handle;
454 dst->nr_pages = src->nr_pages;
455 dst->mtype = src->mtype;
456 dst->edac_mode = src->edac_mode;
457 dst->dtype = src->dtype;
458 dst->grain = src->grain;
459
460 /*
461 * If no src->label, preserve default label assigned
462 * from EDAC core.
463 */
464 if (strlen(src->label))
465 memcpy(dst->label, src->label, sizeof(src->label));
466
467 i++;
468 }
469
470 } else {
471 struct dimm_info *dimm = edac_get_dimm(mci, layer0: 0, layer1: 0, layer2: 0);
472
473 dimm->nr_pages = 1;
474 dimm->grain = 128;
475 dimm->mtype = MEM_UNKNOWN;
476 dimm->dtype = DEV_UNKNOWN;
477 dimm->edac_mode = EDAC_SECDED;
478 }
479
480 rc = edac_mc_add_mc(mci);
481 if (rc < 0) {
482 pr_info("Can't register with the EDAC core\n");
483 edac_mc_free(mci);
484 rc = -ENODEV;
485 goto unlock;
486 }
487
488 spin_lock_irqsave(&ghes_lock, flags);
489 ghes_pvt = pvt;
490 spin_unlock_irqrestore(lock: &ghes_lock, flags);
491
492 ghes_register_report_chain(nb: &ghes_edac_mem_err_nb);
493
494 /* only set on success */
495 refcount_set(r: &ghes_refcount, n: 1);
496
497unlock:
498
499 /* Not needed anymore */
500 kfree(objp: ghes_hw.dimms);
501 ghes_hw.dimms = NULL;
502
503 mutex_unlock(lock: &ghes_reg_mutex);
504
505 return rc;
506}
507
508static void ghes_edac_unregister(struct ghes *ghes)
509{
510 struct mem_ctl_info *mci;
511 unsigned long flags;
512
513 mutex_lock(&ghes_reg_mutex);
514
515 system_scanned = false;
516 memset(&ghes_hw, 0, sizeof(struct ghes_hw_desc));
517
518 if (!refcount_dec_and_test(r: &ghes_refcount))
519 goto unlock;
520
521 /*
522 * Wait for the irq handler being finished.
523 */
524 spin_lock_irqsave(&ghes_lock, flags);
525 mci = ghes_pvt ? ghes_pvt->mci : NULL;
526 ghes_pvt = NULL;
527 spin_unlock_irqrestore(lock: &ghes_lock, flags);
528
529 if (!mci)
530 goto unlock;
531
532 mci = edac_mc_del_mc(dev: mci->pdev);
533 if (mci)
534 edac_mc_free(mci);
535
536 ghes_unregister_report_chain(nb: &ghes_edac_mem_err_nb);
537
538unlock:
539 mutex_unlock(lock: &ghes_reg_mutex);
540}
541
542static int __init ghes_edac_init(void)
543{
544 struct ghes *g, *g_tmp;
545
546 ghes_devs = ghes_get_devices();
547 if (!ghes_devs)
548 return -ENODEV;
549
550 if (list_empty(head: ghes_devs)) {
551 pr_info("GHES probing device list is empty\n");
552 return -ENODEV;
553 }
554
555 list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
556 ghes_edac_register(dev: g->dev);
557 }
558
559 return 0;
560}
561module_init(ghes_edac_init);
562
563static void __exit ghes_edac_exit(void)
564{
565 struct ghes *g, *g_tmp;
566
567 list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
568 ghes_edac_unregister(ghes: g);
569 }
570}
571module_exit(ghes_edac_exit);
572
573MODULE_LICENSE("GPL");
574MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors via EDAC");
575

source code of linux/drivers/edac/ghes_edac.c