1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 nb_err_cpumask = 0xf;
10 static bool report_gart_errors;
11 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
13 void amd_report_gart_errors(bool v)
15 report_gart_errors = v;
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
28 WARN_ON(nb_bus_decoder != f);
30 nb_bus_decoder = NULL;
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
36 * string representation for the different MCA reported error types, see F3x48
40 /* transaction type */
41 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42 EXPORT_SYMBOL_GPL(tt_msgs);
45 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
46 EXPORT_SYMBOL_GPL(ll_msgs);
48 /* memory transaction type */
49 const char *rrrr_msgs[] = {
50 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52 EXPORT_SYMBOL_GPL(rrrr_msgs);
54 /* participating processor */
55 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56 EXPORT_SYMBOL_GPL(pp_msgs);
59 const char *to_msgs[] = { "no timeout", "timed out" };
60 EXPORT_SYMBOL_GPL(to_msgs);
63 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
64 EXPORT_SYMBOL_GPL(ii_msgs);
66 static const char *f10h_nb_mce_desc[] = {
68 "Protocol error (link, L3, probe filter, etc.)",
69 "Parity error in NB-internal arrays",
70 "Link Retry due to IO link transmission error",
71 "L3 ECC data cache error",
72 "ECC error in L3 cache tag",
73 "L3 LRU parity bits error",
74 "ECC Error in the Probe Filter directory"
77 static bool f10h_dc_mce(u16 ec)
79 u8 r4 = (ec >> 4) & 0xf;
83 pr_cont("during data scrub.\n");
92 pr_cont("during L1 linefill from L2.\n");
94 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
101 static bool k8_dc_mce(u16 ec)
104 pr_cont("during system linefill.\n");
108 return f10h_dc_mce(ec);
111 static bool f14h_dc_mce(u16 ec)
113 u8 r4 = (ec >> 4) & 0xf;
115 u8 tt = (ec >> 2) & 0x3;
121 if (tt != TT_DATA || ll != LL_L1)
127 pr_cont("Data/Tag parity error due to %s.\n",
128 (r4 == R4_DRD ? "load/hw prf" : "store"));
131 pr_cont("Copyback parity error on a tag miss.\n");
134 pr_cont("Tag parity error during snoop.\n");
139 } else if (BUS_ERROR(ec)) {
141 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
144 pr_cont("System read data error on a ");
148 pr_cont("TLB reload.\n");
166 static void amd_decode_dc_mce(struct mce *m)
168 u16 ec = m->status & 0xffff;
169 u8 xec = (m->status >> 16) & 0xf;
171 pr_emerg(HW_ERR "Data Cache Error: ");
173 /* TLB error signatures are the same across families */
175 u8 tt = (ec >> 2) & 0x3;
178 pr_cont("%s TLB %s.\n", LL_MSG(ec),
179 (xec ? "multimatch" : "parity error"));
186 if (!fam_ops->dc_mce(ec))
192 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
195 static bool k8_ic_mce(u16 ec)
198 u8 r4 = (ec >> 4) & 0xf;
205 pr_cont("during a linefill from L2.\n");
206 else if (ll == 0x1) {
209 pr_cont("Parity error during data load.\n");
213 pr_cont("Copyback Parity/Victim error.\n");
217 pr_cont("Tag Snoop error.\n");
230 static bool f14h_ic_mce(u16 ec)
233 u8 tt = (ec >> 2) & 0x3;
234 u8 r4 = (ec >> 4) & 0xf;
238 if (tt != 0 || ll != 1)
242 pr_cont("Data/tag array parity error for a tag hit.\n");
243 else if (r4 == R4_SNOOP)
244 pr_cont("Tag error during snoop/victimization.\n");
251 static void amd_decode_ic_mce(struct mce *m)
253 u16 ec = m->status & 0xffff;
254 u8 xec = (m->status >> 16) & 0xf;
256 pr_emerg(HW_ERR "Instruction Cache Error: ");
259 pr_cont("%s TLB %s.\n", LL_MSG(ec),
260 (xec ? "multimatch" : "parity error"));
261 else if (BUS_ERROR(ec)) {
262 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58)));
264 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
265 } else if (fam_ops->ic_mce(ec))
268 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
271 static void amd_decode_bu_mce(struct mce *m)
273 u32 ec = m->status & 0xffff;
274 u32 xec = (m->status >> 16) & 0xf;
276 pr_emerg(HW_ERR "Bus Unit Error");
279 pr_cont(" in the write data buffers.\n");
281 pr_cont(" in the victim data buffers.\n");
282 else if (xec == 0x2 && MEM_ERROR(ec))
283 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
284 else if (xec == 0x0) {
286 pr_cont(": %s error in a Page Descriptor Cache or "
287 "Guest TLB.\n", TT_MSG(ec));
288 else if (BUS_ERROR(ec))
289 pr_cont(": %s/ECC error in data read from NB: %s.\n",
290 RRRR_MSG(ec), PP_MSG(ec));
291 else if (MEM_ERROR(ec)) {
292 u8 rrrr = (ec >> 4) & 0xf;
295 pr_cont(": %s error during data copyback.\n",
297 else if (rrrr <= 0x1)
298 pr_cont(": %s parity/ECC error during data "
299 "access from L2.\n", RRRR_MSG(ec));
310 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
313 static void amd_decode_ls_mce(struct mce *m)
315 u16 ec = m->status & 0xffff;
316 u8 xec = (m->status >> 16) & 0xf;
318 if (boot_cpu_data.x86 == 0x14) {
319 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
320 " please report on LKML.\n");
324 pr_emerg(HW_ERR "Load Store Error");
327 u8 r4 = (ec >> 4) & 0xf;
329 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
332 pr_cont(" during %s.\n", RRRR_MSG(ec));
339 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
342 static bool k8_nb_mce(u16 ec, u8 xec)
348 pr_cont("CRC error detected on HT link.\n");
352 pr_cont("Invalid GART PTE entry during GART table walk.\n");
356 pr_cont("Unsupported atomic RMW received from an IO link.\n");
361 pr_cont("DRAM ECC error detected on the NB.\n");
365 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
376 static bool f10h_nb_mce(u16 ec, u8 xec)
381 if (k8_nb_mce(ec, xec))
395 pr_cont("GART Table Walk data error.\n");
396 else if (BUS_ERROR(ec))
397 pr_cont("DMA Exclusion Vector Table Walk error.\n");
415 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
421 static bool f14h_nb_mce(u16 ec, u8 xec)
426 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
428 u8 xec = (m->status >> 16) & 0x1f;
429 u16 ec = m->status & 0xffff;
430 u32 nbsh = (u32)(m->status >> 32);
432 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
435 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
436 * value encoding has changed so interpret those differently
438 if ((boot_cpu_data.x86 == 0x10) &&
439 (boot_cpu_data.x86_model > 7)) {
440 if (nbsh & K8_NBSH_ERR_CPU_VAL)
441 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
443 u8 assoc_cpus = nbsh & nb_err_cpumask;
446 pr_cont(", core: %d", fls(assoc_cpus) - 1);
451 pr_cont("Sync error (sync packets on HT link detected).\n");
455 pr_cont("HT Master abort.\n");
459 pr_cont("HT Target abort.\n");
463 pr_cont("NB Watchdog timeout.\n");
467 pr_cont("SVM DMA Exclusion Vector error.\n");
474 if (!fam_ops->nb_mce(ec, xec))
477 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
478 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
479 nb_bus_decoder(node_id, m, nbcfg);
484 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
486 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
488 static void amd_decode_fr_mce(struct mce *m)
490 if (boot_cpu_data.x86 == 0xf)
493 /* we have only one error signature so match all fields at once. */
494 if ((m->status & 0xffff) == 0x0f0f) {
495 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
500 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
503 static inline void amd_decode_err_code(u16 ec)
506 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
507 TT_MSG(ec), LL_MSG(ec));
508 } else if (MEM_ERROR(ec)) {
509 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
510 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
511 } else if (BUS_ERROR(ec)) {
512 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
513 "Participating Processor: %s\n",
514 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
517 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
521 * Filter out unwanted MCE signatures here.
523 static bool amd_filter_mce(struct mce *m)
525 u8 xec = (m->status >> 16) & 0x1f;
528 * NB GART TLB error reporting is disabled by default.
530 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
536 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
538 struct mce *m = (struct mce *)data;
541 if (amd_filter_mce(m))
544 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
546 pr_cont("%sorrected error, other errors lost: %s, "
547 "CPU context corrupt: %s",
548 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
549 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
550 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
552 /* do the two bits[14:13] together */
553 ecc = (m->status >> 45) & 0x3;
555 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
561 amd_decode_dc_mce(m);
565 amd_decode_ic_mce(m);
569 amd_decode_bu_mce(m);
573 amd_decode_ls_mce(m);
577 node = amd_get_nb_id(m->extcpu);
578 amd_decode_nb_mce(node, m, 0);
582 amd_decode_fr_mce(m);
589 amd_decode_err_code(m->status & 0xffff);
593 EXPORT_SYMBOL_GPL(amd_decode_mce);
595 static struct notifier_block amd_mce_dec_nb = {
596 .notifier_call = amd_decode_mce,
599 static int __init mce_amd_init(void)
601 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
604 if (boot_cpu_data.x86 != 0xf &&
605 boot_cpu_data.x86 != 0x10 &&
606 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
609 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
613 switch (boot_cpu_data.x86) {
615 fam_ops->dc_mce = k8_dc_mce;
616 fam_ops->ic_mce = k8_ic_mce;
617 fam_ops->nb_mce = k8_nb_mce;
621 fam_ops->dc_mce = f10h_dc_mce;
622 fam_ops->ic_mce = k8_ic_mce;
623 fam_ops->nb_mce = f10h_nb_mce;
627 nb_err_cpumask = 0x3;
628 fam_ops->dc_mce = f14h_dc_mce;
629 fam_ops->ic_mce = f14h_ic_mce;
630 fam_ops->nb_mce = f14h_nb_mce;
634 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
640 pr_info("MCE: In-kernel MCE decoding enabled.\n");
642 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
646 early_initcall(mce_amd_init);
649 static void __exit mce_amd_exit(void)
651 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
655 MODULE_DESCRIPTION("AMD MCE decoder");
656 MODULE_ALIAS("edac-mce-amd");
657 MODULE_LICENSE("GPL");
658 module_exit(mce_amd_exit);