1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char *to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char *f10h_nb_mce_desc[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
98 static const char * const f15h_cu_mce_desc[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
112 "PRB address parity error"
115 static bool f12h_dc_mce(u16 ec, u8 xec)
124 pr_cont("during L1 linefill from L2.\n");
125 else if (ll == LL_L1)
126 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
133 static bool f10h_dc_mce(u16 ec, u8 xec)
135 u8 r4 = (ec >> 4) & 0xf;
138 if (r4 == R4_GEN && ll == LL_L1) {
139 pr_cont("during data scrub.\n");
142 return f12h_dc_mce(ec, xec);
145 static bool k8_dc_mce(u16 ec, u8 xec)
148 pr_cont("during system linefill.\n");
152 return f10h_dc_mce(ec, xec);
155 static bool f14h_dc_mce(u16 ec, u8 xec)
157 u8 r4 = (ec >> 4) & 0xf;
159 u8 tt = (ec >> 2) & 0x3;
165 if (tt != TT_DATA || ll != LL_L1)
171 pr_cont("Data/Tag parity error due to %s.\n",
172 (r4 == R4_DRD ? "load/hw prf" : "store"));
175 pr_cont("Copyback parity error on a tag miss.\n");
178 pr_cont("Tag parity error during snoop.\n");
183 } else if (BUS_ERROR(ec)) {
185 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
188 pr_cont("System read data error on a ");
192 pr_cont("TLB reload.\n");
210 static bool f15h_dc_mce(u16 ec, u8 xec)
218 pr_cont("Data Array access error.\n");
222 pr_cont("UC error during a linefill from L2/NB.\n");
227 pr_cont("STQ access error.\n");
231 pr_cont("SCB access error.\n");
235 pr_cont("Tag error.\n");
239 pr_cont("LDQ access error.\n");
245 } else if (BUS_ERROR(ec)) {
248 pr_cont("during system linefill.\n");
250 pr_cont(" Internal %s condition.\n",
251 ((xec == 1) ? "livelock" : "deadlock"));
258 static void amd_decode_dc_mce(struct mce *m)
260 u16 ec = m->status & 0xffff;
261 u8 xec = (m->status >> 16) & xec_mask;
263 pr_emerg(HW_ERR "Data Cache Error: ");
265 /* TLB error signatures are the same across families */
267 u8 tt = (ec >> 2) & 0x3;
270 pr_cont("%s TLB %s.\n", LL_MSG(ec),
271 ((xec == 2) ? "locked miss"
272 : (xec ? "multimatch" : "parity")));
275 } else if (fam_ops->dc_mce(ec, xec))
278 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
281 static bool k8_ic_mce(u16 ec, u8 xec)
284 u8 r4 = (ec >> 4) & 0xf;
291 pr_cont("during a linefill from L2.\n");
292 else if (ll == 0x1) {
295 pr_cont("Parity error during data load.\n");
299 pr_cont("Copyback Parity/Victim error.\n");
303 pr_cont("Tag Snoop error.\n");
316 static bool f14h_ic_mce(u16 ec, u8 xec)
319 u8 tt = (ec >> 2) & 0x3;
320 u8 r4 = (ec >> 4) & 0xf;
324 if (tt != 0 || ll != 1)
328 pr_cont("Data/tag array parity error for a tag hit.\n");
329 else if (r4 == R4_SNOOP)
330 pr_cont("Tag error during snoop/victimization.\n");
337 static bool f15h_ic_mce(u16 ec, u8 xec)
346 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
350 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
354 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
363 static void amd_decode_ic_mce(struct mce *m)
365 u16 ec = m->status & 0xffff;
366 u8 xec = (m->status >> 16) & xec_mask;
368 pr_emerg(HW_ERR "Instruction Cache Error: ");
371 pr_cont("%s TLB %s.\n", LL_MSG(ec),
372 (xec ? "multimatch" : "parity error"));
373 else if (BUS_ERROR(ec)) {
374 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
376 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
377 } else if (fam_ops->ic_mce(ec, xec))
380 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
383 static void amd_decode_bu_mce(struct mce *m)
385 u32 ec = m->status & 0xffff;
386 u32 xec = (m->status >> 16) & xec_mask;
388 pr_emerg(HW_ERR "Bus Unit Error");
391 pr_cont(" in the write data buffers.\n");
393 pr_cont(" in the victim data buffers.\n");
394 else if (xec == 0x2 && MEM_ERROR(ec))
395 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
396 else if (xec == 0x0) {
398 pr_cont(": %s error in a Page Descriptor Cache or "
399 "Guest TLB.\n", TT_MSG(ec));
400 else if (BUS_ERROR(ec))
401 pr_cont(": %s/ECC error in data read from NB: %s.\n",
402 RRRR_MSG(ec), PP_MSG(ec));
403 else if (MEM_ERROR(ec)) {
404 u8 rrrr = (ec >> 4) & 0xf;
407 pr_cont(": %s error during data copyback.\n",
409 else if (rrrr <= 0x1)
410 pr_cont(": %s parity/ECC error during data "
411 "access from L2.\n", RRRR_MSG(ec));
422 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
425 static void amd_decode_cu_mce(struct mce *m)
427 u16 ec = m->status & 0xffff;
428 u8 xec = (m->status >> 16) & xec_mask;
430 pr_emerg(HW_ERR "Combined Unit Error: ");
434 pr_cont("Data parity TLB read error.\n");
436 pr_cont("Poison data provided for TLB fill.\n");
439 } else if (BUS_ERROR(ec)) {
443 pr_cont("Error during attempted NB data read.\n");
444 } else if (MEM_ERROR(ec)) {
447 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
451 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
462 pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
465 static void amd_decode_ls_mce(struct mce *m)
467 u16 ec = m->status & 0xffff;
468 u8 xec = (m->status >> 16) & xec_mask;
470 if (boot_cpu_data.x86 >= 0x14) {
471 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
472 " please report on LKML.\n");
476 pr_emerg(HW_ERR "Load Store Error");
479 u8 r4 = (ec >> 4) & 0xf;
481 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
484 pr_cont(" during %s.\n", RRRR_MSG(ec));
491 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
494 static bool k8_nb_mce(u16 ec, u8 xec)
500 pr_cont("CRC error detected on HT link.\n");
504 pr_cont("Invalid GART PTE entry during GART table walk.\n");
508 pr_cont("Unsupported atomic RMW received from an IO link.\n");
513 if (boot_cpu_data.x86 == 0x11)
516 pr_cont("DRAM ECC error detected on the NB.\n");
520 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
531 static bool f10h_nb_mce(u16 ec, u8 xec)
536 if (k8_nb_mce(ec, xec))
550 pr_cont("GART Table Walk data error.\n");
551 else if (BUS_ERROR(ec))
552 pr_cont("DMA Exclusion Vector Table Walk error.\n");
560 if (boot_cpu_data.x86 == 0x15)
561 pr_cont("Compute Unit Data Error.\n");
579 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
585 static bool nb_noop_mce(u16 ec, u8 xec)
590 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
592 u8 xec = (m->status >> 16) & 0x1f;
593 u16 ec = m->status & 0xffff;
594 u32 nbsh = (u32)(m->status >> 32);
596 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
599 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
600 * value encoding has changed so interpret those differently
602 if ((boot_cpu_data.x86 == 0x10) &&
603 (boot_cpu_data.x86_model > 7)) {
604 if (nbsh & K8_NBSH_ERR_CPU_VAL)
605 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
607 u8 assoc_cpus = nbsh & nb_err_cpumask;
610 pr_cont(", core: %d", fls(assoc_cpus) - 1);
615 pr_cont("Sync error (sync packets on HT link detected).\n");
619 pr_cont("HT Master abort.\n");
623 pr_cont("HT Target abort.\n");
627 pr_cont("NB Watchdog timeout.\n");
631 pr_cont("SVM DMA Exclusion Vector error.\n");
638 if (!fam_ops->nb_mce(ec, xec))
641 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
642 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
643 nb_bus_decoder(node_id, m, nbcfg);
648 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
650 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
652 static void amd_decode_fr_mce(struct mce *m)
654 if (boot_cpu_data.x86 == 0xf ||
655 boot_cpu_data.x86 == 0x11)
658 /* we have only one error signature so match all fields at once. */
659 if ((m->status & 0xffff) == 0x0f0f) {
660 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
665 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
668 static inline void amd_decode_err_code(u16 ec)
671 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
672 TT_MSG(ec), LL_MSG(ec));
673 } else if (MEM_ERROR(ec)) {
674 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
675 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
676 } else if (BUS_ERROR(ec)) {
677 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
678 "Participating Processor: %s\n",
679 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
682 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
686 * Filter out unwanted MCE signatures here.
688 static bool amd_filter_mce(struct mce *m)
690 u8 xec = (m->status >> 16) & 0x1f;
693 * NB GART TLB error reporting is disabled by default.
695 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
701 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
703 struct mce *m = (struct mce *)data;
706 if (amd_filter_mce(m))
709 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
711 pr_cont("%sorrected error, other errors lost: %s, "
712 "CPU context corrupt: %s",
713 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
714 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
715 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
717 /* do the two bits[14:13] together */
718 ecc = (m->status >> 45) & 0x3;
720 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
726 amd_decode_dc_mce(m);
730 amd_decode_ic_mce(m);
734 if (boot_cpu_data.x86 == 0x15)
735 amd_decode_cu_mce(m);
737 amd_decode_bu_mce(m);
741 amd_decode_ls_mce(m);
745 node = amd_get_nb_id(m->extcpu);
746 amd_decode_nb_mce(node, m, 0);
750 amd_decode_fr_mce(m);
757 amd_decode_err_code(m->status & 0xffff);
761 EXPORT_SYMBOL_GPL(amd_decode_mce);
763 static struct notifier_block amd_mce_dec_nb = {
764 .notifier_call = amd_decode_mce,
767 static int __init mce_amd_init(void)
769 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
772 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
773 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
776 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
780 switch (boot_cpu_data.x86) {
782 fam_ops->dc_mce = k8_dc_mce;
783 fam_ops->ic_mce = k8_ic_mce;
784 fam_ops->nb_mce = k8_nb_mce;
788 fam_ops->dc_mce = f10h_dc_mce;
789 fam_ops->ic_mce = k8_ic_mce;
790 fam_ops->nb_mce = f10h_nb_mce;
794 fam_ops->dc_mce = k8_dc_mce;
795 fam_ops->ic_mce = k8_ic_mce;
796 fam_ops->nb_mce = f10h_nb_mce;
800 fam_ops->dc_mce = f12h_dc_mce;
801 fam_ops->ic_mce = k8_ic_mce;
802 fam_ops->nb_mce = nb_noop_mce;
806 nb_err_cpumask = 0x3;
807 fam_ops->dc_mce = f14h_dc_mce;
808 fam_ops->ic_mce = f14h_ic_mce;
809 fam_ops->nb_mce = nb_noop_mce;
814 fam_ops->dc_mce = f15h_dc_mce;
815 fam_ops->ic_mce = f15h_ic_mce;
816 fam_ops->nb_mce = f10h_nb_mce;
820 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
826 pr_info("MCE: In-kernel MCE decoding enabled.\n");
828 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
832 early_initcall(mce_amd_init);
835 static void __exit mce_amd_exit(void)
837 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
841 MODULE_DESCRIPTION("AMD MCE decoder");
842 MODULE_ALIAS("edac-mce-amd");
843 MODULE_LICENSE("GPL");
844 module_exit(mce_amd_exit);