1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
10 static bool report_gart_errors;
11 static void (*nb_bus_decoder)(int node_id, struct mce *m);
13 void amd_report_gart_errors(bool v)
15 report_gart_errors = v;
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
28 WARN_ON(nb_bus_decoder != f);
30 nb_bus_decoder = NULL;
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
36 * string representation for the different MCA reported error types, see F3x48
40 /* transaction type */
41 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
44 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
46 /* memory transaction type */
47 static const char * const rrrr_msgs[] = {
48 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51 /* participating processor */
52 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53 EXPORT_SYMBOL_GPL(pp_msgs);
56 static const char * const to_msgs[] = { "no timeout", "timed out" };
59 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
61 /* internal error type */
62 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
64 static const char * const f15h_mc1_mce_desc[] = {
65 "UC during a demand linefill from L2",
66 "Parity error during data load from IC",
67 "Parity error for IC valid bit",
68 "Main tag parity error",
69 "Parity error in prediction queue",
70 "PFB data/address parity error",
71 "Parity error in the branch status reg",
72 "PFB promotion address error",
73 "Tag error during probe/victimization",
74 "Parity error for IC probe tag valid bit",
75 "PFB non-cacheable bit parity error",
76 "PFB valid bit parity error", /* xec = 0xd */
77 "Microcode Patch Buffer", /* xec = 010 */
85 static const char * const f15h_mc2_mce_desc[] = {
86 "Fill ECC error on data fills", /* xec = 0x4 */
87 "Fill parity error on insn fills",
88 "Prefetcher request FIFO parity error",
89 "PRQ address parity error",
90 "PRQ data parity error",
93 "WCB Data parity error",
94 "VB Data ECC or parity error",
95 "L2 Tag ECC error", /* xec = 0x10 */
96 "Hard L2 Tag ECC error",
97 "Multiple hits on L2 tag",
99 "PRB address parity error"
102 static const char * const mc4_mce_desc[] = {
103 "DRAM ECC error detected on the NB",
104 "CRC error detected on HT link",
105 "Link-defined sync error packets detected on HT link",
108 "Invalid GART PTE entry during GART table walk",
109 "Unsupported atomic RMW received from an IO link",
110 "Watchdog timeout due to lack of progress",
111 "DRAM ECC error detected on the NB",
112 "SVM DMA Exclusion Vector error",
113 "HT data error detected on link",
114 "Protocol error (link, L3, probe filter)",
115 "NB internal arrays parity error",
116 "DRAM addr/ctl signals parity error",
117 "IO link transmission error",
118 "L3 data cache ECC error", /* xec = 0x1c */
119 "L3 cache tag error",
120 "L3 LRU parity bits error",
121 "ECC Error in the Probe Filter directory"
124 static const char * const mc5_mce_desc[] = {
125 "CPU Watchdog timer expire",
126 "Wakeup array dest tag",
130 "Retire dispatch queue",
131 "Mapper checkpoint array",
132 "Physical register file EX0 port",
133 "Physical register file EX1 port",
134 "Physical register file AG0 port",
135 "Physical register file AG1 port",
136 "Flag register file",
138 "Retire status queue"
141 static bool f12h_mc0_mce(u16 ec, u8 xec)
150 pr_cont("during L1 linefill from L2.\n");
151 else if (ll == LL_L1)
152 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
159 static bool f10h_mc0_mce(u16 ec, u8 xec)
161 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
162 pr_cont("during data scrub.\n");
165 return f12h_mc0_mce(ec, xec);
168 static bool k8_mc0_mce(u16 ec, u8 xec)
171 pr_cont("during system linefill.\n");
175 return f10h_mc0_mce(ec, xec);
178 static bool cat_mc0_mce(u16 ec, u8 xec)
185 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
191 pr_cont("Data/Tag parity error due to %s.\n",
192 (r4 == R4_DRD ? "load/hw prf" : "store"));
195 pr_cont("Copyback parity error on a tag miss.\n");
198 pr_cont("Tag parity error during snoop.\n");
203 } else if (BUS_ERROR(ec)) {
205 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
208 pr_cont("System read data error on a ");
212 pr_cont("TLB reload.\n");
230 static bool f15h_mc0_mce(u16 ec, u8 xec)
238 pr_cont("Data Array access error.\n");
242 pr_cont("UC error during a linefill from L2/NB.\n");
247 pr_cont("STQ access error.\n");
251 pr_cont("SCB access error.\n");
255 pr_cont("Tag error.\n");
259 pr_cont("LDQ access error.\n");
265 } else if (BUS_ERROR(ec)) {
268 pr_cont("System Read Data Error.\n");
270 pr_cont(" Internal error condition type %d.\n", xec);
271 } else if (INT_ERROR(ec)) {
273 pr_cont("Hardware Assert.\n");
283 static void decode_mc0_mce(struct mce *m)
285 u16 ec = EC(m->status);
286 u8 xec = XEC(m->status, xec_mask);
288 pr_emerg(HW_ERR "MC0 Error: ");
290 /* TLB error signatures are the same across families */
292 if (TT(ec) == TT_DATA) {
293 pr_cont("%s TLB %s.\n", LL_MSG(ec),
294 ((xec == 2) ? "locked miss"
295 : (xec ? "multimatch" : "parity")));
298 } else if (fam_ops->mc0_mce(ec, xec))
301 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
304 static bool k8_mc1_mce(u16 ec, u8 xec)
313 pr_cont("during a linefill from L2.\n");
314 else if (ll == 0x1) {
317 pr_cont("Parity error during data load.\n");
321 pr_cont("Copyback Parity/Victim error.\n");
325 pr_cont("Tag Snoop error.\n");
338 static bool cat_mc1_mce(u16 ec, u8 xec)
346 if (TT(ec) != TT_INSTR)
350 pr_cont("Data/tag array parity error for a tag hit.\n");
351 else if (r4 == R4_SNOOP)
352 pr_cont("Tag error during snoop/victimization.\n");
354 pr_cont("Tag parity error from victim castout.\n");
356 pr_cont("Microcode patch RAM parity error.\n");
363 static bool f15h_mc1_mce(u16 ec, u8 xec)
372 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
376 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
380 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
384 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
393 static void decode_mc1_mce(struct mce *m)
395 u16 ec = EC(m->status);
396 u8 xec = XEC(m->status, xec_mask);
398 pr_emerg(HW_ERR "MC1 Error: ");
401 pr_cont("%s TLB %s.\n", LL_MSG(ec),
402 (xec ? "multimatch" : "parity error"));
403 else if (BUS_ERROR(ec)) {
404 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
406 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
407 } else if (INT_ERROR(ec)) {
409 pr_cont("Hardware Assert.\n");
412 } else if (fam_ops->mc1_mce(ec, xec))
420 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
423 static bool k8_mc2_mce(u16 ec, u8 xec)
428 pr_cont(" in the write data buffers.\n");
430 pr_cont(" in the victim data buffers.\n");
431 else if (xec == 0x2 && MEM_ERROR(ec))
432 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
433 else if (xec == 0x0) {
435 pr_cont(": %s error in a Page Descriptor Cache or "
436 "Guest TLB.\n", TT_MSG(ec));
437 else if (BUS_ERROR(ec))
438 pr_cont(": %s/ECC error in data read from NB: %s.\n",
439 R4_MSG(ec), PP_MSG(ec));
440 else if (MEM_ERROR(ec)) {
444 pr_cont(": %s error during data copyback.\n",
447 pr_cont(": %s parity/ECC error during data "
448 "access from L2.\n", R4_MSG(ec));
459 static bool f15h_mc2_mce(u16 ec, u8 xec)
465 pr_cont("Data parity TLB read error.\n");
467 pr_cont("Poison data provided for TLB fill.\n");
470 } else if (BUS_ERROR(ec)) {
474 pr_cont("Error during attempted NB data read.\n");
475 } else if (MEM_ERROR(ec)) {
478 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
482 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
488 } else if (INT_ERROR(ec)) {
490 pr_cont("Hardware Assert.\n");
498 static bool f16h_mc2_mce(u16 ec, u8 xec)
507 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
512 pr_cont("ECC error in L2 tag (%s).\n",
513 ((r4 == R4_GEN) ? "BankReq" :
514 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
519 pr_cont("ECC error in L2 data array (%s).\n",
520 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
521 ((r4 == R4_GEN) ? "Attr" :
522 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
527 pr_cont("Parity error in L2 attribute bits (%s).\n",
528 ((r4 == R4_RD) ? "Hit" :
529 ((r4 == R4_GEN) ? "Attr" : "Fill")));
539 static void decode_mc2_mce(struct mce *m)
541 u16 ec = EC(m->status);
542 u8 xec = XEC(m->status, xec_mask);
544 pr_emerg(HW_ERR "MC2 Error: ");
546 if (!fam_ops->mc2_mce(ec, xec))
547 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
550 static void decode_mc3_mce(struct mce *m)
552 u16 ec = EC(m->status);
553 u8 xec = XEC(m->status, xec_mask);
555 if (boot_cpu_data.x86 >= 0x14) {
556 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
557 " please report on LKML.\n");
561 pr_emerg(HW_ERR "MC3 Error");
566 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
569 pr_cont(" during %s.\n", R4_MSG(ec));
576 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
579 static void decode_mc4_mce(struct mce *m)
581 struct cpuinfo_x86 *c = &boot_cpu_data;
582 int node_id = amd_get_nb_id(m->extcpu);
583 u16 ec = EC(m->status);
584 u8 xec = XEC(m->status, 0x1f);
587 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
592 /* special handling for DRAM ECCs */
593 if (xec == 0x0 || xec == 0x8) {
594 /* no ECCs on F11h */
598 pr_cont("%s.\n", mc4_mce_desc[xec]);
601 nb_bus_decoder(node_id, m);
608 pr_cont("GART Table Walk data error.\n");
609 else if (BUS_ERROR(ec))
610 pr_cont("DMA Exclusion Vector Table Walk error.\n");
616 if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
617 pr_cont("Compute Unit Data Error.\n");
630 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
634 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
637 static void decode_mc5_mce(struct mce *m)
639 struct cpuinfo_x86 *c = &boot_cpu_data;
640 u16 ec = EC(m->status);
641 u8 xec = XEC(m->status, xec_mask);
643 if (c->x86 == 0xf || c->x86 == 0x11)
646 pr_emerg(HW_ERR "MC5 Error: ");
650 pr_cont("Hardware Assert.\n");
656 if (xec == 0x0 || xec == 0xc)
657 pr_cont("%s.\n", mc5_mce_desc[xec]);
659 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
666 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
669 static void decode_mc6_mce(struct mce *m)
671 u8 xec = XEC(m->status, xec_mask);
673 pr_emerg(HW_ERR "MC6 Error: ");
677 pr_cont("Hardware Assertion");
681 pr_cont("Free List");
685 pr_cont("Physical Register File");
689 pr_cont("Retire Queue");
693 pr_cont("Scheduler table");
697 pr_cont("Status Register File");
705 pr_cont(" parity error.\n");
710 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
713 static inline void amd_decode_err_code(u16 ec)
716 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
720 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
723 pr_cont(", mem/io: %s", II_MSG(ec));
725 pr_cont(", tx: %s", TT_MSG(ec));
727 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
728 pr_cont(", mem-tx: %s", R4_MSG(ec));
731 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
738 * Filter out unwanted MCE signatures here.
740 static bool amd_filter_mce(struct mce *m)
742 u8 xec = (m->status >> 16) & 0x1f;
745 * NB GART TLB error reporting is disabled by default.
747 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
753 static const char *decode_error_status(struct mce *m)
755 if (m->status & MCI_STATUS_UC) {
756 if (m->status & MCI_STATUS_PCC)
757 return "System Fatal error.";
758 if (m->mcgstatus & MCG_STATUS_RIPV)
759 return "Uncorrected, software restartable error.";
760 return "Uncorrected, software containable error.";
763 if (m->status & MCI_STATUS_DEFERRED)
764 return "Deferred error.";
766 return "Corrected error, no action required.";
769 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
771 struct mce *m = (struct mce *)data;
772 struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
775 if (amd_filter_mce(m))
778 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
780 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
782 c->x86, c->x86_model, c->x86_mask,
784 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
785 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
786 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
787 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
788 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
790 if (c->x86 == 0x15 || c->x86 == 0x16)
792 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
793 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
795 /* do the two bits[14:13] together */
796 ecc = (m->status >> 45) & 0x3;
798 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
800 pr_cont("]: 0x%016llx\n", m->status);
802 if (m->status & MCI_STATUS_ADDRV)
803 pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
842 amd_decode_err_code(m->status & 0xffff);
846 EXPORT_SYMBOL_GPL(amd_decode_mce);
848 static struct notifier_block amd_mce_dec_nb = {
849 .notifier_call = amd_decode_mce,
852 static int __init mce_amd_init(void)
854 struct cpuinfo_x86 *c = &boot_cpu_data;
856 if (c->x86_vendor != X86_VENDOR_AMD)
859 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
865 fam_ops->mc0_mce = k8_mc0_mce;
866 fam_ops->mc1_mce = k8_mc1_mce;
867 fam_ops->mc2_mce = k8_mc2_mce;
871 fam_ops->mc0_mce = f10h_mc0_mce;
872 fam_ops->mc1_mce = k8_mc1_mce;
873 fam_ops->mc2_mce = k8_mc2_mce;
877 fam_ops->mc0_mce = k8_mc0_mce;
878 fam_ops->mc1_mce = k8_mc1_mce;
879 fam_ops->mc2_mce = k8_mc2_mce;
883 fam_ops->mc0_mce = f12h_mc0_mce;
884 fam_ops->mc1_mce = k8_mc1_mce;
885 fam_ops->mc2_mce = k8_mc2_mce;
889 fam_ops->mc0_mce = cat_mc0_mce;
890 fam_ops->mc1_mce = cat_mc1_mce;
891 fam_ops->mc2_mce = k8_mc2_mce;
895 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
897 fam_ops->mc0_mce = f15h_mc0_mce;
898 fam_ops->mc1_mce = f15h_mc1_mce;
899 fam_ops->mc2_mce = f15h_mc2_mce;
904 fam_ops->mc0_mce = cat_mc0_mce;
905 fam_ops->mc1_mce = cat_mc1_mce;
906 fam_ops->mc2_mce = f16h_mc2_mce;
910 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
915 pr_info("MCE: In-kernel MCE decoding enabled.\n");
917 mce_register_decode_chain(&amd_mce_dec_nb);
921 early_initcall(mce_amd_init);
924 static void __exit mce_amd_exit(void)
926 mce_unregister_decode_chain(&amd_mce_dec_nb);
930 MODULE_DESCRIPTION("AMD MCE decoder");
931 MODULE_ALIAS("edac-mce-amd");
932 MODULE_LICENSE("GPL");
933 module_exit(mce_amd_exit);