1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char *to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char *f10h_nb_mce_desc[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "Microcode Patch Buffer", /* xec = 010 */
98 static const char * const f15h_cu_mce_desc[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
107 "VB Data ECC or parity error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
112 "PRB address parity error"
115 static const char * const fr_ex_mce_desc[] = {
116 "CPU Watchdog timer expire",
117 "Wakeup array dest tag",
121 "Retire dispatch queue",
122 "Mapper checkpoint array",
123 "Physical register file EX0 port",
124 "Physical register file EX1 port",
125 "Physical register file AG0 port",
126 "Physical register file AG1 port",
127 "Flag register file",
128 "DE correctable error could not be corrected"
131 static bool f12h_dc_mce(u16 ec, u8 xec)
140 pr_cont("during L1 linefill from L2.\n");
141 else if (ll == LL_L1)
142 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
149 static bool f10h_dc_mce(u16 ec, u8 xec)
151 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
152 pr_cont("during data scrub.\n");
155 return f12h_dc_mce(ec, xec);
158 static bool k8_dc_mce(u16 ec, u8 xec)
161 pr_cont("during system linefill.\n");
165 return f10h_dc_mce(ec, xec);
168 static bool f14h_dc_mce(u16 ec, u8 xec)
175 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
181 pr_cont("Data/Tag parity error due to %s.\n",
182 (r4 == R4_DRD ? "load/hw prf" : "store"));
185 pr_cont("Copyback parity error on a tag miss.\n");
188 pr_cont("Tag parity error during snoop.\n");
193 } else if (BUS_ERROR(ec)) {
195 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
198 pr_cont("System read data error on a ");
202 pr_cont("TLB reload.\n");
220 static bool f15h_dc_mce(u16 ec, u8 xec)
228 pr_cont("Data Array access error.\n");
232 pr_cont("UC error during a linefill from L2/NB.\n");
237 pr_cont("STQ access error.\n");
241 pr_cont("SCB access error.\n");
245 pr_cont("Tag error.\n");
249 pr_cont("LDQ access error.\n");
255 } else if (BUS_ERROR(ec)) {
258 pr_cont("System Read Data Error.\n");
260 pr_cont(" Internal error condition type %d.\n", xec);
267 static void amd_decode_dc_mce(struct mce *m)
269 u16 ec = EC(m->status);
270 u8 xec = XEC(m->status, xec_mask);
272 pr_emerg(HW_ERR "Data Cache Error: ");
274 /* TLB error signatures are the same across families */
276 if (TT(ec) == TT_DATA) {
277 pr_cont("%s TLB %s.\n", LL_MSG(ec),
278 ((xec == 2) ? "locked miss"
279 : (xec ? "multimatch" : "parity")));
282 } else if (fam_ops->dc_mce(ec, xec))
285 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
288 static bool k8_ic_mce(u16 ec, u8 xec)
297 pr_cont("during a linefill from L2.\n");
298 else if (ll == 0x1) {
301 pr_cont("Parity error during data load.\n");
305 pr_cont("Copyback Parity/Victim error.\n");
309 pr_cont("Tag Snoop error.\n");
322 static bool f14h_ic_mce(u16 ec, u8 xec)
328 if (TT(ec) != 0 || LL(ec) != 1)
332 pr_cont("Data/tag array parity error for a tag hit.\n");
333 else if (r4 == R4_SNOOP)
334 pr_cont("Tag error during snoop/victimization.\n");
341 static bool f15h_ic_mce(u16 ec, u8 xec)
350 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
354 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
358 pr_cont("%s.\n", f15h_ic_mce_desc[xec-4]);
362 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
371 static void amd_decode_ic_mce(struct mce *m)
373 u16 ec = EC(m->status);
374 u8 xec = XEC(m->status, xec_mask);
376 pr_emerg(HW_ERR "Instruction Cache Error: ");
379 pr_cont("%s TLB %s.\n", LL_MSG(ec),
380 (xec ? "multimatch" : "parity error"));
381 else if (BUS_ERROR(ec)) {
382 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
384 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
385 } else if (fam_ops->ic_mce(ec, xec))
388 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
391 static void amd_decode_bu_mce(struct mce *m)
393 u16 ec = EC(m->status);
394 u8 xec = XEC(m->status, xec_mask);
396 pr_emerg(HW_ERR "Bus Unit Error");
399 pr_cont(" in the write data buffers.\n");
401 pr_cont(" in the victim data buffers.\n");
402 else if (xec == 0x2 && MEM_ERROR(ec))
403 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
404 else if (xec == 0x0) {
406 pr_cont(": %s error in a Page Descriptor Cache or "
407 "Guest TLB.\n", TT_MSG(ec));
408 else if (BUS_ERROR(ec))
409 pr_cont(": %s/ECC error in data read from NB: %s.\n",
410 R4_MSG(ec), PP_MSG(ec));
411 else if (MEM_ERROR(ec)) {
415 pr_cont(": %s error during data copyback.\n",
418 pr_cont(": %s parity/ECC error during data "
419 "access from L2.\n", R4_MSG(ec));
430 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
433 static void amd_decode_cu_mce(struct mce *m)
435 u16 ec = EC(m->status);
436 u8 xec = XEC(m->status, xec_mask);
438 pr_emerg(HW_ERR "Combined Unit Error: ");
442 pr_cont("Data parity TLB read error.\n");
444 pr_cont("Poison data provided for TLB fill.\n");
447 } else if (BUS_ERROR(ec)) {
451 pr_cont("Error during attempted NB data read.\n");
452 } else if (MEM_ERROR(ec)) {
455 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
459 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
470 pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
473 static void amd_decode_ls_mce(struct mce *m)
475 u16 ec = EC(m->status);
476 u8 xec = XEC(m->status, xec_mask);
478 if (boot_cpu_data.x86 >= 0x14) {
479 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
480 " please report on LKML.\n");
484 pr_emerg(HW_ERR "Load Store Error");
489 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
492 pr_cont(" during %s.\n", R4_MSG(ec));
499 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
502 static bool k8_nb_mce(u16 ec, u8 xec)
508 pr_cont("CRC error detected on HT link.\n");
512 pr_cont("Invalid GART PTE entry during GART table walk.\n");
516 pr_cont("Unsupported atomic RMW received from an IO link.\n");
521 if (boot_cpu_data.x86 == 0x11)
524 pr_cont("DRAM ECC error detected on the NB.\n");
528 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
539 static bool f10h_nb_mce(u16 ec, u8 xec)
544 if (k8_nb_mce(ec, xec))
558 pr_cont("GART Table Walk data error.\n");
559 else if (BUS_ERROR(ec))
560 pr_cont("DMA Exclusion Vector Table Walk error.\n");
568 if (boot_cpu_data.x86 == 0x15)
569 pr_cont("Compute Unit Data Error.\n");
587 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
593 static bool nb_noop_mce(u16 ec, u8 xec)
598 void amd_decode_nb_mce(struct mce *m)
600 struct cpuinfo_x86 *c = &boot_cpu_data;
601 int node_id = amd_get_nb_id(m->extcpu);
602 u16 ec = EC(m->status);
603 u8 xec = XEC(m->status, 0x1f);
605 pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id);
609 pr_cont("Sync error (sync packets on HT link detected).\n");
613 pr_cont("HT Master abort.\n");
617 pr_cont("HT Target abort.\n");
621 pr_cont("NB Watchdog timeout.\n");
625 pr_cont("SVM DMA Exclusion Vector error.\n");
632 if (!fam_ops->nb_mce(ec, xec))
635 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x15)
636 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
637 nb_bus_decoder(node_id, m);
642 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
644 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
646 static void amd_decode_fr_mce(struct mce *m)
648 struct cpuinfo_x86 *c = &boot_cpu_data;
649 u8 xec = XEC(m->status, xec_mask);
651 if (c->x86 == 0xf || c->x86 == 0x11)
654 if (c->x86 != 0x15 && xec != 0x0)
657 pr_emerg(HW_ERR "%s Error: ",
658 (c->x86 == 0x15 ? "Execution Unit" : "FIROB"));
660 if (xec == 0x0 || xec == 0xc)
661 pr_cont("%s.\n", fr_ex_mce_desc[xec]);
663 pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]);
670 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
673 static void amd_decode_fp_mce(struct mce *m)
675 u8 xec = XEC(m->status, xec_mask);
677 pr_emerg(HW_ERR "Floating Point Unit Error: ");
681 pr_cont("Free List");
685 pr_cont("Physical Register File");
689 pr_cont("Retire Queue");
693 pr_cont("Scheduler table");
697 pr_cont("Status Register File");
705 pr_cont(" parity error.\n");
710 pr_emerg(HW_ERR "Corrupted FP MCE info?\n");
713 static inline void amd_decode_err_code(u16 ec)
716 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
719 pr_cont(", mem/io: %s", II_MSG(ec));
721 pr_cont(", tx: %s", TT_MSG(ec));
723 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
724 pr_cont(", mem-tx: %s", R4_MSG(ec));
727 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
734 * Filter out unwanted MCE signatures here.
736 static bool amd_filter_mce(struct mce *m)
738 u8 xec = (m->status >> 16) & 0x1f;
741 * NB GART TLB error reporting is disabled by default.
743 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
749 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
751 struct mce *m = (struct mce *)data;
752 struct cpuinfo_x86 *c = &boot_cpu_data;
755 if (amd_filter_mce(m))
758 pr_emerg(HW_ERR "CPU:%d\tMC%d_STATUS[%s|%s|%s|%s|%s",
760 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
761 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
762 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
763 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
764 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
768 ((m->status & BIT_64(44)) ? "Deferred" : "-"),
769 ((m->status & BIT_64(43)) ? "Poison" : "-"));
771 /* do the two bits[14:13] together */
772 ecc = (m->status >> 45) & 0x3;
774 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
776 pr_cont("]: 0x%016llx\n", m->status);
778 if (m->status & MCI_STATUS_ADDRV)
779 pr_emerg(HW_ERR "\tMC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
783 amd_decode_dc_mce(m);
787 amd_decode_ic_mce(m);
792 amd_decode_cu_mce(m);
794 amd_decode_bu_mce(m);
798 amd_decode_ls_mce(m);
802 amd_decode_nb_mce(m);
806 amd_decode_fr_mce(m);
810 amd_decode_fp_mce(m);
817 amd_decode_err_code(m->status & 0xffff);
821 EXPORT_SYMBOL_GPL(amd_decode_mce);
823 static struct notifier_block amd_mce_dec_nb = {
824 .notifier_call = amd_decode_mce,
827 static int __init mce_amd_init(void)
829 struct cpuinfo_x86 *c = &boot_cpu_data;
831 if (c->x86_vendor != X86_VENDOR_AMD)
834 if ((c->x86 < 0xf || c->x86 > 0x12) &&
835 (c->x86 != 0x14 || c->x86_model > 0xf) &&
836 (c->x86 != 0x15 || c->x86_model > 0xf))
839 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
845 fam_ops->dc_mce = k8_dc_mce;
846 fam_ops->ic_mce = k8_ic_mce;
847 fam_ops->nb_mce = k8_nb_mce;
851 fam_ops->dc_mce = f10h_dc_mce;
852 fam_ops->ic_mce = k8_ic_mce;
853 fam_ops->nb_mce = f10h_nb_mce;
857 fam_ops->dc_mce = k8_dc_mce;
858 fam_ops->ic_mce = k8_ic_mce;
859 fam_ops->nb_mce = f10h_nb_mce;
863 fam_ops->dc_mce = f12h_dc_mce;
864 fam_ops->ic_mce = k8_ic_mce;
865 fam_ops->nb_mce = nb_noop_mce;
869 nb_err_cpumask = 0x3;
870 fam_ops->dc_mce = f14h_dc_mce;
871 fam_ops->ic_mce = f14h_ic_mce;
872 fam_ops->nb_mce = nb_noop_mce;
877 fam_ops->dc_mce = f15h_dc_mce;
878 fam_ops->ic_mce = f15h_ic_mce;
879 fam_ops->nb_mce = f10h_nb_mce;
883 printk(KERN_WARNING "Huh? What family is that: %d?!\n", c->x86);
888 pr_info("MCE: In-kernel MCE decoding enabled.\n");
890 mce_register_decode_chain(&amd_mce_dec_nb);
894 early_initcall(mce_amd_init);
897 static void __exit mce_amd_exit(void)
899 mce_unregister_decode_chain(&amd_mce_dec_nb);
903 MODULE_DESCRIPTION("AMD MCE decoder");
904 MODULE_ALIAS("edac-mce-amd");
905 MODULE_LICENSE("GPL");
906 module_exit(mce_amd_exit);