powerpc/eeh: Dump PHB diag-data early
authorGavin Shan <gwshan@linux.vnet.ibm.com>
Sat, 22 Nov 2014 10:58:09 +0000 (21:58 +1100)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Tue, 2 Dec 2014 00:03:26 +0000 (11:03 +1100)
On PowerNV platform, PHB diag-data is dumped after stopping device
drivers. In case of recursive EEH errors, the kernel is usually
crashed before dumping PHB diag-data for the second EEH error. It's
hard to locate the root cause of the second EEH error without PHB
diag-data.

The patch adds one more EEH option "eeh=early_log", which helps
dumping PHB diag-data immediately once frozen PE is detected, in
order to get the PHB diag-data for the second EEH error.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
arch/powerpc/include/asm/eeh.h
arch/powerpc/kernel/eeh.c
arch/powerpc/platforms/powernv/eeh-ioda.c

index 2e633b41712ae70a6f0db04f8a49651dd4b39c12..0652ebe117af66b8c0ed1ae300078abde74fb545 100644 (file)
@@ -39,6 +39,7 @@ struct device_node;
 #define EEH_PROBE_MODE_DEV     0x04    /* From PCI device      */
 #define EEH_PROBE_MODE_DEVTREE 0x08    /* From device tree     */
 #define EEH_ENABLE_IO_FOR_LOG  0x10    /* Enable IO for log    */
+#define EEH_EARLY_DUMP_LOG     0x20    /* Dump log immediately */
 
 /*
  * Delay for PE reset, all in ms
index f1c6b115cb37c0d2e0a8c73bd4069eb3180fb87f..05be77d9ea0ea874ba0c20f9854ed4e393b55d82 100644 (file)
@@ -143,6 +143,8 @@ static int __init eeh_setup(char *str)
 {
        if (!strcmp(str, "off"))
                eeh_add_flag(EEH_FORCE_DISABLED);
+       else if (!strcmp(str, "early_log"))
+               eeh_add_flag(EEH_EARLY_DUMP_LOG);
 
        return 1;
 }
index fb38fe4dba89f13c8a3bf2e5b84d1ccfc2d708c7..2809c989528814d8dadf5e79d7aa75e32b6a888d 100644 (file)
@@ -353,6 +353,9 @@ static int ioda_eeh_get_phb_state(struct eeh_pe *pe)
        } else if (!(pe->state & EEH_PE_ISOLATED)) {
                eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
                ioda_eeh_phb_diag(pe);
+
+               if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+                       pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
        }
 
        return result;
@@ -451,6 +454,9 @@ static int ioda_eeh_get_pe_state(struct eeh_pe *pe)
 
                eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
                ioda_eeh_phb_diag(pe);
+
+               if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+                       pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
        }
 
        return result;
@@ -730,7 +736,8 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
                            char *drv_log, unsigned long len)
 {
-       pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+       if (!eeh_has_flag(EEH_EARLY_DUMP_LOG))
+               pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
 
        return 0;
 }
@@ -1086,6 +1093,10 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
                    !((*pe)->state & EEH_PE_ISOLATED)) {
                        eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
                        ioda_eeh_phb_diag(*pe);
+
+                       if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+                               pnv_pci_dump_phb_diag_data((*pe)->phb,
+                                                          (*pe)->data);
                }
 
                /*