Merge tag 'please-pull-aer-trace' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorIngo Molnar <mingo@kernel.org>
Thu, 24 Jan 2013 13:49:10 +0000 (14:49 +0100)
committerIngo Molnar <mingo@kernel.org>
Thu, 24 Jan 2013 13:49:10 +0000 (14:49 +0100)
Use perf/event tracing to report PCI Express advanced errors, by
Tony Luck.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
drivers/acpi/apei/cper.c
drivers/pci/pcie/aer/aerdrv_errprint.c
include/linux/aer.h
include/trace/events/ras.h [new file with mode: 0644]

index e6defd86b42454e98d2828abf2d99240d9e68b18..1e5d8a40101e274f5d6ce2bfe1a24a486a671af7 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/time.h>
 #include <linux/cper.h>
 #include <linux/acpi.h>
+#include <linux/pci.h>
 #include <linux/aer.h>
 
 /*
@@ -249,6 +250,10 @@ static const char *cper_pcie_port_type_strs[] = {
 static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
                            const struct acpi_hest_generic_data *gdata)
 {
+#ifdef CONFIG_ACPI_APEI_PCIEAER
+       struct pci_dev *dev;
+#endif
+
        if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
                printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
                       pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ?
@@ -281,10 +286,18 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
        "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
        pfx, pcie->bridge.secondary_status, pcie->bridge.control);
 #ifdef CONFIG_ACPI_APEI_PCIEAER
-       if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) {
-               struct aer_capability_regs *aer_regs = (void *)pcie->aer_info;
-               cper_print_aer(pfx, gdata->error_severity, aer_regs);
+       dev = pci_get_domain_bus_and_slot(pcie->device_id.segment,
+                       pcie->device_id.bus, pcie->device_id.function);
+       if (!dev) {
+               pr_err("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n",
+                       pcie->device_id.segment, pcie->device_id.bus,
+                       pcie->device_id.slot, pcie->device_id.function);
+               return;
        }
+       if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO)
+               cper_print_aer(pfx, dev, gdata->error_severity,
+                               (struct aer_capability_regs *) pcie->aer_info);
+       pci_dev_put(dev);
 #endif
 }
 
index 3ea51736f18db45be7c40280626eb0d803c157b1..5ab14251839d0f06c395ac0dfc064c166cf3b75a 100644 (file)
@@ -23,6 +23,9 @@
 
 #include "aerdrv.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ras.h>
+
 #define AER_AGENT_RECEIVER             0
 #define AER_AGENT_REQUESTER            1
 #define AER_AGENT_COMPLETER            2
@@ -121,12 +124,11 @@ static const char *aer_agent_string[] = {
        "Transmitter ID"
 };
 
-static void __aer_print_error(const char *prefix,
+static void __aer_print_error(struct pci_dev *dev,
                              struct aer_err_info *info)
 {
        int i, status;
        const char *errmsg = NULL;
-
        status = (info->status & ~info->mask);
 
        for (i = 0; i < 32; i++) {
@@ -141,26 +143,22 @@ static void __aer_print_error(const char *prefix,
                                aer_uncorrectable_error_string[i] : NULL;
 
                if (errmsg)
-                       printk("%s""   [%2d] %-22s%s\n", prefix, i, errmsg,
+                       dev_err(&dev->dev, "   [%2d] %-22s%s\n", i, errmsg,
                                info->first_error == i ? " (First)" : "");
                else
-                       printk("%s""   [%2d] Unknown Error Bit%s\n", prefix, i,
-                               info->first_error == i ? " (First)" : "");
+                       dev_err(&dev->dev, "   [%2d] Unknown Error Bit%s\n",
+                               i, info->first_error == i ? " (First)" : "");
        }
 }
 
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 {
        int id = ((dev->bus->number << 8) | dev->devfn);
-       char prefix[44];
-
-       snprintf(prefix, sizeof(prefix), "%s%s %s: ",
-                (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR,
-                dev_driver_string(&dev->dev), dev_name(&dev->dev));
 
        if (info->status == 0) {
-               printk("%s""PCIe Bus Error: severity=%s, type=Unaccessible, "
-                       "id=%04x(Unregistered Agent ID)\n", prefix,
+               dev_err(&dev->dev,
+                       "PCIe Bus Error: severity=%s, type=Unaccessible, "
+                       "id=%04x(Unregistered Agent ID)\n",
                        aer_error_severity_string[info->severity], id);
        } else {
                int layer, agent;
@@ -168,22 +166,24 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
                layer = AER_GET_LAYER_ERROR(info->severity, info->status);
                agent = AER_GET_AGENT(info->severity, info->status);
 
-               printk("%s""PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n",
-                       prefix, aer_error_severity_string[info->severity],
+               dev_err(&dev->dev,
+                       "PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n",
+                       aer_error_severity_string[info->severity],
                        aer_error_layer[layer], id, aer_agent_string[agent]);
 
-               printk("%s""  device [%04x:%04x] error status/mask=%08x/%08x\n",
-                       prefix, dev->vendor, dev->device,
+               dev_err(&dev->dev,
+                       "  device [%04x:%04x] error status/mask=%08x/%08x\n",
+                       dev->vendor, dev->device,
                        info->status, info->mask);
 
-               __aer_print_error(prefix, info);
+               __aer_print_error(dev, info);
 
                if (info->tlp_header_valid) {
                        unsigned char *tlp = (unsigned char *) &info->tlp;
-                       printk("%s""  TLP Header:"
+                       dev_err(&dev->dev, "  TLP Header:"
                                " %02x%02x%02x%02x %02x%02x%02x%02x"
                                " %02x%02x%02x%02x %02x%02x%02x%02x\n",
-                               prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
+                               *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
                                *(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4),
                                *(tlp + 11), *(tlp + 10), *(tlp + 9),
                                *(tlp + 8), *(tlp + 15), *(tlp + 14),
@@ -192,8 +192,11 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
        }
 
        if (info->id && info->error_dev_num > 1 && info->id == id)
-               printk("%s""  Error of this Agent(%04x) is reported first\n",
-                       prefix, id);
+               dev_err(&dev->dev,
+                          "  Error of this Agent(%04x) is reported first\n",
+                       id);
+       trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+                       info->severity);
 }
 
 void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -217,7 +220,7 @@ int cper_severity_to_aer(int cper_severity)
 }
 EXPORT_SYMBOL_GPL(cper_severity_to_aer);
 
-void cper_print_aer(const char *prefix, int cper_severity,
+void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity,
                    struct aer_capability_regs *aer)
 {
        int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0;
@@ -239,25 +242,27 @@ void cper_print_aer(const char *prefix, int cper_severity,
        }
        layer = AER_GET_LAYER_ERROR(aer_severity, status);
        agent = AER_GET_AGENT(aer_severity, status);
-       printk("%s""aer_status: 0x%08x, aer_mask: 0x%08x\n",
-              prefix, status, mask);
+       dev_err(&dev->dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+              status, mask);
        cper_print_bits(prefix, status, status_strs, status_strs_size);
-       printk("%s""aer_layer=%s, aer_agent=%s\n", prefix,
+       dev_err(&dev->dev, "aer_layer=%s, aer_agent=%s\n",
               aer_error_layer[layer], aer_agent_string[agent]);
        if (aer_severity != AER_CORRECTABLE)
-               printk("%s""aer_uncor_severity: 0x%08x\n",
-                      prefix, aer->uncor_severity);
+               dev_err(&dev->dev, "aer_uncor_severity: 0x%08x\n",
+                      aer->uncor_severity);
        if (tlp_header_valid) {
                const unsigned char *tlp;
                tlp = (const unsigned char *)&aer->header_log;
-               printk("%s""aer_tlp_header:"
+               dev_err(&dev->dev, "aer_tlp_header:"
                        " %02x%02x%02x%02x %02x%02x%02x%02x"
                        " %02x%02x%02x%02x %02x%02x%02x%02x\n",
-                       prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
+                       *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
                        *(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4),
                        *(tlp + 11), *(tlp + 10), *(tlp + 9),
                        *(tlp + 8), *(tlp + 15), *(tlp + 14),
                        *(tlp + 13), *(tlp + 12));
        }
+       trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+                       aer_severity);
 }
 #endif
index 544abdb2238ccdffda329816578b41bb954fc617..ec10e1b24c1cce50d50581d58c5000cf7b9e5565 100644 (file)
@@ -49,8 +49,8 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 }
 #endif
 
-extern void cper_print_aer(const char *prefix, int cper_severity,
-                          struct aer_capability_regs *aer);
+extern void cper_print_aer(const char *prefix, struct pci_dev *dev,
+                          int cper_severity, struct aer_capability_regs *aer);
 extern int cper_severity_to_aer(int cper_severity);
 extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
                              int severity);
diff --git a/include/trace/events/ras.h b/include/trace/events/ras.h
new file mode 100644 (file)
index 0000000..88b8783
--- /dev/null
@@ -0,0 +1,77 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ras
+
+#if !defined(_TRACE_AER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_AER_H
+
+#include <linux/tracepoint.h>
+#include <linux/edac.h>
+
+
+/*
+ * PCIe AER Trace event
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event on a PCIe device. The event report has
+ * the following structure:
+ *
+ * char * dev_name -   The name of the slot where the device resides
+ *                     ([domain:]bus:device.function).
+ * u32 status -                Either the correctable or uncorrectable register
+ *                     indicating what error or errors have been seen
+ * u8 severity -       error severity 0:NONFATAL 1:FATAL 2:CORRECTED
+ */
+
+#define aer_correctable_errors         \
+       {BIT(0),        "Receiver Error"},              \
+       {BIT(6),        "Bad TLP"},                     \
+       {BIT(7),        "Bad DLLP"},                    \
+       {BIT(8),        "RELAY_NUM Rollover"},          \
+       {BIT(12),       "Replay Timer Timeout"},        \
+       {BIT(13),       "Advisory Non-Fatal"}
+
+#define aer_uncorrectable_errors               \
+       {BIT(4),        "Data Link Protocol"},          \
+       {BIT(12),       "Poisoned TLP"},                \
+       {BIT(13),       "Flow Control Protocol"},       \
+       {BIT(14),       "Completion Timeout"},          \
+       {BIT(15),       "Completer Abort"},             \
+       {BIT(16),       "Unexpected Completion"},       \
+       {BIT(17),       "Receiver Overflow"},           \
+       {BIT(18),       "Malformed TLP"},               \
+       {BIT(19),       "ECRC"},                        \
+       {BIT(20),       "Unsupported Request"}
+
+TRACE_EVENT(aer_event,
+       TP_PROTO(const char *dev_name,
+                const u32 status,
+                const u8 severity),
+
+       TP_ARGS(dev_name, status, severity),
+
+       TP_STRUCT__entry(
+               __string(       dev_name,       dev_name        )
+               __field(        u32,            status          )
+               __field(        u8,             severity        )
+       ),
+
+       TP_fast_assign(
+               __assign_str(dev_name, dev_name);
+               __entry->status         = status;
+               __entry->severity       = severity;
+       ),
+
+       TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
+               __get_str(dev_name),
+               __entry->severity == HW_EVENT_ERR_CORRECTED ? "Corrected" :
+                       __entry->severity == HW_EVENT_ERR_FATAL ?
+                       "Fatal" : "Uncorrected",
+               __entry->severity == HW_EVENT_ERR_CORRECTED ?
+               __print_flags(__entry->status, "|", aer_correctable_errors) :
+               __print_flags(__entry->status, "|", aer_uncorrectable_errors))
+);
+
+#endif /* _TRACE_AER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>