trace, RAS: Add eMCA trace event interface
authorChen, Gong <gong.chen@linux.intel.com>
Wed, 18 Jun 2014 02:33:07 +0000 (22:33 -0400)
committerTony Luck <tony.luck@intel.com>
Wed, 25 Jun 2014 20:26:47 +0000 (13:26 -0700)
Add trace interface to elaborate all H/W error related information.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
drivers/acpi/Kconfig
drivers/acpi/acpi_extlog.c
drivers/firmware/efi/cper.c
drivers/ras/ras.c
include/linux/cper.h
include/ras/ras_event.h

index a34a22841002495713a74f960dea482ecee8dc11..206942b8d10572958183de7d4e5c91413678c51d 100644 (file)
@@ -370,6 +370,7 @@ config ACPI_EXTLOG
        tristate "Extended Error Log support"
        depends on X86_MCE && X86_LOCAL_APIC
        select UEFI_CPER
        tristate "Extended Error Log support"
        depends on X86_MCE && X86_LOCAL_APIC
        select UEFI_CPER
+       select RAS
        default n
        help
          Certain usages such as Predictive Failure Analysis (PFA) require
        default n
        help
          Certain usages such as Predictive Failure Analysis (PFA) require
@@ -384,6 +385,7 @@ config ACPI_EXTLOG
 
          Enhanced MCA Logging allows firmware to provide additional error
          information to system software, synchronous with MCE or CMCI. This
 
          Enhanced MCA Logging allows firmware to provide additional error
          information to system software, synchronous with MCE or CMCI. This
-         driver adds support for that functionality.
+         driver adds support for that functionality with corresponding
+         tracepoint which carries that information to userspace.
 
 endif  # ACPI
 
 endif  # ACPI
index 185334114d71005e649f10fd04acfcd4b7bf14ef..e61da957f30f14c1e4917b450179e78afa994a1d 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/mce.h>
 
 #include "apei/apei-internal.h"
 #include <asm/mce.h>
 
 #include "apei/apei-internal.h"
+#include <ras/ras_event.h>
 
 #define EXT_ELOG_ENTRY_MASK    GENMASK_ULL(51, 0) /* elog entry address mask */
 
 
 #define EXT_ELOG_ENTRY_MASK    GENMASK_ULL(51, 0) /* elog entry address mask */
 
@@ -137,8 +138,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
        struct mce *mce = (struct mce *)data;
        int     bank = mce->bank;
        int     cpu = mce->extcpu;
        struct mce *mce = (struct mce *)data;
        int     bank = mce->bank;
        int     cpu = mce->extcpu;
-       struct acpi_generic_status *estatus;
-       int rc;
+       struct acpi_generic_status *estatus, *tmp;
+       struct acpi_generic_data *gdata;
+       const uuid_le *fru_id = &NULL_UUID_LE;
+       char *fru_text = "";
+       uuid_le *sec_type;
+       static u32 err_seq;
 
        estatus = extlog_elog_entry_check(cpu, bank);
        if (estatus == NULL)
 
        estatus = extlog_elog_entry_check(cpu, bank);
        if (estatus == NULL)
@@ -148,7 +153,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
        /* clear record status to enable BIOS to update it again */
        estatus->block_status = 0;
 
        /* clear record status to enable BIOS to update it again */
        estatus->block_status = 0;
 
-       rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
+       tmp = (struct acpi_generic_status *)elog_buf;
+       print_extlog_rcd(NULL, tmp, cpu);
+
+       /* log event via trace */
+       err_seq++;
+       gdata = (struct acpi_generic_data *)(tmp + 1);
+       if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+               fru_id = (uuid_le *)gdata->fru_id;
+       if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+               fru_text = gdata->fru_text;
+       sec_type = (uuid_le *)gdata->section_type;
+       if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
+               struct cper_sec_mem_err *mem = (void *)(gdata + 1);
+               if (gdata->error_data_length >= sizeof(*mem))
+                       trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
+                                              (u8)gdata->error_severity);
+       }
 
        return NOTIFY_STOP;
 }
 
        return NOTIFY_STOP;
 }
index ac33a9fed3414c66dea3e29b562842b434eaced0..437e6fd47311d087174684061964cc8f846bea47 100644 (file)
@@ -207,7 +207,7 @@ const char *cper_mem_err_type_str(unsigned int etype)
 }
 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
 
 }
 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
 
-static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
+static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
 {
        u32 len, n;
 
 {
        u32 len, n;
 
@@ -249,7 +249,7 @@ static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
        return n;
 }
 
        return n;
 }
 
-static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
+static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
 {
        u32 len, n;
        const char *bank = NULL, *device = NULL;
 {
        u32 len, n;
        const char *bank = NULL, *device = NULL;
@@ -271,8 +271,44 @@ static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
        return n;
 }
 
        return n;
 }
 
+void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
+                      struct cper_mem_err_compact *cmem)
+{
+       cmem->validation_bits = mem->validation_bits;
+       cmem->node = mem->node;
+       cmem->card = mem->card;
+       cmem->module = mem->module;
+       cmem->bank = mem->bank;
+       cmem->device = mem->device;
+       cmem->row = mem->row;
+       cmem->column = mem->column;
+       cmem->bit_pos = mem->bit_pos;
+       cmem->requestor_id = mem->requestor_id;
+       cmem->responder_id = mem->responder_id;
+       cmem->target_id = mem->target_id;
+       cmem->rank = mem->rank;
+       cmem->mem_array_handle = mem->mem_array_handle;
+       cmem->mem_dev_handle = mem->mem_dev_handle;
+}
+
+const char *cper_mem_err_unpack(struct trace_seq *p,
+                               struct cper_mem_err_compact *cmem)
+{
+       const char *ret = p->buffer + p->len;
+
+       if (cper_mem_err_location(cmem, rcd_decode_str))
+               trace_seq_printf(p, "%s", rcd_decode_str);
+       if (cper_dimm_err_location(cmem, rcd_decode_str))
+               trace_seq_printf(p, "%s", rcd_decode_str);
+       trace_seq_putc(p, '\0');
+
+       return ret;
+}
+
 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
 {
 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
 {
+       struct cper_mem_err_compact cmem;
+
        if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
                printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
        if (mem->validation_bits & CPER_MEM_VALID_PA)
        if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
                printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
        if (mem->validation_bits & CPER_MEM_VALID_PA)
@@ -281,14 +317,15 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
        if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
                printk("%s""physical_address_mask: 0x%016llx\n",
                       pfx, mem->physical_addr_mask);
        if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
                printk("%s""physical_address_mask: 0x%016llx\n",
                       pfx, mem->physical_addr_mask);
-       if (cper_mem_err_location(mem, rcd_decode_str))
+       cper_mem_err_pack(mem, &cmem);
+       if (cper_mem_err_location(&cmem, rcd_decode_str))
                printk("%s%s\n", pfx, rcd_decode_str);
        if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
                u8 etype = mem->error_type;
                printk("%s""error_type: %d, %s\n", pfx, etype,
                       cper_mem_err_type_str(etype));
        }
                printk("%s%s\n", pfx, rcd_decode_str);
        if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
                u8 etype = mem->error_type;
                printk("%s""error_type: %d, %s\n", pfx, etype,
                       cper_mem_err_type_str(etype));
        }
-       if (cper_dimm_err_location(mem, rcd_decode_str))
+       if (cper_dimm_err_location(&cmem, rcd_decode_str))
                printk("%s%s\n", pfx, rcd_decode_str);
 }
 
                printk("%s%s\n", pfx, rcd_decode_str);
 }
 
index 4cac43a1e25cdce3c804d88d1f8360f88bf46254..b67dd362b7b6cff2f62d2f431c9d25c4d444df79 100644 (file)
@@ -23,4 +23,7 @@ static int __init ras_init(void)
 }
 subsys_initcall(ras_init);
 
 }
 subsys_initcall(ras_init);
 
+#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
+EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
+#endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
index ed088b9c129800f078524f5a79f99e8a9cd5a028..76abba4b238ece14f8a2bd9bcce5ab53306c61fd 100644 (file)
@@ -22,6 +22,7 @@
 #define LINUX_CPER_H
 
 #include <linux/uuid.h>
 #define LINUX_CPER_H
 
 #include <linux/uuid.h>
+#include <linux/trace_seq.h>
 
 /* CPER record signature and the size */
 #define CPER_SIG_RECORD                                "CPER"
 
 /* CPER record signature and the size */
 #define CPER_SIG_RECORD                                "CPER"
@@ -363,6 +364,24 @@ struct cper_sec_mem_err {
        __u16   mem_dev_handle;         /* module handle in UEFI 2.4 */
 };
 
        __u16   mem_dev_handle;         /* module handle in UEFI 2.4 */
 };
 
+struct cper_mem_err_compact {
+       __u64   validation_bits;
+       __u16   node;
+       __u16   card;
+       __u16   module;
+       __u16   bank;
+       __u16   device;
+       __u16   row;
+       __u16   column;
+       __u16   bit_pos;
+       __u64   requestor_id;
+       __u64   responder_id;
+       __u64   target_id;
+       __u16   rank;
+       __u16   mem_array_handle;
+       __u16   mem_dev_handle;
+};
+
 struct cper_sec_pcie {
        __u64           validation_bits;
        __u32           port_type;
 struct cper_sec_pcie {
        __u64           validation_bits;
        __u32           port_type;
@@ -406,5 +425,9 @@ const char *cper_severity_str(unsigned int);
 const char *cper_mem_err_type_str(unsigned int);
 void cper_print_bits(const char *prefix, unsigned int bits,
                     const char * const strs[], unsigned int strs_size);
 const char *cper_mem_err_type_str(unsigned int);
 void cper_print_bits(const char *prefix, unsigned int bits,
                     const char * const strs[], unsigned int strs_size);
+void cper_mem_err_pack(const struct cper_sec_mem_err *,
+                      struct cper_mem_err_compact *);
+const char *cper_mem_err_unpack(struct trace_seq *,
+                               struct cper_mem_err_compact *);
 
 #endif
 
 #endif
index acbcbb88eaaa26b3e0f631e3db6a8130844cca84..47da53c27ffa54fd5602964400713f44c5ffdf82 100644 (file)
@@ -9,6 +9,70 @@
 #include <linux/edac.h>
 #include <linux/ktime.h>
 #include <linux/aer.h>
 #include <linux/edac.h>
 #include <linux/ktime.h>
 #include <linux/aer.h>
+#include <linux/cper.h>
+
+/*
+ * MCE Extended Error Log trace event
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event.
+ */
+
+/* memory trace event */
+
+#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
+TRACE_EVENT(extlog_mem_event,
+       TP_PROTO(struct cper_sec_mem_err *mem,
+                u32 err_seq,
+                const uuid_le *fru_id,
+                const char *fru_text,
+                u8 sev),
+
+       TP_ARGS(mem, err_seq, fru_id, fru_text, sev),
+
+       TP_STRUCT__entry(
+               __field(u32, err_seq)
+               __field(u8, etype)
+               __field(u8, sev)
+               __field(u64, pa)
+               __field(u8, pa_mask_lsb)
+               __field_struct(uuid_le, fru_id)
+               __string(fru_text, fru_text)
+               __field_struct(struct cper_mem_err_compact, data)
+       ),
+
+       TP_fast_assign(
+               __entry->err_seq = err_seq;
+               if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
+                       __entry->etype = mem->error_type;
+               else
+                       __entry->etype = ~0;
+               __entry->sev = sev;
+               if (mem->validation_bits & CPER_MEM_VALID_PA)
+                       __entry->pa = mem->physical_addr;
+               else
+                       __entry->pa = ~0ull;
+
+               if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
+                       __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
+               else
+                       __entry->pa_mask_lsb = ~0;
+               __entry->fru_id = *fru_id;
+               __assign_str(fru_text, fru_text);
+               cper_mem_err_pack(mem, &__entry->data);
+       ),
+
+       TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
+                 __entry->err_seq,
+                 cper_severity_str(__entry->sev),
+                 cper_mem_err_type_str(__entry->etype),
+                 __entry->pa,
+                 __entry->pa_mask_lsb,
+                 cper_mem_err_unpack(p, &__entry->data),
+                 &__entry->fru_id,
+                 __get_str(fru_text))
+);
+#endif
 
 /*
  * Hardware Events Report
 
 /*
  * Hardware Events Report