trace, RAS: Add eMCA trace event interface
authorChen, Gong <gong.chen@linux.intel.com>
Wed, 18 Jun 2014 02:33:07 +0000 (22:33 -0400)
committerTony Luck <tony.luck@intel.com>
Wed, 25 Jun 2014 20:26:47 +0000 (13:26 -0700)
Add trace interface to elaborate all H/W error related information.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
drivers/acpi/Kconfig
drivers/acpi/acpi_extlog.c
drivers/firmware/efi/cper.c
drivers/ras/ras.c
include/linux/cper.h
include/ras/ras_event.h

index a34a22841002495713a74f960dea482ecee8dc11..206942b8d10572958183de7d4e5c91413678c51d 100644 (file)
@@ -370,6 +370,7 @@ config ACPI_EXTLOG
        tristate "Extended Error Log support"
        depends on X86_MCE && X86_LOCAL_APIC
        select UEFI_CPER
+       select RAS
        default n
        help
          Certain usages such as Predictive Failure Analysis (PFA) require
@@ -384,6 +385,7 @@ config ACPI_EXTLOG
 
          Enhanced MCA Logging allows firmware to provide additional error
          information to system software, synchronous with MCE or CMCI. This
-         driver adds support for that functionality.
+         driver adds support for that functionality with corresponding
+         tracepoint which carries that information to userspace.
 
 endif  # ACPI
index 185334114d71005e649f10fd04acfcd4b7bf14ef..e61da957f30f14c1e4917b450179e78afa994a1d 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/mce.h>
 
 #include "apei/apei-internal.h"
+#include <ras/ras_event.h>
 
 #define EXT_ELOG_ENTRY_MASK    GENMASK_ULL(51, 0) /* elog entry address mask */
 
@@ -137,8 +138,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
        struct mce *mce = (struct mce *)data;
        int     bank = mce->bank;
        int     cpu = mce->extcpu;
-       struct acpi_generic_status *estatus;
-       int rc;
+       struct acpi_generic_status *estatus, *tmp;
+       struct acpi_generic_data *gdata;
+       const uuid_le *fru_id = &NULL_UUID_LE;
+       char *fru_text = "";
+       uuid_le *sec_type;
+       static u32 err_seq;
 
        estatus = extlog_elog_entry_check(cpu, bank);
        if (estatus == NULL)
@@ -148,7 +153,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
        /* clear record status to enable BIOS to update it again */
        estatus->block_status = 0;
 
-       rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
+       tmp = (struct acpi_generic_status *)elog_buf;
+       print_extlog_rcd(NULL, tmp, cpu);
+
+       /* log event via trace */
+       err_seq++;
+       gdata = (struct acpi_generic_data *)(tmp + 1);
+       if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
+               fru_id = (uuid_le *)gdata->fru_id;
+       if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
+               fru_text = gdata->fru_text;
+       sec_type = (uuid_le *)gdata->section_type;
+       if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
+               struct cper_sec_mem_err *mem = (void *)(gdata + 1);
+               if (gdata->error_data_length >= sizeof(*mem))
+                       trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
+                                              (u8)gdata->error_severity);
+       }
 
        return NOTIFY_STOP;
 }
index ac33a9fed3414c66dea3e29b562842b434eaced0..437e6fd47311d087174684061964cc8f846bea47 100644 (file)
@@ -207,7 +207,7 @@ const char *cper_mem_err_type_str(unsigned int etype)
 }
 EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
 
-static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
+static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
 {
        u32 len, n;
 
@@ -249,7 +249,7 @@ static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
        return n;
 }
 
-static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
+static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
 {
        u32 len, n;
        const char *bank = NULL, *device = NULL;
@@ -271,8 +271,44 @@ static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
        return n;
 }
 
+void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
+                      struct cper_mem_err_compact *cmem)
+{
+       cmem->validation_bits = mem->validation_bits;
+       cmem->node = mem->node;
+       cmem->card = mem->card;
+       cmem->module = mem->module;
+       cmem->bank = mem->bank;
+       cmem->device = mem->device;
+       cmem->row = mem->row;
+       cmem->column = mem->column;
+       cmem->bit_pos = mem->bit_pos;
+       cmem->requestor_id = mem->requestor_id;
+       cmem->responder_id = mem->responder_id;
+       cmem->target_id = mem->target_id;
+       cmem->rank = mem->rank;
+       cmem->mem_array_handle = mem->mem_array_handle;
+       cmem->mem_dev_handle = mem->mem_dev_handle;
+}
+
+const char *cper_mem_err_unpack(struct trace_seq *p,
+                               struct cper_mem_err_compact *cmem)
+{
+       const char *ret = p->buffer + p->len;
+
+       if (cper_mem_err_location(cmem, rcd_decode_str))
+               trace_seq_printf(p, "%s", rcd_decode_str);
+       if (cper_dimm_err_location(cmem, rcd_decode_str))
+               trace_seq_printf(p, "%s", rcd_decode_str);
+       trace_seq_putc(p, '\0');
+
+       return ret;
+}
+
 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
 {
+       struct cper_mem_err_compact cmem;
+
        if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
                printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
        if (mem->validation_bits & CPER_MEM_VALID_PA)
@@ -281,14 +317,15 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
        if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
                printk("%s""physical_address_mask: 0x%016llx\n",
                       pfx, mem->physical_addr_mask);
-       if (cper_mem_err_location(mem, rcd_decode_str))
+       cper_mem_err_pack(mem, &cmem);
+       if (cper_mem_err_location(&cmem, rcd_decode_str))
                printk("%s%s\n", pfx, rcd_decode_str);
        if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
                u8 etype = mem->error_type;
                printk("%s""error_type: %d, %s\n", pfx, etype,
                       cper_mem_err_type_str(etype));
        }
-       if (cper_dimm_err_location(mem, rcd_decode_str))
+       if (cper_dimm_err_location(&cmem, rcd_decode_str))
                printk("%s%s\n", pfx, rcd_decode_str);
 }
 
index 4cac43a1e25cdce3c804d88d1f8360f88bf46254..b67dd362b7b6cff2f62d2f431c9d25c4d444df79 100644 (file)
@@ -23,4 +23,7 @@ static int __init ras_init(void)
 }
 subsys_initcall(ras_init);
 
+#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
+EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
+#endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
index ed088b9c129800f078524f5a79f99e8a9cd5a028..76abba4b238ece14f8a2bd9bcce5ab53306c61fd 100644 (file)
@@ -22,6 +22,7 @@
 #define LINUX_CPER_H
 
 #include <linux/uuid.h>
+#include <linux/trace_seq.h>
 
 /* CPER record signature and the size */
 #define CPER_SIG_RECORD                                "CPER"
@@ -363,6 +364,24 @@ struct cper_sec_mem_err {
        __u16   mem_dev_handle;         /* module handle in UEFI 2.4 */
 };
 
+struct cper_mem_err_compact {
+       __u64   validation_bits;
+       __u16   node;
+       __u16   card;
+       __u16   module;
+       __u16   bank;
+       __u16   device;
+       __u16   row;
+       __u16   column;
+       __u16   bit_pos;
+       __u64   requestor_id;
+       __u64   responder_id;
+       __u64   target_id;
+       __u16   rank;
+       __u16   mem_array_handle;
+       __u16   mem_dev_handle;
+};
+
 struct cper_sec_pcie {
        __u64           validation_bits;
        __u32           port_type;
@@ -406,5 +425,9 @@ const char *cper_severity_str(unsigned int);
 const char *cper_mem_err_type_str(unsigned int);
 void cper_print_bits(const char *prefix, unsigned int bits,
                     const char * const strs[], unsigned int strs_size);
+void cper_mem_err_pack(const struct cper_sec_mem_err *,
+                      struct cper_mem_err_compact *);
+const char *cper_mem_err_unpack(struct trace_seq *,
+                               struct cper_mem_err_compact *);
 
 #endif
index acbcbb88eaaa26b3e0f631e3db6a8130844cca84..47da53c27ffa54fd5602964400713f44c5ffdf82 100644 (file)
@@ -9,6 +9,70 @@
 #include <linux/edac.h>
 #include <linux/ktime.h>
 #include <linux/aer.h>
+#include <linux/cper.h>
+
+/*
+ * MCE Extended Error Log trace event
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event.
+ */
+
+/* memory trace event */
+
+#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
+TRACE_EVENT(extlog_mem_event,
+       TP_PROTO(struct cper_sec_mem_err *mem,
+                u32 err_seq,
+                const uuid_le *fru_id,
+                const char *fru_text,
+                u8 sev),
+
+       TP_ARGS(mem, err_seq, fru_id, fru_text, sev),
+
+       TP_STRUCT__entry(
+               __field(u32, err_seq)
+               __field(u8, etype)
+               __field(u8, sev)
+               __field(u64, pa)
+               __field(u8, pa_mask_lsb)
+               __field_struct(uuid_le, fru_id)
+               __string(fru_text, fru_text)
+               __field_struct(struct cper_mem_err_compact, data)
+       ),
+
+       TP_fast_assign(
+               __entry->err_seq = err_seq;
+               if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
+                       __entry->etype = mem->error_type;
+               else
+                       __entry->etype = ~0;
+               __entry->sev = sev;
+               if (mem->validation_bits & CPER_MEM_VALID_PA)
+                       __entry->pa = mem->physical_addr;
+               else
+                       __entry->pa = ~0ull;
+
+               if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
+                       __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
+               else
+                       __entry->pa_mask_lsb = ~0;
+               __entry->fru_id = *fru_id;
+               __assign_str(fru_text, fru_text);
+               cper_mem_err_pack(mem, &__entry->data);
+       ),
+
+       TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
+                 __entry->err_seq,
+                 cper_severity_str(__entry->sev),
+                 cper_mem_err_type_str(__entry->etype),
+                 __entry->pa,
+                 __entry->pa_mask_lsb,
+                 cper_mem_err_unpack(p, &__entry->data),
+                 &__entry->fru_id,
+                 __get_str(fru_text))
+);
+#endif
 
 /*
  * Hardware Events Report