x86, mce: Support memory error recovery for both UCNA and Deferred error in machine_c...

author Chen Yucong <slaoub@gmail.com>

Tue, 18 Nov 2014 02:09:20 +0000 (10:09 +0800)

committer Tony Luck <tony.luck@intel.com>

Wed, 19 Nov 2014 18:56:51 +0000 (10:56 -0800)
author Chen Yucong <slaoub@gmail.com>
Tue, 18 Nov 2014 02:09:20 +0000 (10:09 +0800)
committer Tony Luck <tony.luck@intel.com>
Wed, 19 Nov 2014 18:56:51 +0000 (10:56 -0800)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index 453e9bf90968fdf586d1627c4d2af537d307b612..cfb16f631d524aba129adc6498e0a6feabb4d93e 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i)
         }
  }
  
+static bool memory_error(struct mce *m)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+
+       if (c->x86_vendor == X86_VENDOR_AMD) {
+               /*
+                * coming soon
+                */
+               return false;
+       } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+               /*
+                * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+                *
+                * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+                * indicating a memory error. Bit 8 is used for indicating a
+                * cache hierarchy error. The combination of bit 2 and bit 3
+                * is used for indicating a `generic' cache hierarchy error
+                * But we can't just blindly check the above bits, because if
+                * bit 11 is set, then it is a bus/interconnect error - and
+                * either way the above bits just gives more detail on what
+                * bus/interconnect error happened. Note that bit 12 can be
+                * ignored, as it's the "filter" bit.
+                */
+               return (m->status & 0xef80) == BIT(7) ||
+                      (m->status & 0xef00) == BIT(8) ||
+                      (m->status & 0xeffc) == 0xc;
+       }
+
+       return false;
+}
+
  DEFINE_PER_CPU(unsigned, mce_poll_count);
  
  /*
@@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
  void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
  {
         struct mce m;
+       int severity;
         int i;
  
         this_cpu_inc(mce_poll_count);
@@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
  
                 if (!(flags & MCP_TIMESTAMP))
                         m.tsc = 0;
+
+               severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
+
+               /*
+                * In the cases where we don't have a valid address after all,
+                * do not add it into the ring buffer.
+                */
+               if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
+                       if (m.status & MCI_STATUS_ADDRV) {
+                               mce_ring_add(m.addr >> PAGE_SHIFT);
+                               mce_schedule_work();
+                       }
+               }
+
                 /*
                  * Don't get the IP here because it's unlikely to
                  * have anything to do with the actual error location.
author	Chen Yucong <slaoub@gmail.com>
	Tue, 18 Nov 2014 02:09:20 +0000 (10:09 +0800)
committer	Tony Luck <tony.luck@intel.com>
	Wed, 19 Nov 2014 18:56:51 +0000 (10:56 -0800)