#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
+#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
+#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/sysdev.h>
+#include <linux/delay.h>
#include <linux/ctype.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
+#include <linux/nmi.h>
#include <linux/cpu.h>
+#include <linux/smp.h>
#include <linux/fs.h>
+#include <linux/mm.h>
#include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
#include <asm/idle.h>
+#include <asm/ipi.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/smp.h>
+#include "mce-internal.h"
#include "mce.h"
/* Handle unconfigured int18 (should never happen) */
#define MISC_MCELOG_MINOR 227
+#define SPINUNIT 100 /* 100ns */
+
atomic_t mce_entry;
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
/*
* Tolerant levels:
* 0: always panic on uncorrected errors, log corrected errors
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = -1;
+static int monarch_timeout = -1;
+static int mce_panic_timeout;
+static int mce_dont_log_ce;
+int mce_cmci_disabled;
+int mce_ignore_ce;
+int mce_ser;
static char trigger[128];
static char *trigger_argv[2] = { trigger, NULL };
static unsigned long dont_init_banks;
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static int cpu_missing;
+
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
}
+static DEFINE_PER_CPU(struct work_struct, mce_work);
+
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
- m->cpu = smp_processor_id();
+ m->cpu = m->extcpu = smp_processor_id();
rdtscll(m->tsc);
+ /* We hope get_seconds stays lockless */
+ m->time = get_seconds();
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->cpuid = cpuid_eax(1);
+#ifdef CONFIG_SMP
+ m->socketid = cpu_data(m->extcpu).phys_proc_id;
+#endif
+ m->apicid = cpu_data(m->extcpu).initial_apicid;
+ rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
}
DEFINE_PER_CPU(struct mce, injectm);
*/
static struct mce_log mcelog = {
- MCE_LOG_SIGNATURE,
- MCE_LOG_LEN,
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
};
void mce_log(struct mce *mce)
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+ set_bit(MCE_OVERFLOW,
+ (unsigned long *)&mcelog.flags);
return;
}
/* Old left over entry. Skip: */
mcelog.entry[entry].finished = 1;
wmb();
+ mce->finished = 1;
set_bit(0, ¬ify_user);
}
static void print_mce(struct mce *m)
{
- printk(KERN_EMERG "\n"
- KERN_EMERG "HARDWARE ERROR\n"
- KERN_EMERG
+ printk(KERN_EMERG
"CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
- m->cpu, m->mcgstatus, m->bank, m->status);
+ m->extcpu, m->mcgstatus, m->bank, m->status);
if (m->ip) {
printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
if (m->misc)
printk("MISC %llx ", m->misc);
printk("\n");
- printk(KERN_EMERG "This is not a software problem!\n");
- printk(KERN_EMERG "Run through mcelog --ascii to decode "
- "and contact your hardware vendor\n");
+ printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid,
+ m->apicid);
+}
+
+static void print_mce_head(void)
+{
+ printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
+}
+
+static void print_mce_tail(void)
+{
+ printk(KERN_EMERG "This is not a software problem!\n"
+ KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
}
-static void mce_panic(char *msg, struct mce *backup, u64 start)
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_paniced;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mce_panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static void mce_panic(char *msg, struct mce *final, char *exp)
{
int i;
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_add_return(1, &mce_paniced) > 1)
+ wait_for_panic();
+ barrier();
+
bust_spinlocks(1);
console_verbose();
+ print_mce_head();
+ /* First print corrected ones that are still unlogged */
for (i = 0; i < MCE_LOG_LEN; i++) {
- u64 tsc = mcelog.entry[i].tsc;
-
- if ((s64)(tsc - start) < 0)
+ struct mce *m = &mcelog.entry[i];
+ if (!(m->status & MCI_STATUS_VAL))
continue;
- print_mce(&mcelog.entry[i]);
- if (backup && mcelog.entry[i].tsc == backup->tsc)
- backup = NULL;
+ if (!(m->status & MCI_STATUS_UC))
+ print_mce(m);
}
- if (backup)
- print_mce(backup);
+ /* Now print uncorrected but with the final one last */
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || memcmp(m, final, sizeof(struct mce)))
+ print_mce(m);
+ }
+ if (final)
+ print_mce(final);
+ if (cpu_missing)
+ printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
+ print_mce_tail();
+ if (exp)
+ printk(KERN_EMERG "Machine check: %s\n", exp);
+ if (panic_timeout == 0)
+ panic_timeout = mce_panic_timeout;
panic(msg);
}
wrmsrl(msr, v);
}
+/*
+ * Simple lockless ring to communicate PFNs from the exception handler with the
+ * process context work function. This is vastly simplified because there's
+ * only a single reader and a single writer.
+ */
+#define MCE_RING_SIZE 16 /* we use one entry less */
+
+struct mce_ring {
+ unsigned short start;
+ unsigned short end;
+ unsigned long ring[MCE_RING_SIZE];
+};
+static DEFINE_PER_CPU(struct mce_ring, mce_ring);
+
+/* Runs with CPU affinity in workqueue */
+static int mce_ring_empty(void)
+{
+ struct mce_ring *r = &__get_cpu_var(mce_ring);
+
+ return r->start == r->end;
+}
+
+static int mce_ring_get(unsigned long *pfn)
+{
+ struct mce_ring *r;
+ int ret = 0;
+
+ *pfn = 0;
+ get_cpu();
+ r = &__get_cpu_var(mce_ring);
+ if (r->start == r->end)
+ goto out;
+ *pfn = r->ring[r->start];
+ r->start = (r->start + 1) % MCE_RING_SIZE;
+ ret = 1;
+out:
+ put_cpu();
+ return ret;
+}
+
+/* Always runs in MCE context with preempt off */
+static int mce_ring_add(unsigned long pfn)
+{
+ struct mce_ring *r = &__get_cpu_var(mce_ring);
+ unsigned next;
+
+ next = (r->end + 1) % MCE_RING_SIZE;
+ if (next == r->start)
+ return -1;
+ r->ring[r->end] = pfn;
+ wmb();
+ r->end = next;
+ return 0;
+}
+
int mce_available(struct cpuinfo_x86 *c)
{
if (mce_disabled)
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
+static void mce_schedule_work(void)
+{
+ if (!mce_ring_empty()) {
+ struct work_struct *work = &__get_cpu_var(mce_work);
+ if (!work_pending(work))
+ schedule_work(work);
+ }
+}
+
+/*
+ * Get the address of the instruction at the time of the machine check
+ * error.
+ */
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
- if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+
+ if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
m->ip = regs->ip;
m->cs = regs->cs;
} else {
m->ip = 0;
m->cs = 0;
}
- if (rip_msr) {
- /* Assume the RIP in the MSR is exact. Is this true? */
- m->mcgstatus |= MCG_STATUS_EIPV;
+ if (rip_msr)
m->ip = mce_rdmsrl(rip_msr);
- m->cs = 0;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Called after interrupts have been reenabled again
+ * when a MCE happened during an interrupts off region
+ * in the kernel.
+ */
+asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ exit_idle();
+ irq_enter();
+ mce_notify_irq();
+ mce_schedule_work();
+ irq_exit();
+}
+#endif
+
+static void mce_report_event(struct pt_regs *regs)
+{
+ if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
+ mce_notify_irq();
+ /*
+ * Triggering the work queue here is just an insurance
+ * policy in case the syscall exit notify handler
+ * doesn't run soon enough or ends up running on the
+ * wrong CPU (can happen when audit sleeps)
+ */
+ mce_schedule_work();
+ return;
}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Without APIC do not notify. The event will be picked
+ * up eventually.
+ */
+ if (!cpu_has_apic)
+ return;
+
+ /*
+ * When interrupts are disabled we cannot use
+ * kernel services safely. Trigger an self interrupt
+ * through the APIC to instead do the notification
+ * after interrupts are reenabled again.
+ */
+ apic->send_IPI_self(MCE_SELF_VECTOR);
+
+ /*
+ * Wait for idle afterwards again so that we don't leave the
+ * APIC in a non idle state because the normal APIC writes
+ * cannot exclude us.
+ */
+ apic_wait_icr_idle();
+#endif
}
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
/*
* Poll for corrected events or events that happened before reset.
* Those are just logged through /dev/mcelog.
*
* This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll hander -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
*/
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce m;
int i;
+ __get_cpu_var(mce_poll_count)++;
+
mce_setup(&m);
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
continue;
/*
- * Uncorrected events are handled by the exception handler
- * when it is enabled. But when the exception is disabled log
- * everything.
+ * Uncorrected or signalled events are handled by the exception
+ * handler when it is enabled, so don't process those here.
*
* TBD do the same check for MCI_STATUS_EN here?
*/
- if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+ if (!(flags & MCP_UC) &&
+ (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
if (m.status & MCI_STATUS_MISCV)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG)) {
+ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m);
add_taint(TAINT_MACHINE_CHECK);
}
}
EXPORT_SYMBOL_GPL(machine_check_poll);
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+ if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t)
+{
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_paniced))
+ wait_for_panic();
+ if (!monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ /* CHECKME: Make panic default for 1 too? */
+ if (tolerant < 1)
+ mce_panic("Timeout synchronizing machine check over CPUs",
+ NULL, NULL);
+ cpu_missing = 1;
+ return 1;
+ }
+ *t -= SPINUNIT;
+out:
+ touch_nmi_watchdog();
+ return 0;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of an machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ int cpu;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ char *nmsg = NULL;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+ &nmsg);
+ if (severity > global_worst) {
+ msg = nmsg;
+ global_worst = severity;
+ m = &per_cpu(mces_seen, cpu);
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+ mce_panic("Fatal Machine check", m, msg);
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (!m && tolerant < 3)
+ mce_panic("Machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the mces_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int no_way_out, int *order)
+{
+ int nwo;
+ int cpus = num_online_cpus();
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout) {
+ *order = -1;
+ return no_way_out;
+ }
+
+ atomic_add(no_way_out, &global_nwo);
+
+ /*
+ * Wait for everyone.
+ */
+ while (atomic_read(&mce_callin) != cpus) {
+ if (mce_timed_out(&timeout)) {
+ atomic_set(&global_nwo, 0);
+ *order = -1;
+ return no_way_out;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ nwo = atomic_read(&global_nwo);
+
+ /*
+ * Monarch starts executing now, the others wait.
+ */
+ if (*order == 1) {
+ atomic_set(&mce_executing, 1);
+ return nwo;
+ }
+
+ /*
+ * Now start the scanning loop one by one
+ * in the original callin order.
+ * This way when there are any shared banks it will
+ * be only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (atomic_read(&mce_executing) < *order) {
+ if (mce_timed_out(&timeout)) {
+ atomic_set(&global_nwo, 0);
+ *order = -1;
+ return no_way_out;
+ }
+ ndelay(SPINUNIT);
+ }
+ return nwo;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+ int ret = -1;
+ u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /* CHECKME: Can this race with a parallel hotplug? */
+ int cpus = num_online_cpus();
+
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= cpus) {
+ if (mce_timed_out(&timeout))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ return 0;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+ return ret;
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses upto page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ return 0;
+ if ((m->misc & 0x3f) > PAGE_SHIFT)
+ return 0;
+ if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+ return 0;
+ return 1;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < banks; i++) {
+ if (test_bit(i, toclear))
+ mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
* This is executed in NMI context not subject to normal locking rules. This
* implies that most kernel services cannot be safely used. Don't even
* think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
*/
void do_machine_check(struct pt_regs *regs, long error_code)
{
- struct mce m, panicm;
- int panicm_found = 0;
- u64 mcestart = 0;
+ struct mce m, *final;
int i;
+ int worst = 0;
+ int severity;
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ int order;
+
/*
* If no_way_out gets set, there is no safe way to recover from this
* MCE. If tolerant is cranked up, we'll try anyway.
*/
int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ char *msg = "Unknown";
atomic_inc(&mce_entry);
+ __get_cpu_var(mce_exception_count)++;
+
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
goto out;
if (!banks)
goto out;
+ order = atomic_add_return(1, &mce_callin);
mce_setup(&m);
m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ no_way_out = mce_no_way_out(&m, &msg);
- /* if the restart IP is not valid, we're done for */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- no_way_out = 1;
+ final = &__get_cpu_var(mces_seen);
+ *final = m;
- rdtscll(mcestart);
barrier();
+ /*
+ * When no restart IP must always kill or panic.
+ */
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ kill_it = 1;
+
+ /*
+ * Go through all the banks in exclusion of the other CPUs.
+ * This way we don't report duplicated events on shared banks
+ * because the first one to see it will clear it.
+ */
+ no_way_out = mce_start(no_way_out, &order);
for (i = 0; i < banks; i++) {
__clear_bit(i, toclear);
if (!bank[i])
continue;
/*
- * Non uncorrected errors are handled by machine_check_poll
- * Leave them alone.
+ * Non uncorrected or non signaled errors are handled by
+ * machine_check_poll. Leave them alone, unless this panics.
*/
- if ((m.status & MCI_STATUS_UC) == 0)
+ if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
continue;
/*
*/
add_taint(TAINT_MACHINE_CHECK);
- __set_bit(i, toclear);
+ severity = mce_severity(&m, tolerant, NULL);
- if (m.status & MCI_STATUS_EN) {
- /* if PCC was set, there's no way out */
- no_way_out |= !!(m.status & MCI_STATUS_PCC);
- /*
- * If this error was uncorrectable and there was
- * an overflow, we're in trouble. If no overflow,
- * we might get away with just killing a task.
- */
- if (m.status & MCI_STATUS_UC) {
- if (tolerant < 1 || m.status & MCI_STATUS_OVER)
- no_way_out = 1;
- kill_it = 1;
- }
- } else {
+ /*
+ * When machine check was for corrected handler don't touch,
+ * unless we're panicing.
+ */
+ if (severity == MCE_KEEP_SEVERITY && !no_way_out)
+ continue;
+ __set_bit(i, toclear);
+ if (severity == MCE_NO_SEVERITY) {
/*
* Machine check event was not enabled. Clear, but
* ignore.
continue;
}
+ /*
+ * Kill on action required.
+ */
+ if (severity == MCE_AR_SEVERITY)
+ kill_it = 1;
+
if (m.status & MCI_STATUS_MISCV)
m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
if (m.status & MCI_STATUS_ADDRV)
m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+ /*
+ * Action optional error. Queue address for later processing.
+ * When the ring overflows we just ignore the AO error.
+ * RED-PEN add some logging mechanism when
+ * usable_address or mce_add_ring fails.
+ * RED-PEN don't ignore overflow for tolerant == 0
+ */
+ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
+ mce_ring_add(m.addr >> PAGE_SHIFT);
+
mce_get_rip(&m, regs);
mce_log(&m);
- /*
- * Did this bank cause the exception?
- *
- * Assume that the bank with uncorrectable errors did it,
- * and that there is only a single one:
- */
- if ((m.status & MCI_STATUS_UC) &&
- (m.status & MCI_STATUS_EN)) {
- panicm = m;
- panicm_found = 1;
+ if (severity > worst) {
+ *final = m;
+ worst = severity;
}
}
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
/*
- * If we didn't find an uncorrectable error, pick
- * the last one (shouldn't happen, just being safe).
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
*/
- if (!panicm_found)
- panicm = m;
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
/*
* If we have decided that we just CAN'T continue, and the user
* has not set tolerant to an insane level, give up and die.
+ *
+ * This is mainly used in the case when the system doesn't
+ * support MCE broadcasting or it has been disabled.
*/
if (no_way_out && tolerant < 3)
- mce_panic("Machine check", &panicm, mcestart);
+ mce_panic("Fatal machine check on current CPU", final, msg);
/*
* If the error seems to be unrecoverable, something should be
* one task, do that. If the user has set the tolerance very
* high, don't try to do anything at all.
*/
- if (kill_it && tolerant < 3) {
- int user_space = 0;
- /*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
- */
- if (m.mcgstatus & MCG_STATUS_EIPV)
- user_space = panicm.ip && (panicm.cs & 3);
-
- /*
- * If we know that the error was in user space, send a
- * SIGBUS. Otherwise, panic if tolerance is low.
- *
- * force_sig() takes an awful lot of locks and has a slight
- * risk of deadlocking.
- */
- if (user_space) {
- force_sig(SIGBUS, current);
- } else if (panic_on_oops || tolerant < 2) {
- mce_panic("Uncorrected machine check",
- &panicm, mcestart);
- }
- }
+ if (kill_it && tolerant < 3)
+ force_sig(SIGBUS, current);
/* notify userspace ASAP */
set_thread_flag(TIF_MCE_NOTIFY);
- /* the last thing we do is clear state */
- for (i = 0; i < banks; i++) {
- if (test_bit(i, toclear))
- mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
- }
+ if (worst > 0)
+ mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
atomic_dec(&mce_entry);
}
EXPORT_SYMBOL_GPL(do_machine_check);
+/* dummy to break dependency. actual code is in mm/memory-failure.c */
+void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+{
+ printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+}
+
+/*
+ * Called after mce notification in process context. This code
+ * is allowed to sleep. Call the high level VM handler to process
+ * any corrupted pages.
+ * Assume that the work queue code only calls this one at a time
+ * per CPU.
+ * Note we don't disable preemption, so this code might run on the wrong
+ * CPU. In this case the event is picked up by the scheduled work queue.
+ * This is merely a fast path to expedite processing in some common
+ * cases.
+ */
+void mce_notify_process(void)
+{
+ unsigned long pfn;
+ mce_notify_irq();
+ while (mce_ring_get(&pfn))
+ memory_failure(pfn, MCE_VECTOR);
+}
+
+static void mce_process_work(struct work_struct *dummy)
+{
+ mce_notify_process();
+}
+
#ifdef CONFIG_X86_MCE_INTEL
/***
* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
* polling interval, otherwise increase the polling interval.
*/
n = &__get_cpu_var(next_interval);
- if (mce_notify_user()) {
+ if (mce_notify_irq())
*n = max(*n/2, HZ/100);
- } else {
+ else
*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
- }
t->expires = jiffies + *n;
add_timer(t);
* Can be called from interrupt context, but not from machine check/NMI
* context.
*/
-int mce_notify_user(void)
+int mce_notify_irq(void)
{
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
}
return 0;
}
-EXPORT_SYMBOL_GPL(mce_notify_user);
+EXPORT_SYMBOL_GPL(mce_notify_irq);
/*
* Initialize Machine Checks for a CPU.
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
+ if (cap & MCG_SER_P)
+ mce_ser = 1;
+
return 0;
}
if (c->x86 == 6 && c->x86_model < 0x1A)
__set_bit(0, &dont_init_banks);
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+ monarch_timeout < 0)
+ monarch_timeout = USEC_PER_SEC;
}
+ if (monarch_timeout < 0)
+ monarch_timeout = 0;
+ if (mce_bootlog != 0)
+ mce_panic_timeout = 30;
}
static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
struct timer_list *t = &__get_cpu_var(mce_timer);
int *n = &__get_cpu_var(next_interval);
+ if (mce_ignore_ce)
+ return;
+
*n = check_interval * HZ;
if (!*n)
return;
mce_init();
mce_cpu_features(c);
mce_init_timer();
+ INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
}
/*
};
/*
- * mce=off disables machine check
- * mce=TOLERANCELEVEL (number, see above)
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
*/
str++;
if (!strcmp(str, "off"))
mce_disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ mce_cmci_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ mce_dont_log_ce = 1;
+ else if (!strcmp(str, "ignore_ce"))
+ mce_ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b');
- else if (isdigit(str[0]))
+ else if (isdigit(str[0])) {
get_option(&str, &tolerant);
- else {
+ if (*str == ',') {
+ ++str;
+ get_option(&str, &monarch_timeout);
+ }
+ } else {
printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
str);
return 0;
static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
static struct sysdev_ext_attribute attr_check_interval = {
_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
static struct sysdev_attribute *mce_attrs[] = {
&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
+ &attr_monarch_timeout.attr,
NULL
};