From: Ingo Molnar Date: Wed, 22 Dec 2010 10:53:20 +0000 (+0100) Subject: Merge commit 'v2.6.37-rc7' into perf/core X-Git-Tag: firefly_0821_release~7613^2~2648^2~59 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=6c529a266bdc590a870ee2d2092ff6527eff427b;hp=-c;p=firefly-linux-kernel-4.4.55.git Merge commit 'v2.6.37-rc7' into perf/core Merge reason: Pick up the latest -rc. Signed-off-by: Ingo Molnar --- 6c529a266bdc590a870ee2d2092ff6527eff427b diff --combined Documentation/kernel-parameters.txt index 5e55e4623ab5,8b61c9360999..316c723a950c --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@@ -1579,12 -1579,20 +1579,12 @@@ and is between 256 and 4096 characters nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels Format: [panic,][num] - Valid num: 0,1,2 + Valid num: 0 0 - turn nmi_watchdog off - 1 - use the IO-APIC timer for the NMI watchdog - 2 - use the local APIC for the NMI watchdog using - a performance counter. Note: This will use one - performance counter and the local APIC's performance - vector. When panic is specified, panic when an NMI watchdog timeout occurs. This is useful when you use a panic=... timeout and need the box quickly up again. - Instead of 1 and 2 it is possible to use the following - symbolic names: lapic and ioapic - Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic netpoll.carrier_timeout= [NET] Specifies amount of time (in seconds) that @@@ -2167,11 -2175,6 +2167,6 @@@ reset_devices [KNL] Force drivers to reset the underlying device during initialization. - resource_alloc_from_bottom - Allocate new resources from the beginning of available - space, not the end. If you need to use this, please - report a bug. - resume= [SWSUSP] Specify the partition device for software suspend diff --combined MAINTAINERS index ed192f11ad23,6a588873cf8d..f1f803c6674a --- a/MAINTAINERS +++ b/MAINTAINERS @@@ -559,14 -559,14 +559,14 @@@ W: http://maxim.org.za/at91_26.htm S: Maintained ARM/BCMRING ARM ARCHITECTURE - M: Leo Chen + M: Jiandong Zheng M: Scott Branden L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-bcmring ARM/BCMRING MTD NAND DRIVER - M: Leo Chen + M: Jiandong Zheng M: Scott Branden L: linux-mtd@lists.infradead.org S: Maintained @@@ -815,7 -815,7 +815,7 @@@ F: drivers/mmc/host/msm_sdcc. F: drivers/mmc/host/msm_sdcc.h F: drivers/serial/msm_serial.h F: drivers/serial/msm_serial.c - T: git git://codeaurora.org/quic/kernel/dwalker/linux-msm.git + T: git git://codeaurora.org/quic/kernel/davidb/linux-msm.git S: Maintained ARM/TOSA MACHINE SUPPORT @@@ -4612,7 -4612,7 +4612,7 @@@ PERFORMANCE EVENTS SUBSYSTE M: Peter Zijlstra M: Paul Mackerras M: Ingo Molnar -M: Arnaldo Carvalho de Melo +M: Arnaldo Carvalho de Melo S: Supported F: kernel/perf_event*.c F: include/linux/perf_event.h @@@ -5932,7 -5932,6 +5932,6 @@@ F: include/linux/tty. TULIP NETWORK DRIVERS M: Grant Grundler - M: Kyle McMartin L: netdev@vger.kernel.org S: Maintained F: drivers/net/tulip/ @@@ -6584,6 -6583,15 +6583,15 @@@ F: include/linux/mfd/wm8400 F: include/sound/wm????.h F: sound/soc/codecs/wm* + WORKQUEUE + M: Tejun Heo + L: linux-kernel@vger.kernel.org + T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git + S: Maintained + F: include/linux/workqueue.h + F: kernel/workqueue.c + F: Documentation/workqueue.txt + X.25 NETWORK LAYER M: Andrew Hendry L: linux-x25@vger.kernel.org diff --combined arch/x86/kernel/apic/apic.c index e9e2a93783f9,78218135b48e..fb7657822aad --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@@ -31,6 -31,7 +31,6 @@@ #include #include #include -#include #include #include @@@ -798,7 -799,11 +798,7 @@@ void __init setup_boot_APIC_clock(void * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - pr_warning("APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; /* Setup the lapic or request the broadcast */ setup_APIC_timer(); @@@ -1382,7 -1387,16 +1382,15 @@@ void __cpuinit end_local_APIC_setup(voi } #endif - setup_apic_nmi_watchdog(NULL); apic_pm_activate(); + + /* + * Now that local APIC setup is completed for BP, configure the fault + * handling for interrupt remapping. + */ + if (!smp_processor_id() && intr_remapping_enabled) + enable_drhd_fault_handling(); + } #ifdef CONFIG_X86_X2APIC @@@ -1744,10 -1758,17 +1752,10 @@@ int __init APIC_init_uniprocessor(void setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } -#else - localise_nmi_watchdog(); #endif x86_init.timers.setup_percpu_clockev(); -#ifdef CONFIG_X86_64 - check_nmi_watchdog(); -#endif - return 0; } diff --combined arch/x86/kernel/apic/io_apic.c index e4a040c28de1,fadcd743a74f..16c2db8750a2 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@@ -54,6 -54,7 +54,6 @@@ #include #include #include -#include #include #include #include @@@ -2429,13 -2430,12 +2429,12 @@@ static void ack_apic_level(struct irq_d { struct irq_cfg *cfg = data->chip_data; int i, do_unmask_irq = 0, irq = data->irq; - struct irq_desc *desc = irq_to_desc(irq); unsigned long v; irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_ioapic(cfg); } @@@ -2642,6 -2642,24 +2641,6 @@@ static void lapic_register_intr(int irq "edge"); } -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - /* * This looks a bit hackish but it's about the only one way of sending * a few INTA cycles to 8259As and any associated glue logic. ICR does @@@ -2747,6 -2765,15 +2746,6 @@@ static inline void __init check_timer(v */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); legacy_pic->init(1); -#ifdef CONFIG_X86_32 - { - unsigned int ver; - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - } -#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@@ -2794,6 -2821,10 +2793,6 @@@ unmask_ioapic(cfg); } if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - legacy_pic->unmask(0); - } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); goto out; @@@ -2819,6 -2850,11 +2818,6 @@@ if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - setup_nmi(); - legacy_pic->unmask(0); - } goto out; } /* @@@ -2830,6 -2866,15 +2829,6 @@@ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@@ -3367,6 -3412,7 +3366,7 @@@ dmar_msi_set_affinity(struct irq_data * msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); dmar_msi_write(irq, &msg); diff --combined include/linux/sched.h index d2e63d1e725c,223874538b33..a99d735db3df --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -143,7 -143,7 +143,7 @@@ extern unsigned long nr_iowait_cpu(int extern unsigned long this_cpu_load(void); - extern void calc_global_load(void); + extern void calc_global_load(unsigned long ticks); extern unsigned long get_parent_ip(unsigned long addr); @@@ -316,7 -316,6 +316,7 @@@ extern int proc_dowatchdog_thresh(struc size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern int softlockup_thresh; +void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) { @@@ -327,9 -326,6 +327,9 @@@ static inline void touch_softlockup_wat static inline void touch_all_softlockup_watchdogs(void) { } +static inline void lockup_detector_init(void) +{ +} #endif #ifdef CONFIG_DETECT_HUNG_TASK diff --combined kernel/sched.c index 605ab1b24d81,297d1a0eedb0..c68cead94dd7 --- a/kernel/sched.c +++ b/kernel/sched.c @@@ -636,22 -636,18 +636,18 @@@ static inline struct task_group *task_g #endif /* CONFIG_CGROUP_SCHED */ - static u64 irq_time_cpu(int cpu); - static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); + static void update_rq_clock_task(struct rq *rq, s64 delta); - inline void update_rq_clock(struct rq *rq) + static void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) { - int cpu = cpu_of(rq); - u64 irq_time; + s64 delta; - rq->clock = sched_clock_cpu(cpu); - irq_time = irq_time_cpu(cpu); - if (rq->clock - irq_time > rq->clock_task) - rq->clock_task = rq->clock - irq_time; + if (rq->skip_clock_update) + return; - sched_irq_time_avg_update(rq, irq_time); - } + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + rq->clock += delta; + update_rq_clock_task(rq, delta); } /* @@@ -1924,10 -1920,9 +1920,9 @@@ static void deactivate_task(struct rq * * They are read and saved off onto struct rq in update_rq_clock(). * This may result in other CPU reading this CPU's irq time and can * race with irq/account_system_vtime on this CPU. We would either get old - * or new value (or semi updated value on 32 bit) with a side effect of - * accounting a slice of irq time to wrong task when irq is in progress - * while we read rq->clock. That is a worthy compromise in place of having - * locks on each irq in account_system_time. + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. */ static DEFINE_PER_CPU(u64, cpu_hardirq_time); static DEFINE_PER_CPU(u64, cpu_softirq_time); @@@ -1945,19 -1940,58 +1940,58 @@@ void disable_sched_clock_irqtime(void sched_clock_irqtime = 0; } - static u64 irq_time_cpu(int cpu) + #ifndef CONFIG_64BIT + static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + + static inline void irq_time_write_begin(void) { - if (!sched_clock_irqtime) - return 0; + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); + } + + static inline void irq_time_write_end(void) + { + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); + } + + static inline u64 irq_time_read(int cpu) + { + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + return irq_time; + } + #else /* CONFIG_64BIT */ + static inline void irq_time_write_begin(void) + { + } + + static inline void irq_time_write_end(void) + { + } + + static inline u64 irq_time_read(int cpu) + { return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); } + #endif /* CONFIG_64BIT */ + /* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ void account_system_vtime(struct task_struct *curr) { unsigned long flags; + s64 delta; int cpu; - u64 now, delta; if (!sched_clock_irqtime) return; @@@ -1965,9 -1999,10 +1999,10 @@@ local_irq_save(flags); cpu = smp_processor_id(); - now = sched_clock_cpu(cpu); - delta = now - per_cpu(irq_start_time, cpu); - per_cpu(irq_start_time, cpu) = now; + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread @@@ -1975,33 -2010,55 +2010,55 @@@ * that do not consume any time, but still wants to run. */ if (hardirq_count()) - per_cpu(cpu_hardirq_time, cpu) += delta; + __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) - per_cpu(cpu_softirq_time, cpu) += delta; + __this_cpu_add(cpu_softirq_time, delta); + irq_time_write_end(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(account_system_vtime); - static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) + static void update_rq_clock_task(struct rq *rq, s64 delta) { - if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { - u64 delta_irq = curr_irq_time - rq->prev_irq_time; - rq->prev_irq_time = curr_irq_time; - sched_rt_avg_update(rq, delta_irq); - } + s64 irq_delta; + + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + rq->clock_task += delta; + + if (irq_delta && sched_feat(NONIRQ_POWER)) + sched_rt_avg_update(rq, irq_delta); } - #else + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ - static u64 irq_time_cpu(int cpu) + static void update_rq_clock_task(struct rq *rq, s64 delta) { - return 0; + rq->clock_task += delta; } - static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } - - #endif + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #include "sched_idletask.c" #include "sched_fair.c" @@@ -2129,7 -2186,7 +2186,7 @@@ static void check_preempt_curr(struct r * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (test_tsk_need_resched(rq->curr)) + if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@@ -3119,6 -3176,15 +3176,15 @@@ static long calc_load_fold_active(struc return delta; } + static unsigned long + calc_load(unsigned long load, unsigned long exp, unsigned long active) + { + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; + } + #ifdef CONFIG_NO_HZ /* * For NO_HZ we delay the active fold to the next LOAD_FREQ update. @@@ -3148,6 -3214,128 +3214,128 @@@ static long calc_load_fold_idle(void return delta; } + + /** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ + static unsigned long + fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) + { + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; + } + + /* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ + static unsigned long + calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) + { + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); + } + + /* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ + static void calc_global_nohz(unsigned long ticks) + { + long delta, active, n; + + if (time_before(jiffies, calc_load_update)) + return; + + /* + * If we crossed a calc_load_update boundary, make sure to fold + * any pending idle changes, the respective CPUs might have + * missed the tick driven calc_load_account_active() update + * due to NO_HZ. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + /* + * If we were idle for multiple load cycles, apply them. + */ + if (ticks >= LOAD_FREQ) { + n = ticks / LOAD_FREQ; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Its possible the remainder of the above division also crosses + * a LOAD_FREQ period, the regular check in calc_global_load() + * which comes after this will take care of that. + * + * Consider us being 11 ticks before a cycle completion, and us + * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will + * age us 4 cycles, and the test in calc_global_load() will + * pick up the final one. + */ + } #else static void calc_load_account_idle(struct rq *this_rq) { @@@ -3157,6 -3345,10 +3345,10 @@@ static inline long calc_load_fold_idle( { return 0; } + + static void calc_global_nohz(unsigned long ticks) + { + } #endif /** @@@ -3174,24 -3366,17 +3366,17 @@@ void get_avenrun(unsigned long *loads, loads[2] = (avenrun[2] + offset) << shift; } - static unsigned long - calc_load(unsigned long load, unsigned long exp, unsigned long active) - { - load *= exp; - load += active * (FIXED_1 - exp); - return load >> FSHIFT; - } - /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. */ - void calc_global_load(void) + void calc_global_load(unsigned long ticks) { - unsigned long upd = calc_load_update + 10; long active; - if (time_before(jiffies, upd)) + calc_global_nohz(ticks); + + if (time_before(jiffies, calc_load_update + 10)) return; active = atomic_long_read(&calc_load_tasks); @@@ -3845,7 -4030,6 +4030,6 @@@ static void put_prev_task(struct rq *rq { if (prev->se.on_rq) update_rq_clock(rq); - rq->skip_clock_update = 0; prev->sched_class->put_prev_task(rq, prev); } @@@ -3903,7 -4087,6 +4087,6 @@@ need_resched_nonpreemptible hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); - clear_tsk_need_resched(prev); switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@@ -3935,6 -4118,8 +4118,8 @@@ put_prev_task(rq, prev); next = pick_next_task(rq); + clear_tsk_need_resched(prev); + rq->skip_clock_update = 0; if (likely(prev != next)) { sched_info_switch(prev, next); @@@ -8108,6 -8293,8 +8293,6 @@@ void __init sched_init(void zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_event_init(); - scheduler_running = 1; }