Merge commit 'v2.6.37-rc7' into perf/core

author Ingo Molnar <mingo@elte.hu>

Wed, 22 Dec 2010 10:53:20 +0000 (11:53 +0100)

committer Ingo Molnar <mingo@elte.hu>

Wed, 22 Dec 2010 10:53:23 +0000 (11:53 +0100)
author Ingo Molnar <mingo@elte.hu>
Wed, 22 Dec 2010 10:53:20 +0000 (11:53 +0100)
committer Ingo Molnar <mingo@elte.hu>
Wed, 22 Dec 2010 10:53:23 +0000 (11:53 +0100)
diff --combined Documentation/kernel-parameters.txt

index 5e55e4623ab502697cbd4f1733490feff6dbe548,8b61c93609994dd91e36c25e1b29647ad084eaff..316c723a950c52d9bad3537ea8b15c1cabff2e5b
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -1579,12 -1579,20 +1579,12 @@@ and is between 256 and 4096 characters
   
         nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
                         Format: [panic,][num]
- -                      Valid num: 0,1,2
+ +                      Valid num: 0
                         0 - turn nmi_watchdog off
- -                      1 - use the IO-APIC timer for the NMI watchdog
- -                      2 - use the local APIC for the NMI watchdog using
- -                      a performance counter. Note: This will use one
- -                      performance counter and the local APIC's performance
- -                      vector.
                         When panic is specified, panic when an NMI watchdog
                         timeout occurs.
                         This is useful when you use a panic=... timeout and
                         need the box quickly up again.
- -                      Instead of 1 and 2 it is possible to use the following
- -                      symbolic names: lapic and ioapic
- -                      Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
   
         netpoll.carrier_timeout=
                         [NET] Specifies amount of time (in seconds) that
@@@ -2167,11 -2175,6 +2167,6 @@@
         reset_devices   [KNL] Force drivers to reset the underlying device
                         during initialization.
   
-       resource_alloc_from_bottom
-                       Allocate new resources from the beginning of available
-                       space, not the end.  If you need to use this, please
-                       report a bug.
- 
         resume=         [SWSUSP]
                         Specify the partition device for software suspend
   
diff --combined MAINTAINERS

index ed192f11ad2309915057c1e16799449befa5b9f6,6a588873cf8d2da8b00f803a1dcd8d96841292bd..f1f803c6674a8ca42728647e608efb7f67cded22
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -559,14 -559,14 +559,14 @@@ W:      http://maxim.org.za/at91_26.htm
   S:    Maintained
   
   ARM/BCMRING ARM ARCHITECTURE
- M:    Leo Chen <leochen@broadcom.com>
+ M:    Jiandong Zheng <jdzheng@broadcom.com>
   M:    Scott Branden <sbranden@broadcom.com>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
   F:    arch/arm/mach-bcmring
   
   ARM/BCMRING MTD NAND DRIVER
- M:    Leo Chen <leochen@broadcom.com>
+ M:    Jiandong Zheng <jdzheng@broadcom.com>
   M:    Scott Branden <sbranden@broadcom.com>
   L:    linux-mtd@lists.infradead.org
   S:    Maintained
@@@ -815,7 -815,7 +815,7 @@@ F: drivers/mmc/host/msm_sdcc.
   F:    drivers/mmc/host/msm_sdcc.h
   F:    drivers/serial/msm_serial.h
   F:    drivers/serial/msm_serial.c
- T:    git git://codeaurora.org/quic/kernel/dwalker/linux-msm.git
+ T:    git git://codeaurora.org/quic/kernel/davidb/linux-msm.git
   S:    Maintained
   
   ARM/TOSA MACHINE SUPPORT
@@@ -4612,7 -4612,7 +4612,7 @@@ PERFORMANCE EVENTS SUBSYSTE
   M:    Peter Zijlstra <a.p.zijlstra@chello.nl>
   M:    Paul Mackerras <paulus@samba.org>
   M:    Ingo Molnar <mingo@elte.hu>
- -M:    Arnaldo Carvalho de Melo <acme@redhat.com>
+ +M:    Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
   S:    Supported
   F:    kernel/perf_event*.c
   F:    include/linux/perf_event.h
@@@ -5932,7 -5932,6 +5932,6 @@@ F:      include/linux/tty.
   
   TULIP NETWORK DRIVERS
   M:    Grant Grundler <grundler@parisc-linux.org>
- M:    Kyle McMartin <kyle@mcmartin.ca>
   L:    netdev@vger.kernel.org
   S:    Maintained
   F:    drivers/net/tulip/
@@@ -6584,6 -6583,15 +6583,15 @@@ F:    include/linux/mfd/wm8400
   F:    include/sound/wm????.h
   F:    sound/soc/codecs/wm*
   
+ WORKQUEUE
+ M:    Tejun Heo <tj@kernel.org>
+ L:    linux-kernel@vger.kernel.org
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git
+ S:    Maintained
+ F:    include/linux/workqueue.h
+ F:    kernel/workqueue.c
+ F:    Documentation/workqueue.txt
+ 
   X.25 NETWORK LAYER
   M:    Andrew Hendry <andrew.hendry@gmail.com>
   L:    linux-x25@vger.kernel.org
diff --combined arch/x86/kernel/apic/apic.c

index e9e2a93783f9d6e98bbf9c627eb0e8429afc2175,78218135b48e6169d155fb4a097e5b6c8e30e53a..fb7657822aadd7cb0954f6483490b5e9a0279f60
--- 1/arch/x86/kernel/apic/apic.c
--- 2/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@@ -31,6 -31,7 +31,6 @@@
   #include <linux/init.h>
   #include <linux/cpu.h>
   #include <linux/dmi.h>
- -#include <linux/nmi.h>
   #include <linux/smp.h>
   #include <linux/mm.h>
   
@@@ -798,7 -799,11 +798,7 @@@ void __init setup_boot_APIC_clock(void
          * PIT/HPET going.  Otherwise register lapic as a dummy
          * device.
          */
- -      if (nmi_watchdog != NMI_IO_APIC)
- -              lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- -      else
- -              pr_warning("APIC timer registered as dummy,"
- -                      " due to nmi_watchdog=%d!\n", nmi_watchdog);
+ +      lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
   
         /* Setup the lapic or request the broadcast */
         setup_APIC_timer();
@@@ -1382,7 -1387,16 +1382,15 @@@ void __cpuinit end_local_APIC_setup(voi
         }
   #endif
   
- -      setup_apic_nmi_watchdog(NULL);
         apic_pm_activate();
+ 
+       /*
+        * Now that local APIC setup is completed for BP, configure the fault
+        * handling for interrupt remapping.
+        */
+       if (!smp_processor_id() && intr_remapping_enabled)
+               enable_drhd_fault_handling();
+ 
   }
   
   #ifdef CONFIG_X86_X2APIC
@@@ -1744,10 -1758,17 +1752,10 @@@ int __init APIC_init_uniprocessor(void
                 setup_IO_APIC();
         else {
                 nr_ioapics = 0;
- -              localise_nmi_watchdog();
         }
- -#else
- -      localise_nmi_watchdog();
   #endif
   
         x86_init.timers.setup_percpu_clockev();
- -#ifdef CONFIG_X86_64
- -      check_nmi_watchdog();
- -#endif
- -
         return 0;
   }
   
diff --combined arch/x86/kernel/apic/io_apic.c

index e4a040c28de125c9a0a9286d4f5f1d9e338748da,fadcd743a74f8bdcd5effbaf7e28b01ea3003532..16c2db8750a24d84e339d47b1b9c2341cd144b8c
--- 1/arch/x86/kernel/apic/io_apic.c
--- 2/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@@ -54,6 -54,7 +54,6 @@@
   #include <asm/dma.h>
   #include <asm/timer.h>
   #include <asm/i8259.h>
- -#include <asm/nmi.h>
   #include <asm/msidef.h>
   #include <asm/hypertransport.h>
   #include <asm/setup.h>
@@@ -2429,13 -2430,12 +2429,12 @@@ static void ack_apic_level(struct irq_d
   {
         struct irq_cfg *cfg = data->chip_data;
         int i, do_unmask_irq = 0, irq = data->irq;
-       struct irq_desc *desc = irq_to_desc(irq);
         unsigned long v;
   
         irq_complete_move(cfg);
   #ifdef CONFIG_GENERIC_PENDING_IRQ
         /* If we are moving the irq we need to mask it */
-       if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
+       if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
                 do_unmask_irq = 1;
                 mask_ioapic(cfg);
         }
@@@ -2642,6 -2642,24 +2641,6 @@@ static void lapic_register_intr(int irq
                                       "edge");
   }
   
- -static void __init setup_nmi(void)
- -{
- -      /*
- -       * Dirty trick to enable the NMI watchdog ...
- -       * We put the 8259A master into AEOI mode and
- -       * unmask on all local APICs LVT0 as NMI.
- -       *
- -       * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- -       * is from Maciej W. Rozycki - so we do not have to EOI from
- -       * the NMI handler or the timer interrupt.
- -       */
- -      apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
- -
- -      enable_NMI_through_LVT0();
- -
- -      apic_printk(APIC_VERBOSE, " done.\n");
- -}
- -
   /*
    * This looks a bit hackish but it's about the only one way of sending
    * a few INTA cycles to 8259As and any associated glue logic.  ICR does
@@@ -2747,6 -2765,15 +2746,6 @@@ static inline void __init check_timer(v
          */
         apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
         legacy_pic->init(1);
- -#ifdef CONFIG_X86_32
- -      {
- -              unsigned int ver;
- -
- -              ver = apic_read(APIC_LVR);
- -              ver = GET_APIC_VERSION(ver);
- -              timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
- -      }
- -#endif
   
         pin1  = find_isa_irq_pin(0, mp_INT);
         apic1 = find_isa_irq_apic(0, mp_INT);
@@@ -2794,6 -2821,10 +2793,6 @@@
                                 unmask_ioapic(cfg);
                 }
                 if (timer_irq_works()) {
- -                      if (nmi_watchdog == NMI_IO_APIC) {
- -                              setup_nmi();
- -                              legacy_pic->unmask(0);
- -                      }
                         if (disable_timer_pin_1 > 0)
                                 clear_IO_APIC_pin(0, pin1);
                         goto out;
@@@ -2819,6 -2850,11 +2818,6 @@@
                 if (timer_irq_works()) {
                         apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                         timer_through_8259 = 1;
- -                      if (nmi_watchdog == NMI_IO_APIC) {
- -                              legacy_pic->mask(0);
- -                              setup_nmi();
- -                              legacy_pic->unmask(0);
- -                      }
                         goto out;
                 }
                 /*
@@@ -2830,6 -2866,15 +2829,6 @@@
                 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
         }
   
- -      if (nmi_watchdog == NMI_IO_APIC) {
- -              apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- -                          "through the IO-APIC - disabling NMI Watchdog!\n");
- -              nmi_watchdog = NMI_NONE;
- -      }
- -#ifdef CONFIG_X86_32
- -      timer_ack = 0;
- -#endif
- -
         apic_printk(APIC_QUIET, KERN_INFO
                     "...trying to set up timer as Virtual Wire IRQ...\n");
   
@@@ -3367,6 -3412,7 +3366,7 @@@ dmar_msi_set_affinity(struct irq_data *
         msg.data |= MSI_DATA_VECTOR(cfg->vector);
         msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+       msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
   
         dmar_msi_write(irq, &msg);
   
diff --combined include/linux/sched.h

index d2e63d1e725c37cfdaf3673160f327a048a31e7a,223874538b33208e3c5ff11710f3161d58b4aef2..a99d735db3dfe5ee26fba4aebc5261dedfa74af9
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -143,7 -143,7 +143,7 @@@ extern unsigned long nr_iowait_cpu(int 
   extern unsigned long this_cpu_load(void);
   
   
- extern void calc_global_load(void);
+ extern void calc_global_load(unsigned long ticks);
   
   extern unsigned long get_parent_ip(unsigned long addr);
   
@@@ -316,7 -316,6 +316,7 @@@ extern int proc_dowatchdog_thresh(struc
                                   size_t *lenp, loff_t *ppos);
   extern unsigned int  softlockup_panic;
   extern int softlockup_thresh;
+ +void lockup_detector_init(void);
   #else
   static inline void touch_softlockup_watchdog(void)
   {
@@@ -327,9 -326,6 +327,9 @@@ static inline void touch_softlockup_wat
   static inline void touch_all_softlockup_watchdogs(void)
   {
   }
+ +static inline void lockup_detector_init(void)
+ +{
+ +}
   #endif
   
   #ifdef CONFIG_DETECT_HUNG_TASK
diff --combined kernel/sched.c

index 605ab1b24d8175563d87f1b22471591ae9a5ea2a,297d1a0eedb0e68d8b9327f530ba477c93b1222e..c68cead94dd76942beeffa932498d0ae5a2cfe41
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -636,22 -636,18 +636,18 @@@ static inline struct task_group *task_g
   
   #endif /* CONFIG_CGROUP_SCHED */
   
- static u64 irq_time_cpu(int cpu);
- static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+ static void update_rq_clock_task(struct rq *rq, s64 delta);
   
- inline void update_rq_clock(struct rq *rq)
+ static void update_rq_clock(struct rq *rq)
   {
-       if (!rq->skip_clock_update) {
-               int cpu = cpu_of(rq);
-               u64 irq_time;
+       s64 delta;
   
-               rq->clock = sched_clock_cpu(cpu);
-               irq_time = irq_time_cpu(cpu);
-               if (rq->clock - irq_time > rq->clock_task)
-                       rq->clock_task = rq->clock - irq_time;
+       if (rq->skip_clock_update)
+               return;
   
-               sched_irq_time_avg_update(rq, irq_time);
-       }
+       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+       rq->clock += delta;
+       update_rq_clock_task(rq, delta);
   }
   
   /*
@@@ -1924,10 -1920,9 +1920,9 @@@ static void deactivate_task(struct rq *
    * They are read and saved off onto struct rq in update_rq_clock().
    * This may result in other CPU reading this CPU's irq time and can
    * race with irq/account_system_vtime on this CPU. We would either get old
-  * or new value (or semi updated value on 32 bit) with a side effect of
-  * accounting a slice of irq time to wrong task when irq is in progress
-  * while we read rq->clock. That is a worthy compromise in place of having
-  * locks on each irq in account_system_time.
+  * or new value with a side effect of accounting a slice of irq time to wrong
+  * task when irq is in progress while we read rq->clock. That is a worthy
+  * compromise in place of having locks on each irq in account_system_time.
    */
   static DEFINE_PER_CPU(u64, cpu_hardirq_time);
   static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@@ -1945,19 -1940,58 +1940,58 @@@ void disable_sched_clock_irqtime(void
         sched_clock_irqtime = 0;
   }
   
- static u64 irq_time_cpu(int cpu)
+ #ifndef CONFIG_64BIT
+ static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+ 
+ static inline void irq_time_write_begin(void)
   {
-       if (!sched_clock_irqtime)
-               return 0;
+       __this_cpu_inc(irq_time_seq.sequence);
+       smp_wmb();
+ }
+ 
+ static inline void irq_time_write_end(void)
+ {
+       smp_wmb();
+       __this_cpu_inc(irq_time_seq.sequence);
+ }
+ 
+ static inline u64 irq_time_read(int cpu)
+ {
+       u64 irq_time;
+       unsigned seq;
+ 
+       do {
+               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+               irq_time = per_cpu(cpu_softirq_time, cpu) +
+                          per_cpu(cpu_hardirq_time, cpu);
+       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
   
+       return irq_time;
+ }
+ #else /* CONFIG_64BIT */
+ static inline void irq_time_write_begin(void)
+ {
+ }
+ 
+ static inline void irq_time_write_end(void)
+ {
+ }
+ 
+ static inline u64 irq_time_read(int cpu)
+ {
         return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
   }
+ #endif /* CONFIG_64BIT */
   
+ /*
+  * Called before incrementing preempt_count on {soft,}irq_enter
+  * and before decrementing preempt_count on {soft,}irq_exit.
+  */
   void account_system_vtime(struct task_struct *curr)
   {
         unsigned long flags;
+       s64 delta;
         int cpu;
-       u64 now, delta;
   
         if (!sched_clock_irqtime)
                 return;
@@@ -1965,9 -1999,10 +1999,10 @@@
         local_irq_save(flags);
   
         cpu = smp_processor_id();
-       now = sched_clock_cpu(cpu);
-       delta = now - per_cpu(irq_start_time, cpu);
-       per_cpu(irq_start_time, cpu) = now;
+       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+       __this_cpu_add(irq_start_time, delta);
+ 
+       irq_time_write_begin();
         /*
          * We do not account for softirq time from ksoftirqd here.
          * We want to continue accounting softirq time to ksoftirqd thread
@@@ -1975,33 -2010,55 +2010,55 @@@
          * that do not consume any time, but still wants to run.
          */
         if (hardirq_count())
-               per_cpu(cpu_hardirq_time, cpu) += delta;
+               __this_cpu_add(cpu_hardirq_time, delta);
         else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-               per_cpu(cpu_softirq_time, cpu) += delta;
+               __this_cpu_add(cpu_softirq_time, delta);
   
+       irq_time_write_end();
         local_irq_restore(flags);
   }
   EXPORT_SYMBOL_GPL(account_system_vtime);
   
- static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+ static void update_rq_clock_task(struct rq *rq, s64 delta)
   {
-       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
-               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-               rq->prev_irq_time = curr_irq_time;
-               sched_rt_avg_update(rq, delta_irq);
-       }
+       s64 irq_delta;
+ 
+       irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ 
+       /*
+        * Since irq_time is only updated on {soft,}irq_exit, we might run into
+        * this case when a previous update_rq_clock() happened inside a
+        * {soft,}irq region.
+        *
+        * When this happens, we stop ->clock_task and only update the
+        * prev_irq_time stamp to account for the part that fit, so that a next
+        * update will consume the rest. This ensures ->clock_task is
+        * monotonic.
+        *
+        * It does however cause some slight miss-attribution of {soft,}irq
+        * time, a more accurate solution would be to update the irq_time using
+        * the current rq->clock timestamp, except that would require using
+        * atomic ops.
+        */
+       if (irq_delta > delta)
+               irq_delta = delta;
+ 
+       rq->prev_irq_time += irq_delta;
+       delta -= irq_delta;
+       rq->clock_task += delta;
+ 
+       if (irq_delta && sched_feat(NONIRQ_POWER))
+               sched_rt_avg_update(rq, irq_delta);
   }
   
- #else
+ #else /* CONFIG_IRQ_TIME_ACCOUNTING */
   
- static u64 irq_time_cpu(int cpu)
+ static void update_rq_clock_task(struct rq *rq, s64 delta)
   {
-       return 0;
+       rq->clock_task += delta;
   }
   
- static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
- 
- #endif
+ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
   
   #include "sched_idletask.c"
   #include "sched_fair.c"
@@@ -2129,7 -2186,7 +2186,7 @@@ static void check_preempt_curr(struct r
          * A queue event has occurred, and we're going to schedule.  In
          * this case, we can save a useless back to back clock update.
          */
-       if (test_tsk_need_resched(rq->curr))
+       if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
                 rq->skip_clock_update = 1;
   }
   
@@@ -3119,6 -3176,15 +3176,15 @@@ static long calc_load_fold_active(struc
         return delta;
   }
   
+ static unsigned long
+ calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ {
+       load *= exp;
+       load += active * (FIXED_1 - exp);
+       load += 1UL << (FSHIFT - 1);
+       return load >> FSHIFT;
+ }
+ 
   #ifdef CONFIG_NO_HZ
   /*
    * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@@ -3148,6 -3214,128 +3214,128 @@@ static long calc_load_fold_idle(void
   
         return delta;
   }
+ 
+ /**
+  * fixed_power_int - compute: x^n, in O(log n) time
+  *
+  * @x:         base of the power
+  * @frac_bits: fractional bits of @x
+  * @n:         power to raise @x to.
+  *
+  * By exploiting the relation between the definition of the natural power
+  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+  * (where: n_i \elem {0, 1}, the binary vector representing n),
+  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+  * of course trivially computable in O(log_2 n), the length of our binary
+  * vector.
+  */
+ static unsigned long
+ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+ {
+       unsigned long result = 1UL << frac_bits;
+ 
+       if (n) for (;;) {
+               if (n & 1) {
+                       result *= x;
+                       result += 1UL << (frac_bits - 1);
+                       result >>= frac_bits;
+               }
+               n >>= 1;
+               if (!n)
+                       break;
+               x *= x;
+               x += 1UL << (frac_bits - 1);
+               x >>= frac_bits;
+       }
+ 
+       return result;
+ }
+ 
+ /*
+  * a1 = a0 * e + a * (1 - e)
+  *
+  * a2 = a1 * e + a * (1 - e)
+  *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+  *    = a0 * e^2 + a * (1 - e) * (1 + e)
+  *
+  * a3 = a2 * e + a * (1 - e)
+  *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+  *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+  *
+  *  ...
+  *
+  * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+  *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+  *    = a0 * e^n + a * (1 - e^n)
+  *
+  * [1] application of the geometric series:
+  *
+  *              n         1 - x^(n+1)
+  *     S_n := \Sum x^i = -------------
+  *             i=0          1 - x
+  */
+ static unsigned long
+ calc_load_n(unsigned long load, unsigned long exp,
+           unsigned long active, unsigned int n)
+ {
+ 
+       return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+ }
+ 
+ /*
+  * NO_HZ can leave us missing all per-cpu ticks calling
+  * calc_load_account_active(), but since an idle CPU folds its delta into
+  * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+  * in the pending idle delta if our idle period crossed a load cycle boundary.
+  *
+  * Once we've updated the global active value, we need to apply the exponential
+  * weights adjusted to the number of cycles missed.
+  */
+ static void calc_global_nohz(unsigned long ticks)
+ {
+       long delta, active, n;
+ 
+       if (time_before(jiffies, calc_load_update))
+               return;
+ 
+       /*
+        * If we crossed a calc_load_update boundary, make sure to fold
+        * any pending idle changes, the respective CPUs might have
+        * missed the tick driven calc_load_account_active() update
+        * due to NO_HZ.
+        */
+       delta = calc_load_fold_idle();
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
+ 
+       /*
+        * If we were idle for multiple load cycles, apply them.
+        */
+       if (ticks >= LOAD_FREQ) {
+               n = ticks / LOAD_FREQ;
+ 
+               active = atomic_long_read(&calc_load_tasks);
+               active = active > 0 ? active * FIXED_1 : 0;
+ 
+               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ 
+               calc_load_update += n * LOAD_FREQ;
+       }
+ 
+       /*
+        * Its possible the remainder of the above division also crosses
+        * a LOAD_FREQ period, the regular check in calc_global_load()
+        * which comes after this will take care of that.
+        *
+        * Consider us being 11 ticks before a cycle completion, and us
+        * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+        * age us 4 cycles, and the test in calc_global_load() will
+        * pick up the final one.
+        */
+ }
   #else
   static void calc_load_account_idle(struct rq *this_rq)
   {
@@@ -3157,6 -3345,10 +3345,10 @@@ static inline long calc_load_fold_idle(
   {
         return 0;
   }
+ 
+ static void calc_global_nohz(unsigned long ticks)
+ {
+ }
   #endif
   
   /**
@@@ -3174,24 -3366,17 +3366,17 @@@ void get_avenrun(unsigned long *loads, 
         loads[2] = (avenrun[2] + offset) << shift;
   }
   
- static unsigned long
- calc_load(unsigned long load, unsigned long exp, unsigned long active)
- {
-       load *= exp;
-       load += active * (FIXED_1 - exp);
-       return load >> FSHIFT;
- }
- 
   /*
    * calc_load - update the avenrun load estimates 10 ticks after the
    * CPUs have updated calc_load_tasks.
    */
- void calc_global_load(void)
+ void calc_global_load(unsigned long ticks)
   {
-       unsigned long upd = calc_load_update + 10;
         long active;
   
-       if (time_before(jiffies, upd))
+       calc_global_nohz(ticks);
+ 
+       if (time_before(jiffies, calc_load_update + 10))
                 return;
   
         active = atomic_long_read(&calc_load_tasks);
@@@ -3845,7 -4030,6 +4030,6 @@@ static void put_prev_task(struct rq *rq
   {
         if (prev->se.on_rq)
                 update_rq_clock(rq);
-       rq->skip_clock_update = 0;
         prev->sched_class->put_prev_task(rq, prev);
   }
   
@@@ -3903,7 -4087,6 +4087,6 @@@ need_resched_nonpreemptible
                 hrtick_clear(rq);
   
         raw_spin_lock_irq(&rq->lock);
-       clear_tsk_need_resched(prev);
   
         switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@@ -3935,6 -4118,8 +4118,8 @@@
   
         put_prev_task(rq, prev);
         next = pick_next_task(rq);
+       clear_tsk_need_resched(prev);
+       rq->skip_clock_update = 0;
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
@@@ -8108,6 -8293,8 +8293,6 @@@ void __init sched_init(void
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
   #endif /* SMP */
   
- -      perf_event_init();
- -
         scheduler_running = 1;
   }
author	Ingo Molnar <mingo@elte.hu>
	Wed, 22 Dec 2010 10:53:20 +0000 (11:53 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Wed, 22 Dec 2010 10:53:23 +0000 (11:53 +0100)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic/apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic/io_apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history