From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 22 Dec 2010 10:53:20 +0000 (+0100)
Subject: Merge commit 'v2.6.37-rc7' into perf/core
X-Git-Tag: firefly_0821_release~7613^2~2648^2~59
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=6c529a266bdc590a870ee2d2092ff6527eff427b;hp=-c;p=firefly-linux-kernel-4.4.55.git

Merge commit 'v2.6.37-rc7' into perf/core

Merge reason: Pick up the latest -rc.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---

6c529a266bdc590a870ee2d2092ff6527eff427b
diff --combined Documentation/kernel-parameters.txt
index 5e55e4623ab5,8b61c9360999..316c723a950c
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -1579,12 -1579,20 +1579,12 @@@ and is between 256 and 4096 characters
  
  	nmi_watchdog=	[KNL,BUGS=X86] Debugging features for SMP kernels
  			Format: [panic,][num]
 -			Valid num: 0,1,2
 +			Valid num: 0
  			0 - turn nmi_watchdog off
 -			1 - use the IO-APIC timer for the NMI watchdog
 -			2 - use the local APIC for the NMI watchdog using
 -			a performance counter. Note: This will use one
 -			performance counter and the local APIC's performance
 -			vector.
  			When panic is specified, panic when an NMI watchdog
  			timeout occurs.
  			This is useful when you use a panic=... timeout and
  			need the box quickly up again.
 -			Instead of 1 and 2 it is possible to use the following
 -			symbolic names: lapic and ioapic
 -			Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
  
  	netpoll.carrier_timeout=
  			[NET] Specifies amount of time (in seconds) that
@@@ -2167,11 -2175,6 +2167,6 @@@
  	reset_devices	[KNL] Force drivers to reset the underlying device
  			during initialization.
  
- 	resource_alloc_from_bottom
- 			Allocate new resources from the beginning of available
- 			space, not the end.  If you need to use this, please
- 			report a bug.
- 
  	resume=		[SWSUSP]
  			Specify the partition device for software suspend
  
diff --combined MAINTAINERS
index ed192f11ad23,6a588873cf8d..f1f803c6674a
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@@ -559,14 -559,14 +559,14 @@@ W:	http://maxim.org.za/at91_26.htm
  S:	Maintained
  
  ARM/BCMRING ARM ARCHITECTURE
- M:	Leo Chen <leochen@broadcom.com>
+ M:	Jiandong Zheng <jdzheng@broadcom.com>
  M:	Scott Branden <sbranden@broadcom.com>
  L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:	Maintained
  F:	arch/arm/mach-bcmring
  
  ARM/BCMRING MTD NAND DRIVER
- M:	Leo Chen <leochen@broadcom.com>
+ M:	Jiandong Zheng <jdzheng@broadcom.com>
  M:	Scott Branden <sbranden@broadcom.com>
  L:	linux-mtd@lists.infradead.org
  S:	Maintained
@@@ -815,7 -815,7 +815,7 @@@ F:	drivers/mmc/host/msm_sdcc.
  F:	drivers/mmc/host/msm_sdcc.h
  F:	drivers/serial/msm_serial.h
  F:	drivers/serial/msm_serial.c
- T:	git git://codeaurora.org/quic/kernel/dwalker/linux-msm.git
+ T:	git git://codeaurora.org/quic/kernel/davidb/linux-msm.git
  S:	Maintained
  
  ARM/TOSA MACHINE SUPPORT
@@@ -4612,7 -4612,7 +4612,7 @@@ PERFORMANCE EVENTS SUBSYSTE
  M:	Peter Zijlstra <a.p.zijlstra@chello.nl>
  M:	Paul Mackerras <paulus@samba.org>
  M:	Ingo Molnar <mingo@elte.hu>
 -M:	Arnaldo Carvalho de Melo <acme@redhat.com>
 +M:	Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  S:	Supported
  F:	kernel/perf_event*.c
  F:	include/linux/perf_event.h
@@@ -5932,7 -5932,6 +5932,6 @@@ F:	include/linux/tty.
  
  TULIP NETWORK DRIVERS
  M:	Grant Grundler <grundler@parisc-linux.org>
- M:	Kyle McMartin <kyle@mcmartin.ca>
  L:	netdev@vger.kernel.org
  S:	Maintained
  F:	drivers/net/tulip/
@@@ -6584,6 -6583,15 +6583,15 @@@ F:	include/linux/mfd/wm8400
  F:	include/sound/wm????.h
  F:	sound/soc/codecs/wm*
  
+ WORKQUEUE
+ M:	Tejun Heo <tj@kernel.org>
+ L:	linux-kernel@vger.kernel.org
+ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git
+ S:	Maintained
+ F:	include/linux/workqueue.h
+ F:	kernel/workqueue.c
+ F:	Documentation/workqueue.txt
+ 
  X.25 NETWORK LAYER
  M:	Andrew Hendry <andrew.hendry@gmail.com>
  L:	linux-x25@vger.kernel.org
diff --combined arch/x86/kernel/apic/apic.c
index e9e2a93783f9,78218135b48e..fb7657822aad
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@@ -31,6 -31,7 +31,6 @@@
  #include <linux/init.h>
  #include <linux/cpu.h>
  #include <linux/dmi.h>
 -#include <linux/nmi.h>
  #include <linux/smp.h>
  #include <linux/mm.h>
  
@@@ -798,7 -799,11 +798,7 @@@ void __init setup_boot_APIC_clock(void
  	 * PIT/HPET going.  Otherwise register lapic as a dummy
  	 * device.
  	 */
 -	if (nmi_watchdog != NMI_IO_APIC)
 -		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
 -	else
 -		pr_warning("APIC timer registered as dummy,"
 -			" due to nmi_watchdog=%d!\n", nmi_watchdog);
 +	lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
  
  	/* Setup the lapic or request the broadcast */
  	setup_APIC_timer();
@@@ -1382,7 -1387,16 +1382,15 @@@ void __cpuinit end_local_APIC_setup(voi
  	}
  #endif
  
 -	setup_apic_nmi_watchdog(NULL);
  	apic_pm_activate();
+ 
+ 	/*
+ 	 * Now that local APIC setup is completed for BP, configure the fault
+ 	 * handling for interrupt remapping.
+ 	 */
+ 	if (!smp_processor_id() && intr_remapping_enabled)
+ 		enable_drhd_fault_handling();
+ 
  }
  
  #ifdef CONFIG_X86_X2APIC
@@@ -1744,10 -1758,17 +1752,10 @@@ int __init APIC_init_uniprocessor(void
  		setup_IO_APIC();
  	else {
  		nr_ioapics = 0;
 -		localise_nmi_watchdog();
  	}
 -#else
 -	localise_nmi_watchdog();
  #endif
  
  	x86_init.timers.setup_percpu_clockev();
 -#ifdef CONFIG_X86_64
 -	check_nmi_watchdog();
 -#endif
 -
  	return 0;
  }
  
diff --combined arch/x86/kernel/apic/io_apic.c
index e4a040c28de1,fadcd743a74f..16c2db8750a2
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@@ -54,6 -54,7 +54,6 @@@
  #include <asm/dma.h>
  #include <asm/timer.h>
  #include <asm/i8259.h>
 -#include <asm/nmi.h>
  #include <asm/msidef.h>
  #include <asm/hypertransport.h>
  #include <asm/setup.h>
@@@ -2429,13 -2430,12 +2429,12 @@@ static void ack_apic_level(struct irq_d
  {
  	struct irq_cfg *cfg = data->chip_data;
  	int i, do_unmask_irq = 0, irq = data->irq;
- 	struct irq_desc *desc = irq_to_desc(irq);
  	unsigned long v;
  
  	irq_complete_move(cfg);
  #ifdef CONFIG_GENERIC_PENDING_IRQ
  	/* If we are moving the irq we need to mask it */
- 	if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
+ 	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
  		do_unmask_irq = 1;
  		mask_ioapic(cfg);
  	}
@@@ -2642,6 -2642,24 +2641,6 @@@ static void lapic_register_intr(int irq
  				      "edge");
  }
  
 -static void __init setup_nmi(void)
 -{
 -	/*
 -	 * Dirty trick to enable the NMI watchdog ...
 -	 * We put the 8259A master into AEOI mode and
 -	 * unmask on all local APICs LVT0 as NMI.
 -	 *
 -	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
 -	 * is from Maciej W. Rozycki - so we do not have to EOI from
 -	 * the NMI handler or the timer interrupt.
 -	 */
 -	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
 -
 -	enable_NMI_through_LVT0();
 -
 -	apic_printk(APIC_VERBOSE, " done.\n");
 -}
 -
  /*
   * This looks a bit hackish but it's about the only one way of sending
   * a few INTA cycles to 8259As and any associated glue logic.  ICR does
@@@ -2747,6 -2765,15 +2746,6 @@@ static inline void __init check_timer(v
  	 */
  	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
  	legacy_pic->init(1);
 -#ifdef CONFIG_X86_32
 -	{
 -		unsigned int ver;
 -
 -		ver = apic_read(APIC_LVR);
 -		ver = GET_APIC_VERSION(ver);
 -		timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
 -	}
 -#endif
  
  	pin1  = find_isa_irq_pin(0, mp_INT);
  	apic1 = find_isa_irq_apic(0, mp_INT);
@@@ -2794,6 -2821,10 +2793,6 @@@
  				unmask_ioapic(cfg);
  		}
  		if (timer_irq_works()) {
 -			if (nmi_watchdog == NMI_IO_APIC) {
 -				setup_nmi();
 -				legacy_pic->unmask(0);
 -			}
  			if (disable_timer_pin_1 > 0)
  				clear_IO_APIC_pin(0, pin1);
  			goto out;
@@@ -2819,6 -2850,11 +2818,6 @@@
  		if (timer_irq_works()) {
  			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
  			timer_through_8259 = 1;
 -			if (nmi_watchdog == NMI_IO_APIC) {
 -				legacy_pic->mask(0);
 -				setup_nmi();
 -				legacy_pic->unmask(0);
 -			}
  			goto out;
  		}
  		/*
@@@ -2830,6 -2866,15 +2829,6 @@@
  		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
  	}
  
 -	if (nmi_watchdog == NMI_IO_APIC) {
 -		apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
 -			    "through the IO-APIC - disabling NMI Watchdog!\n");
 -		nmi_watchdog = NMI_NONE;
 -	}
 -#ifdef CONFIG_X86_32
 -	timer_ack = 0;
 -#endif
 -
  	apic_printk(APIC_QUIET, KERN_INFO
  		    "...trying to set up timer as Virtual Wire IRQ...\n");
  
@@@ -3367,6 -3412,7 +3366,7 @@@ dmar_msi_set_affinity(struct irq_data *
  	msg.data |= MSI_DATA_VECTOR(cfg->vector);
  	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+ 	msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
  
  	dmar_msi_write(irq, &msg);
  
diff --combined include/linux/sched.h
index d2e63d1e725c,223874538b33..a99d735db3df
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -143,7 -143,7 +143,7 @@@ extern unsigned long nr_iowait_cpu(int 
  extern unsigned long this_cpu_load(void);
  
  
- extern void calc_global_load(void);
+ extern void calc_global_load(unsigned long ticks);
  
  extern unsigned long get_parent_ip(unsigned long addr);
  
@@@ -316,7 -316,6 +316,7 @@@ extern int proc_dowatchdog_thresh(struc
  				  size_t *lenp, loff_t *ppos);
  extern unsigned int  softlockup_panic;
  extern int softlockup_thresh;
 +void lockup_detector_init(void);
  #else
  static inline void touch_softlockup_watchdog(void)
  {
@@@ -327,9 -326,6 +327,9 @@@ static inline void touch_softlockup_wat
  static inline void touch_all_softlockup_watchdogs(void)
  {
  }
 +static inline void lockup_detector_init(void)
 +{
 +}
  #endif
  
  #ifdef CONFIG_DETECT_HUNG_TASK
diff --combined kernel/sched.c
index 605ab1b24d81,297d1a0eedb0..c68cead94dd7
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@@ -636,22 -636,18 +636,18 @@@ static inline struct task_group *task_g
  
  #endif /* CONFIG_CGROUP_SCHED */
  
- static u64 irq_time_cpu(int cpu);
- static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+ static void update_rq_clock_task(struct rq *rq, s64 delta);
  
- inline void update_rq_clock(struct rq *rq)
+ static void update_rq_clock(struct rq *rq)
  {
- 	if (!rq->skip_clock_update) {
- 		int cpu = cpu_of(rq);
- 		u64 irq_time;
+ 	s64 delta;
  
- 		rq->clock = sched_clock_cpu(cpu);
- 		irq_time = irq_time_cpu(cpu);
- 		if (rq->clock - irq_time > rq->clock_task)
- 			rq->clock_task = rq->clock - irq_time;
+ 	if (rq->skip_clock_update)
+ 		return;
  
- 		sched_irq_time_avg_update(rq, irq_time);
- 	}
+ 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ 	rq->clock += delta;
+ 	update_rq_clock_task(rq, delta);
  }
  
  /*
@@@ -1924,10 -1920,9 +1920,9 @@@ static void deactivate_task(struct rq *
   * They are read and saved off onto struct rq in update_rq_clock().
   * This may result in other CPU reading this CPU's irq time and can
   * race with irq/account_system_vtime on this CPU. We would either get old
-  * or new value (or semi updated value on 32 bit) with a side effect of
-  * accounting a slice of irq time to wrong task when irq is in progress
-  * while we read rq->clock. That is a worthy compromise in place of having
-  * locks on each irq in account_system_time.
+  * or new value with a side effect of accounting a slice of irq time to wrong
+  * task when irq is in progress while we read rq->clock. That is a worthy
+  * compromise in place of having locks on each irq in account_system_time.
   */
  static DEFINE_PER_CPU(u64, cpu_hardirq_time);
  static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@@ -1945,19 -1940,58 +1940,58 @@@ void disable_sched_clock_irqtime(void
  	sched_clock_irqtime = 0;
  }
  
- static u64 irq_time_cpu(int cpu)
+ #ifndef CONFIG_64BIT
+ static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+ 
+ static inline void irq_time_write_begin(void)
  {
- 	if (!sched_clock_irqtime)
- 		return 0;
+ 	__this_cpu_inc(irq_time_seq.sequence);
+ 	smp_wmb();
+ }
+ 
+ static inline void irq_time_write_end(void)
+ {
+ 	smp_wmb();
+ 	__this_cpu_inc(irq_time_seq.sequence);
+ }
+ 
+ static inline u64 irq_time_read(int cpu)
+ {
+ 	u64 irq_time;
+ 	unsigned seq;
+ 
+ 	do {
+ 		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ 		irq_time = per_cpu(cpu_softirq_time, cpu) +
+ 			   per_cpu(cpu_hardirq_time, cpu);
+ 	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
  
+ 	return irq_time;
+ }
+ #else /* CONFIG_64BIT */
+ static inline void irq_time_write_begin(void)
+ {
+ }
+ 
+ static inline void irq_time_write_end(void)
+ {
+ }
+ 
+ static inline u64 irq_time_read(int cpu)
+ {
  	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
  }
+ #endif /* CONFIG_64BIT */
  
+ /*
+  * Called before incrementing preempt_count on {soft,}irq_enter
+  * and before decrementing preempt_count on {soft,}irq_exit.
+  */
  void account_system_vtime(struct task_struct *curr)
  {
  	unsigned long flags;
+ 	s64 delta;
  	int cpu;
- 	u64 now, delta;
  
  	if (!sched_clock_irqtime)
  		return;
@@@ -1965,9 -1999,10 +1999,10 @@@
  	local_irq_save(flags);
  
  	cpu = smp_processor_id();
- 	now = sched_clock_cpu(cpu);
- 	delta = now - per_cpu(irq_start_time, cpu);
- 	per_cpu(irq_start_time, cpu) = now;
+ 	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ 	__this_cpu_add(irq_start_time, delta);
+ 
+ 	irq_time_write_begin();
  	/*
  	 * We do not account for softirq time from ksoftirqd here.
  	 * We want to continue accounting softirq time to ksoftirqd thread
@@@ -1975,33 -2010,55 +2010,55 @@@
  	 * that do not consume any time, but still wants to run.
  	 */
  	if (hardirq_count())
- 		per_cpu(cpu_hardirq_time, cpu) += delta;
+ 		__this_cpu_add(cpu_hardirq_time, delta);
  	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- 		per_cpu(cpu_softirq_time, cpu) += delta;
+ 		__this_cpu_add(cpu_softirq_time, delta);
  
+ 	irq_time_write_end();
  	local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(account_system_vtime);
  
- static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+ static void update_rq_clock_task(struct rq *rq, s64 delta)
  {
- 	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- 		u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- 		rq->prev_irq_time = curr_irq_time;
- 		sched_rt_avg_update(rq, delta_irq);
- 	}
+ 	s64 irq_delta;
+ 
+ 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ 
+ 	/*
+ 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ 	 * this case when a previous update_rq_clock() happened inside a
+ 	 * {soft,}irq region.
+ 	 *
+ 	 * When this happens, we stop ->clock_task and only update the
+ 	 * prev_irq_time stamp to account for the part that fit, so that a next
+ 	 * update will consume the rest. This ensures ->clock_task is
+ 	 * monotonic.
+ 	 *
+ 	 * It does however cause some slight miss-attribution of {soft,}irq
+ 	 * time, a more accurate solution would be to update the irq_time using
+ 	 * the current rq->clock timestamp, except that would require using
+ 	 * atomic ops.
+ 	 */
+ 	if (irq_delta > delta)
+ 		irq_delta = delta;
+ 
+ 	rq->prev_irq_time += irq_delta;
+ 	delta -= irq_delta;
+ 	rq->clock_task += delta;
+ 
+ 	if (irq_delta && sched_feat(NONIRQ_POWER))
+ 		sched_rt_avg_update(rq, irq_delta);
  }
  
- #else
+ #else /* CONFIG_IRQ_TIME_ACCOUNTING */
  
- static u64 irq_time_cpu(int cpu)
+ static void update_rq_clock_task(struct rq *rq, s64 delta)
  {
- 	return 0;
+ 	rq->clock_task += delta;
  }
  
- static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
- 
- #endif
+ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  
  #include "sched_idletask.c"
  #include "sched_fair.c"
@@@ -2129,7 -2186,7 +2186,7 @@@ static void check_preempt_curr(struct r
  	 * A queue event has occurred, and we're going to schedule.  In
  	 * this case, we can save a useless back to back clock update.
  	 */
- 	if (test_tsk_need_resched(rq->curr))
+ 	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
  		rq->skip_clock_update = 1;
  }
  
@@@ -3119,6 -3176,15 +3176,15 @@@ static long calc_load_fold_active(struc
  	return delta;
  }
  
+ static unsigned long
+ calc_load(unsigned long load, unsigned long exp, unsigned long active)
+ {
+ 	load *= exp;
+ 	load += active * (FIXED_1 - exp);
+ 	load += 1UL << (FSHIFT - 1);
+ 	return load >> FSHIFT;
+ }
+ 
  #ifdef CONFIG_NO_HZ
  /*
   * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@@ -3148,6 -3214,128 +3214,128 @@@ static long calc_load_fold_idle(void
  
  	return delta;
  }
+ 
+ /**
+  * fixed_power_int - compute: x^n, in O(log n) time
+  *
+  * @x:         base of the power
+  * @frac_bits: fractional bits of @x
+  * @n:         power to raise @x to.
+  *
+  * By exploiting the relation between the definition of the natural power
+  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+  * (where: n_i \elem {0, 1}, the binary vector representing n),
+  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+  * of course trivially computable in O(log_2 n), the length of our binary
+  * vector.
+  */
+ static unsigned long
+ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+ {
+ 	unsigned long result = 1UL << frac_bits;
+ 
+ 	if (n) for (;;) {
+ 		if (n & 1) {
+ 			result *= x;
+ 			result += 1UL << (frac_bits - 1);
+ 			result >>= frac_bits;
+ 		}
+ 		n >>= 1;
+ 		if (!n)
+ 			break;
+ 		x *= x;
+ 		x += 1UL << (frac_bits - 1);
+ 		x >>= frac_bits;
+ 	}
+ 
+ 	return result;
+ }
+ 
+ /*
+  * a1 = a0 * e + a * (1 - e)
+  *
+  * a2 = a1 * e + a * (1 - e)
+  *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+  *    = a0 * e^2 + a * (1 - e) * (1 + e)
+  *
+  * a3 = a2 * e + a * (1 - e)
+  *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+  *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+  *
+  *  ...
+  *
+  * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+  *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+  *    = a0 * e^n + a * (1 - e^n)
+  *
+  * [1] application of the geometric series:
+  *
+  *              n         1 - x^(n+1)
+  *     S_n := \Sum x^i = -------------
+  *             i=0          1 - x
+  */
+ static unsigned long
+ calc_load_n(unsigned long load, unsigned long exp,
+ 	    unsigned long active, unsigned int n)
+ {
+ 
+ 	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+ }
+ 
+ /*
+  * NO_HZ can leave us missing all per-cpu ticks calling
+  * calc_load_account_active(), but since an idle CPU folds its delta into
+  * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+  * in the pending idle delta if our idle period crossed a load cycle boundary.
+  *
+  * Once we've updated the global active value, we need to apply the exponential
+  * weights adjusted to the number of cycles missed.
+  */
+ static void calc_global_nohz(unsigned long ticks)
+ {
+ 	long delta, active, n;
+ 
+ 	if (time_before(jiffies, calc_load_update))
+ 		return;
+ 
+ 	/*
+ 	 * If we crossed a calc_load_update boundary, make sure to fold
+ 	 * any pending idle changes, the respective CPUs might have
+ 	 * missed the tick driven calc_load_account_active() update
+ 	 * due to NO_HZ.
+ 	 */
+ 	delta = calc_load_fold_idle();
+ 	if (delta)
+ 		atomic_long_add(delta, &calc_load_tasks);
+ 
+ 	/*
+ 	 * If we were idle for multiple load cycles, apply them.
+ 	 */
+ 	if (ticks >= LOAD_FREQ) {
+ 		n = ticks / LOAD_FREQ;
+ 
+ 		active = atomic_long_read(&calc_load_tasks);
+ 		active = active > 0 ? active * FIXED_1 : 0;
+ 
+ 		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ 		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ 		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ 
+ 		calc_load_update += n * LOAD_FREQ;
+ 	}
+ 
+ 	/*
+ 	 * Its possible the remainder of the above division also crosses
+ 	 * a LOAD_FREQ period, the regular check in calc_global_load()
+ 	 * which comes after this will take care of that.
+ 	 *
+ 	 * Consider us being 11 ticks before a cycle completion, and us
+ 	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ 	 * age us 4 cycles, and the test in calc_global_load() will
+ 	 * pick up the final one.
+ 	 */
+ }
  #else
  static void calc_load_account_idle(struct rq *this_rq)
  {
@@@ -3157,6 -3345,10 +3345,10 @@@ static inline long calc_load_fold_idle(
  {
  	return 0;
  }
+ 
+ static void calc_global_nohz(unsigned long ticks)
+ {
+ }
  #endif
  
  /**
@@@ -3174,24 -3366,17 +3366,17 @@@ void get_avenrun(unsigned long *loads, 
  	loads[2] = (avenrun[2] + offset) << shift;
  }
  
- static unsigned long
- calc_load(unsigned long load, unsigned long exp, unsigned long active)
- {
- 	load *= exp;
- 	load += active * (FIXED_1 - exp);
- 	return load >> FSHIFT;
- }
- 
  /*
   * calc_load - update the avenrun load estimates 10 ticks after the
   * CPUs have updated calc_load_tasks.
   */
- void calc_global_load(void)
+ void calc_global_load(unsigned long ticks)
  {
- 	unsigned long upd = calc_load_update + 10;
  	long active;
  
- 	if (time_before(jiffies, upd))
+ 	calc_global_nohz(ticks);
+ 
+ 	if (time_before(jiffies, calc_load_update + 10))
  		return;
  
  	active = atomic_long_read(&calc_load_tasks);
@@@ -3845,7 -4030,6 +4030,6 @@@ static void put_prev_task(struct rq *rq
  {
  	if (prev->se.on_rq)
  		update_rq_clock(rq);
- 	rq->skip_clock_update = 0;
  	prev->sched_class->put_prev_task(rq, prev);
  }
  
@@@ -3903,7 -4087,6 +4087,6 @@@ need_resched_nonpreemptible
  		hrtick_clear(rq);
  
  	raw_spin_lock_irq(&rq->lock);
- 	clear_tsk_need_resched(prev);
  
  	switch_count = &prev->nivcsw;
  	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@@ -3935,6 -4118,8 +4118,8 @@@
  
  	put_prev_task(rq, prev);
  	next = pick_next_task(rq);
+ 	clear_tsk_need_resched(prev);
+ 	rq->skip_clock_update = 0;
  
  	if (likely(prev != next)) {
  		sched_info_switch(prev, next);
@@@ -8108,6 -8293,8 +8293,6 @@@ void __init sched_init(void
  		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  #endif /* SMP */
  
 -	perf_event_init();
 -
  	scheduler_running = 1;
  }