Merge branch 'linus' into sched/core

author Ingo Molnar <mingo@elte.hu>

Wed, 21 Jul 2010 19:45:02 +0000 (21:45 +0200)

committer Ingo Molnar <mingo@elte.hu>

Wed, 21 Jul 2010 19:45:08 +0000 (21:45 +0200)
author Ingo Molnar <mingo@elte.hu>
Wed, 21 Jul 2010 19:45:02 +0000 (21:45 +0200)
committer Ingo Molnar <mingo@elte.hu>
Wed, 21 Jul 2010 19:45:08 +0000 (21:45 +0200)
diff --combined arch/powerpc/kernel/process.c

index 22f08cb7e7d1db6cc828f87556b0477bc712c533,773424df828a393d9a2860ded0b380978cfdd9cc..43855c9f84de366dbb530cf16056dc4869cf7bad
--- 1/arch/powerpc/kernel/process.c
--- 2/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@@ -1005,7 -1005,6 +1005,6 @@@ out
         return error;
   }
   
- #ifdef CONFIG_IRQSTACKS
   static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
                                   unsigned long nbytes)
   {
@@@ -1030,10 -1029,6 +1029,6 @@@
         return 0;
   }
   
- #else
- #define valid_irq_stack(sp, p, nb)    0
- #endif /* CONFIG_IRQSTACKS */
- 
   int validate_sp(unsigned long sp, struct task_struct *p,
                        unsigned long nbytes)
   {
@@@ -1268,14 -1263,3 +1263,14 @@@ unsigned long randomize_et_dyn(unsigne
   
         return ret;
   }
+ +
+ +#ifdef CONFIG_SMP
+ +int arch_sd_sibling_asym_packing(void)
+ +{
+ +      if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+ +              printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+ +              return SD_ASYM_PACKING;
+ +      }
+ +      return 0;
+ +}
+ +#endif
diff --combined include/linux/sched.h

index ff154e10752bb43bb80af77b59b6d01f9de69927,747fcaedddb70d6cb0a49b43c0d2f2a6f23a693f..9a7bc5ba7e7e501a6850e2cef17afe3a11ddb2c3
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -139,7 -139,7 +139,7 @@@ extern int nr_processes(void)
   extern unsigned long nr_running(void);
   extern unsigned long nr_uninterruptible(void);
   extern unsigned long nr_iowait(void);
- extern unsigned long nr_iowait_cpu(void);
+ extern unsigned long nr_iowait_cpu(int cpu);
   extern unsigned long this_cpu_load(void);
   
   
@@@ -271,11 -271,14 +271,11 @@@ extern int runqueue_is_locked(int cpu)
   
   extern cpumask_var_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
- -extern int select_nohz_load_balancer(int cpu);
- -extern int get_nohz_load_balancer(void);
+ +extern void select_nohz_load_balancer(int stop_tick);
+ +extern int get_nohz_timer_target(void);
   extern int nohz_ratelimit(int cpu);
   #else
- -static inline int select_nohz_load_balancer(int cpu)
- -{
- -      return 0;
- -}
+ +static inline void select_nohz_load_balancer(int stop_tick) { }
   
   static inline int nohz_ratelimit(int cpu)
   {
@@@ -801,7 -804,7 +801,7 @@@ enum cpu_idle_type 
   #define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
   #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
   #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
- -
+ +#define SD_ASYM_PACKING               0x0800  /* Place busy groups earlier in the domain */
   #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
   
   enum powersavings_balance_level {
@@@ -836,8 -839,6 +836,8 @@@ static inline int sd_balance_for_packag
         return SD_PREFER_SIBLING;
   }
   
+ +extern int __weak arch_sd_sibiling_asym_packing(void);
+ +
   /*
    * Optimise SD flags for power savings:
    * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
@@@ -859,7 -860,7 +859,7 @@@ struct sched_group 
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
          * single CPU.
          */
- -      unsigned int cpu_power;
+ +      unsigned int cpu_power, cpu_power_orig;
   
         /*
          * The CPUs this group covers.
@@@ -1695,7 -1696,6 +1695,7 @@@ extern void thread_group_times(struct t
   #define PF_EXITING    0x00000004      /* getting shut down */
   #define PF_EXITPIDONE 0x00000008      /* pi exit done on shut down */
   #define PF_VCPU               0x00000010      /* I'm a virtual CPU */
+ +#define PF_WQ_WORKER  0x00000020      /* I'm a workqueue worker */
   #define PF_FORKNOEXEC 0x00000040      /* forked but didn't exec */
   #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
   #define PF_SUPERPRIV  0x00000100      /* used super-user privileges */
@@@ -1790,23 -1790,20 +1790,23 @@@ static inline int set_cpus_allowed(stru
   #endif
   
   /*
- - * Architectures can set this to 1 if they have specified
- - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- - * but then during bootup it turns out that sched_clock()
- - * is reliable after all:
+ + * Do not use outside of architecture code which knows its limitations.
+ + *
+ + * sched_clock() has no promise of monotonicity or bounded drift between
+ + * CPUs, use (which you should not) requires disabling IRQs.
+ + *
+ + * Please use one of the three interfaces below.
    */
- -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- -extern int sched_clock_stable;
- -#endif
- -
- -/* ftrace calls sched_clock() directly */
   extern unsigned long long notrace sched_clock(void);
+ +/*
+ + * See the comment in kernel/sched_clock.c
+ + */
+ +extern u64 cpu_clock(int cpu);
+ +extern u64 local_clock(void);
+ +extern u64 sched_clock_cpu(int cpu);
+ +
   
   extern void sched_clock_init(void);
- -extern u64 sched_clock_cpu(int cpu);
   
   #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
   static inline void sched_clock_tick(void)
@@@ -1821,19 -1818,17 +1821,19 @@@ static inline void sched_clock_idle_wak
   {
   }
   #else
+ +/*
+ + * Architectures can set this to 1 if they have specified
+ + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ + * but then during bootup it turns out that sched_clock()
+ + * is reliable after all:
+ + */
+ +extern int sched_clock_stable;
+ +
   extern void sched_clock_tick(void);
   extern void sched_clock_idle_sleep_event(void);
   extern void sched_clock_idle_wakeup_event(u64 delta_ns);
   #endif
   
- -/*
- - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- - * clock constructed from sched_clock():
- - */
- -extern unsigned long long cpu_clock(int cpu);
- -
   extern unsigned long long
   task_sched_runtime(struct task_struct *task);
   extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
diff --combined kernel/sched.c

index 7b443ee27be4aba135c66c4b3a7071f0b8b269c5,f52a8801b7a285fb252ecc6935b55f525813881b..16f3f77f71beccb828d47e3870a3650a9db255c5
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -77,7 -77,6 +77,7 @@@
   #include <asm/irq_regs.h>
   
   #include "sched_cpupri.h"
+ +#include "workqueue_sched.h"
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
@@@ -457,10 -456,9 +457,10 @@@ struct rq 
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ +      unsigned long last_load_update_tick;
   #ifdef CONFIG_NO_HZ
         u64 nohz_stamp;
- -      unsigned char in_nohz_recently;
+ +      unsigned char nohz_balance_kick;
   #endif
         unsigned int skip_clock_update;
   
@@@ -1194,27 -1192,6 +1194,27 @@@ static void resched_cpu(int cpu
   }
   
   #ifdef CONFIG_NO_HZ
+ +/*
+ + * In the semi idle case, use the nearest busy cpu for migrating timers
+ + * from an idle cpu.  This is good for power-savings.
+ + *
+ + * We don't do similar optimization for completely idle system, as
+ + * selecting an idle cpu will add more delays to the timers than intended
+ + * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ + */
+ +int get_nohz_timer_target(void)
+ +{
+ +      int cpu = smp_processor_id();
+ +      int i;
+ +      struct sched_domain *sd;
+ +
+ +      for_each_domain(cpu, sd) {
+ +              for_each_cpu(i, sched_domain_span(sd))
+ +                      if (!idle_cpu(i))
+ +                              return i;
+ +      }
+ +      return cpu;
+ +}
   /*
    * When add_timer_on() enqueues a timer into the timer wheel of an
    * idle CPU then this timer might expire before the next timer event
@@@ -1277,6 -1254,12 +1277,12 @@@ static void sched_avg_update(struct rq 
         s64 period = sched_avg_period();
   
         while ((s64)(rq->clock - rq->age_stamp) > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (rq->age_stamp));
                 rq->age_stamp += period;
                 rq->rt_avg /= 2;
         }
@@@ -1669,7 -1652,7 +1675,7 @@@ static void update_shares(struct sched_
         if (root_task_group_empty())
                 return;
   
- -      now = cpu_clock(raw_smp_processor_id());
+ +      now = local_clock();
         elapsed = now - sd->last_update;
   
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@@ -1680,9 -1663,6 +1686,6 @@@
   
   static void update_h_load(long cpu)
   {
-       if (root_task_group_empty())
-               return;
- 
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
   }
   
@@@ -1825,7 -1805,6 +1828,7 @@@ static void cfs_rq_set_shares(struct cf
   static void calc_load_account_idle(struct rq *this_rq);
   static void update_sysctl(void);
   static int get_update_sysctl_factor(void);
+ +static void update_cpu_load(struct rq *this_rq);
   
   static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
   {
@@@ -2288,55 -2267,11 +2291,55 @@@ static void update_avg(u64 *avg, u64 sa
   }
   #endif
   
- -/***
+ +static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+ +                               bool is_sync, bool is_migrate, bool is_local,
+ +                               unsigned long en_flags)
+ +{
+ +      schedstat_inc(p, se.statistics.nr_wakeups);
+ +      if (is_sync)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_sync);
+ +      if (is_migrate)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+ +      if (is_local)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_local);
+ +      else
+ +              schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ +
+ +      activate_task(rq, p, en_flags);
+ +}
+ +
+ +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+ +                                      int wake_flags, bool success)
+ +{
+ +      trace_sched_wakeup(p, success);
+ +      check_preempt_curr(rq, p, wake_flags);
+ +
+ +      p->state = TASK_RUNNING;
+ +#ifdef CONFIG_SMP
+ +      if (p->sched_class->task_woken)
+ +              p->sched_class->task_woken(rq, p);
+ +
+ +      if (unlikely(rq->idle_stamp)) {
+ +              u64 delta = rq->clock - rq->idle_stamp;
+ +              u64 max = 2*sysctl_sched_migration_cost;
+ +
+ +              if (delta > max)
+ +                      rq->avg_idle = max;
+ +              else
+ +                      update_avg(&rq->avg_idle, delta);
+ +              rq->idle_stamp = 0;
+ +      }
+ +#endif
+ +      /* if a worker is waking up, notify workqueue */
+ +      if ((p->flags & PF_WQ_WORKER) && success)
+ +              wq_worker_waking_up(p, cpu_of(rq));
+ +}
+ +
+ +/**
    * try_to_wake_up - wake up a thread
- - * @p: the to-be-woken-up thread
+ + * @p: the thread to be awakened
    * @state: the mask of task states that can be woken
- - * @sync: do a synchronous wakeup?
+ + * @wake_flags: wake modifier flags (WF_*)
    *
    * Put it on the run-queue if it's not already there. The "current"
    * thread is always on the run-queue (except when the actual
@@@ -2344,8 -2279,7 +2347,8 @@@
    * the simpler "current->state = TASK_RUNNING" to mark yourself
    * runnable without the overhead of this.
    *
- - * returns failure only if the task is already active.
+ + * Returns %true if @p was woken up, %false if it was already running
+ + * or @state didn't match @p's state.
    */
   static int try_to_wake_up(struct task_struct *p, unsigned int state,
                           int wake_flags)
@@@ -2425,11 -2359,38 +2428,11 @@@
   
   out_activate:
   #endif /* CONFIG_SMP */
- -      schedstat_inc(p, se.statistics.nr_wakeups);
- -      if (wake_flags & WF_SYNC)
- -              schedstat_inc(p, se.statistics.nr_wakeups_sync);
- -      if (orig_cpu != cpu)
- -              schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- -      if (cpu == this_cpu)
- -              schedstat_inc(p, se.statistics.nr_wakeups_local);
- -      else
- -              schedstat_inc(p, se.statistics.nr_wakeups_remote);
- -      activate_task(rq, p, en_flags);
+ +      ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
+ +                    cpu == this_cpu, en_flags);
         success = 1;
- -
   out_running:
- -      trace_sched_wakeup(p, success);
- -      check_preempt_curr(rq, p, wake_flags);
- -
- -      p->state = TASK_RUNNING;
- -#ifdef CONFIG_SMP
- -      if (p->sched_class->task_woken)
- -              p->sched_class->task_woken(rq, p);
- -
- -      if (unlikely(rq->idle_stamp)) {
- -              u64 delta = rq->clock - rq->idle_stamp;
- -              u64 max = 2*sysctl_sched_migration_cost;
- -
- -              if (delta > max)
- -                      rq->avg_idle = max;
- -              else
- -                      update_avg(&rq->avg_idle, delta);
- -              rq->idle_stamp = 0;
- -      }
- -#endif
+ +      ttwu_post_activation(p, rq, wake_flags, success);
   out:
         task_rq_unlock(rq, &flags);
         put_cpu();
@@@ -2437,37 -2398,6 +2440,37 @@@
         return success;
   }
   
+ +/**
+ + * try_to_wake_up_local - try to wake up a local task with rq lock held
+ + * @p: the thread to be awakened
+ + *
+ + * Put @p on the run-queue if it's not alredy there.  The caller must
+ + * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ + * the current task.  this_rq() stays locked over invocation.
+ + */
+ +static void try_to_wake_up_local(struct task_struct *p)
+ +{
+ +      struct rq *rq = task_rq(p);
+ +      bool success = false;
+ +
+ +      BUG_ON(rq != this_rq());
+ +      BUG_ON(p == current);
+ +      lockdep_assert_held(&rq->lock);
+ +
+ +      if (!(p->state & TASK_NORMAL))
+ +              return;
+ +
+ +      if (!p->se.on_rq) {
+ +              if (likely(!task_running(rq, p))) {
+ +                      schedstat_inc(rq, ttwu_count);
+ +                      schedstat_inc(rq, ttwu_local);
+ +              }
+ +              ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+ +              success = true;
+ +      }
+ +      ttwu_post_activation(p, rq, 0, success);
+ +}
+ +
   /**
    * wake_up_process - Wake up a specific process
    * @p: The process to be woken up.
@@@ -2564,7 -2494,16 +2567,16 @@@ void sched_fork(struct task_struct *p, 
         if (p->sched_class->task_fork)
                 p->sched_class->task_fork(p);
   
+       /*
+        * The child is not yet in the pid-hash so no cgroup attach races,
+        * and the cgroup is pinned to this child due to cgroup_fork()
+        * is ran before sched_fork().
+        *
+        * Silence PROVE_RCU.
+        */
+       rcu_read_lock();
         set_task_cpu(p, cpu);
+       rcu_read_unlock();
   
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         if (likely(sched_info_on()))
@@@ -2934,9 -2873,9 +2946,9 @@@ unsigned long nr_iowait(void
         return sum;
   }
   
- unsigned long nr_iowait_cpu(void)
+ unsigned long nr_iowait_cpu(int cpu)
   {
-       struct rq *this = this_rq();
+       struct rq *this = cpu_rq(cpu);
         return atomic_read(&this->nr_iowait);
   }
   
@@@ -3072,103 -3011,24 +3084,103 @@@ static void calc_load_account_active(st
         this_rq->calc_load_update += LOAD_FREQ;
   }
   
+ +/*
+ + * The exact cpuload at various idx values, calculated at every tick would be
+ + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ + *
+ + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ + * on nth tick when cpu may be busy, then we have:
+ + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ + *
+ + * decay_load_missed() below does efficient calculation of
+ + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ + *
+ + * The calculation is approximated on a 128 point scale.
+ + * degrade_zero_ticks is the number of ticks after which load at any
+ + * particular idx is approximated to be zero.
+ + * degrade_factor is a precomputed table, a row for each load idx.
+ + * Each column corresponds to degradation factor for a power of two ticks,
+ + * based on 128 point scale.
+ + * Example:
+ + * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ + *
+ + * With this power of 2 load factors, we can degrade the load n times
+ + * by looking at 1 bits in n and doing as many mult/shift instead of
+ + * n mult/shifts needed by the exact degradation.
+ + */
+ +#define DEGRADE_SHIFT         7
+ +static const unsigned char
+ +              degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+ +static const unsigned char
+ +              degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ +                                      {0, 0, 0, 0, 0, 0, 0, 0},
+ +                                      {64, 32, 8, 0, 0, 0, 0, 0},
+ +                                      {96, 72, 40, 12, 1, 0, 0},
+ +                                      {112, 98, 75, 43, 15, 1, 0},
+ +                                      {120, 112, 98, 76, 45, 16, 2} };
+ +
+ +/*
+ + * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ + * would be when CPU is idle and so we just decay the old load without
+ + * adding any new load.
+ + */
+ +static unsigned long
+ +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+ +{
+ +      int j = 0;
+ +
+ +      if (!missed_updates)
+ +              return load;
+ +
+ +      if (missed_updates >= degrade_zero_ticks[idx])
+ +              return 0;
+ +
+ +      if (idx == 1)
+ +              return load >> missed_updates;
+ +
+ +      while (missed_updates) {
+ +              if (missed_updates % 2)
+ +                      load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+ +
+ +              missed_updates >>= 1;
+ +              j++;
+ +      }
+ +      return load;
+ +}
+ +
   /*
    * Update rq->cpu_load[] statistics. This function is usually called every
- - * scheduler tick (TICK_NSEC).
+ + * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ + * every tick. We fix it up based on jiffies.
    */
   static void update_cpu_load(struct rq *this_rq)
   {
         unsigned long this_load = this_rq->load.weight;
+ +      unsigned long curr_jiffies = jiffies;
+ +      unsigned long pending_updates;
         int i, scale;
   
         this_rq->nr_load_updates++;
   
+ +      /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+ +      if (curr_jiffies == this_rq->last_load_update_tick)
+ +              return;
+ +
+ +      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ +      this_rq->last_load_update_tick = curr_jiffies;
+ +
         /* Update our load: */
- -      for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ +      this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ +      for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
                 unsigned long old_load, new_load;
   
                 /* scale is effectively 1 << i now, and >> i divides by scale */
   
                 old_load = this_rq->cpu_load[i];
+ +              old_load = decay_load_missed(old_load, pending_updates - 1, i);
                 new_load = this_load;
                 /*
                  * Round up the averaging division if load is increasing. This
@@@ -3176,15 -3036,9 +3188,15 @@@
                  * example.
                  */
                 if (new_load > old_load)
- -                      new_load += scale-1;
- -              this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ +                      new_load += scale - 1;
+ +
+ +              this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
         }
+ +}
+ +
+ +static void update_cpu_load_active(struct rq *this_rq)
+ +{
+ +      update_cpu_load(this_rq);
   
         calc_load_account_active(this_rq);
   }
@@@ -3572,7 -3426,7 +3584,7 @@@ void scheduler_tick(void
   
         raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
- -      update_cpu_load(rq);
+ +      update_cpu_load_active(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
   
@@@ -3744,6 -3598,7 +3756,6 @@@ need_resched
         rq = cpu_rq(cpu);
         rcu_note_context_switch(cpu);
         prev = rq->curr;
- -      switch_count = &prev->nivcsw;
   
         release_kernel_lock(prev);
   need_resched_nonpreemptible:
@@@ -3756,26 -3611,11 +3768,26 @@@
         raw_spin_lock_irq(&rq->lock);
         clear_tsk_need_resched(prev);
   
+ +      switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- -              if (unlikely(signal_pending_state(prev->state, prev)))
+ +              if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
- -              else
+ +              } else {
+ +                      /*
+ +                       * If a worker is going to sleep, notify and
+ +                       * ask workqueue whether it wants to wake up a
+ +                       * task to maintain concurrency.  If so, wake
+ +                       * up the task.
+ +                       */
+ +                      if (prev->flags & PF_WQ_WORKER) {
+ +                              struct task_struct *to_wakeup;
+ +
+ +                              to_wakeup = wq_worker_sleeping(prev, cpu);
+ +                              if (to_wakeup)
+ +                                      try_to_wake_up_local(to_wakeup);
+ +                      }
                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ +              }
                 switch_count = &prev->nvcsw;
         }
   
@@@ -3797,10 -3637,8 +3809,10 @@@
   
                 context_switch(rq, prev, next); /* unlocks the rq */
                 /*
- -               * the context switch might have flipped the stack from under
- -               * us, hence refresh the local variables.
+ +               * The context switch have flipped the stack from under us
+ +               * and restored the local variables which were saved when
+ +               * this task called schedule() in the past. prev == current
+ +               * is still correct, but it can be moved to another cpu/rq.
                  */
                 cpu = smp_processor_id();
                 rq = cpu_rq(cpu);
@@@ -3809,8 -3647,11 +3821,8 @@@
   
         post_schedule(rq);
   
- -      if (unlikely(reacquire_kernel_lock(current) < 0)) {
- -              prev = rq->curr;
- -              switch_count = &prev->nivcsw;
+ +      if (unlikely(reacquire_kernel_lock(prev)))
                 goto need_resched_nonpreemptible;
- -      }
   
         preempt_enable_no_resched();
         if (need_resched())
@@@ -4600,8 -4441,12 +4612,8 @@@ recheck
          */
         if (user && !capable(CAP_SYS_NICE)) {
                 if (rt_policy(policy)) {
- -                      unsigned long rlim_rtprio;
- -
- -                      if (!lock_task_sighand(p, &flags))
- -                              return -ESRCH;
- -                      rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
- -                      unlock_task_sighand(p, &flags);
+ +                      unsigned long rlim_rtprio =
+ +                                      task_rlimit(p, RLIMIT_RTPRIO);
   
                         /* can't set/change the rt policy */
                         if (policy != p->policy && !rlim_rtprio)
@@@ -5971,49 -5816,20 +5983,49 @@@ migration_call(struct notifier_block *n
    */
   static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
- -      .priority = 10
+ +      .priority = CPU_PRI_MIGRATION,
   };
   
+ +static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+ +                                    unsigned long action, void *hcpu)
+ +{
+ +      switch (action & ~CPU_TASKS_FROZEN) {
+ +      case CPU_ONLINE:
+ +      case CPU_DOWN_FAILED:
+ +              set_cpu_active((long)hcpu, true);
+ +              return NOTIFY_OK;
+ +      default:
+ +              return NOTIFY_DONE;
+ +      }
+ +}
+ +
+ +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+ +                                      unsigned long action, void *hcpu)
+ +{
+ +      switch (action & ~CPU_TASKS_FROZEN) {
+ +      case CPU_DOWN_PREPARE:
+ +              set_cpu_active((long)hcpu, false);
+ +              return NOTIFY_OK;
+ +      default:
+ +              return NOTIFY_DONE;
+ +      }
+ +}
+ +
   static int __init migration_init(void)
   {
         void *cpu = (void *)(long)smp_processor_id();
         int err;
   
- -      /* Start one for the boot CPU: */
+ +      /* Initialize migration for the boot CPU */
         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
         BUG_ON(err == NOTIFY_BAD);
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
   
+ +      /* Register cpu active notifiers */
+ +      cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+ +      cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+ +
         return 0;
   }
   early_initcall(migration_init);
@@@ -6248,18 -6064,23 +6260,18 @@@ static void rq_attach_root(struct rq *r
                 free_rootdomain(old_rd);
   }
   
- -static int init_rootdomain(struct root_domain *rd, bool bootmem)
+ +static int init_rootdomain(struct root_domain *rd)
   {
- -      gfp_t gfp = GFP_KERNEL;
- -
         memset(rd, 0, sizeof(*rd));
   
- -      if (bootmem)
- -              gfp = GFP_NOWAIT;
- -
- -      if (!alloc_cpumask_var(&rd->span, gfp))
+ +      if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
                 goto out;
- -      if (!alloc_cpumask_var(&rd->online, gfp))
+ +      if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
- -      if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+ +      if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_online;
   
- -      if (cpupri_init(&rd->cpupri, bootmem) != 0)
+ +      if (cpupri_init(&rd->cpupri) != 0)
                 goto free_rto_mask;
         return 0;
   
@@@ -6275,7 -6096,7 +6287,7 @@@ out
   
   static void init_defrootdomain(void)
   {
- -      init_rootdomain(&def_root_domain, true);
+ +      init_rootdomain(&def_root_domain);
   
         atomic_set(&def_root_domain.refcount, 1);
   }
@@@ -6288,7 -6109,7 +6300,7 @@@ static struct root_domain *alloc_rootdo
         if (!rd)
                 return NULL;
   
- -      if (init_rootdomain(rd, false) != 0) {
+ +      if (init_rootdomain(rd) != 0) {
                 kfree(rd);
                 return NULL;
         }
@@@ -7467,35 -7288,29 +7479,35 @@@ int __init sched_create_sysfs_power_sav
   }
   #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
   
- -#ifndef CONFIG_CPUSETS
   /*
- - * Add online and remove offline CPUs from the scheduler domains.
- - * When cpusets are enabled they take over this function.
+ + * Update cpusets according to cpu_active mask.  If cpusets are
+ + * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ + * around partition_sched_domains().
    */
- -static int update_sched_domains(struct notifier_block *nfb,
- -                              unsigned long action, void *hcpu)
+ +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ +                           void *hcpu)
   {
- -      switch (action) {
+ +      switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_ONLINE:
- -      case CPU_ONLINE_FROZEN:
- -      case CPU_DOWN_PREPARE:
- -      case CPU_DOWN_PREPARE_FROZEN:
         case CPU_DOWN_FAILED:
- -      case CPU_DOWN_FAILED_FROZEN:
- -              partition_sched_domains(1, NULL, NULL);
+ +              cpuset_update_active_cpus();
                 return NOTIFY_OK;
+ +      default:
+ +              return NOTIFY_DONE;
+ +      }
+ +}
   
+ +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ +                             void *hcpu)
+ +{
+ +      switch (action & ~CPU_TASKS_FROZEN) {
+ +      case CPU_DOWN_PREPARE:
+ +              cpuset_update_active_cpus();
+ +              return NOTIFY_OK;
         default:
                 return NOTIFY_DONE;
         }
   }
- -#endif
   
   static int update_runtime(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
@@@ -7541,8 -7356,10 +7553,8 @@@ void __init sched_init_smp(void
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
   
- -#ifndef CONFIG_CPUSETS
- -      /* XXX: Theoretical race here - CPU may be hotplugged now */
- -      hotcpu_notifier(update_sched_domains, 0);
- -#endif
+ +      hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+ +      hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
   
         /* RT runtime code needs to handle some hotplug events */
         hotcpu_notifier(update_runtime, 0);
@@@ -7787,9 -7604,6 +7799,9 @@@ void __init sched_init(void
   
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                         rq->cpu_load[j] = 0;
+ +
+ +              rq->last_load_update_tick = jiffies;
+ +
   #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
@@@ -7803,10 -7617,6 +7815,10 @@@
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
                 rq_attach_root(rq, &def_root_domain);
+ +#ifdef CONFIG_NO_HZ
+ +              rq->nohz_balance_kick = 0;
+ +              init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+ +#endif
   #endif
                 init_rq_hrtick(rq);
                 atomic_set(&rq->nr_iowait, 0);
@@@ -7851,11 -7661,8 +7863,11 @@@
         zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
   #ifdef CONFIG_SMP
   #ifdef CONFIG_NO_HZ
- -      zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
- -      alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+ +      zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ +      alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+ +      atomic_set(&nohz.load_balancer, nr_cpu_ids);
+ +      atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+ +      atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
   #endif
         /* May be allocated at isolcpus cmdline parse time */
         if (cpu_isolated_map == NULL)
diff --combined kernel/sched_fair.c

index c9ac097609537e90031d8ba9cd8f491f2fcc3535,a878b5332daad5d7db16625f298a4e963edac909..806d1b227a21060aac100994a8992266c00b59b5
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -1240,6 -1240,7 +1240,7 @@@ static int wake_affine(struct sched_dom
          * effect of the currently running task from the load
          * of the current CPU:
          */
+       rcu_read_lock();
         if (sync) {
                 tg = task_group(current);
                 weight = current->se.load.weight;
@@@ -1275,6 -1276,7 +1276,7 @@@
                 balanced = this_eff_load <= prev_eff_load;
         } else
                 balanced = true;
+       rcu_read_unlock();
   
         /*
          * If the currently running task will sleep within
@@@ -2285,6 -2287,13 +2287,6 @@@ static void update_cpu_power(struct sch
         unsigned long power = SCHED_LOAD_SCALE;
         struct sched_group *sdg = sd->groups;
   
- -      if (sched_feat(ARCH_POWER))
- -              power *= arch_scale_freq_power(sd, cpu);
- -      else
- -              power *= default_scale_freq_power(sd, cpu);
- -
- -      power >>= SCHED_LOAD_SHIFT;
- -
         if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
                 if (sched_feat(ARCH_POWER))
                         power *= arch_scale_smt_power(sd, cpu);
@@@ -2294,15 -2303,6 +2296,15 @@@
                 power >>= SCHED_LOAD_SHIFT;
         }
   
+ +      sdg->cpu_power_orig = power;
+ +
+ +      if (sched_feat(ARCH_POWER))
+ +              power *= arch_scale_freq_power(sd, cpu);
+ +      else
+ +              power *= default_scale_freq_power(sd, cpu);
+ +
+ +      power >>= SCHED_LOAD_SHIFT;
+ +
         power *= scale_rt_power(cpu);
         power >>= SCHED_LOAD_SHIFT;
   
@@@ -2335,31 -2335,6 +2337,31 @@@ static void update_group_power(struct s
         sdg->cpu_power = power;
   }
   
+ +/*
+ + * Try and fix up capacity for tiny siblings, this is needed when
+ + * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ + * which on its own isn't powerful enough.
+ + *
+ + * See update_sd_pick_busiest() and check_asym_packing().
+ + */
+ +static inline int
+ +fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+ +{
+ +      /*
+ +       * Only siblings can have significantly less than SCHED_LOAD_SCALE
+ +       */
+ +      if (sd->level != SD_LV_SIBLING)
+ +              return 0;
+ +
+ +      /*
+ +       * If ~90% of the cpu_power is still there, we're good.
+ +       */
+ +      if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ +              return 1;
+ +
+ +      return 0;
+ +}
+ +
   /**
    * update_sg_lb_stats - Update sched_group's statistics for load balancing.
    * @sd: The sched_domain whose statistics are to be updated.
@@@ -2425,14 -2400,14 +2427,14 @@@ static inline void update_sg_lb_stats(s
          * domains. In the newly idle case, we will allow all the cpu's
          * to do the newly idle load balance.
          */
- -      if (idle != CPU_NEWLY_IDLE && local_group &&
- -          balance_cpu != this_cpu) {
- -              *balance = 0;
- -              return;
+ +      if (idle != CPU_NEWLY_IDLE && local_group) {
+ +              if (balance_cpu != this_cpu) {
+ +                      *balance = 0;
+ +                      return;
+ +              }
+ +              update_group_power(sd, this_cpu);
         }
   
- -      update_group_power(sd, this_cpu);
- -
         /* Adjust by relative CPU power of the group */
         sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
   
@@@ -2453,51 -2428,6 +2455,51 @@@
   
         sgs->group_capacity =
                 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ +      if (!sgs->group_capacity)
+ +              sgs->group_capacity = fix_small_capacity(sd, group);
+ +}
+ +
+ +/**
+ + * update_sd_pick_busiest - return 1 on busiest group
+ + * @sd: sched_domain whose statistics are to be checked
+ + * @sds: sched_domain statistics
+ + * @sg: sched_group candidate to be checked for being the busiest
+ + * @sgs: sched_group statistics
+ + * @this_cpu: the current cpu
+ + *
+ + * Determine if @sg is a busier group than the previously selected
+ + * busiest group.
+ + */
+ +static bool update_sd_pick_busiest(struct sched_domain *sd,
+ +                                 struct sd_lb_stats *sds,
+ +                                 struct sched_group *sg,
+ +                                 struct sg_lb_stats *sgs,
+ +                                 int this_cpu)
+ +{
+ +      if (sgs->avg_load <= sds->max_load)
+ +              return false;
+ +
+ +      if (sgs->sum_nr_running > sgs->group_capacity)
+ +              return true;
+ +
+ +      if (sgs->group_imb)
+ +              return true;
+ +
+ +      /*
+ +       * ASYM_PACKING needs to move all the work to the lowest
+ +       * numbered CPUs in the group, therefore mark all groups
+ +       * higher than ourself as busy.
+ +       */
+ +      if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ +          this_cpu < group_first_cpu(sg)) {
+ +              if (!sds->busiest)
+ +                      return true;
+ +
+ +              if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ +                      return true;
+ +      }
+ +
+ +      return false;
   }
   
   /**
@@@ -2505,7 -2435,7 +2507,7 @@@
    * @sd: sched_domain whose statistics are to be updated.
    * @this_cpu: Cpu for which load balance is currently performed.
    * @idle: Idle status of this_cpu
- - * @sd_idle: Idle status of the sched_domain containing group.
+ + * @sd_idle: Idle status of the sched_domain containing sg.
    * @cpus: Set of cpus considered for load balancing.
    * @balance: Should we balance.
    * @sds: variable to hold the statistics for this sched_domain.
@@@ -2516,7 -2446,7 +2518,7 @@@ static inline void update_sd_lb_stats(s
                         struct sd_lb_stats *sds)
   {
         struct sched_domain *child = sd->child;
- -      struct sched_group *group = sd->groups;
+ +      struct sched_group *sg = sd->groups;
         struct sg_lb_stats sgs;
         int load_idx, prefer_sibling = 0;
   
@@@ -2529,20 -2459,21 +2531,20 @@@
         do {
                 int local_group;
   
- -              local_group = cpumask_test_cpu(this_cpu,
- -                                             sched_group_cpus(group));
+ +              local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
                 memset(&sgs, 0, sizeof(sgs));
- -              update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+ +              update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
                                 local_group, cpus, balance, &sgs);
   
                 if (local_group && !(*balance))
                         return;
   
                 sds->total_load += sgs.group_load;
- -              sds->total_pwr += group->cpu_power;
+ +              sds->total_pwr += sg->cpu_power;
   
                 /*
                  * In case the child domain prefers tasks go to siblings
- -               * first, lower the group capacity to one so that we'll try
+ +               * first, lower the sg capacity to one so that we'll try
                  * and move all the excess tasks away.
                  */
                 if (prefer_sibling)
@@@ -2550,72 -2481,23 +2552,72 @@@
   
                 if (local_group) {
                         sds->this_load = sgs.avg_load;
- -                      sds->this = group;
+ +                      sds->this = sg;
                         sds->this_nr_running = sgs.sum_nr_running;
                         sds->this_load_per_task = sgs.sum_weighted_load;
- -              } else if (sgs.avg_load > sds->max_load &&
- -                         (sgs.sum_nr_running > sgs.group_capacity ||
- -                              sgs.group_imb)) {
+ +              } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                         sds->max_load = sgs.avg_load;
- -                      sds->busiest = group;
+ +                      sds->busiest = sg;
                         sds->busiest_nr_running = sgs.sum_nr_running;
                         sds->busiest_group_capacity = sgs.group_capacity;
                         sds->busiest_load_per_task = sgs.sum_weighted_load;
                         sds->group_imb = sgs.group_imb;
                 }
   
- -              update_sd_power_savings_stats(group, sds, local_group, &sgs);
- -              group = group->next;
- -      } while (group != sd->groups);
+ +              update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+ +              sg = sg->next;
+ +      } while (sg != sd->groups);
+ +}
+ +
+ +int __weak arch_sd_sibling_asym_packing(void)
+ +{
+ +       return 0*SD_ASYM_PACKING;
+ +}
+ +
+ +/**
+ + * check_asym_packing - Check to see if the group is packed into the
+ + *                    sched doman.
+ + *
+ + * This is primarily intended to used at the sibling level.  Some
+ + * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ + * case of POWER7, it can move to lower SMT modes only when higher
+ + * threads are idle.  When in lower SMT modes, the threads will
+ + * perform better since they share less core resources.  Hence when we
+ + * have idle threads, we want them to be the higher ones.
+ + *
+ + * This packing function is run on idle threads.  It checks to see if
+ + * the busiest CPU in this domain (core in the P7 case) has a higher
+ + * CPU number than the packing function is being run on.  Here we are
+ + * assuming lower CPU number will be equivalent to lower a SMT thread
+ + * number.
+ + *
+ + * Returns 1 when packing is required and a task should be moved to
+ + * this CPU.  The amount of the imbalance is returned in *imbalance.
+ + *
+ + * @sd: The sched_domain whose packing is to be checked.
+ + * @sds: Statistics of the sched_domain which is to be packed
+ + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ + * @imbalance: returns amount of imbalanced due to packing.
+ + */
+ +static int check_asym_packing(struct sched_domain *sd,
+ +                            struct sd_lb_stats *sds,
+ +                            int this_cpu, unsigned long *imbalance)
+ +{
+ +      int busiest_cpu;
+ +
+ +      if (!(sd->flags & SD_ASYM_PACKING))
+ +              return 0;
+ +
+ +      if (!sds->busiest)
+ +              return 0;
+ +
+ +      busiest_cpu = group_first_cpu(sds->busiest);
+ +      if (this_cpu > busiest_cpu)
+ +              return 0;
+ +
+ +      *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ +                                     SCHED_LOAD_SCALE);
+ +      return 1;
   }
   
   /**
@@@ -2810,10 -2692,6 +2812,10 @@@ find_busiest_group(struct sched_domain 
         if (!(*balance))
                 goto ret;
   
+ +      if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+ +          check_asym_packing(sd, &sds, this_cpu, imbalance))
+ +              return sds.busiest;
+ +
         if (!sds.busiest || sds.busiest_nr_running == 0)
                 goto out_balanced;
   
@@@ -2848,9 -2726,8 +2850,9 @@@ ret
    * find_busiest_queue - find the busiest runqueue among the cpus in group.
    */
   static struct rq *
- -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- -                 unsigned long imbalance, const struct cpumask *cpus)
+ +find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+ +                 enum cpu_idle_type idle, unsigned long imbalance,
+ +                 const struct cpumask *cpus)
   {
         struct rq *busiest = NULL, *rq;
         unsigned long max_load = 0;
@@@ -2861,9 -2738,6 +2863,9 @@@
                 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
                 unsigned long wl;
   
+ +              if (!capacity)
+ +                      capacity = fix_small_capacity(sd, group);
+ +
                 if (!cpumask_test_cpu(i, cpus))
                         continue;
   
@@@ -2903,19 -2777,9 +2905,19 @@@
   /* Working cpumask for load_balance and load_balance_newidle. */
   static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
   
- -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+ +static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+ +                             int busiest_cpu, int this_cpu)
   {
         if (idle == CPU_NEWLY_IDLE) {
+ +
+ +              /*
+ +               * ASYM_PACKING needs to force migrate tasks from busy but
+ +               * higher numbered CPUs in order to pack all tasks in the
+ +               * lowest numbered CPUs.
+ +               */
+ +              if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ +                      return 1;
+ +
                 /*
                  * The only task running in a non-idle cpu can be moved to this
                  * cpu in an attempt to completely freeup the other CPU
@@@ -2990,7 -2854,7 +2992,7 @@@ redo
                 goto out_balanced;
         }
   
- -      busiest = find_busiest_queue(group, idle, imbalance, cpus);
+ +      busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
         if (!busiest) {
                 schedstat_inc(sd, lb_nobusyq[idle]);
                 goto out_balanced;
@@@ -3034,8 -2898,7 +3036,8 @@@
                 schedstat_inc(sd, lb_failed[idle]);
                 sd->nr_balance_failed++;
   
- -              if (need_active_balance(sd, sd_idle, idle)) {
+ +              if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+ +                                      this_cpu)) {
                         raw_spin_lock_irqsave(&busiest->lock, flags);
   
                         /* don't kick the active_load_balance_cpu_stop,
@@@ -3230,40 -3093,13 +3232,40 @@@ out_unlock
   }
   
   #ifdef CONFIG_NO_HZ
+ +
+ +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+ +
+ +static void trigger_sched_softirq(void *data)
+ +{
+ +      raise_softirq_irqoff(SCHED_SOFTIRQ);
+ +}
+ +
+ +static inline void init_sched_softirq_csd(struct call_single_data *csd)
+ +{
+ +      csd->func = trigger_sched_softirq;
+ +      csd->info = NULL;
+ +      csd->flags = 0;
+ +      csd->priv = 0;
+ +}
+ +
+ +/*
+ + * idle load balancing details
+ + * - One of the idle CPUs nominates itself as idle load_balancer, while
+ + *   entering idle.
+ + * - This idle load balancer CPU will also go into tickless mode when
+ + *   it is idle, just like all other idle CPUs
+ + * - When one of the busy CPUs notice that there may be an idle rebalancing
+ + *   needed, they will kick the idle load balancer, which then does idle
+ + *   load balancing for all the idle CPUs.
+ + */
   static struct {
         atomic_t load_balancer;
- -      cpumask_var_t cpu_mask;
- -      cpumask_var_t ilb_grp_nohz_mask;
- -} nohz ____cacheline_aligned = {
- -      .load_balancer = ATOMIC_INIT(-1),
- -};
+ +      atomic_t first_pick_cpu;
+ +      atomic_t second_pick_cpu;
+ +      cpumask_var_t idle_cpus_mask;
+ +      cpumask_var_t grp_idle_mask;
+ +      unsigned long next_balance;     /* in jiffy units */
+ +} nohz ____cacheline_aligned;
   
   int get_nohz_load_balancer(void)
   {
@@@ -3317,17 -3153,17 +3319,17 @@@ static inline struct sched_domain *lowe
    */
   static inline int is_semi_idle_group(struct sched_group *ilb_group)
   {
- -      cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ +      cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
                                         sched_group_cpus(ilb_group));
   
         /*
          * A sched_group is semi-idle when it has atleast one busy cpu
          * and atleast one idle cpu.
          */
- -      if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ +      if (cpumask_empty(nohz.grp_idle_mask))
                 return 0;
   
- -      if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ +      if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
                 return 0;
   
         return 1;
@@@ -3360,7 -3196,7 +3362,7 @@@ static int find_new_ilb(int cpu
          * Optimize for the case when we have no idle CPUs or only one
          * idle CPU. Don't walk the sched_domain hierarchy in such cases
          */
- -      if (cpumask_weight(nohz.cpu_mask) < 2)
+ +      if (cpumask_weight(nohz.idle_cpus_mask) < 2)
                 goto out_done;
   
         for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@@ -3368,7 -3204,7 +3370,7 @@@
   
                 do {
                         if (is_semi_idle_group(ilb_group))
- -                              return cpumask_first(nohz.ilb_grp_nohz_mask);
+ +                              return cpumask_first(nohz.grp_idle_mask);
   
                         ilb_group = ilb_group->next;
   
@@@ -3376,116 -3212,98 +3378,116 @@@
         }
   
   out_done:
- -      return cpumask_first(nohz.cpu_mask);
+ +      return nr_cpu_ids;
   }
   #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
   static inline int find_new_ilb(int call_cpu)
   {
- -      return cpumask_first(nohz.cpu_mask);
+ +      return nr_cpu_ids;
   }
   #endif
   
+ +/*
+ + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ + * CPU (if there is one).
+ + */
+ +static void nohz_balancer_kick(int cpu)
+ +{
+ +      int ilb_cpu;
+ +
+ +      nohz.next_balance++;
+ +
+ +      ilb_cpu = get_nohz_load_balancer();
+ +
+ +      if (ilb_cpu >= nr_cpu_ids) {
+ +              ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+ +              if (ilb_cpu >= nr_cpu_ids)
+ +                      return;
+ +      }
+ +
+ +      if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+ +              struct call_single_data *cp;
+ +
+ +              cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+ +              cp = &per_cpu(remote_sched_softirq_cb, cpu);
+ +              __smp_call_function_single(ilb_cpu, cp, 0);
+ +      }
+ +      return;
+ +}
+ +
   /*
    * This routine will try to nominate the ilb (idle load balancing)
    * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- - * load balancing on behalf of all those cpus. If all the cpus in the system
- - * go into this tickless mode, then there will be no ilb owner (as there is
- - * no need for one) and all the cpus will sleep till the next wakeup event
- - * arrives...
- - *
- - * For the ilb owner, tick is not stopped. And this tick will be used
- - * for idle load balancing. ilb owner will still be part of
- - * nohz.cpu_mask..
+ + * load balancing on behalf of all those cpus.
    *
- - * While stopping the tick, this cpu will become the ilb owner if there
- - * is no other owner. And will be the owner till that cpu becomes busy
- - * or if all cpus in the system stop their ticks at which point
- - * there is no need for ilb owner.
+ + * When the ilb owner becomes busy, we will not have new ilb owner until some
+ + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ + * idle load balancing by kicking one of the idle CPUs.
    *
- - * When the ilb owner becomes busy, it nominates another owner, during the
- - * next busy scheduler_tick()
+ + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ + * ilb owner CPU in future (when there is a need for idle load balancing on
+ + * behalf of all idle CPUs).
    */
- -int select_nohz_load_balancer(int stop_tick)
+ +void select_nohz_load_balancer(int stop_tick)
   {
         int cpu = smp_processor_id();
   
         if (stop_tick) {
- -              cpu_rq(cpu)->in_nohz_recently = 1;
- -
                 if (!cpu_active(cpu)) {
                         if (atomic_read(&nohz.load_balancer) != cpu)
- -                              return 0;
+ +                              return;
   
                         /*
                          * If we are going offline and still the leader,
                          * give up!
                          */
- -                      if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ +                      if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ +                                         nr_cpu_ids) != cpu)
                                 BUG();
   
- -                      return 0;
+ +                      return;
                 }
   
- -              cpumask_set_cpu(cpu, nohz.cpu_mask);
+ +              cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
   
- -              /* time for ilb owner also to sleep */
- -              if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
- -                      if (atomic_read(&nohz.load_balancer) == cpu)
- -                              atomic_set(&nohz.load_balancer, -1);
- -                      return 0;
- -              }
+ +              if (atomic_read(&nohz.first_pick_cpu) == cpu)
+ +                      atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+ +              if (atomic_read(&nohz.second_pick_cpu) == cpu)
+ +                      atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
   
- -              if (atomic_read(&nohz.load_balancer) == -1) {
- -                      /* make me the ilb owner */
- -                      if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
- -                              return 1;
- -              } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ +              if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
                         int new_ilb;
   
- -                      if (!(sched_smt_power_savings ||
- -                                              sched_mc_power_savings))
- -                              return 1;
+ +                      /* make me the ilb owner */
+ +                      if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+ +                                         cpu) != nr_cpu_ids)
+ +                              return;
+ +
                         /*
                          * Check to see if there is a more power-efficient
                          * ilb.
                          */
                         new_ilb = find_new_ilb(cpu);
                         if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- -                              atomic_set(&nohz.load_balancer, -1);
+ +                              atomic_set(&nohz.load_balancer, nr_cpu_ids);
                                 resched_cpu(new_ilb);
- -                              return 0;
+ +                              return;
                         }
- -                      return 1;
+ +                      return;
                 }
         } else {
- -              if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
- -                      return 0;
+ +              if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ +                      return;
   
- -              cpumask_clear_cpu(cpu, nohz.cpu_mask);
+ +              cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
   
                 if (atomic_read(&nohz.load_balancer) == cpu)
- -                      if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ +                      if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ +                                         nr_cpu_ids) != cpu)
                                 BUG();
         }
- -      return 0;
+ +      return;
   }
   #endif
   
@@@ -3567,102 -3385,11 +3569,102 @@@ out
                 rq->next_balance = next_balance;
   }
   
+ +#ifdef CONFIG_NO_HZ
   /*
- - * run_rebalance_domains is triggered when needed from the scheduler tick.
- - * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ + * In CONFIG_NO_HZ case, the idle balance kickee will do the
    * rebalancing for all the cpus for whom scheduler ticks are stopped.
    */
+ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+ +{
+ +      struct rq *this_rq = cpu_rq(this_cpu);
+ +      struct rq *rq;
+ +      int balance_cpu;
+ +
+ +      if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+ +              return;
+ +
+ +      for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ +              if (balance_cpu == this_cpu)
+ +                      continue;
+ +
+ +              /*
+ +               * If this cpu gets work to do, stop the load balancing
+ +               * work being done for other cpus. Next load
+ +               * balancing owner will pick it up.
+ +               */
+ +              if (need_resched()) {
+ +                      this_rq->nohz_balance_kick = 0;
+ +                      break;
+ +              }
+ +
+ +              raw_spin_lock_irq(&this_rq->lock);
+ +              update_rq_clock(this_rq);
+ +              update_cpu_load(this_rq);
+ +              raw_spin_unlock_irq(&this_rq->lock);
+ +
+ +              rebalance_domains(balance_cpu, CPU_IDLE);
+ +
+ +              rq = cpu_rq(balance_cpu);
+ +              if (time_after(this_rq->next_balance, rq->next_balance))
+ +                      this_rq->next_balance = rq->next_balance;
+ +      }
+ +      nohz.next_balance = this_rq->next_balance;
+ +      this_rq->nohz_balance_kick = 0;
+ +}
+ +
+ +/*
+ + * Current heuristic for kicking the idle load balancer
+ + * - first_pick_cpu is the one of the busy CPUs. It will kick
+ + *   idle load balancer when it has more than one process active. This
+ + *   eliminates the need for idle load balancing altogether when we have
+ + *   only one running process in the system (common case).
+ + * - If there are more than one busy CPU, idle load balancer may have
+ + *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ + *   SMT or core siblings and can run better if they move to different
+ + *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ + *   which will kick idle load balancer as soon as it has any load.
+ + */
+ +static inline int nohz_kick_needed(struct rq *rq, int cpu)
+ +{
+ +      unsigned long now = jiffies;
+ +      int ret;
+ +      int first_pick_cpu, second_pick_cpu;
+ +
+ +      if (time_before(now, nohz.next_balance))
+ +              return 0;
+ +
+ +      if (!rq->nr_running)
+ +              return 0;
+ +
+ +      first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+ +      second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+ +
+ +      if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+ +          second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+ +              return 0;
+ +
+ +      ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+ +      if (ret == nr_cpu_ids || ret == cpu) {
+ +              atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ +              if (rq->nr_running > 1)
+ +                      return 1;
+ +      } else {
+ +              ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+ +              if (ret == nr_cpu_ids || ret == cpu) {
+ +                      if (rq->nr_running)
+ +                              return 1;
+ +              }
+ +      }
+ +      return 0;
+ +}
+ +#else
+ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+ +#endif
+ +
+ +/*
+ + * run_rebalance_domains is triggered when needed from the scheduler tick.
+ + * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ + */
   static void run_rebalance_domains(struct softirq_action *h)
   {
         int this_cpu = smp_processor_id();
@@@ -3672,12 -3399,37 +3674,12 @@@
   
         rebalance_domains(this_cpu, idle);
   
- -#ifdef CONFIG_NO_HZ
         /*
- -       * If this cpu is the owner for idle load balancing, then do the
+ +       * If this cpu has a pending nohz_balance_kick, then do the
          * balancing on behalf of the other idle cpus whose ticks are
          * stopped.
          */
- -      if (this_rq->idle_at_tick &&
- -          atomic_read(&nohz.load_balancer) == this_cpu) {
- -              struct rq *rq;
- -              int balance_cpu;
- -
- -              for_each_cpu(balance_cpu, nohz.cpu_mask) {
- -                      if (balance_cpu == this_cpu)
- -                              continue;
- -
- -                      /*
- -                       * If this cpu gets work to do, stop the load balancing
- -                       * work being done for other cpus. Next load
- -                       * balancing owner will pick it up.
- -                       */
- -                      if (need_resched())
- -                              break;
- -
- -                      rebalance_domains(balance_cpu, CPU_IDLE);
- -
- -                      rq = cpu_rq(balance_cpu);
- -                      if (time_after(this_rq->next_balance, rq->next_balance))
- -                              this_rq->next_balance = rq->next_balance;
- -              }
- -      }
- -#endif
+ +      nohz_idle_balance(this_cpu, idle);
   }
   
   static inline int on_null_domain(int cpu)
@@@ -3687,17 -3439,57 +3689,17 @@@
   
   /*
    * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- - *
- - * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- - * idle load balancing owner or decide to stop the periodic load balancing,
- - * if the whole system is idle.
    */
   static inline void trigger_load_balance(struct rq *rq, int cpu)
   {
- -#ifdef CONFIG_NO_HZ
- -      /*
- -       * If we were in the nohz mode recently and busy at the current
- -       * scheduler tick, then check if we need to nominate new idle
- -       * load balancer.
- -       */
- -      if (rq->in_nohz_recently && !rq->idle_at_tick) {
- -              rq->in_nohz_recently = 0;
- -
- -              if (atomic_read(&nohz.load_balancer) == cpu) {
- -                      cpumask_clear_cpu(cpu, nohz.cpu_mask);
- -                      atomic_set(&nohz.load_balancer, -1);
- -              }
- -
- -              if (atomic_read(&nohz.load_balancer) == -1) {
- -                      int ilb = find_new_ilb(cpu);
- -
- -                      if (ilb < nr_cpu_ids)
- -                              resched_cpu(ilb);
- -              }
- -      }
- -
- -      /*
- -       * If this cpu is idle and doing idle load balancing for all the
- -       * cpus with ticks stopped, is it time for that to stop?
- -       */
- -      if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- -          cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
- -              resched_cpu(cpu);
- -              return;
- -      }
- -
- -      /*
- -       * If this cpu is idle and the idle load balancing is done by
- -       * someone else, then no need raise the SCHED_SOFTIRQ
- -       */
- -      if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- -          cpumask_test_cpu(cpu, nohz.cpu_mask))
- -              return;
- -#endif
         /* Don't need to rebalance while attached to NULL domain */
         if (time_after_eq(jiffies, rq->next_balance) &&
             likely(!on_null_domain(cpu)))
                 raise_softirq(SCHED_SOFTIRQ);
+ +#ifdef CONFIG_NO_HZ
+ +      else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ +              nohz_balancer_kick(cpu);
+ +#endif
   }
   
   static void rq_online_fair(struct rq *rq)
diff --combined kernel/time/tick-sched.c

index 5f171f04ab009756879abed192029190483ffa3e,813993b5fb61048f24f876a7247e4f8ac2c861a2..17525cac6cfefd136d9a4f68e942dfe0346bffa1
--- 1/kernel/time/tick-sched.c
--- 2/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@@ -154,14 -154,14 +154,14 @@@ static void tick_nohz_update_jiffies(kt
    * Updates the per cpu time idle statistics counters
    */
   static void
- update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
   {
         ktime_t delta;
   
         if (ts->idle_active) {
                 delta = ktime_sub(now, ts->idle_entrytime);
                 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-               if (nr_iowait_cpu() > 0)
+               if (nr_iowait_cpu(cpu) > 0)
                         ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
                 ts->idle_entrytime = now;
         }
@@@ -175,19 -175,19 +175,19 @@@ static void tick_nohz_stop_idle(int cpu
   {
         struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
   
-       update_ts_time_stats(ts, now, NULL);
+       update_ts_time_stats(cpu, ts, now, NULL);
         ts->idle_active = 0;
   
         sched_clock_idle_wakeup_event(0);
   }
   
- static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
   {
         ktime_t now;
   
         now = ktime_get();
   
-       update_ts_time_stats(ts, now, NULL);
+       update_ts_time_stats(cpu, ts, now, NULL);
   
         ts->idle_entrytime = now;
         ts->idle_active = 1;
@@@ -216,7 -216,7 +216,7 @@@ u64 get_cpu_idle_time_us(int cpu, u64 *
         if (!tick_nohz_enabled)
                 return -1;
   
-       update_ts_time_stats(ts, ktime_get(), last_update_time);
+       update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
   
         return ktime_to_us(ts->idle_sleeptime);
   }
@@@ -242,7 -242,7 +242,7 @@@ u64 get_cpu_iowait_time_us(int cpu, u6
         if (!tick_nohz_enabled)
                 return -1;
   
-       update_ts_time_stats(ts, ktime_get(), last_update_time);
+       update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
   
         return ktime_to_us(ts->iowait_sleeptime);
   }
@@@ -284,7 -284,7 +284,7 @@@ void tick_nohz_stop_sched_tick(int inid
          */
         ts->inidle = 1;
   
-       now = tick_nohz_start_idle(ts);
+       now = tick_nohz_start_idle(cpu, ts);
   
         /*
          * If this cpu is offline and it is the one which updates
@@@ -315,9 -315,6 +315,6 @@@
                 goto end;
         }
   
-       if (nohz_ratelimit(cpu))
-               goto end;
- 
         ts->idle_calls++;
         /* Read jiffies and the time when jiffies were updated last */
         do {
@@@ -328,7 -325,7 +325,7 @@@
         } while (read_seqretry(&xtime_lock, seq));
   
         if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-           arch_needs_cpu(cpu)) {
+           arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
                 next_jiffies = last_jiffies + 1;
                 delta_jiffies = 1;
         } else {
@@@ -408,7 -405,13 +405,7 @@@
                  * the scheduler tick in nohz_restart_sched_tick.
                  */
                 if (!ts->tick_stopped) {
- -                      if (select_nohz_load_balancer(1)) {
- -                              /*
- -                               * sched tick not stopped!
- -                               */
- -                              cpumask_clear_cpu(cpu, nohz_cpu_mask);
- -                              goto out;
- -                      }
+ +                      select_nohz_load_balancer(1);
   
                         ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                         ts->tick_stopped = 1;
author	Ingo Molnar <mingo@elte.hu>
	Wed, 21 Jul 2010 19:45:02 +0000 (21:45 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Wed, 21 Jul 2010 19:45:08 +0000 (21:45 +0200)
		1	2
arch/powerpc/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/tick-sched.c	patch \|	diff1 \|	diff2 \|	blob \| history