sched: Merge select_task_rq_fair() and sched_balance_self()
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Thu, 10 Sep 2009 11:50:02 +0000 (13:50 +0200)
committerIngo Molnar <mingo@elte.hu>
Tue, 15 Sep 2009 14:01:05 +0000 (16:01 +0200)
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.

To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().

Modify sched_balance_self() to:

  - update_shares() when walking up the domain tree,
    (it only called it for the top domain, but it should
     have done this anyway), which allows us to remove
    this ugly bit from try_to_wake_up().

  - do wake_affine() on the smallest domain that contains
    both this (the waking) and the prev (the wakee) cpu for
    WAKE invocations.

Then use the top-down balance steps it had to replace wake_idle().

This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.

SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.

Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/ia64/include/asm/topology.h
arch/mips/include/asm/mach-ip27/topology.h
arch/powerpc/include/asm/topology.h
arch/sh/include/asm/topology.h
arch/sparc/include/asm/topology_64.h
arch/x86/include/asm/topology.h
include/linux/sched.h
include/linux/topology.h
kernel/sched.c
kernel/sched_fair.c

index 7b4c8c70b2d18f7949372899ab056f41d913ed6b..cf6053b226c38383de57774346fdb1ed4b4d8e39 100644 (file)
@@ -67,6 +67,7 @@ void build_cpu_to_node_map(void);
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_NEWIDLE    \
                                | SD_BALANCE_EXEC       \
+                               | SD_BALANCE_WAKE       \
                                | SD_WAKE_AFFINE,       \
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
@@ -91,8 +92,8 @@ void build_cpu_to_node_map(void);
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_EXEC       \
                                | SD_BALANCE_FORK       \
-                               | SD_SERIALIZE          \
-                               | SD_WAKE_BALANCE,      \
+                               | SD_BALANCE_WAKE       \
+                               | SD_SERIALIZE,         \
        .last_balance           = jiffies,              \
        .balance_interval       = 64,                   \
        .nr_balance_failed      = 0,                    \
index 07547231e078dd3ecd35a988b3cd73bfc30c3683..d8332398f5be1eccd46936f00f685bc9403d90ca 100644 (file)
@@ -48,7 +48,7 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
        .cache_nice_tries       = 1,                    \
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_EXEC       \
-                               | SD_WAKE_BALANCE,      \
+                               | SD_BALANCE_WAKE,      \
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
        .nr_balance_failed      = 0,                    \
index 054a16d68082a852c460d96436e9f3b7bb3ce41c..c6343313ff592f7d8ccd9ffd8913f3a2e0dcc771 100644 (file)
@@ -62,9 +62,8 @@ static inline int pcibus_to_node(struct pci_bus *bus)
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_EXEC       \
                                | SD_BALANCE_NEWIDLE    \
-                               | SD_WAKE_IDLE          \
-                               | SD_SERIALIZE          \
-                               | SD_WAKE_BALANCE,      \
+                               | SD_BALANCE_WAKE       \
+                               | SD_SERIALIZE,         \
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
        .nr_balance_failed      = 0,                    \
index b69ee850906d037e5bf24be4c19766729ede6825..dc1531e2f25f52cfe235038a0579b094d80c858d 100644 (file)
@@ -21,8 +21,8 @@
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_FORK       \
                                | SD_BALANCE_EXEC       \
-                               | SD_SERIALIZE          \
-                               | SD_WAKE_BALANCE,      \
+                               | SD_BALANCE_WAKE       \
+                               | SD_SERIALIZE,         \
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
        .nr_balance_failed      = 0,                    \
index e5ea8d332421d1ad1441541c8e013d62d099e444..1d091abd2d132d4ddf1208d81f52c9742f046cf9 100644 (file)
@@ -57,8 +57,8 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_FORK       \
                                | SD_BALANCE_EXEC       \
-                               | SD_SERIALIZE          \
-                               | SD_WAKE_BALANCE,      \
+                               | SD_BALANCE_WAKE       \
+                               | SD_SERIALIZE,         \
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
 }
index 26d06e052a181073851197c2110ce8d61916d5be..966d58dc62742ca1802c5f6b582ff9dcec65a214 100644 (file)
@@ -145,14 +145,12 @@ extern unsigned long node_remap_size[];
                                | 1*SD_BALANCE_NEWIDLE                  \
                                | 1*SD_BALANCE_EXEC                     \
                                | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_WAKE_IDLE                        \
+                               | 1*SD_BALANCE_WAKE                     \
                                | 1*SD_WAKE_AFFINE                      \
-                               | 1*SD_WAKE_BALANCE                     \
                                | 0*SD_SHARE_CPUPOWER                   \
                                | 0*SD_POWERSAVINGS_BALANCE             \
                                | 0*SD_SHARE_PKG_RESOURCES              \
                                | 1*SD_SERIALIZE                        \
-                               | 1*SD_WAKE_IDLE_FAR                    \
                                | 0*SD_PREFER_SIBLING                   \
                                ,                                       \
        .last_balance           = jiffies,                              \
index 3b0ca66bd6cee1c577d0ee9751797ac47517afdc..c30bf3d516d11fa180579215402b392331e3fbf0 100644 (file)
@@ -803,16 +803,15 @@ enum cpu_idle_type {
 #define SD_BALANCE_NEWIDLE     0x0002  /* Balance when about to become idle */
 #define SD_BALANCE_EXEC                0x0004  /* Balance on exec */
 #define SD_BALANCE_FORK                0x0008  /* Balance on fork, clone */
-#define SD_WAKE_IDLE           0x0010  /* Wake to idle CPU on task wakeup */
+#define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
-#define SD_WAKE_BALANCE                0x0040  /* Perform balancing at task wakeup */
+
 #define SD_SHARE_CPUPOWER      0x0080  /* Domain members share cpu power */
 #define SD_POWERSAVINGS_BALANCE        0x0100  /* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
 #define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
-#define SD_WAKE_IDLE_FAR       0x0800  /* Gain latency sacrificing cache hit */
+
 #define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling domain */
-#define SD_BALANCE_WAKE                0x2000  /* Balance on wakeup */
 
 enum powersavings_balance_level {
        POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
index 85e8cf7d393c6e11bf509d73ca9b24bbfeef23bf..6a8cd15555bb2ef3154dcab175d743e4ad08ad1b 100644 (file)
@@ -95,14 +95,12 @@ int arch_update_cpu_topology(void);
                                | 1*SD_BALANCE_NEWIDLE                  \
                                | 1*SD_BALANCE_EXEC                     \
                                | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_WAKE_IDLE                        \
+                               | 1*SD_BALANCE_WAKE                     \
                                | 1*SD_WAKE_AFFINE                      \
-                               | 1*SD_WAKE_BALANCE                     \
                                | 1*SD_SHARE_CPUPOWER                   \
                                | 0*SD_POWERSAVINGS_BALANCE             \
                                | 0*SD_SHARE_PKG_RESOURCES              \
                                | 0*SD_SERIALIZE                        \
-                               | 0*SD_WAKE_IDLE_FAR                    \
                                | 0*SD_PREFER_SIBLING                   \
                                ,                                       \
        .last_balance           = jiffies,                              \
@@ -129,13 +127,11 @@ int arch_update_cpu_topology(void);
                                | 1*SD_BALANCE_NEWIDLE                  \
                                | 1*SD_BALANCE_EXEC                     \
                                | 1*SD_BALANCE_FORK                     \
-                               | 1*SD_WAKE_IDLE                        \
+                               | 1*SD_BALANCE_WAKE                     \
                                | 1*SD_WAKE_AFFINE                      \
-                               | 1*SD_WAKE_BALANCE                     \
                                | 0*SD_SHARE_CPUPOWER                   \
                                | 1*SD_SHARE_PKG_RESOURCES              \
                                | 0*SD_SERIALIZE                        \
-                               | 0*SD_WAKE_IDLE_FAR                    \
                                | sd_balance_for_mc_power()             \
                                | sd_power_saving_flags()               \
                                ,                                       \
@@ -163,13 +159,11 @@ int arch_update_cpu_topology(void);
                                | 1*SD_BALANCE_NEWIDLE                  \
                                | 1*SD_BALANCE_EXEC                     \
                                | 1*SD_BALANCE_FORK                     \
-                               | 1*SD_WAKE_IDLE                        \
+                               | 1*SD_BALANCE_WAKE                     \
                                | 0*SD_WAKE_AFFINE                      \
-                               | 1*SD_WAKE_BALANCE                     \
                                | 0*SD_SHARE_CPUPOWER                   \
                                | 0*SD_SHARE_PKG_RESOURCES              \
                                | 0*SD_SERIALIZE                        \
-                               | 0*SD_WAKE_IDLE_FAR                    \
                                | sd_balance_for_package_power()        \
                                | sd_power_saving_flags()               \
                                ,                                       \
@@ -191,14 +185,12 @@ int arch_update_cpu_topology(void);
                                | 1*SD_BALANCE_NEWIDLE                  \
                                | 0*SD_BALANCE_EXEC                     \
                                | 0*SD_BALANCE_FORK                     \
-                               | 0*SD_WAKE_IDLE                        \
+                               | 0*SD_BALANCE_WAKE                     \
                                | 1*SD_WAKE_AFFINE                      \
-                               | 0*SD_WAKE_BALANCE                     \
                                | 0*SD_SHARE_CPUPOWER                   \
                                | 0*SD_POWERSAVINGS_BALANCE             \
                                | 0*SD_SHARE_PKG_RESOURCES              \
                                | 1*SD_SERIALIZE                        \
-                               | 1*SD_WAKE_IDLE_FAR                    \
                                | 0*SD_PREFER_SIBLING                   \
                                ,                                       \
        .last_balance           = jiffies,                              \
index fc6fda881d2e9ec9039415d331df109f2d70f014..6c819f338b11a6aeadf1a67e5edaf40dc05c08cf 100644 (file)
@@ -512,14 +512,6 @@ struct root_domain {
 #ifdef CONFIG_SMP
        struct cpupri cpupri;
 #endif
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       /*
-        * Preferred wake up cpu nominated by sched_mc balance that will be
-        * used when most cpus are idle in the system indicating overall very
-        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
-        */
-       unsigned int sched_mc_preferred_wakeup_cpu;
-#endif
 };
 
 /*
@@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (!sched_feat(SYNC_WAKEUPS))
                sync = 0;
 
-#ifdef CONFIG_SMP
-       if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
-               struct sched_domain *sd;
-
-               this_cpu = raw_smp_processor_id();
-               cpu = task_cpu(p);
-
-               for_each_domain(this_cpu, sd) {
-                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               update_shares(sd);
-                               break;
-                       }
-               }
-       }
-#endif
-
        this_cpu = get_cpu();
 
        smp_wmb();
@@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
        *imbalance = sds->min_load_per_task;
        sds->busiest = sds->group_min;
 
-       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                       group_first_cpu(sds->group_leader);
-       }
-
        return 1;
 
 }
@@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd)
        }
 
        /* Following flags don't use groups */
-       if (sd->flags & (SD_WAKE_IDLE |
-                        SD_WAKE_AFFINE |
-                        SD_WAKE_BALANCE))
+       if (sd->flags & (SD_WAKE_AFFINE))
                return 0;
 
        return 1;
@@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                return 0;
 
-       /* Does parent contain flags not in child? */
-       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
-       if (cflags & SD_WAKE_AFFINE)
-               pflags &= ~SD_WAKE_BALANCE;
        /* Flags needing groups don't count if only 1 group in parent */
        if (parent->groups == parent->groups->next) {
                pflags &= ~(SD_LOAD_BALANCE |
@@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd,
                request = attr->relax_domain_level;
        if (request < sd->level) {
                /* turn off idle balance on this domain */
-               sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+               sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
        } else {
                /* turn on idle balance on this domain */
-               sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+               sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
        }
 }
 
index f2eb5b9347152c76224c112b801b61c33d225bf1..09d19f77eb3a849f421a05127421a7778424b53d 100644 (file)
@@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq)
        se->vruntime = rightmost->vruntime + 1;
 }
 
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-
-#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-
-static int wake_idle(int cpu, struct task_struct *p)
-{
-       struct sched_domain *sd;
-       int i;
-       unsigned int chosen_wakeup_cpu;
-       int this_cpu;
-       struct rq *task_rq = task_rq(p);
-
-       /*
-        * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-        * are idle and this is not a kernel thread and this task's affinity
-        * allows it to be moved to preferred cpu, then just move!
-        */
-
-       this_cpu = smp_processor_id();
-       chosen_wakeup_cpu =
-               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-
-       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-               idle_cpu(cpu) && idle_cpu(this_cpu) &&
-               p->mm && !(p->flags & PF_KTHREAD) &&
-               cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-               return chosen_wakeup_cpu;
-
-       /*
-        * If it is idle, then it is the best cpu to run this task.
-        *
-        * This cpu is also the best, if it has more than one task already.
-        * Siblings must be also busy(in most cases) as they didn't already
-        * pickup the extra load from this cpu and hence we need not check
-        * sibling runqueue info. This will avoid the checks and cache miss
-        * penalities associated with that.
-        */
-       if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-               return cpu;
-
-       for_each_domain(cpu, sd) {
-               if ((sd->flags & SD_WAKE_IDLE)
-                   || ((sd->flags & SD_WAKE_IDLE_FAR)
-                       && !task_hot(p, task_rq->clock, sd))) {
-                       for_each_cpu_and(i, sched_domain_span(sd),
-                                        &p->cpus_allowed) {
-                               if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-                                       if (i != task_cpu(p)) {
-                                               schedstat_inc(p,
-                                                      se.nr_wakeups_idle);
-                                       }
-                                       return i;
-                               }
-                       }
-               } else {
-                       break;
-               }
-       }
-       return cpu;
-}
-#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-       return cpu;
-}
-#endif
-
 #ifdef CONFIG_SMP
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 #endif
 
-static int
-wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-           struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-           int idx, unsigned long load, unsigned long this_load,
-           unsigned int imbalance)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-       struct task_struct *curr = this_rq->curr;
-       struct task_group *tg;
-       unsigned long tl = this_load;
+       struct task_struct *curr = current;
+       unsigned long this_load, load;
+       int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
+       unsigned int imbalance;
+       struct task_group *tg;
        unsigned long weight;
        int balanced;
 
-       if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
-               return 0;
+       idx       = sd->wake_idx;
+       this_cpu  = smp_processor_id();
+       prev_cpu  = task_cpu(p);
+       load      = source_load(prev_cpu, idx);
+       this_load = target_load(this_cpu, idx);
 
        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
                        p->se.avg_overlap > sysctl_sched_migration_cost))
@@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
                tg = task_group(current);
                weight = current->se.load.weight;
 
-               tl += effective_load(tg, this_cpu, -weight, -weight);
+               this_load += effective_load(tg, this_cpu, -weight, -weight);
                load += effective_load(tg, prev_cpu, 0, -weight);
        }
 
        tg = task_group(p);
        weight = p->se.load.weight;
 
+       imbalance = 100 + (sd->imbalance_pct - 100) / 2;
+
        /*
         * In low-load situations, where prev_cpu is idle and this_cpu is idle
-        * due to the sync cause above having dropped tl to 0, we'll always have
-        * an imbalance, but there's really nothing you can do about that, so
-        * that's good too.
+        * due to the sync cause above having dropped this_load to 0, we'll
+        * always have an imbalance, but there's really nothing you can do
+        * about that, so that's good too.
         *
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-       balanced = !tl ||
-               100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+       balanced = !this_load ||
+               100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
        /*
@@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
 
-       if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-                       tl_per_task)) {
+       if (balanced ||
+           (this_load <= load &&
+            this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
                 * there is no bad imbalance.
                 */
-               schedstat_inc(this_sd, ttwu_move_affine);
+               schedstat_inc(sd, ttwu_move_affine);
                schedstat_inc(p, se.nr_wakeups_affine);
 
                return 1;
@@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        return 0;
 }
 
-static int sched_balance_self(int cpu, int flag);
-
-static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
-{
-       struct sched_domain *sd, *this_sd = NULL;
-       int prev_cpu, this_cpu, new_cpu;
-       unsigned long load, this_load;
-       struct rq *this_rq;
-       unsigned int imbalance;
-       int idx;
-
-       prev_cpu        = task_cpu(p);
-       this_cpu        = smp_processor_id();
-       this_rq         = cpu_rq(this_cpu);
-       new_cpu         = prev_cpu;
-
-       if (flag != SD_BALANCE_WAKE)
-               return sched_balance_self(this_cpu, flag);
-
-       /*
-        * 'this_sd' is the first domain that both
-        * this_cpu and prev_cpu are present in:
-        */
-       for_each_domain(this_cpu, sd) {
-               if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
-                       this_sd = sd;
-                       break;
-               }
-       }
-
-       if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
-               goto out;
-
-       /*
-        * Check for affine wakeup and passive balancing possibilities.
-        */
-       if (!this_sd)
-               goto out;
-
-       idx = this_sd->wake_idx;
-
-       imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
-       load = source_load(prev_cpu, idx);
-       this_load = target_load(this_cpu, idx);
-
-       if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
-                                    load, this_load, imbalance))
-               return this_cpu;
-
-       /*
-        * Start passive balancing when half the imbalance_pct
-        * limit is reached.
-        */
-       if (this_sd->flags & SD_WAKE_BALANCE) {
-               if (imbalance*this_load <= 100*load) {
-                       schedstat_inc(this_sd, ttwu_move_balance);
-                       schedstat_inc(p, se.nr_wakeups_passive);
-                       return this_cpu;
-               }
-       }
-
-out:
-       return wake_idle(new_cpu, p);
-}
-
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int sched_balance_self(int cpu, int flag)
+static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 {
        struct task_struct *t = current;
        struct sched_domain *tmp, *sd = NULL;
+       int cpu = smp_processor_id();
+       int prev_cpu = task_cpu(p);
+       int new_cpu = cpu;
+       int want_affine = 0;
+
+       if (flag & SD_BALANCE_WAKE) {
+               if (sched_feat(AFFINE_WAKEUPS))
+                       want_affine = 1;
+               new_cpu = prev_cpu;
+       }
 
        for_each_domain(cpu, tmp) {
                /*
@@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag)
                 */
                if (tmp->flags & SD_POWERSAVINGS_BALANCE)
                        break;
-               if (tmp->flags & flag)
-                       sd = tmp;
-       }
 
-       if (sd)
-               update_shares(sd);
+               switch (flag) {
+               case SD_BALANCE_WAKE:
+                       if (!sched_feat(LB_WAKEUP_UPDATE))
+                               break;
+               case SD_BALANCE_FORK:
+               case SD_BALANCE_EXEC:
+                       if (root_task_group_empty())
+                               break;
+                       update_shares(tmp);
+               default:
+                       break;
+               }
+
+               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+
+                       if (wake_affine(tmp, p, sync))
+                               return cpu;
+
+                       want_affine = 0;
+               }
+
+               if (!(tmp->flags & flag))
+                       continue;
+
+               sd = tmp;
+       }
 
        while (sd) {
                struct sched_group *group;
-               int new_cpu, weight;
+               int weight;
 
                if (!(sd->flags & flag)) {
                        sd = sd->child;
@@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag)
                /* while loop will break here if sd == NULL */
        }
 
-       return cpu;
+       return new_cpu;
 }
 #endif /* CONFIG_SMP */