UPSTREAM: sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index efa516dfd6bc585e7b5bf2a6846ef29ad7e9bde1..15ccfbff1bde46473a9eb3116db2e97fc12edc78 100644 (file)
@@ -53,7 +53,6 @@
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
-unsigned int sysctl_sched_is_big_little = 0;
 unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_initial_task_util = 0;
 unsigned int sysctl_sched_cstate_aware = 1;
@@ -128,6 +127,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
@@ -300,19 +305,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
+               struct rq *rq = rq_of(cfs_rq);
+               int cpu = cpu_of(rq);
                /*
                 * Ensure we either appear before our parent (if already
                 * enqueued) or force our parent to appear after us when it is
-                * enqueued.  The fact that we always enqueue bottom-up
-                * reduces this to two cases.
+                * enqueued. The fact that we always enqueue bottom-up
+                * reduces this to two cases and a special case for the root
+                * cfs_rq. Furthermore, it also means that we will always reset
+                * tmp_alone_branch either when the branch is connected
+                * to a tree or when we reach the beg of the tree
                 */
                if (cfs_rq->tg->parent &&
-                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
-               } else {
+                   cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+                       /*
+                        * If parent is already on the list, we add the child
+                        * just before. Thanks to circular linked property of
+                        * the list, this means to put the child at the tail
+                        * of the list that starts by parent.
+                        */
                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                               &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+                       /*
+                        * The branch is now connected to its tree so we can
+                        * reset tmp_alone_branch to the beginning of the
+                        * list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else if (!cfs_rq->tg->parent) {
+                       /*
+                        * cfs rq without parent should be put
+                        * at the tail of the list.
+                        */
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq->leaf_cfs_rq_list);
+                       /*
+                        * We have reach the beg of a tree so we can reset
+                        * tmp_alone_branch to the beginning of the list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else {
+                       /*
+                        * The parent has not already been added so we want to
+                        * make sure that it will be put after us.
+                        * tmp_alone_branch points to the beg of the branch
+                        * where we will add parent.
+                        */
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               rq->tmp_alone_branch);
+                       /*
+                        * update tmp_alone_branch to points to the new beg
+                        * of the branch
+                        */
+                       rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
                }
 
                cfs_rq->on_list = 1;
@@ -670,7 +715,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
@@ -696,18 +741,108 @@ void init_entity_runnable_average(struct sched_entity *se)
        sa->period_contrib = 1023;
        sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-       sa->util_avg =  sched_freq() ?
-               sysctl_sched_initial_task_util :
-               scale_load_down(SCHED_LOAD_SCALE);
-       sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       /*
+        * In previous Android versions, we used to have:
+        *      sa->util_avg =  sched_freq() ?
+        *              sysctl_sched_initial_task_util :
+        *              scale_load_down(SCHED_LOAD_SCALE);
+        *      sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+        * However, that functionality has been moved to enqueue.
+        * It is unclear if we should restore this in enqueue.
+        */
+       /*
+        * At this point, util_avg won't be used in select_task_rq_fair anyway
+        */
+       sa->util_avg = 0;
+       sa->util_sum = 0;
        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct sched_avg *sa = &se->avg;
+       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+       if (cap > 0) {
+               if (cfs_rq->avg.util_avg != 0) {
+                       sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+                       sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+                       if (sa->util_avg > cap)
+                               sa->util_avg = cap;
+               } else {
+                       sa->util_avg = cap;
+               }
+               /*
+                * If we wish to restore tuning via setting initial util,
+                * this is where we should do it.
+                */
+               sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       }
+
+       if (entity_is_task(se)) {
+               struct task_struct *p = task_of(se);
+               if (p->sched_class != &fair_sched_class) {
+                       /*
+                        * For !fair tasks do:
+                        *
+                       update_cfs_rq_load_avg(now, cfs_rq, false);
+                       attach_entity_load_avg(cfs_rq, se);
+                       switched_from_fair(rq, p);
+                        *
+                        * such that the next switched_to_fair() has the
+                        * expected state.
+                        */
+                       se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+                       return;
+               }
+       }
+
+       attach_entity_cfs_rq(se);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
 
 /*
  * Update the current task's runtime statistics.
@@ -1207,8 +1342,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
        if (env->best_task)
                put_task_struct(env->best_task);
-       if (p)
-               get_task_struct(p);
 
        env->best_task = p;
        env->best_imp = imp;
@@ -1276,20 +1409,30 @@ static void task_numa_compare(struct task_numa_env *env,
        long imp = env->p->numa_group ? groupimp : taskimp;
        long moveimp = imp;
        int dist = env->dist;
+       bool assigned = false;
 
        rcu_read_lock();
 
        raw_spin_lock_irq(&dst_rq->lock);
        cur = dst_rq->curr;
        /*
-        * No need to move the exiting task, and this ensures that ->curr
-        * wasn't reaped and thus get_task_struct() in task_numa_assign()
-        * is safe under RCU read lock.
-        * Note that rcu_read_lock() itself can't protect from the final
-        * put_task_struct() after the last schedule().
+        * No need to move the exiting task or idle task.
         */
        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
+       else {
+               /*
+                * The task_struct must be protected here to protect the
+                * p->numa_faults access in the task_weight since the
+                * numa_faults could already be freed in the following path:
+                * finish_task_switch()
+                *     --> put_task_struct()
+                *         --> __put_task_struct()
+                *             --> task_numa_free()
+                */
+               get_task_struct(cur);
+       }
+
        raw_spin_unlock_irq(&dst_rq->lock);
 
        /*
@@ -1373,6 +1516,7 @@ balance:
                 */
                if (!load_too_imbalanced(src_load, dst_load, env)) {
                        imp = moveimp - 1;
+                       put_task_struct(cur);
                        cur = NULL;
                        goto assign;
                }
@@ -1395,12 +1539,20 @@ balance:
         * Call select_idle_sibling to maybe find a better one.
         */
        if (!cur)
-               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+               env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                                                  env->dst_cpu);
 
 assign:
+       assigned = true;
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
+       /*
+        * The dst_rq->curr isn't assigned. The protection for task_struct is
+        * finished.
+        */
+       if (cur && !assigned)
+               put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2677,9 +2829,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -2695,6 +2859,29 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+        if (&this_rq()->cfs == cfs_rq) {
+                /*
+                 * There are a few boundary cases this might miss but it should
+                 * get called often enough that that should (hopefully) not be
+                 * a real problem -- added to that it only calls on the local
+                 * CPU, so if we enqueue remotely we'll miss an update, but
+                 * the next tick/schedule should update.
+                 *
+                 * It will not get called when we go idle, because the idle
+                 * thread is a different class (!fair), nor will the utilization
+                 * number include things like RT tasks.
+                 *
+                 * As is, the util number is not freq-invariant (we'd have to
+                 * implement arch_scale_freq_capacity() for that).
+                 *
+                 * See cpu_util().
+                 */
+                cpufreq_update_util(rq_of(cfs_rq), 0);
+        }
+}
+
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 
 /*
@@ -2714,11 +2901,28 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
        WRITE_ONCE(*ptr, res);                                  \
 } while (0)
 
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
        struct sched_avg *sa = &cfs_rq->avg;
-       int decayed, removed = 0;
+       int decayed, removed = 0, removed_util = 0;
 
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2731,6 +2935,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                sub_positive(&sa->util_avg, r);
                sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+               removed_util = 1;
        }
 
        decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2741,11 +2946,24 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
        cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
+       /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+       if (cfs_rq == &rq_of(cfs_rq)->cfs)
+               trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+       if (update_freq && (decayed || removed_util))
+               cfs_rq_util_change(cfs_rq);
+
        return decayed || removed;
 }
 
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG      0x1
+#define SKIP_AGE_LOAD  0x2
+
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
@@ -2755,55 +2973,55 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
         * Track task load average for carrying it to new CPU after migrated, and
         * track group sched_entity load average for task_h_load calc in migration
         */
-       __update_load_avg(now, cpu, &se->avg,
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+               __update_load_avg(now, cpu, &se->avg,
                          se->on_rq * scale_load_down(se->load.weight),
                          cfs_rq->curr == se, NULL);
+       }
 
-       if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+       if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
                update_tg_load_avg(cfs_rq, 0);
 
        if (entity_is_task(se))
                trace_sched_load_avg_task(task_of(se), &se->avg);
-       trace_sched_load_avg_cpu(cpu, cfs_rq);
 }
 
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       if (!sched_feat(ATTACH_AGE_LOAD))
-               goto skip_aging;
-
-       /*
-        * If we got migrated (either between CPUs or between cgroups) we'll
-        * have aged the average right before clearing @last_update_time.
-        */
-       if (se->avg.last_update_time) {
-               __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                                 &se->avg, 0, 0, NULL);
-
-               /*
-                * XXX: we could have just aged the entire load away if we've been
-                * absent from the fair class for too long.
-                */
-       }
-
-skip_aging:
        se->avg.last_update_time = cfs_rq->avg.last_update_time;
        cfs_rq->avg.load_avg += se->avg.load_avg;
        cfs_rq->avg.load_sum += se->avg.load_sum;
        cfs_rq->avg.util_avg += se->avg.util_avg;
        cfs_rq->avg.util_sum += se->avg.util_sum;
+
+       cfs_rq_util_change(cfs_rq);
 }
 
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                         &se->avg, se->on_rq * scale_load_down(se->load.weight),
-                         cfs_rq->curr == se, NULL);
 
        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
        sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+
+       cfs_rq_util_change(cfs_rq);
 }
 
 /* Add the load generated by se into cfs_rq's load average */
@@ -2811,34 +3029,20 @@ static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct sched_avg *sa = &se->avg;
-       u64 now = cfs_rq_clock_task(cfs_rq);
-       int migrated, decayed;
-
-       migrated = !sa->last_update_time;
-       if (!migrated) {
-               __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-                       se->on_rq * scale_load_down(se->load.weight),
-                       cfs_rq->curr == se, NULL);
-       }
-
-       decayed = update_cfs_rq_load_avg(now, cfs_rq);
 
        cfs_rq->runnable_load_avg += sa->load_avg;
        cfs_rq->runnable_load_sum += sa->load_sum;
 
-       if (migrated)
+       if (!sa->last_update_time) {
                attach_entity_load_avg(cfs_rq, se);
-
-       if (decayed || migrated)
                update_tg_load_avg(cfs_rq, 0);
+       }
 }
 
 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
 static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       update_load_avg(se, 1);
-
        cfs_rq->runnable_load_avg =
                max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
        cfs_rq->runnable_load_sum =
@@ -2866,6 +3070,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 }
 #endif
 
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 last_update_time;
+
+       last_update_time = cfs_rq_last_update_time(cfs_rq);
+       __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
 /*
  * Task first catches up with cfs_rq, and then subtract
  * itself from the cfs_rq (task must be off the queue now).
@@ -2873,7 +3090,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 void remove_entity_load_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 last_update_time;
 
        /*
         * Newly created task or never used group entity should not be removed
@@ -2882,9 +3098,7 @@ void remove_entity_load_avg(struct sched_entity *se)
        if (se->avg.last_update_time == 0)
                return;
 
-       last_update_time = cfs_rq_last_update_time(cfs_rq);
-
-       __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+       sync_entity_load_avg(se);
        atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
        atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
@@ -2921,7 +3135,16 @@ static int idle_balance(struct rq *this_rq);
 
 #else /* CONFIG_SMP */
 
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+       return 0;
+}
+
+#define UPDATE_TG      0x0
+#define SKIP_AGE_LOAD  0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
@@ -3064,6 +3287,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+       update_load_avg(se, UPDATE_TG);
        enqueue_entity_load_avg(cfs_rq, se);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
@@ -3139,6 +3363,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+       update_load_avg(se, UPDATE_TG);
        dequeue_entity_load_avg(cfs_rq, se);
 
        update_stats_dequeue(cfs_rq, se);
@@ -3229,7 +3454,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 */
                update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
        }
 
        update_stats_curr_start(cfs_rq, se);
@@ -3345,7 +3570,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
        /*
         * Ensure that runnable average is periodically updated.
         */
-       update_load_avg(curr, 1);
+       update_load_avg(curr, UPDATE_TG);
        update_cfs_shares(cfs_rq);
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -3942,6 +4167,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
        if (!cfs_bandwidth_used())
                return;
 
+       /* Synchronize hierarchical throttle counter: */
+       if (unlikely(!cfs_rq->throttle_uptodate)) {
+               struct rq *rq = rq_of(cfs_rq);
+               struct cfs_rq *pcfs_rq;
+               struct task_group *tg;
+
+               cfs_rq->throttle_uptodate = 1;
+
+               /* Get closest up-to-date node, because leaves go first: */
+               for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+                       pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+                       if (pcfs_rq->throttle_uptodate)
+                               break;
+               }
+               if (tg) {
+                       cfs_rq->throttle_count = pcfs_rq->throttle_count;
+                       cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               }
+       }
+
        /* an active group must be handled by the update_curr()->put() path */
        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                return;
@@ -4183,7 +4428,7 @@ static inline void hrtick_update(struct rq *rq)
 
 #ifdef CONFIG_SMP
 static bool cpu_overutilized(int cpu);
-static inline unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu);
 #else
 #define boosted_cpu_util(cpu) cpu_util(cpu)
 #endif
@@ -4218,6 +4463,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        int task_wakeup = flags & ENQUEUE_WAKEUP;
 #endif
 
+       /*
+        * If in_iowait is set, the code below may not trigger any cpufreq
+        * utilization updates, so do it here explicitly with the IOWAIT flag
+        * passed.
+        */
+       if (p->in_iowait)
+               cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
        for_each_sched_entity(se) {
                if (se->on_rq)
                        break;
@@ -4246,7 +4499,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
 
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
                update_cfs_shares(cfs_rq);
        }
 
@@ -4255,6 +4508,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+       /*
+        * Update SchedTune accounting.
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
        if (!se) {
                walt_inc_cumulative_runnable_avg(rq, p);
                if (!task_new && !rq->rd->overutilized &&
@@ -4274,9 +4546,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                        update_capacity_of(cpu_of(rq));
        }
 
-       /* Update SchedTune accouting */
-       schedtune_enqueue_task(p, cpu_of(rq));
-
 #endif /* CONFIG_SMP */
        hrtick_update(rq);
 }
@@ -4311,15 +4580,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
+                       /* Avoid re-evaluating load for this entity: */
+                       se = parent_entity(se);
                        /*
                         * Bias pick_next to pick a task from this cfs_rq, as
                         * p is sleeping when it is within its sched_slice.
                         */
-                       if (task_sleep && parent_entity(se))
-                               set_next_buddy(parent_entity(se));
-
-                       /* avoid re-evaluating load for this entity */
-                       se = parent_entity(se);
+                       if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+                               set_next_buddy(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
@@ -4333,7 +4601,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
 
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
                update_cfs_shares(cfs_rq);
        }
 
@@ -4342,6 +4610,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+       /*
+        * Update SchedTune accounting
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
        if (!se) {
                walt_dec_cumulative_runnable_avg(rq, p);
 
@@ -4361,9 +4638,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                }
        }
 
-       /* Update SchedTune accouting */
-       schedtune_dequeue_task(p, cpu_of(rq));
-
 #endif /* CONFIG_SMP */
 
        hrtick_update(rq);
@@ -4878,7 +5152,7 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
 }
 
 static int find_new_capacity(struct energy_env *eenv,
-       const struct sched_group_energy const *sge)
+       const struct sched_group_energy * const sge)
 {
        int idx;
        unsigned long util = group_max_util(eenv);
@@ -5028,6 +5302,7 @@ static inline int __energy_diff(struct energy_env *eenv)
        struct sched_domain *sd;
        struct sched_group *sg;
        int sd_cpu = -1, energy_before = 0, energy_after = 0;
+       int diff, margin;
 
        struct energy_env eenv_before = {
                .util_delta     = 0,
@@ -5077,6 +5352,16 @@ static inline int __energy_diff(struct energy_env *eenv)
                        eenv->cap.before, eenv->cap.after, eenv->cap.delta,
                        eenv->nrg.delta, eenv->payoff);
 
+       /*
+        * Dead-zone margin preventing too many migrations.
+        */
+
+       margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+       diff = eenv->nrg.after - eenv->nrg.before;
+
+       eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
+
        return eenv->nrg.diff;
 }
 
@@ -5176,18 +5461,18 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
 
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                      int prev_cpu, int sync)
 {
        s64 this_load, load;
        s64 this_eff_load, prev_eff_load;
-       int idx, this_cpu, prev_cpu;
+       int idx, this_cpu;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
 
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
-       prev_cpu  = task_cpu(p);
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
 
@@ -5253,8 +5538,6 @@ static inline unsigned long task_util(struct task_struct *p)
        return p->se.avg.util_avg;
 }
 
-unsigned int capacity_margin = 1280; /* ~20% margin */
-
 static inline unsigned long boosted_task_util(struct task_struct *task);
 
 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
@@ -5280,11 +5563,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
        return __task_fits(p, cpu, 0);
 }
 
-static inline bool task_fits_spare(struct task_struct *p, int cpu)
-{
-       return __task_fits(p, cpu, cpu_util(cpu));
-}
-
 static bool cpu_overutilized(int cpu)
 {
        return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
@@ -5374,7 +5652,7 @@ schedtune_task_margin(struct task_struct *task)
 
 #endif /* CONFIG_SCHED_TUNE */
 
-static inline unsigned long
+unsigned long
 boosted_cpu_util(int cpu)
 {
        unsigned long util = cpu_util(cpu);
@@ -5396,6 +5674,13 @@ boosted_task_util(struct task_struct *task)
        return util + margin;
 }
 
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+       return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -5405,10 +5690,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                  int this_cpu, int sd_flag)
 {
        struct sched_group *idlest = NULL, *group = sd->groups;
-       struct sched_group *fit_group = NULL, *spare_group = NULL;
+       struct sched_group *most_spare_sg = NULL;
        unsigned long min_load = ULONG_MAX, this_load = 0;
-       unsigned long fit_capacity = ULONG_MAX;
-       unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
+       unsigned long most_spare = 0, this_spare = 0;
        int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -5416,7 +5700,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                load_idx = sd->wake_idx;
 
        do {
-               unsigned long load, avg_load, spare_capacity;
+               unsigned long load, avg_load, spare_cap, max_spare_cap;
                int local_group;
                int i;
 
@@ -5428,8 +5712,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_cpus(group));
 
-               /* Tally up the load of all CPUs in the group */
+               /*
+                * Tally up the load of all CPUs in the group and find
+                * the group containing the CPU with most spare capacity.
+                */
                avg_load = 0;
+               max_spare_cap = 0;
 
                for_each_cpu(i, sched_group_cpus(group)) {
                        /* Bias balancing toward cpus of our domain */
@@ -5440,24 +5728,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
                        avg_load += load;
 
-                       /*
-                        * Look for most energy-efficient group that can fit
-                        * that can fit the task.
-                        */
-                       if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
-                               fit_capacity = capacity_of(i);
-                               fit_group = group;
-                       }
+                       spare_cap = capacity_spare_wake(i, p);
 
-                       /*
-                        * Look for group which has most spare capacity on a
-                        * single cpu.
-                        */
-                       spare_capacity = capacity_of(i) - cpu_util(i);
-                       if (spare_capacity > max_spare_capacity) {
-                               max_spare_capacity = spare_capacity;
-                               spare_group = group;
-                       }
+                       if (spare_cap > max_spare_cap)
+                               max_spare_cap = spare_cap;
                }
 
                /* Adjust by relative CPU capacity of the group */
@@ -5465,17 +5739,32 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
                if (local_group) {
                        this_load = avg_load;
-               } else if (avg_load < min_load) {
-                       min_load = avg_load;
-                       idlest = group;
+                       this_spare = max_spare_cap;
+               } else {
+                       if (avg_load < min_load) {
+                               min_load = avg_load;
+                               idlest = group;
+                       }
+
+                       if (most_spare < max_spare_cap) {
+                               most_spare = max_spare_cap;
+                               most_spare_sg = group;
+                       }
                }
        } while (group = group->next, group != sd->groups);
 
-       if (fit_group)
-               return fit_group;
-
-       if (spare_group)
-               return spare_group;
+       /*
+        * The cross-over point between using spare capacity or least load
+        * is too conservative for high utilization tasks on partially
+        * utilized systems if we require spare_capacity > task_util(p),
+        * so we allow for some task stuffing by using
+        * spare_capacity > task_util(p)/2.
+        */
+       if (this_spare > task_util(p) / 2 &&
+           imbalance*this_spare > 100*most_spare)
+               return NULL;
+       else if (most_spare > task_util(p) / 2)
+               return most_spare_sg;
 
        if (!idlest || 100*this_load < imbalance*min_load)
                return NULL;
@@ -5495,9 +5784,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        int shallowest_idle_cpu = -1;
        int i;
 
+       /* Check if we have any choice: */
+       if (group->group_weight == 1)
+               return cpumask_first(sched_group_cpus(group));
+
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-               if (task_fits_spare(p, i)) {
+               if (idle_cpu(i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
                        if (idle && idle->exit_latency < min_exit_latency) {
@@ -5509,8 +5802,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                min_exit_latency = idle->exit_latency;
                                latest_idle_timestamp = rq->idle_stamp;
                                shallowest_idle_cpu = i;
-                       } else if (idle_cpu(i) &&
-                                  (!idle || idle->exit_latency == min_exit_latency) &&
+                       } else if ((!idle || idle->exit_latency == min_exit_latency) &&
                                   rq->idle_stamp > latest_idle_timestamp) {
                                /*
                                 * If equal or no active idle state, then
@@ -5519,13 +5811,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                 */
                                latest_idle_timestamp = rq->idle_stamp;
                                shallowest_idle_cpu = i;
-                       } else if (shallowest_idle_cpu == -1) {
-                               /*
-                                * If we haven't found an idle CPU yet
-                                * pick a non-idle one that can fit the task as
-                                * fallback.
-                                */
-                               shallowest_idle_cpu = i;
                        }
                } else if (shallowest_idle_cpu == -1) {
                        load = weighted_cpuload(i);
@@ -5542,14 +5827,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
        struct sched_domain *sd;
        struct sched_group *sg;
-       int i = task_cpu(p);
-       int best_idle = -1;
-       int best_idle_cstate = -1;
-       int best_idle_capacity = INT_MAX;
+       int best_idle_cpu = -1;
+       int best_idle_cstate = INT_MAX;
+       unsigned long best_idle_capacity = ULONG_MAX;
 
        if (!sysctl_sched_cstate_aware) {
                if (idle_cpu(target))
@@ -5558,8 +5842,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
                /*
                 * If the prevous cpu is cache affine and idle, don't be stupid.
                 */
-               if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-                       return i;
+               if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+                       return prev;
        }
 
        /*
@@ -5569,24 +5853,26 @@ static int select_idle_sibling(struct task_struct *p, int target)
        for_each_lower_domain(sd) {
                sg = sd->groups;
                do {
+                       int i;
                        if (!cpumask_intersects(sched_group_cpus(sg),
                                                tsk_cpus_allowed(p)))
                                goto next;
 
                        if (sysctl_sched_cstate_aware) {
                                for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
-                                       struct rq *rq = cpu_rq(i);
-                                       int idle_idx = idle_get_state_idx(rq);
+                                       int idle_idx = idle_get_state_idx(cpu_rq(i));
                                        unsigned long new_usage = boosted_task_util(p);
                                        unsigned long capacity_orig = capacity_orig_of(i);
+
                                        if (new_usage > capacity_orig || !idle_cpu(i))
                                                goto next;
 
                                        if (i == target && new_usage <= capacity_curr_of(target))
                                                return target;
 
-                                       if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
-                                               best_idle = i;
+                                       if (idle_idx < best_idle_cstate &&
+                                           capacity_orig <= best_idle_capacity) {
+                                               best_idle_cpu = i;
                                                best_idle_cstate = idle_idx;
                                                best_idle_capacity = capacity_orig;
                                        }
@@ -5605,227 +5891,216 @@ next:
                        sg = sg->next;
                } while (sg != sd->groups);
        }
-       if (best_idle > 0)
-               target = best_idle;
+
+       if (best_idle_cpu >= 0)
+               target = best_idle_cpu;
 
 done:
        return target;
 }
 
+static int start_cpu(bool boosted)
+{
+       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+       RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+                          "sched RCU must be held");
+
+       return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
 static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
 {
-       int iter_cpu;
        int target_cpu = -1;
-       int target_util = 0;
-       int backup_capacity = 0;
+       unsigned long target_util = prefer_idle ? ULONG_MAX : 0;
+       unsigned long backup_capacity = ULONG_MAX;
        int best_idle_cpu = -1;
        int best_idle_cstate = INT_MAX;
        int backup_cpu = -1;
-       unsigned long task_util_boosted, new_util;
+       unsigned long min_util = boosted_task_util(p);
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int cpu = start_cpu(boosted);
 
-       task_util_boosted = boosted_task_util(p);
-       for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
-               int cur_capacity;
-               struct rq *rq;
-               int idle_idx;
+       if (cpu < 0)
+               return target_cpu;
 
-               /*
-                * Iterate from higher cpus for boosted tasks.
-                */
-               int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+       sd = rcu_dereference(per_cpu(sd_ea, cpu));
 
-               if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
-                       continue;
+       if (!sd)
+               return target_cpu;
 
-               /*
-                * p's blocked utilization is still accounted for on prev_cpu
-                * so prev_cpu will receive a negative bias due to the double
-                * accounting. However, the blocked utilization may be zero.
-                */
-               new_util = cpu_util(i) + task_util_boosted;
+       sg = sd->groups;
 
-               /*
-                * Ensure minimum capacity to grant the required boost.
-                * The target CPU can be already at a capacity level higher
-                * than the one required to boost the task.
-                */
-               if (new_util > capacity_orig_of(i))
-                       continue;
+       do {
+               int i;
+
+               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+                       unsigned long cur_capacity, new_util;
+
+                       if (!cpu_online(i))
+                               continue;
+
+                       /*
+                        * p's blocked utilization is still accounted for on prev_cpu
+                        * so prev_cpu will receive a negative bias due to the double
+                        * accounting. However, the blocked utilization may be zero.
+                        */
+                       new_util = cpu_util(i) + task_util(p);
+
+                       /*
+                        * Ensure minimum capacity to grant the required boost.
+                        * The target CPU can be already at a capacity level higher
+                        * than the one required to boost the task.
+                        */
+                       new_util = max(min_util, new_util);
+
+                       if (new_util > capacity_orig_of(i))
+                               continue;
 
 #ifdef CONFIG_SCHED_WALT
-               if (walt_cpu_high_irqload(i))
-                       continue;
+                       if (walt_cpu_high_irqload(i))
+                               continue;
 #endif
-               /*
-                * Unconditionally favoring tasks that prefer idle cpus to
-                * improve latency.
-                */
-               if (idle_cpu(i) && prefer_idle) {
-                       if (best_idle_cpu < 0)
-                               best_idle_cpu = i;
-                       continue;
-               }
 
-               cur_capacity = capacity_curr_of(i);
-               rq = cpu_rq(i);
-               idle_idx = idle_get_state_idx(rq);
-
-               if (new_util < cur_capacity) {
-                       if (cpu_rq(i)->nr_running) {
-                               if(prefer_idle) {
-                                       // Find a target cpu with lowest
-                                       // utilization.
-                                       if (target_util == 0 ||
-                                               target_util < new_util) {
-                                               target_cpu = i;
+                       /*
+                        * Unconditionally favoring tasks that prefer idle cpus to
+                        * improve latency.
+                        */
+                       if (idle_cpu(i) && prefer_idle)
+                               return i;
+
+                       cur_capacity = capacity_curr_of(i);
+
+                       if (new_util < cur_capacity) {
+                               if (cpu_rq(i)->nr_running) {
+                                       /*
+                                        * Find a target cpu with the lowest/highest
+                                        * utilization if prefer_idle/!prefer_idle.
+                                        */
+                                       if ((prefer_idle && target_util > new_util) ||
+                                           (!prefer_idle && target_util < new_util)) {
                                                target_util = new_util;
-                                       }
-                               } else {
-                                       // Find a target cpu with highest
-                                       // utilization.
-                                       if (target_util == 0 ||
-                                               target_util > new_util) {
                                                target_cpu = i;
-                                               target_util = new_util;
+                                       }
+                               } else if (!prefer_idle) {
+                                       int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+                                       if (best_idle_cpu < 0 ||
+                                               (sysctl_sched_cstate_aware &&
+                                                       best_idle_cstate > idle_idx)) {
+                                               best_idle_cstate = idle_idx;
+                                               best_idle_cpu = i;
                                        }
                                }
-                       } else if (!prefer_idle) {
-                               if (best_idle_cpu < 0 ||
-                                       (sysctl_sched_cstate_aware &&
-                                               best_idle_cstate > idle_idx)) {
-                                       best_idle_cstate = idle_idx;
-                                       best_idle_cpu = i;
-                               }
+                       } else if (backup_capacity > cur_capacity) {
+                               /* Find a backup cpu with least capacity. */
+                               backup_capacity = cur_capacity;
+                               backup_cpu = i;
                        }
-               } else if (backup_capacity == 0 ||
-                               backup_capacity > cur_capacity) {
-                       // Find a backup cpu with least capacity.
-                       backup_capacity = cur_capacity;
-                       backup_cpu = i;
                }
-       }
+       } while (sg = sg->next, sg != sd->groups);
 
-       if (prefer_idle && best_idle_cpu >= 0)
-               target_cpu = best_idle_cpu;
-       else if (target_cpu < 0)
+       if (target_cpu < 0)
                target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
 
        return target_cpu;
 }
 
-static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
 {
-       struct sched_domain *sd;
-       struct sched_group *sg, *sg_target;
-       int target_max_cap = INT_MAX;
-       int target_cpu = task_cpu(p);
-       unsigned long task_util_boosted, new_util;
-       int i;
+       unsigned long util, capacity;
 
-       if (sysctl_sched_sync_hint_enable && sync) {
-               int cpu = smp_processor_id();
-               cpumask_t search_cpus;
-               cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
-               if (cpumask_test_cpu(cpu, &search_cpus))
-                       return cpu;
-       }
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+               return cpu_util(cpu);
 
-       sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+       capacity = capacity_orig_of(cpu);
+       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
 
-       if (!sd)
-               return target;
+       return (util >= capacity) ? capacity : util;
+}
 
-       sg = sd->groups;
-       sg_target = sg;
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+       long min_cap, max_cap;
 
-       if (sysctl_sched_is_big_little) {
+       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
 
-               /*
-                * Find group with sufficient capacity. We only get here if no cpu is
-                * overutilized. We may end up overutilizing a cpu by adding the task,
-                * but that should not be any worse than select_idle_sibling().
-                * load_balance() should sort it out later as we get above the tipping
-                * point.
-                */
-               do {
-                       /* Assuming all cpus are the same in group */
-                       int max_cap_cpu = group_first_cpu(sg);
+       /* Minimum capacity is close to max, no need to abort wake_affine */
+       if (max_cap - min_cap < max_cap >> 3)
+               return 0;
 
-                       /*
-                        * Assume smaller max capacity means more energy-efficient.
-                        * Ideally we should query the energy model for the right
-                        * answer but it easily ends up in an exhaustive search.
-                        */
-                       if (capacity_of(max_cap_cpu) < target_max_cap &&
-                           task_fits_max(p, max_cap_cpu)) {
-                               sg_target = sg;
-                               target_max_cap = capacity_of(max_cap_cpu);
-                       }
-               } while (sg = sg->next, sg != sd->groups);
+       /* Bring task utilization in sync with prev_cpu */
+       sync_entity_load_avg(&p->se);
 
-               task_util_boosted = boosted_task_util(p);
-               /* Find cpu with sufficient capacity */
-               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
-                       /*
-                        * p's blocked utilization is still accounted for on prev_cpu
-                        * so prev_cpu will receive a negative bias due to the double
-                        * accounting. However, the blocked utilization may be zero.
-                        */
-                       new_util = cpu_util(i) + task_util_boosted;
+       return min_cap * 1024 < task_util(p) * capacity_margin;
+}
 
-                       /*
-                        * Ensure minimum capacity to grant the required boost.
-                        * The target CPU can be already at a capacity level higher
-                        * than the one required to boost the task.
-                        */
-                       if (new_util > capacity_orig_of(i))
-                               continue;
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+       struct sched_domain *sd;
+       int target_cpu = prev_cpu, tmp_target;
+       bool boosted, prefer_idle;
 
-                       if (new_util < capacity_curr_of(i)) {
-                               target_cpu = i;
-                               if (cpu_rq(i)->nr_running)
-                                       break;
-                       }
+       if (sysctl_sched_sync_hint_enable && sync) {
+               int cpu = smp_processor_id();
 
-                       /* cpu has capacity at higher OPP, keep it as fallback */
-                       if (target_cpu == task_cpu(p))
-                               target_cpu = i;
-               }
-       } else {
-               /*
-                * Find a cpu with sufficient capacity
-                */
+               if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                       return cpu;
+       }
+
+       rcu_read_lock();
 #ifdef CONFIG_CGROUP_SCHEDTUNE
-               bool boosted = schedtune_task_boost(p) > 0;
-               bool prefer_idle = schedtune_prefer_idle(p) > 0;
+       boosted = schedtune_task_boost(p) > 0;
+       prefer_idle = schedtune_prefer_idle(p) > 0;
 #else
-               bool boosted = 0;
-               bool prefer_idle = 0;
+       boosted = get_sysctl_sched_cfs_boost() > 0;
+       prefer_idle = 0;
 #endif
-               int tmp_target = find_best_target(p, boosted, prefer_idle);
-               if (tmp_target >= 0) {
-                       target_cpu = tmp_target;
-                       if ((boosted || prefer_idle) && idle_cpu(target_cpu))
-                               return target_cpu;
-               }
+
+       sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+       /* Find a cpu with sufficient capacity */
+       tmp_target = find_best_target(p, boosted, prefer_idle);
+
+       if (!sd)
+               goto unlock;
+       if (tmp_target >= 0) {
+               target_cpu = tmp_target;
+               if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+                       goto unlock;
        }
 
-       if (target_cpu != task_cpu(p)) {
+       if (target_cpu != prev_cpu) {
                struct energy_env eenv = {
-                       .util_delta     = task_util(p),
-                       .src_cpu        = task_cpu(p),
-                       .dst_cpu        = target_cpu,
-                       .task           = p,
+                       .util_delta     = task_util(p),
+                       .src_cpu        = prev_cpu,
+                       .dst_cpu        = target_cpu,
+                       .task           = p,
                };
 
                /* Not enough spare capacity on previous cpu */
-               if (cpu_overutilized(task_cpu(p)))
-                       return target_cpu;
+               if (cpu_overutilized(prev_cpu))
+                       goto unlock;
 
                if (energy_diff(&eenv) >= 0)
-                       return task_cpu(p);
+                       target_cpu = prev_cpu;
        }
 
+unlock:
+       rcu_read_unlock();
        return target_cpu;
 }
 
@@ -5851,9 +6126,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        int sync = wake_flags & WF_SYNC;
 
        if (sd_flag & SD_BALANCE_WAKE)
-               want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
-                             cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
-                             energy_aware();
+               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+                             && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+
+       if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+               return select_energy_cpu_brute(p, prev_cpu, sync);
 
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
@@ -5878,15 +6155,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
        if (affine_sd) {
                sd = NULL; /* Prefer wake_affine over balance flags */
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                        new_cpu = cpu;
        }
 
        if (!sd) {
-               if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
-                       new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
-               else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-                       new_cpu = select_idle_sibling(p, new_cpu);
+               if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+                       new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
        } else while (sd) {
                struct sched_group *group;
@@ -6861,7 +7136,8 @@ static void update_blocked_averages(int cpu)
                if (throttled_hierarchy(cfs_rq))
                        continue;
 
-               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+                                          true))
                        update_tg_load_avg(cfs_rq, 0);
        }
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6922,7 +7198,7 @@ static inline void update_blocked_averages(int cpu)
 
        raw_spin_lock_irqsave(&rq->lock, flags);
        update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -7085,7 +7361,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
                mcc->cpu = cpu;
 #ifdef CONFIG_SCHED_DEBUG
                raw_spin_unlock_irqrestore(&mcc->lock, flags);
-               pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+               printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+                               cpu, capacity);
                goto skip_unlock;
 #endif
        }
@@ -7101,13 +7378,14 @@ skip_unlock: __attribute__ ((unused));
        cpu_rq(cpu)->cpu_capacity = capacity;
        sdg->sgc->capacity = capacity;
        sdg->sgc->max_capacity = capacity;
+       sdg->sgc->min_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-       unsigned long capacity, max_capacity;
+       unsigned long capacity, max_capacity, min_capacity;
        unsigned long interval;
 
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -7121,6 +7399,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
        capacity = 0;
        max_capacity = 0;
+       min_capacity = ULONG_MAX;
 
        if (child->flags & SD_OVERLAP) {
                /*
@@ -7151,6 +7430,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                        }
 
                        max_capacity = max(capacity, max_capacity);
+                       min_capacity = min(capacity, min_capacity);
                }
        } else  {
                /*
@@ -7164,12 +7444,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
                        capacity += sgc->capacity;
                        max_capacity = max(sgc->max_capacity, max_capacity);
+                       min_capacity = min(sgc->min_capacity, min_capacity);
                        group = group->next;
                } while (group != child->groups);
        }
 
        sdg->sgc->capacity = capacity;
        sdg->sgc->max_capacity = max_capacity;
+       sdg->sgc->min_capacity = min_capacity;
 }
 
 /*
@@ -7397,14 +7679,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        if (sgs->avg_load <= busiest->avg_load)
                return false;
 
+       if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+               goto asym_packing;
+
        /*
-        * Candiate sg has no more than one task per cpu and has higher
-        * per-cpu capacity. No reason to pull tasks to less capable cpus.
+        * Candidate sg has no more than one task per CPU and
+        * has higher per-CPU capacity. Migrating tasks to less
+        * capable CPUs may harm throughput. Maximize throughput,
+        * power/energy consequences are not considered.
         */
        if (sgs->sum_nr_running <= sgs->group_weight &&
            group_smaller_cpu_capacity(sds->local, sg))
                return false;
 
+asym_packing:
        /* This is the busiest node in its class. */
        if (!(env->sd->flags & SD_ASYM_PACKING))
                return true;
@@ -9120,6 +9408,34 @@ static inline bool vruntime_normalized(struct task_struct *p)
        return false;
 }
 
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       /* Catch up with the cfs_rq and remove our load when we leave */
+       update_load_avg(se, 0);
+       detach_entity_load_avg(cfs_rq, se);
+       update_tg_load_avg(cfs_rq, false);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /*
+        * Since the real-depth could have been changed (only FAIR
+        * class maintain depth value), reset depth properly.
+        */
+       se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+       /* Synchronize entity with its cfs_rq */
+       update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+       attach_entity_load_avg(cfs_rq, se);
+       update_tg_load_avg(cfs_rq, false);
+}
+
 static void detach_task_cfs_rq(struct task_struct *p)
 {
        struct sched_entity *se = &p->se;
@@ -9134,8 +9450,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
                se->vruntime -= cfs_rq->min_vruntime;
        }
 
-       /* Catch up with the cfs_rq and remove our load when we leave */
-       detach_entity_load_avg(cfs_rq, se);
+       detach_entity_cfs_rq(se);
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -9143,16 +9458,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       /*
-        * Since the real-depth could have been changed (only FAIR
-        * class maintain depth value), reset depth properly.
-        */
-       se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
-       /* Synchronize task with its cfs_rq */
-       attach_entity_load_avg(cfs_rq, se);
+       attach_entity_cfs_rq(se);
 
        if (!vruntime_normalized(p))
                se->vruntime += cfs_rq->min_vruntime;
@@ -9246,8 +9552,9 @@ void free_fair_sched_group(struct task_group *tg)
 
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-       struct cfs_rq *cfs_rq;
        struct sched_entity *se;
+       struct cfs_rq *cfs_rq;
+       struct rq *rq;
        int i;
 
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -9262,6 +9569,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
        for_each_possible_cpu(i) {
+               rq = cpu_rq(i);
+
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -9275,6 +9584,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                init_entity_runnable_average(se);
+
+               raw_spin_lock_irq(&rq->lock);
+               post_init_entity_util_avg(se);
+               raw_spin_unlock_irq(&rq->lock);
        }
 
        return 1;