UPSTREAM: sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 8f258f437ac2c12d566e87aaed751d7d7387fb4e..15ccfbff1bde46473a9eb3116db2e97fc12edc78 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,10 +30,13 @@
  #include <linux/mempolicy.h>
  #include <linux/migrate.h>
  #include <linux/task_work.h>
+#include <linux/module.h>
  
  #include <trace/events/sched.h>
  
  #include "sched.h"
+#include "tune.h"
+#include "walt.h"
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
@@ -50,6 +53,16 @@
  unsigned int sysctl_sched_latency = 6000000ULL;
  unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
+#endif
  /*
   * The initial- and re-scaling of tunables is configurable
   * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -114,6 +127,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  #endif
  
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  {
         lw->weight += inc;
@@ -286,19 +305,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
         if (!cfs_rq->on_list) {
+               struct rq *rq = rq_of(cfs_rq);
+               int cpu = cpu_of(rq);
                 /*
                  * Ensure we either appear before our parent (if already
                  * enqueued) or force our parent to appear after us when it is
-                * enqueued.  The fact that we always enqueue bottom-up
-                * reduces this to two cases.
+                * enqueued. The fact that we always enqueue bottom-up
+                * reduces this to two cases and a special case for the root
+                * cfs_rq. Furthermore, it also means that we will always reset
+                * tmp_alone_branch either when the branch is connected
+                * to a tree or when we reach the beg of the tree
                  */
                 if (cfs_rq->tg->parent &&
-                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
-               } else {
+                   cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+                       /*
+                        * If parent is already on the list, we add the child
+                        * just before. Thanks to circular linked property of
+                        * the list, this means to put the child at the tail
+                        * of the list that starts by parent.
+                        */
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+                       /*
+                        * The branch is now connected to its tree so we can
+                        * reset tmp_alone_branch to the beginning of the
+                        * list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else if (!cfs_rq->tg->parent) {
+                       /*
+                        * cfs rq without parent should be put
+                        * at the tail of the list.
+                        */
                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                               &rq->leaf_cfs_rq_list);
+                       /*
+                        * We have reach the beg of a tree so we can reset
+                        * tmp_alone_branch to the beginning of the list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else {
+                       /*
+                        * The parent has not already been added so we want to
+                        * make sure that it will be put after us.
+                        * tmp_alone_branch points to the beg of the branch
+                        * where we will add parent.
+                        */
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               rq->tmp_alone_branch);
+                       /*
+                        * update tmp_alone_branch to points to the new beg
+                        * of the branch
+                        */
+                       rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
                 }
  
                 cfs_rq->on_list = 1;
@@ -656,7 +715,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
  
  /*
@@ -682,16 +741,108 @@ void init_entity_runnable_average(struct sched_entity *se)
         sa->period_contrib = 1023;
         sa->load_avg = scale_load_down(se->load.weight);
         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-       sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
-       sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       /*
+        * In previous Android versions, we used to have:
+        *      sa->util_avg =  sched_freq() ?
+        *              sysctl_sched_initial_task_util :
+        *              scale_load_down(SCHED_LOAD_SCALE);
+        *      sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+        * However, that functionality has been moved to enqueue.
+        * It is unclear if we should restore this in enqueue.
+        */
+       /*
+        * At this point, util_avg won't be used in select_task_rq_fair anyway
+        */
+       sa->util_avg = 0;
+       sa->util_sum = 0;
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct sched_avg *sa = &se->avg;
+       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+       if (cap > 0) {
+               if (cfs_rq->avg.util_avg != 0) {
+                       sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+                       sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+                       if (sa->util_avg > cap)
+                               sa->util_avg = cap;
+               } else {
+                       sa->util_avg = cap;
+               }
+               /*
+                * If we wish to restore tuning via setting initial util,
+                * this is where we should do it.
+                */
+               sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       }
+
+       if (entity_is_task(se)) {
+               struct task_struct *p = task_of(se);
+               if (p->sched_class != &fair_sched_class) {
+                       /*
+                        * For !fair tasks do:
+                        *
+                       update_cfs_rq_load_avg(now, cfs_rq, false);
+                       attach_entity_load_avg(cfs_rq, se);
+                       switched_from_fair(rq, p);
+                        *
+                        * such that the next switched_to_fair() has the
+                        * expected state.
+                        */
+                       se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+                       return;
+               }
+       }
+
+       attach_entity_cfs_rq(se);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
  #else
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
  
  /*
   * Update the current task's runtime statistics.
@@ -1388,7 +1539,8 @@ balance:
          * Call select_idle_sibling to maybe find a better one.
          */
         if (!cur)
-               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+               env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                                                  env->dst_cpu);
  
  assign:
         assigned = true;
@@ -2600,6 +2752,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
  
         scale_freq = arch_scale_freq_capacity(NULL, cpu);
         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+       trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
  
         /* delta_w is the amount already accumulated against our next period */
         delta_w = sa->period_contrib;
@@ -2676,9 +2829,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
   */
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  {
@@ -2694,6 +2859,29 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+        if (&this_rq()->cfs == cfs_rq) {
+                /*
+                 * There are a few boundary cases this might miss but it should
+                 * get called often enough that that should (hopefully) not be
+                 * a real problem -- added to that it only calls on the local
+                 * CPU, so if we enqueue remotely we'll miss an update, but
+                 * the next tick/schedule should update.
+                 *
+                 * It will not get called when we go idle, because the idle
+                 * thread is a different class (!fair), nor will the utilization
+                 * number include things like RT tasks.
+                 *
+                 * As is, the util number is not freq-invariant (we'd have to
+                 * implement arch_scale_freq_capacity() for that).
+                 *
+                 * See cpu_util().
+                 */
+                cpufreq_update_util(rq_of(cfs_rq), 0);
+        }
+}
+
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  
  /*
@@ -2713,11 +2901,28 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
         WRITE_ONCE(*ptr, res);                                  \
  } while (0)
  
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  {
         struct sched_avg *sa = &cfs_rq->avg;
-       int decayed, removed = 0;
+       int decayed, removed = 0, removed_util = 0;
  
         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2730,6 +2935,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                 sub_positive(&sa->util_avg, r);
                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+               removed_util = 1;
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2740,11 +2946,24 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
+       /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+       if (cfs_rq == &rq_of(cfs_rq)->cfs)
+               trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+       if (update_freq && (decayed || removed_util))
+               cfs_rq_util_change(cfs_rq);
+
         return decayed || removed;
  }
  
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG      0x1
+#define SKIP_AGE_LOAD  0x2
+
  /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
@@ -2754,51 +2973,55 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
          * Track task load average for carrying it to new CPU after migrated, and
          * track group sched_entity load average for task_h_load calc in migration
          */
-       __update_load_avg(now, cpu, &se->avg,
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+               __update_load_avg(now, cpu, &se->avg,
                           se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
+       }
  
-       if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+       if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
                 update_tg_load_avg(cfs_rq, 0);
+
+       if (entity_is_task(se))
+               trace_sched_load_avg_task(task_of(se), &se->avg);
  }
  
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       if (!sched_feat(ATTACH_AGE_LOAD))
-               goto skip_aging;
-
-       /*
-        * If we got migrated (either between CPUs or between cgroups) we'll
-        * have aged the average right before clearing @last_update_time.
-        */
-       if (se->avg.last_update_time) {
-               __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                                 &se->avg, 0, 0, NULL);
-
-               /*
-                * XXX: we could have just aged the entire load away if we've been
-                * absent from the fair class for too long.
-                */
-       }
-
-skip_aging:
         se->avg.last_update_time = cfs_rq->avg.last_update_time;
         cfs_rq->avg.load_avg += se->avg.load_avg;
         cfs_rq->avg.load_sum += se->avg.load_sum;
         cfs_rq->avg.util_avg += se->avg.util_avg;
         cfs_rq->avg.util_sum += se->avg.util_sum;
+
+       cfs_rq_util_change(cfs_rq);
  }
  
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                         &se->avg, se->on_rq * scale_load_down(se->load.weight),
-                         cfs_rq->curr == se, NULL);
  
         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+
+       cfs_rq_util_change(cfs_rq);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@ -2806,62 +3029,76 @@ static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         struct sched_avg *sa = &se->avg;
-       u64 now = cfs_rq_clock_task(cfs_rq);
-       int migrated, decayed;
-
-       migrated = !sa->last_update_time;
-       if (!migrated) {
-               __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-                       se->on_rq * scale_load_down(se->load.weight),
-                       cfs_rq->curr == se, NULL);
-       }
-
-       decayed = update_cfs_rq_load_avg(now, cfs_rq);
  
         cfs_rq->runnable_load_avg += sa->load_avg;
         cfs_rq->runnable_load_sum += sa->load_sum;
  
-       if (migrated)
+       if (!sa->last_update_time) {
                 attach_entity_load_avg(cfs_rq, se);
-
-       if (decayed || migrated)
                 update_tg_load_avg(cfs_rq, 0);
+       }
  }
  
  /* Remove the runnable load generated by se from cfs_rq's runnable load average */
  static inline void
  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       update_load_avg(se, 1);
-
         cfs_rq->runnable_load_avg =
                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
         cfs_rq->runnable_load_sum =
                 max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
  }
  
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 last_update_time;
-
  #ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
         u64 last_update_time_copy;
+       u64 last_update_time;
  
         do {
                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
                 smp_rmb();
                 last_update_time = cfs_rq->avg.last_update_time;
         } while (last_update_time != last_update_time_copy);
+
+       return last_update_time;
+}
  #else
-       last_update_time = cfs_rq->avg.last_update_time;
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->avg.last_update_time;
+}
  #endif
  
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 last_update_time;
+
+       last_update_time = cfs_rq_last_update_time(cfs_rq);
         __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       /*
+        * Newly created task or never used group entity should not be removed
+        * from its (source) cfs_rq
+        */
+       if (se->avg.last_update_time == 0)
+               return;
+
+       sync_entity_load_avg(se);
         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
  }
@@ -2898,7 +3135,16 @@ static int idle_balance(struct rq *this_rq);
  
  #else /* CONFIG_SMP */
  
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+       return 0;
+}
+
+#define UPDATE_TG      0x0
+#define SKIP_AGE_LOAD  0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
@@ -2962,6 +3208,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         }
  
                         trace_sched_stat_blocked(tsk, delta);
+                       trace_sched_blocked_reason(tsk);
  
                         /*
                          * Blocking time is in units of nanosecs, so shift by
@@ -3040,6 +3287,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+       update_load_avg(se, UPDATE_TG);
         enqueue_entity_load_avg(cfs_rq, se);
         account_entity_enqueue(cfs_rq, se);
         update_cfs_shares(cfs_rq);
@@ -3115,6 +3363,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+       update_load_avg(se, UPDATE_TG);
         dequeue_entity_load_avg(cfs_rq, se);
  
         update_stats_dequeue(cfs_rq, se);
@@ -3205,7 +3454,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                  */
                 update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
         }
  
         update_stats_curr_start(cfs_rq, se);
@@ -3321,7 +3570,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         /*
          * Ensure that runnable average is periodically updated.
          */
-       update_load_avg(curr, 1);
+       update_load_avg(curr, UPDATE_TG);
         update_cfs_shares(cfs_rq);
  
  #ifdef CONFIG_SCHED_HRTICK
@@ -3918,6 +4167,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
         if (!cfs_bandwidth_used())
                 return;
  
+       /* Synchronize hierarchical throttle counter: */
+       if (unlikely(!cfs_rq->throttle_uptodate)) {
+               struct rq *rq = rq_of(cfs_rq);
+               struct cfs_rq *pcfs_rq;
+               struct task_group *tg;
+
+               cfs_rq->throttle_uptodate = 1;
+
+               /* Get closest up-to-date node, because leaves go first: */
+               for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+                       pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+                       if (pcfs_rq->throttle_uptodate)
+                               break;
+               }
+               if (tg) {
+                       cfs_rq->throttle_count = pcfs_rq->throttle_count;
+                       cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               }
+       }
+
         /* an active group must be handled by the update_curr()->put() path */
         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                 return;
@@ -4157,6 +4426,28 @@ static inline void hrtick_update(struct rq *rq)
  }
  #endif
  
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
+unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+
+#ifdef CONFIG_SMP
+static void update_capacity_of(int cpu)
+{
+       unsigned long req_cap;
+
+       if (!sched_freq())
+               return;
+
+       /* Convert scale-invariant capacity to cpu. */
+       req_cap = boosted_cpu_util(cpu);
+       req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+       set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+#endif
+
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
@@ -4167,6 +4458,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
+       int task_new = flags & ENQUEUE_WAKEUP_NEW;
+       int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
+
+       /*
+        * If in_iowait is set, the code below may not trigger any cpufreq
+        * utilization updates, so do it here explicitly with the IOWAIT flag
+        * passed.
+        */
+       if (p->in_iowait)
+               cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
  
         for_each_sched_entity(se) {
                 if (se->on_rq)
@@ -4183,6 +4486,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 flags = ENQUEUE_WAKEUP;
         }
@@ -4190,17 +4494,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
                 update_cfs_shares(cfs_rq);
         }
  
         if (!se)
                 add_nr_running(rq, 1);
  
+#ifdef CONFIG_SMP
+
+       /*
+        * Update SchedTune accounting.
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
+       if (!se) {
+               walt_inc_cumulative_runnable_avg(rq, p);
+               if (!task_new && !rq->rd->overutilized &&
+                   cpu_overutilized(rq->cpu)) {
+                       rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
+
+               /*
+                * We want to potentially trigger a freq switch
+                * request only for tasks that are waking up; this is
+                * because we get here also during load balancing, but
+                * in these cases it seems wise to trigger as single
+                * request after load balancing is done.
+                */
+               if (task_new || task_wakeup)
+                       update_capacity_of(cpu_of(rq));
+       }
+
+#endif /* CONFIG_SMP */
         hrtick_update(rq);
  }
  
@@ -4230,18 +4576,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
+                       /* Avoid re-evaluating load for this entity: */
+                       se = parent_entity(se);
                         /*
                          * Bias pick_next to pick a task from this cfs_rq, as
                          * p is sleeping when it is within its sched_slice.
                          */
-                       if (task_sleep && parent_entity(se))
-                               set_next_buddy(parent_entity(se));
-
-                       /* avoid re-evaluating load for this entity */
-                       se = parent_entity(se);
+                       if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+                               set_next_buddy(se);
                         break;
                 }
                 flags |= DEQUEUE_SLEEP;
@@ -4250,17 +4596,50 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
                 update_cfs_shares(cfs_rq);
         }
  
         if (!se)
                 sub_nr_running(rq, 1);
  
+#ifdef CONFIG_SMP
+
+       /*
+        * Update SchedTune accounting
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
+       if (!se) {
+               walt_dec_cumulative_runnable_avg(rq, p);
+
+               /*
+                * We want to potentially trigger a freq switch
+                * request only for tasks that are going to sleep;
+                * this is because we get here also during load
+                * balancing, but in these cases it seems wise to
+                * trigger as single request after load balancing is
+                * done.
+                */
+               if (task_sleep) {
+                       if (rq->cfs.nr_running)
+                               update_capacity_of(cpu_of(rq));
+                       else if (sched_freq())
+                               set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+               }
+       }
+
+#endif /* CONFIG_SMP */
+
         hrtick_update(rq);
  }
  
@@ -4487,15 +4866,6 @@ static unsigned long target_load(int cpu, int type)
         return max(rq->cpu_load[type-1], total);
  }
  
-static unsigned long capacity_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity_orig;
-}
  
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
@@ -4670,51 +5040,448 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  #endif
  
  /*
- * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
- * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.  In order
- * to determine whether we should let the load spread vs consolodating to
- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.  With
- * both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.  Waker/wakee
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
   */
-static int wake_wide(struct task_struct *p)
+unsigned long capacity_curr_of(int cpu)
  {
-       unsigned int master = current->wakee_flips;
-       unsigned int slave = p->wakee_flips;
-       int factor = this_cpu_read(sd_llc_size);
-
-       if (master < slave)
-               swap(master, slave);
-       if (slave < factor || master < slave * factor)
-               return 0;
-       return 1;
+       return cpu_rq(cpu)->cpu_capacity_orig *
+              arch_scale_freq_capacity(NULL, cpu)
+              >> SCHED_CAPACITY_SHIFT;
  }
  
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static inline bool energy_aware(void)
  {
-       s64 this_load, load;
-       s64 this_eff_load, prev_eff_load;
-       int idx, this_cpu, prev_cpu;
-       struct task_group *tg;
-       unsigned long weight;
-       int balanced;
+       return sched_feat(ENERGY_AWARE);
+}
  
-       idx       = sd->wake_idx;
-       this_cpu  = smp_processor_id();
-       prev_cpu  = task_cpu(p);
-       load      = source_load(prev_cpu, idx);
-       this_load = target_load(this_cpu, idx);
+struct energy_env {
+       struct sched_group      *sg_top;
+       struct sched_group      *sg_cap;
+       int                     cap_idx;
+       int                     util_delta;
+       int                     src_cpu;
+       int                     dst_cpu;
+       int                     energy;
+       int                     payoff;
+       struct task_struct      *task;
+       struct {
+               int before;
+               int after;
+               int delta;
+               int diff;
+       } nrg;
+       struct {
+               int before;
+               int after;
+               int delta;
+       } cap;
+};
  
-       /*
-        * If sync wakeup then subtract the (maximum possible)
-        * effect of the currently running task from the load
-        * of the current CPU:
-        */
-       if (sync) {
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ *   capacity = capacity_orig * curr_freq/max_freq
+ *
+ *   norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+       int util = __cpu_util(cpu, delta);
+
+       if (util >= capacity)
+               return SCHED_CAPACITY_SCALE;
+
+       return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+       if (cpu == eenv->src_cpu)
+               return -eenv->util_delta;
+       if (cpu == eenv->dst_cpu)
+               return eenv->util_delta;
+       return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+       int i, delta;
+       unsigned long max_util = 0;
+
+       for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+               delta = calc_util_delta(eenv, i);
+               max_util = max(max_util, __cpu_util(i, delta));
+       }
+
+       return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+       int i, delta;
+       unsigned long util_sum = 0;
+       unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+       for_each_cpu(i, sched_group_cpus(sg)) {
+               delta = calc_util_delta(eenv, i);
+               util_sum += __cpu_norm_util(i, capacity, delta);
+       }
+
+       if (util_sum > SCHED_CAPACITY_SCALE)
+               return SCHED_CAPACITY_SCALE;
+       return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+       const struct sched_group_energy * const sge)
+{
+       int idx;
+       unsigned long util = group_max_util(eenv);
+
+       for (idx = 0; idx < sge->nr_cap_states; idx++) {
+               if (sge->cap_states[idx].cap >= util)
+                       break;
+       }
+
+       eenv->cap_idx = idx;
+
+       return idx;
+}
+
+static int group_idle_state(struct sched_group *sg)
+{
+       int i, state = INT_MAX;
+
+       /* Find the shallowest idle state in the sched group. */
+       for_each_cpu(i, sched_group_cpus(sg))
+               state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+       /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+       state++;
+
+       return state;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+       struct sched_domain *sd;
+       int cpu, total_energy = 0;
+       struct cpumask visit_cpus;
+       struct sched_group *sg;
+
+       WARN_ON(!eenv->sg_top->sge);
+
+       cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+
+       while (!cpumask_empty(&visit_cpus)) {
+               struct sched_group *sg_shared_cap = NULL;
+
+               cpu = cpumask_first(&visit_cpus);
+
+               /*
+                * Is the group utilization affected by cpus outside this
+                * sched_group?
+                */
+               sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+               if (!sd)
+                       /*
+                        * We most probably raced with hotplug; returning a
+                        * wrong energy estimation is better than entering an
+                        * infinite loop.
+                        */
+                       return -EINVAL;
+
+               if (sd->parent)
+                       sg_shared_cap = sd->parent->groups;
+
+               for_each_domain(cpu, sd) {
+                       sg = sd->groups;
+
+                       /* Has this sched_domain already been visited? */
+                       if (sd->child && group_first_cpu(sg) != cpu)
+                               break;
+
+                       do {
+                               unsigned long group_util;
+                               int sg_busy_energy, sg_idle_energy;
+                               int cap_idx, idle_idx;
+
+                               if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+                                       eenv->sg_cap = sg_shared_cap;
+                               else
+                                       eenv->sg_cap = sg;
+
+                               cap_idx = find_new_capacity(eenv, sg->sge);
+
+                               if (sg->group_weight == 1) {
+                                       /* Remove capacity of src CPU (before task move) */
+                                       if (eenv->util_delta == 0 &&
+                                           cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+                                               eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+                                               eenv->cap.delta -= eenv->cap.before;
+                                       }
+                                       /* Add capacity of dst CPU  (after task move) */
+                                       if (eenv->util_delta != 0 &&
+                                           cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+                                               eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+                                               eenv->cap.delta += eenv->cap.after;
+                                       }
+                               }
+
+                               idle_idx = group_idle_state(sg);
+                               group_util = group_norm_util(eenv, sg);
+                               sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+                                                               >> SCHED_CAPACITY_SHIFT;
+                               sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
+                                                               * sg->sge->idle_states[idle_idx].power)
+                                                               >> SCHED_CAPACITY_SHIFT;
+
+                               total_energy += sg_busy_energy + sg_idle_energy;
+
+                               if (!sd->child)
+                                       cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+                               if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+                                       goto next_cpu;
+
+                       } while (sg = sg->next, sg != sd->groups);
+               }
+next_cpu:
+               cpumask_clear_cpu(cpu, &visit_cpus);
+               continue;
+       }
+
+       eenv->energy = total_energy;
+       return 0;
+}
+
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+       return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+/*
+ * energy_diff(): Estimate the energy impact of changing the utilization
+ * distribution. eenv specifies the change: utilisation amount, source, and
+ * destination cpu. Source or destination cpu may be -1 in which case the
+ * utilization is removed from or added to the system (e.g. task wake-up). If
+ * both are specified, the utilization is migrated.
+ */
+static inline int __energy_diff(struct energy_env *eenv)
+{
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int sd_cpu = -1, energy_before = 0, energy_after = 0;
+       int diff, margin;
+
+       struct energy_env eenv_before = {
+               .util_delta     = 0,
+               .src_cpu        = eenv->src_cpu,
+               .dst_cpu        = eenv->dst_cpu,
+               .nrg            = { 0, 0, 0, 0},
+               .cap            = { 0, 0, 0 },
+       };
+
+       if (eenv->src_cpu == eenv->dst_cpu)
+               return 0;
+
+       sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+       sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
+       if (!sd)
+               return 0; /* Error */
+
+       sg = sd->groups;
+
+       do {
+               if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
+                       eenv_before.sg_top = eenv->sg_top = sg;
+
+                       if (sched_group_energy(&eenv_before))
+                               return 0; /* Invalid result abort */
+                       energy_before += eenv_before.energy;
+
+                       /* Keep track of SRC cpu (before) capacity */
+                       eenv->cap.before = eenv_before.cap.before;
+                       eenv->cap.delta = eenv_before.cap.delta;
+
+                       if (sched_group_energy(eenv))
+                               return 0; /* Invalid result abort */
+                       energy_after += eenv->energy;
+               }
+       } while (sg = sg->next, sg != sd->groups);
+
+       eenv->nrg.before = energy_before;
+       eenv->nrg.after = energy_after;
+       eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+       eenv->payoff = 0;
+
+       trace_sched_energy_diff(eenv->task,
+                       eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+                       eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+                       eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+                       eenv->nrg.delta, eenv->payoff);
+
+       /*
+        * Dead-zone margin preventing too many migrations.
+        */
+
+       margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+       diff = eenv->nrg.after - eenv->nrg.before;
+
+       eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
+
+       return eenv->nrg.diff;
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+       u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+       int max_delta;
+
+       /* Check for boundaries */
+       max_delta  = schedtune_target_nrg.max_power;
+       max_delta -= schedtune_target_nrg.min_power;
+       WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+       /* Do scaling using positive numbers to increase the range */
+       normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+       /* Scale by energy magnitude */
+       normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+       /* Normalize on max energy for target platform */
+       normalized_nrg = reciprocal_divide(
+                       normalized_nrg, schedtune_target_nrg.rdiv);
+
+       return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+       int boost = schedtune_task_boost(eenv->task);
+       int nrg_delta;
+
+       /* Conpute "absolute" energy diff */
+       __energy_diff(eenv);
+
+       /* Return energy diff when boost margin is 0 */
+       if (boost == 0)
+               return eenv->nrg.diff;
+
+       /* Compute normalized energy diff */
+       nrg_delta = normalize_energy(eenv->nrg.diff);
+       eenv->nrg.delta = nrg_delta;
+
+       eenv->payoff = schedtune_accept_deltas(
+                       eenv->nrg.delta,
+                       eenv->cap.delta,
+                       eenv->task);
+
+       /*
+        * When SchedTune is enabled, the energy_diff() function will return
+        * the computed energy payoff value. Since the energy_diff() return
+        * value is expected to be negative by its callers, this evaluation
+        * function return a negative value each time the evaluation return a
+        * positive payoff, which is the condition for the acceptance of
+        * a scheduling decision
+        */
+       return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.  In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.  With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.  Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
+static int wake_wide(struct task_struct *p)
+{
+       unsigned int master = current->wakee_flips;
+       unsigned int slave = p->wakee_flips;
+       int factor = this_cpu_read(sd_llc_size);
+
+       if (master < slave)
+               swap(master, slave);
+       if (slave < factor || master < slave * factor)
+               return 0;
+       return 1;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                      int prev_cpu, int sync)
+{
+       s64 this_load, load;
+       s64 this_eff_load, prev_eff_load;
+       int idx, this_cpu;
+       struct task_group *tg;
+       unsigned long weight;
+       int balanced;
+
+       idx       = sd->wake_idx;
+       this_cpu  = smp_processor_id();
+       load      = source_load(prev_cpu, idx);
+       this_load = target_load(this_cpu, idx);
+
+       /*
+        * If sync wakeup then subtract the (maximum possible)
+        * effect of the currently running task from the load
+        * of the current CPU:
+        */
+       if (sync) {
                 tg = task_group(current);
                 weight = current->se.avg.load_avg;
  
@@ -4760,6 +5527,160 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         return 1;
  }
  
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+               unsigned long demand = p->ravg.demand;
+               return (demand << 10) / walt_ravg_window;
+       }
+#endif
+       return p->se.avg.util_avg;
+}
+
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+       unsigned long capacity = capacity_of(cpu);
+
+       util += boosted_task_util(p);
+
+       return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+       unsigned long capacity = capacity_of(cpu);
+       unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+       if (capacity == max_capacity)
+               return true;
+
+       if (capacity * capacity_margin > max_capacity * 1024)
+               return true;
+
+       return __task_fits(p, cpu, 0);
+}
+
+static bool cpu_overutilized(int cpu)
+{
+       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+       long long margin = 0;
+
+       /*
+        * Signal proportional compensation (SPC)
+        *
+        * The Boost (B) value is used to compute a Margin (M) which is
+        * proportional to the complement of the original Signal (S):
+        *   M = B * (SCHED_LOAD_SCALE - S), if B is positive
+        *   M = B * S, if B is negative
+        * The obtained M could be used by the caller to "boost" S.
+        */
+       if (boost >= 0) {
+               margin  = SCHED_LOAD_SCALE - signal;
+               margin *= boost;
+       } else
+               margin = -signal * boost;
+       /*
+        * Fast integer division by constant:
+        *  Constant   :                 (C) = 100
+        *  Precision  : 0.1%            (P) = 0.1
+        *  Reference  : C * 100 / P     (R) = 100000
+        *
+        * Thus:
+        *  Shift bits : ceil(log(R,2))  (S) = 17
+        *  Mult const : round(2^S/C)    (M) = 1311
+        *
+        *
+        */
+       margin  *= 1311;
+       margin >>= 17;
+
+       if (boost < 0)
+               margin *= -1;
+       return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+       int boost = schedtune_cpu_boost(cpu);
+
+       if (boost == 0)
+               return 0;
+
+       return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+       int boost = schedtune_task_boost(task);
+       unsigned long util;
+       long margin;
+
+       if (boost == 0)
+               return 0;
+
+       util = task_util(task);
+       margin = schedtune_margin(util, boost);
+
+       return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+       return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+       return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+unsigned long
+boosted_cpu_util(int cpu)
+{
+       unsigned long util = cpu_util(cpu);
+       long margin = schedtune_cpu_margin(util, cpu);
+
+       trace_sched_boost_cpu(cpu, util, margin);
+
+       return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+       unsigned long util = task_util(task);
+       long margin = schedtune_task_margin(task);
+
+       trace_sched_boost_task(task, util, margin);
+
+       return util + margin;
+}
+
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+       return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
@@ -4769,7 +5690,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                   int this_cpu, int sd_flag)
  {
         struct sched_group *idlest = NULL, *group = sd->groups;
+       struct sched_group *most_spare_sg = NULL;
         unsigned long min_load = ULONG_MAX, this_load = 0;
+       unsigned long most_spare = 0, this_spare = 0;
         int load_idx = sd->forkexec_idx;
         int imbalance = 100 + (sd->imbalance_pct-100)/2;
  
@@ -4777,7 +5700,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 load_idx = sd->wake_idx;
  
         do {
-               unsigned long load, avg_load;
+               unsigned long load, avg_load, spare_cap, max_spare_cap;
                 int local_group;
                 int i;
  
@@ -4789,8 +5712,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 local_group = cpumask_test_cpu(this_cpu,
                                                sched_group_cpus(group));
  
-               /* Tally up the load of all CPUs in the group */
+               /*
+                * Tally up the load of all CPUs in the group and find
+                * the group containing the CPU with most spare capacity.
+                */
                 avg_load = 0;
+               max_spare_cap = 0;
  
                 for_each_cpu(i, sched_group_cpus(group)) {
                         /* Bias balancing toward cpus of our domain */
@@ -4800,6 +5727,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                                 load = target_load(i, load_idx);
  
                         avg_load += load;
+
+                       spare_cap = capacity_spare_wake(i, p);
+
+                       if (spare_cap > max_spare_cap)
+                               max_spare_cap = spare_cap;
                 }
  
                 /* Adjust by relative CPU capacity of the group */
@@ -4807,12 +5739,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
  
                 if (local_group) {
                         this_load = avg_load;
-               } else if (avg_load < min_load) {
-                       min_load = avg_load;
-                       idlest = group;
+                       this_spare = max_spare_cap;
+               } else {
+                       if (avg_load < min_load) {
+                               min_load = avg_load;
+                               idlest = group;
+                       }
+
+                       if (most_spare < max_spare_cap) {
+                               most_spare = max_spare_cap;
+                               most_spare_sg = group;
+                       }
                 }
         } while (group = group->next, group != sd->groups);
  
+       /*
+        * The cross-over point between using spare capacity or least load
+        * is too conservative for high utilization tasks on partially
+        * utilized systems if we require spare_capacity > task_util(p),
+        * so we allow for some task stuffing by using
+        * spare_capacity > task_util(p)/2.
+        */
+       if (this_spare > task_util(p) / 2 &&
+           imbalance*this_spare > 100*most_spare)
+               return NULL;
+       else if (most_spare > task_util(p) / 2)
+               return most_spare_sg;
+
         if (!idlest || 100*this_load < imbalance*min_load)
                 return NULL;
         return idlest;
@@ -4831,6 +5784,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
         int shallowest_idle_cpu = -1;
         int i;
  
+       /* Check if we have any choice: */
+       if (group->group_weight == 1)
+               return cpumask_first(sched_group_cpus(group));
+
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                 if (idle_cpu(i)) {
@@ -4870,20 +5827,24 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  /*
   * Try and locate an idle CPU in the sched_domain.
   */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         struct sched_domain *sd;
         struct sched_group *sg;
-       int i = task_cpu(p);
+       int best_idle_cpu = -1;
+       int best_idle_cstate = INT_MAX;
+       unsigned long best_idle_capacity = ULONG_MAX;
  
-       if (idle_cpu(target))
-               return target;
+       if (!sysctl_sched_cstate_aware) {
+               if (idle_cpu(target))
+                       return target;
  
-       /*
-        * If the prevous cpu is cache affine and idle, don't be stupid.
-        */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-               return i;
+               /*
+                * If the prevous cpu is cache affine and idle, don't be stupid.
+                */
+               if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+                       return prev;
+       }
  
         /*
          * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -4892,60 +5853,257 @@ static int select_idle_sibling(struct task_struct *p, int target)
         for_each_lower_domain(sd) {
                 sg = sd->groups;
                 do {
+                       int i;
                         if (!cpumask_intersects(sched_group_cpus(sg),
                                                 tsk_cpus_allowed(p)))
                                 goto next;
  
-                       for_each_cpu(i, sched_group_cpus(sg)) {
-                               if (i == target || !idle_cpu(i))
-                                       goto next;
-                       }
+                       if (sysctl_sched_cstate_aware) {
+                               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+                                       int idle_idx = idle_get_state_idx(cpu_rq(i));
+                                       unsigned long new_usage = boosted_task_util(p);
+                                       unsigned long capacity_orig = capacity_orig_of(i);
+
+                                       if (new_usage > capacity_orig || !idle_cpu(i))
+                                               goto next;
+
+                                       if (i == target && new_usage <= capacity_curr_of(target))
+                                               return target;
  
-                       target = cpumask_first_and(sched_group_cpus(sg),
+                                       if (idle_idx < best_idle_cstate &&
+                                           capacity_orig <= best_idle_capacity) {
+                                               best_idle_cpu = i;
+                                               best_idle_cstate = idle_idx;
+                                               best_idle_capacity = capacity_orig;
+                                       }
+                               }
+                       } else {
+                               for_each_cpu(i, sched_group_cpus(sg)) {
+                                       if (i == target || !idle_cpu(i))
+                                               goto next;
+                               }
+
+                               target = cpumask_first_and(sched_group_cpus(sg),
                                         tsk_cpus_allowed(p));
-                       goto done;
+                               goto done;
+                       }
  next:
                         sg = sg->next;
                 } while (sg != sd->groups);
         }
+
+       if (best_idle_cpu >= 0)
+               target = best_idle_cpu;
+
  done:
         return target;
  }
  
+static int start_cpu(bool boosted)
+{
+       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+       RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+                          "sched RCU must be held");
+
+       return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+{
+       int target_cpu = -1;
+       unsigned long target_util = prefer_idle ? ULONG_MAX : 0;
+       unsigned long backup_capacity = ULONG_MAX;
+       int best_idle_cpu = -1;
+       int best_idle_cstate = INT_MAX;
+       int backup_cpu = -1;
+       unsigned long min_util = boosted_task_util(p);
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int cpu = start_cpu(boosted);
+
+       if (cpu < 0)
+               return target_cpu;
+
+       sd = rcu_dereference(per_cpu(sd_ea, cpu));
+
+       if (!sd)
+               return target_cpu;
+
+       sg = sd->groups;
+
+       do {
+               int i;
+
+               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+                       unsigned long cur_capacity, new_util;
+
+                       if (!cpu_online(i))
+                               continue;
+
+                       /*
+                        * p's blocked utilization is still accounted for on prev_cpu
+                        * so prev_cpu will receive a negative bias due to the double
+                        * accounting. However, the blocked utilization may be zero.
+                        */
+                       new_util = cpu_util(i) + task_util(p);
+
+                       /*
+                        * Ensure minimum capacity to grant the required boost.
+                        * The target CPU can be already at a capacity level higher
+                        * than the one required to boost the task.
+                        */
+                       new_util = max(min_util, new_util);
+
+                       if (new_util > capacity_orig_of(i))
+                               continue;
+
+#ifdef CONFIG_SCHED_WALT
+                       if (walt_cpu_high_irqload(i))
+                               continue;
+#endif
+
+                       /*
+                        * Unconditionally favoring tasks that prefer idle cpus to
+                        * improve latency.
+                        */
+                       if (idle_cpu(i) && prefer_idle)
+                               return i;
+
+                       cur_capacity = capacity_curr_of(i);
+
+                       if (new_util < cur_capacity) {
+                               if (cpu_rq(i)->nr_running) {
+                                       /*
+                                        * Find a target cpu with the lowest/highest
+                                        * utilization if prefer_idle/!prefer_idle.
+                                        */
+                                       if ((prefer_idle && target_util > new_util) ||
+                                           (!prefer_idle && target_util < new_util)) {
+                                               target_util = new_util;
+                                               target_cpu = i;
+                                       }
+                               } else if (!prefer_idle) {
+                                       int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+                                       if (best_idle_cpu < 0 ||
+                                               (sysctl_sched_cstate_aware &&
+                                                       best_idle_cstate > idle_idx)) {
+                                               best_idle_cstate = idle_idx;
+                                               best_idle_cpu = i;
+                                       }
+                               }
+                       } else if (backup_capacity > cur_capacity) {
+                               /* Find a backup cpu with least capacity. */
+                               backup_capacity = cur_capacity;
+                               backup_cpu = i;
+                       }
+               }
+       } while (sg = sg->next, sg != sd->groups);
+
+       if (target_cpu < 0)
+               target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+       return target_cpu;
+}
+
  /*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
-{
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-       unsigned long capacity = capacity_orig_of(cpu);
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+       unsigned long util, capacity;
+
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+               return cpu_util(cpu);
+
+       capacity = capacity_orig_of(cpu);
+       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
  
         return (util >= capacity) ? capacity : util;
  }
  
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+       long min_cap, max_cap;
+
+       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+       /* Minimum capacity is close to max, no need to abort wake_affine */
+       if (max_cap - min_cap < max_cap >> 3)
+               return 0;
+
+       /* Bring task utilization in sync with prev_cpu */
+       sync_entity_load_avg(&p->se);
+
+       return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+       struct sched_domain *sd;
+       int target_cpu = prev_cpu, tmp_target;
+       bool boosted, prefer_idle;
+
+       if (sysctl_sched_sync_hint_enable && sync) {
+               int cpu = smp_processor_id();
+
+               if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                       return cpu;
+       }
+
+       rcu_read_lock();
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+       boosted = schedtune_task_boost(p) > 0;
+       prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+       boosted = get_sysctl_sched_cfs_boost() > 0;
+       prefer_idle = 0;
+#endif
+
+       sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+       /* Find a cpu with sufficient capacity */
+       tmp_target = find_best_target(p, boosted, prefer_idle);
+
+       if (!sd)
+               goto unlock;
+       if (tmp_target >= 0) {
+               target_cpu = tmp_target;
+               if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+                       goto unlock;
+       }
+
+       if (target_cpu != prev_cpu) {
+               struct energy_env eenv = {
+                       .util_delta     = task_util(p),
+                       .src_cpu        = prev_cpu,
+                       .dst_cpu        = target_cpu,
+                       .task           = p,
+               };
+
+               /* Not enough spare capacity on previous cpu */
+               if (cpu_overutilized(prev_cpu))
+                       goto unlock;
+
+               if (energy_diff(&eenv) >= 0)
+                       target_cpu = prev_cpu;
+       }
+
+unlock:
+       rcu_read_unlock();
+       return target_cpu;
+}
+
  /*
   * select_task_rq_fair: Select target runqueue for the waking task in domains
   * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -4968,7 +6126,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         int sync = wake_flags & WF_SYNC;
  
         if (sd_flag & SD_BALANCE_WAKE)
-               want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+                             && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+
+       if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+               return select_energy_cpu_brute(p, prev_cpu, sync);
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
@@ -4993,13 +6155,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (affine_sd) {
                 sd = NULL; /* Prefer wake_affine over balance flags */
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                         new_cpu = cpu;
         }
  
         if (!sd) {
                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-                       new_cpu = select_idle_sibling(p, new_cpu);
+                       new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  
         } else while (sd) {
                 struct sched_group *group;
@@ -5068,6 +6230,8 @@ static void task_dead_fair(struct task_struct *p)
  {
         remove_entity_load_avg(&p->se);
  }
+#else
+#define task_fits_max(p, cpu) true
  #endif /* CONFIG_SMP */
  
  static unsigned long
@@ -5314,6 +6478,8 @@ again:
         if (hrtick_enabled(rq))
                 hrtick_start_fair(rq, p);
  
+       rq->misfit_task = !task_fits_max(p, rq->cpu);
+
         return p;
  simple:
         cfs_rq = &rq->cfs;
@@ -5335,9 +6501,12 @@ simple:
         if (hrtick_enabled(rq))
                 hrtick_start_fair(rq, p);
  
+       rq->misfit_task = !task_fits_max(p, rq->cpu);
+
         return p;
  
  idle:
+       rq->misfit_task = 0;
         /*
          * This is OK, because current is on_cpu, which avoids it being picked
          * for load-balance and preemption/IRQs are still disabled avoiding
@@ -5550,6 +6719,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
  enum fbq_type { regular, remote, all };
  
+enum group_type {
+       group_other = 0,
+       group_misfit_task,
+       group_imbalanced,
+       group_overloaded,
+};
+
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
  #define LBF_DST_PINNED  0x04
@@ -5568,6 +6744,7 @@ struct lb_env {
         int                     new_dst_cpu;
         enum cpu_idle_type      idle;
         long                    imbalance;
+       unsigned int            src_grp_nr_running;
         /* The set of CPUs under consideration for load-balancing */
         struct cpumask          *cpus;
  
@@ -5578,6 +6755,7 @@ struct lb_env {
         unsigned int            loop_max;
  
         enum fbq_type           fbq_type;
+       enum group_type         busiest_group_type;
         struct list_head        tasks;
  };
  
@@ -5759,7 +6937,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
  
         deactivate_task(env->src_rq, p, 0);
         p->on_rq = TASK_ON_RQ_MIGRATING;
+       double_lock_balance(env->src_rq, env->dst_rq);
         set_task_cpu(p, env->dst_cpu);
+       double_unlock_balance(env->src_rq, env->dst_rq);
  }
  
  /*
@@ -5904,6 +7084,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
  {
         raw_spin_lock(&rq->lock);
         attach_task(rq, p);
+       /*
+        * We want to potentially raise target_cpu's OPP.
+        */
+       update_capacity_of(cpu_of(rq));
         raw_spin_unlock(&rq->lock);
  }
  
@@ -5925,6 +7109,11 @@ static void attach_tasks(struct lb_env *env)
                 attach_task(env->dst_rq, p);
         }
  
+       /*
+        * We want to potentially raise env.dst_cpu's OPP.
+        */
+       update_capacity_of(env->dst_cpu);
+
         raw_spin_unlock(&env->dst_rq->lock);
  }
  
@@ -5947,7 +7136,8 @@ static void update_blocked_averages(int cpu)
                 if (throttled_hierarchy(cfs_rq))
                         continue;
  
-               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+                                          true))
                         update_tg_load_avg(cfs_rq, 0);
         }
         raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6008,7 +7198,7 @@ static inline void update_blocked_averages(int cpu)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
         update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -6020,12 +7210,6 @@ static unsigned long task_h_load(struct task_struct *p)
  
  /********** Helpers for find_busiest_group ************************/
  
-enum group_type {
-       group_other = 0,
-       group_imbalanced,
-       group_overloaded,
-};
-
  /*
   * sg_lb_stats - stats of a sched_group required for load_balancing
   */
@@ -6041,6 +7225,7 @@ struct sg_lb_stats {
         unsigned int group_weight;
         enum group_type group_type;
         int group_no_capacity;
+       int group_misfit_task; /* A cpu has a task too big for its capacity */
  #ifdef CONFIG_NUMA_BALANCING
         unsigned int nr_numa_running;
         unsigned int nr_preferred_running;
@@ -6132,19 +7317,58 @@ static unsigned long scale_rt_capacity(int cpu)
  
         used = div_u64(avg, total);
  
+       /*
+        * deadline bandwidth is defined at system level so we must
+        * weight this bandwidth with the max capacity of the system.
+        * As a reminder, avg_bw is 20bits width and
+        * scale_cpu_capacity is 10 bits width
+        */
+       used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
         if (likely(used < SCHED_CAPACITY_SCALE))
                 return SCHED_CAPACITY_SCALE - used;
  
         return 1;
  }
  
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+       raw_spin_lock_init(&mcc->lock);
+       mcc->val = 0;
+       mcc->cpu = -1;
+}
+
  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
  {
         unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
         struct sched_group *sdg = sd->groups;
+       struct max_cpu_capacity *mcc;
+       unsigned long max_capacity;
+       int max_cap_cpu;
+       unsigned long flags;
  
         cpu_rq(cpu)->cpu_capacity_orig = capacity;
  
+       mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       raw_spin_lock_irqsave(&mcc->lock, flags);
+       max_capacity = mcc->val;
+       max_cap_cpu = mcc->cpu;
+
+       if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+           (max_capacity < capacity)) {
+               mcc->val = capacity;
+               mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+               raw_spin_unlock_irqrestore(&mcc->lock, flags);
+               printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+                               cpu, capacity);
+               goto skip_unlock;
+#endif
+       }
+       raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
         capacity *= scale_rt_capacity(cpu);
         capacity >>= SCHED_CAPACITY_SHIFT;
  
@@ -6153,13 +7377,15 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
  
         cpu_rq(cpu)->cpu_capacity = capacity;
         sdg->sgc->capacity = capacity;
+       sdg->sgc->max_capacity = capacity;
+       sdg->sgc->min_capacity = capacity;
  }
  
  void update_group_capacity(struct sched_domain *sd, int cpu)
  {
         struct sched_domain *child = sd->child;
         struct sched_group *group, *sdg = sd->groups;
-       unsigned long capacity;
+       unsigned long capacity, max_capacity, min_capacity;
         unsigned long interval;
  
         interval = msecs_to_jiffies(sd->balance_interval);
@@ -6172,6 +7398,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
         }
  
         capacity = 0;
+       max_capacity = 0;
+       min_capacity = ULONG_MAX;
  
         if (child->flags & SD_OVERLAP) {
                 /*
@@ -6196,11 +7424,13 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                          */
                         if (unlikely(!rq->sd)) {
                                 capacity += capacity_of(cpu);
-                               continue;
+                       } else {
+                               sgc = rq->sd->groups->sgc;
+                               capacity += sgc->capacity;
                         }
  
-                       sgc = rq->sd->groups->sgc;
-                       capacity += sgc->capacity;
+                       max_capacity = max(capacity, max_capacity);
+                       min_capacity = min(capacity, min_capacity);
                 }
         } else  {
                 /*
@@ -6210,12 +7440,18 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
  
                 group = child->groups;
                 do {
-                       capacity += group->sgc->capacity;
+                       struct sched_group_capacity *sgc = group->sgc;
+
+                       capacity += sgc->capacity;
+                       max_capacity = max(sgc->max_capacity, max_capacity);
+                       min_capacity = min(sgc->min_capacity, min_capacity);
                         group = group->next;
                 } while (group != child->groups);
         }
  
         sdg->sgc->capacity = capacity;
+       sdg->sgc->max_capacity = max_capacity;
+       sdg->sgc->min_capacity = min_capacity;
  }
  
  /*
@@ -6310,6 +7546,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
         return false;
  }
  
+
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+       return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
+                                                       ref->sgc->max_capacity;
+}
+
  static inline enum
  group_type group_classify(struct sched_group *group,
                           struct sg_lb_stats *sgs)
@@ -6320,6 +7568,9 @@ group_type group_classify(struct sched_group *group,
         if (sg_imbalanced(group))
                 return group_imbalanced;
  
+       if (sgs->group_misfit_task)
+               return group_misfit_task;
+
         return group_other;
  }
  
@@ -6331,14 +7582,15 @@ group_type group_classify(struct sched_group *group,
   * @local_group: Does group contain this_cpu.
   * @sgs: variable to hold the statistics for this group.
   * @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
   */
  static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
                         int local_group, struct sg_lb_stats *sgs,
-                       bool *overload)
+                       bool *overload, bool *overutilized)
  {
         unsigned long load;
-       int i;
+       int i, nr_running;
  
         memset(sgs, 0, sizeof(*sgs));
  
@@ -6355,7 +7607,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->group_util += cpu_util(i);
                 sgs->sum_nr_running += rq->cfs.h_nr_running;
  
-               if (rq->nr_running > 1)
+               nr_running = rq->nr_running;
+               if (nr_running > 1)
                         *overload = true;
  
  #ifdef CONFIG_NUMA_BALANCING
@@ -6363,8 +7616,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->nr_preferred_running += rq->nr_preferred_running;
  #endif
                 sgs->sum_weighted_load += weighted_cpuload(i);
-               if (idle_cpu(i))
+               /*
+                * No need to call idle_cpu() if nr_running is not 0
+                */
+               if (!nr_running && idle_cpu(i))
                         sgs->idle_cpus++;
+
+               if (cpu_overutilized(i)) {
+                       *overutilized = true;
+                       if (!sgs->group_misfit_task && rq->misfit_task)
+                               sgs->group_misfit_task = capacity_of(i);
+               }
         }
  
         /* Adjust by relative CPU capacity of the group */
@@ -6406,9 +7668,31 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         if (sgs->group_type < busiest->group_type)
                 return false;
  
+       /*
+        * Candidate sg doesn't face any serious load-balance problems
+        * so don't pick it if the local sg is already filled up.
+        */
+       if (sgs->group_type == group_other &&
+           !group_has_capacity(env, &sds->local_stat))
+               return false;
+
         if (sgs->avg_load <= busiest->avg_load)
                 return false;
  
+       if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+               goto asym_packing;
+
+       /*
+        * Candidate sg has no more than one task per CPU and
+        * has higher per-CPU capacity. Migrating tasks to less
+        * capable CPUs may harm throughput. Maximize throughput,
+        * power/energy consequences are not considered.
+        */
+       if (sgs->sum_nr_running <= sgs->group_weight &&
+           group_smaller_cpu_capacity(sds->local, sg))
+               return false;
+
+asym_packing:
         /* This is the busiest node in its class. */
         if (!(env->sd->flags & SD_ASYM_PACKING))
                 return true;
@@ -6470,7 +7754,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats tmp_sgs;
         int load_idx, prefer_sibling = 0;
-       bool overload = false;
+       bool overload = false, overutilized = false;
  
         if (child && child->flags & SD_PREFER_SIBLING)
                 prefer_sibling = 1;
@@ -6492,7 +7776,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 }
  
                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-                                               &overload);
+                                               &overload, &overutilized);
  
                 if (local_group)
                         goto next_group;
@@ -6514,6 +7798,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                         sgs->group_type = group_classify(sg, sgs);
                 }
  
+               /*
+                * Ignore task groups with misfit tasks if local group has no
+                * capacity or if per-cpu capacity isn't higher.
+                */
+               if (sgs->group_type == group_misfit_task &&
+                   (!group_has_capacity(env, &sds->local_stat) ||
+                    !group_smaller_cpu_capacity(sg, sds->local)))
+                       sgs->group_type = group_other;
+
                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                         sds->busiest = sg;
                         sds->busiest_stat = *sgs;
@@ -6530,10 +7823,23 @@ next_group:
         if (env->sd->flags & SD_NUMA)
                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
  
+       env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
         if (!env->sd->parent) {
                 /* update overload indicator if we are at root domain */
                 if (env->dst_rq->rd->overload != overload)
                         env->dst_rq->rd->overload = overload;
+
+               /* Update over-utilization (tipping point, U >= 0) indicator */
+               if (env->dst_rq->rd->overutilized != overutilized) {
+                       env->dst_rq->rd->overutilized = overutilized;
+                       trace_sched_overutilized(overutilized);
+               }
+       } else {
+               if (!env->dst_rq->rd->overutilized && overutilized) {
+                       env->dst_rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
         }
  
  }
@@ -6682,6 +7988,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
          */
         if (busiest->avg_load <= sds->avg_load ||
             local->avg_load >= sds->avg_load) {
+               /* Misfitting tasks should be migrated in any case */
+               if (busiest->group_type == group_misfit_task) {
+                       env->imbalance = busiest->group_misfit_task;
+                       return;
+               }
+
+               /*
+                * Busiest group is overloaded, local is not, use the spare
+                * cycles to maximize throughput
+                */
+               if (busiest->group_type == group_overloaded &&
+                   local->group_type <= group_misfit_task) {
+                       env->imbalance = busiest->load_per_task;
+                       return;
+               }
+
                 env->imbalance = 0;
                 return fix_small_imbalance(env, sds);
         }
@@ -6715,6 +8037,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 (sds->avg_load - local->avg_load) * local->group_capacity
         ) / SCHED_CAPACITY_SCALE;
  
+       /* Boost imbalance to allow misfit task to be balanced. */
+       if (busiest->group_type == group_misfit_task)
+               env->imbalance = max_t(long, env->imbalance,
+                                    busiest->group_misfit_task);
+
         /*
          * if *imbalance is less than the average load per runnable task
          * there is no guarantee that any tasks will be moved so we'll have
@@ -6756,6 +8083,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
          * this level.
          */
         update_sd_lb_stats(env, &sds);
+
+       if (energy_aware() && !env->dst_rq->rd->overutilized)
+               goto out_balanced;
+
         local = &sds.local_stat;
         busiest = &sds.busiest_stat;
  
@@ -6784,6 +8115,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
             busiest->group_no_capacity)
                 goto force_balance;
  
+       /* Misfitting tasks should be dealt with regardless of the avg load */
+       if (busiest->group_type == group_misfit_task) {
+               goto force_balance;
+       }
+
         /*
          * If the local group is busier than the selected busiest group
          * don't try and pull any tasks.
@@ -6807,7 +8143,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                  * might end up to just move the imbalance on another group
                  */
                 if ((busiest->group_type != group_overloaded) &&
-                               (local->idle_cpus <= (busiest->idle_cpus + 1)))
+                   (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+                   !group_smaller_cpu_capacity(sds.busiest, sds.local))
                         goto out_balanced;
         } else {
                 /*
@@ -6820,6 +8157,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         }
  
  force_balance:
+       env->busiest_group_type = busiest->group_type;
         /* Looks like there is an imbalance. Compute it */
         calculate_imbalance(env, &sds);
         return sds.busiest;
@@ -6878,7 +8216,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                  */
  
                 if (rq->nr_running == 1 && wl > env->imbalance &&
-                   !check_cpu_capacity(rq, env->sd))
+                   !check_cpu_capacity(rq, env->sd) &&
+                   env->busiest_group_type != group_misfit_task)
                         continue;
  
                 /*
@@ -6939,6 +8278,13 @@ static int need_active_balance(struct lb_env *env)
                         return 1;
         }
  
+       if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+                               env->src_rq->cfs.h_nr_running == 1 &&
+                               cpu_overutilized(env->src_cpu) &&
+                               !cpu_overutilized(env->dst_cpu)) {
+                       return 1;
+       }
+
         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  }
  
@@ -7060,6 +8406,11 @@ more_balance:
                  * ld_moved     - cumulative load moved across iterations
                  */
                 cur_ld_moved = detach_tasks(&env);
+               /*
+                * We want to potentially lower env.src_cpu's OPP.
+                */
+               if (cur_ld_moved)
+                       update_capacity_of(env.src_cpu);
  
                 /*
                  * We've detached some tasks from busiest_rq. Every
@@ -7151,7 +8502,8 @@ more_balance:
                  * excessive cache_hot migrations and active balances.
                  */
                 if (idle != CPU_NEWLY_IDLE)
-                       sd->nr_balance_failed++;
+                       if (env.src_grp_nr_running > 1)
+                               sd->nr_balance_failed++;
  
                 if (need_active_balance(&env)) {
                         raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -7283,6 +8635,7 @@ static int idle_balance(struct rq *this_rq)
         struct sched_domain *sd;
         int pulled_task = 0;
         u64 curr_cost = 0;
+       long removed_util=0;
  
         idle_enter_fair(this_rq);
  
@@ -7292,8 +8645,9 @@ static int idle_balance(struct rq *this_rq)
          */
         this_rq->idle_stamp = rq_clock(this_rq);
  
-       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-           !this_rq->rd->overload) {
+       if (!energy_aware() &&
+           (this_rq->avg_idle < sysctl_sched_migration_cost ||
+            !this_rq->rd->overload)) {
                 rcu_read_lock();
                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
                 if (sd)
@@ -7305,6 +8659,17 @@ static int idle_balance(struct rq *this_rq)
  
         raw_spin_unlock(&this_rq->lock);
  
+       /*
+        * If removed_util_avg is !0 we most probably migrated some task away
+        * from this_cpu. In this case we might be willing to trigger an OPP
+        * update, but we want to do so if we don't find anybody else to pull
+        * here (we will trigger an OPP update with the pulled task's enqueue
+        * anyway).
+        *
+        * Record removed_util before calling update_blocked_averages, and use
+        * it below (before returning) to see if an OPP update is required.
+        */
+       removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
         update_blocked_averages(this_cpu);
         rcu_read_lock();
         for_each_domain(this_cpu, sd) {
@@ -7369,6 +8734,12 @@ out:
         if (pulled_task) {
                 idle_exit_fair(this_rq);
                 this_rq->idle_stamp = 0;
+       } else if (removed_util) {
+               /*
+                * No task pulled and someone has been migrated away.
+                * Good case to trigger an OPP update.
+                */
+               update_capacity_of(this_cpu);
         }
  
         return pulled_task;
@@ -7428,8 +8799,13 @@ static int active_load_balance_cpu_stop(void *data)
                 schedstat_inc(sd, alb_count);
  
                 p = detach_one_task(&env);
-               if (p)
+               if (p) {
                         schedstat_inc(sd, alb_pushed);
+                       /*
+                        * We want to potentially lower env.src_cpu's OPP.
+                        */
+                       update_capacity_of(env.src_cpu);
+               }
                 else
                         schedstat_inc(sd, alb_failed);
         }
@@ -7809,12 +9185,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
         if (time_before(now, nohz.next_balance))
                 return false;
  
-       if (rq->nr_running >= 2)
+       if (rq->nr_running >= 2 &&
+           (!energy_aware() || cpu_overutilized(cpu)))
                 return true;
  
         rcu_read_lock();
         sd = rcu_dereference(per_cpu(sd_busy, cpu));
-       if (sd) {
+       if (sd && !energy_aware()) {
                 sgc = sd->groups->sgc;
                 nr_busy = atomic_read(&sgc->nr_busy_cpus);
  
@@ -7920,6 +9297,16 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
  
         if (static_branch_unlikely(&sched_numa_balancing))
                 task_tick_numa(rq, curr);
+
+#ifdef CONFIG_SMP
+       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+               rq->rd->overutilized = true;
+               trace_sched_overutilized(true);
+       }
+
+       rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
  }
  
  /*
@@ -8021,6 +9408,34 @@ static inline bool vruntime_normalized(struct task_struct *p)
         return false;
  }
  
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       /* Catch up with the cfs_rq and remove our load when we leave */
+       update_load_avg(se, 0);
+       detach_entity_load_avg(cfs_rq, se);
+       update_tg_load_avg(cfs_rq, false);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /*
+        * Since the real-depth could have been changed (only FAIR
+        * class maintain depth value), reset depth properly.
+        */
+       se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+       /* Synchronize entity with its cfs_rq */
+       update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+       attach_entity_load_avg(cfs_rq, se);
+       update_tg_load_avg(cfs_rq, false);
+}
+
  static void detach_task_cfs_rq(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
@@ -8035,8 +9450,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
                 se->vruntime -= cfs_rq->min_vruntime;
         }
  
-       /* Catch up with the cfs_rq and remove our load when we leave */
-       detach_entity_load_avg(cfs_rq, se);
+       detach_entity_cfs_rq(se);
  }
  
  static void attach_task_cfs_rq(struct task_struct *p)
@@ -8044,16 +9458,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       /*
-        * Since the real-depth could have been changed (only FAIR
-        * class maintain depth value), reset depth properly.
-        */
-       se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
-       /* Synchronize task with its cfs_rq */
-       attach_entity_load_avg(cfs_rq, se);
+       attach_entity_cfs_rq(se);
  
         if (!vruntime_normalized(p))
                 se->vruntime += cfs_rq->min_vruntime;
@@ -8147,8 +9552,9 @@ void free_fair_sched_group(struct task_group *tg)
  
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
-       struct cfs_rq *cfs_rq;
         struct sched_entity *se;
+       struct cfs_rq *cfs_rq;
+       struct rq *rq;
         int i;
  
         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8163,6 +9569,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  
         for_each_possible_cpu(i) {
+               rq = cpu_rq(i);
+
                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                       GFP_KERNEL, cpu_to_node(i));
                 if (!cfs_rq)
@@ -8176,6 +9584,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 init_cfs_rq(cfs_rq);
                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                 init_entity_runnable_average(se);
+
+               raw_spin_lock_irq(&rq->lock);
+               post_init_entity_util_avg(se);
+               raw_spin_unlock_irq(&rq->lock);
         }
  
         return 1;