UPSTREAM: sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 387950f2649d264f4ff73696ba9d80fc5f7e0489..15ccfbff1bde46473a9eb3116db2e97fc12edc78 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -53,7 +53,6 @@
  unsigned int sysctl_sched_latency = 6000000ULL;
  unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  
-unsigned int sysctl_sched_is_big_little = 0;
  unsigned int sysctl_sched_sync_hint_enable = 1;
  unsigned int sysctl_sched_initial_task_util = 0;
  unsigned int sysctl_sched_cstate_aware = 1;
@@ -306,19 +305,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
         if (!cfs_rq->on_list) {
+               struct rq *rq = rq_of(cfs_rq);
+               int cpu = cpu_of(rq);
                 /*
                  * Ensure we either appear before our parent (if already
                  * enqueued) or force our parent to appear after us when it is
-                * enqueued.  The fact that we always enqueue bottom-up
-                * reduces this to two cases.
+                * enqueued. The fact that we always enqueue bottom-up
+                * reduces this to two cases and a special case for the root
+                * cfs_rq. Furthermore, it also means that we will always reset
+                * tmp_alone_branch either when the branch is connected
+                * to a tree or when we reach the beg of the tree
                  */
                 if (cfs_rq->tg->parent &&
-                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
-               } else {
+                   cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+                       /*
+                        * If parent is already on the list, we add the child
+                        * just before. Thanks to circular linked property of
+                        * the list, this means to put the child at the tail
+                        * of the list that starts by parent.
+                        */
                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+                               &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+                       /*
+                        * The branch is now connected to its tree so we can
+                        * reset tmp_alone_branch to the beginning of the
+                        * list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else if (!cfs_rq->tg->parent) {
+                       /*
+                        * cfs rq without parent should be put
+                        * at the tail of the list.
+                        */
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq->leaf_cfs_rq_list);
+                       /*
+                        * We have reach the beg of a tree so we can reset
+                        * tmp_alone_branch to the beginning of the list.
+                        */
+                       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+               } else {
+                       /*
+                        * The parent has not already been added so we want to
+                        * make sure that it will be put after us.
+                        * tmp_alone_branch points to the beg of the branch
+                        * where we will add parent.
+                        */
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               rq->tmp_alone_branch);
+                       /*
+                        * update tmp_alone_branch to points to the new beg
+                        * of the branch
+                        */
+                       rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
                 }
  
                 cfs_rq->on_list = 1;
@@ -702,18 +741,108 @@ void init_entity_runnable_average(struct sched_entity *se)
         sa->period_contrib = 1023;
         sa->load_avg = scale_load_down(se->load.weight);
         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-       sa->util_avg =  sched_freq() ?
-               sysctl_sched_initial_task_util :
-               scale_load_down(SCHED_LOAD_SCALE);
-       sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       /*
+        * In previous Android versions, we used to have:
+        *      sa->util_avg =  sched_freq() ?
+        *              sysctl_sched_initial_task_util :
+        *              scale_load_down(SCHED_LOAD_SCALE);
+        *      sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+        * However, that functionality has been moved to enqueue.
+        * It is unclear if we should restore this in enqueue.
+        */
+       /*
+        * At this point, util_avg won't be used in select_task_rq_fair anyway
+        */
+       sa->util_avg = 0;
+       sa->util_sum = 0;
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct sched_avg *sa = &se->avg;
+       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+       if (cap > 0) {
+               if (cfs_rq->avg.util_avg != 0) {
+                       sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+                       sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+                       if (sa->util_avg > cap)
+                               sa->util_avg = cap;
+               } else {
+                       sa->util_avg = cap;
+               }
+               /*
+                * If we wish to restore tuning via setting initial util,
+                * this is where we should do it.
+                */
+               sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+       }
+
+       if (entity_is_task(se)) {
+               struct task_struct *p = task_of(se);
+               if (p->sched_class != &fair_sched_class) {
+                       /*
+                        * For !fair tasks do:
+                        *
+                       update_cfs_rq_load_avg(now, cfs_rq, false);
+                       attach_entity_load_avg(cfs_rq, se);
+                       switched_from_fair(rq, p);
+                        *
+                        * such that the next switched_to_fair() has the
+                        * expected state.
+                        */
+                       se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+                       return;
+               }
+       }
+
+       attach_entity_cfs_rq(se);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
  #else
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
  
  /*
   * Update the current task's runtime statistics.
@@ -2700,9 +2829,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
   */
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  {
@@ -2760,9 +2901,25 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
         WRITE_ONCE(*ptr, res);                                  \
  } while (0)
  
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq,
-                                        bool update_freq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  {
         struct sched_avg *sa = &cfs_rq->avg;
         int decayed, removed = 0, removed_util = 0;
@@ -2799,8 +2956,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq,
         return decayed || removed;
  }
  
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG      0x1
+#define SKIP_AGE_LOAD  0x2
+
  /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 now = cfs_rq_clock_task(cfs_rq);
@@ -2810,37 +2973,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
          * Track task load average for carrying it to new CPU after migrated, and
          * track group sched_entity load average for task_h_load calc in migration
          */
-       __update_load_avg(now, cpu, &se->avg,
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+               __update_load_avg(now, cpu, &se->avg,
                           se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
+       }
  
-       if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+       if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
                 update_tg_load_avg(cfs_rq, 0);
  
         if (entity_is_task(se))
                 trace_sched_load_avg_task(task_of(se), &se->avg);
  }
  
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       if (!sched_feat(ATTACH_AGE_LOAD))
-               goto skip_aging;
-
-       /*
-        * If we got migrated (either between CPUs or between cgroups) we'll
-        * have aged the average right before clearing @last_update_time.
-        */
-       if (se->avg.last_update_time) {
-               __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                                 &se->avg, 0, 0, NULL);
-
-               /*
-                * XXX: we could have just aged the entire load away if we've been
-                * absent from the fair class for too long.
-                */
-       }
-
-skip_aging:
         se->avg.last_update_time = cfs_rq->avg.last_update_time;
         cfs_rq->avg.load_avg += se->avg.load_avg;
         cfs_rq->avg.load_sum += se->avg.load_sum;
@@ -2850,11 +3005,16 @@ skip_aging:
         cfs_rq_util_change(cfs_rq);
  }
  
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                         &se->avg, se->on_rq * scale_load_down(se->load.weight),
-                         cfs_rq->curr == se, NULL);
  
         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
@@ -2869,34 +3029,20 @@ static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         struct sched_avg *sa = &se->avg;
-       u64 now = cfs_rq_clock_task(cfs_rq);
-       int migrated, decayed;
-
-       migrated = !sa->last_update_time;
-       if (!migrated) {
-               __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-                       se->on_rq * scale_load_down(se->load.weight),
-                       cfs_rq->curr == se, NULL);
-       }
-
-       decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
  
         cfs_rq->runnable_load_avg += sa->load_avg;
         cfs_rq->runnable_load_sum += sa->load_sum;
  
-       if (migrated)
+       if (!sa->last_update_time) {
                 attach_entity_load_avg(cfs_rq, se);
-
-       if (decayed || migrated)
                 update_tg_load_avg(cfs_rq, 0);
+       }
  }
  
  /* Remove the runnable load generated by se from cfs_rq's runnable load average */
  static inline void
  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       update_load_avg(se, 1);
-
         cfs_rq->runnable_load_avg =
                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
         cfs_rq->runnable_load_sum =
@@ -2989,11 +3135,16 @@ static int idle_balance(struct rq *this_rq);
  
  #else /* CONFIG_SMP */
  
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  {
-       cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+       return 0;
  }
  
+#define UPDATE_TG      0x0
+#define SKIP_AGE_LOAD  0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
@@ -3136,6 +3287,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+       update_load_avg(se, UPDATE_TG);
         enqueue_entity_load_avg(cfs_rq, se);
         account_entity_enqueue(cfs_rq, se);
         update_cfs_shares(cfs_rq);
@@ -3211,6 +3363,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+       update_load_avg(se, UPDATE_TG);
         dequeue_entity_load_avg(cfs_rq, se);
  
         update_stats_dequeue(cfs_rq, se);
@@ -3301,7 +3454,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                  */
                 update_stats_wait_end(cfs_rq, se);
                 __dequeue_entity(cfs_rq, se);
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
         }
  
         update_stats_curr_start(cfs_rq, se);
@@ -3417,7 +3570,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         /*
          * Ensure that runnable average is periodically updated.
          */
-       update_load_avg(curr, 1);
+       update_load_avg(curr, UPDATE_TG);
         update_cfs_shares(cfs_rq);
  
  #ifdef CONFIG_SCHED_HRTICK
@@ -4346,7 +4499,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
                 update_cfs_shares(cfs_rq);
         }
  
@@ -4448,7 +4601,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
  
-               update_load_avg(se, 1);
+               update_load_avg(se, UPDATE_TG);
                 update_cfs_shares(cfs_rq);
         }
  
@@ -5678,9 +5831,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         struct sched_domain *sd;
         struct sched_group *sg;
-       int best_idle = -1;
-       int best_idle_cstate = -1;
-       int best_idle_capacity = INT_MAX;
+       int best_idle_cpu = -1;
+       int best_idle_cstate = INT_MAX;
+       unsigned long best_idle_capacity = ULONG_MAX;
  
         if (!sysctl_sched_cstate_aware) {
                 if (idle_cpu(target))
@@ -5707,18 +5860,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  
                         if (sysctl_sched_cstate_aware) {
                                 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
-                                       struct rq *rq = cpu_rq(i);
-                                       int idle_idx = idle_get_state_idx(rq);
+                                       int idle_idx = idle_get_state_idx(cpu_rq(i));
                                         unsigned long new_usage = boosted_task_util(p);
                                         unsigned long capacity_orig = capacity_orig_of(i);
+
                                         if (new_usage > capacity_orig || !idle_cpu(i))
                                                 goto next;
  
                                         if (i == target && new_usage <= capacity_curr_of(target))
                                                 return target;
  
-                                       if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
-                                               best_idle = i;
+                                       if (idle_idx < best_idle_cstate &&
+                                           capacity_orig <= best_idle_capacity) {
+                                               best_idle_cpu = i;
                                                 best_idle_cstate = idle_idx;
                                                 best_idle_capacity = capacity_orig;
                                         }
@@ -5737,111 +5891,117 @@ next:
                         sg = sg->next;
                 } while (sg != sd->groups);
         }
-       if (best_idle > 0)
-               target = best_idle;
+
+       if (best_idle_cpu >= 0)
+               target = best_idle_cpu;
  
  done:
         return target;
  }
  
+static int start_cpu(bool boosted)
+{
+       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+       RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+                          "sched RCU must be held");
+
+       return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
  static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
  {
-       int iter_cpu;
         int target_cpu = -1;
-       int target_util = 0;
-       int backup_capacity = 0;
+       unsigned long target_util = prefer_idle ? ULONG_MAX : 0;
+       unsigned long backup_capacity = ULONG_MAX;
         int best_idle_cpu = -1;
         int best_idle_cstate = INT_MAX;
         int backup_cpu = -1;
-       unsigned long task_util_boosted, new_util;
+       unsigned long min_util = boosted_task_util(p);
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int cpu = start_cpu(boosted);
  
-       task_util_boosted = boosted_task_util(p);
-       for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
-               int cur_capacity;
-               struct rq *rq;
-               int idle_idx;
+       if (cpu < 0)
+               return target_cpu;
  
-               /*
-                * Iterate from higher cpus for boosted tasks.
-                */
-               int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+       sd = rcu_dereference(per_cpu(sd_ea, cpu));
  
-               if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
-                       continue;
+       if (!sd)
+               return target_cpu;
  
-               /*
-                * p's blocked utilization is still accounted for on prev_cpu
-                * so prev_cpu will receive a negative bias due to the double
-                * accounting. However, the blocked utilization may be zero.
-                */
-               new_util = cpu_util(i) + task_util_boosted;
+       sg = sd->groups;
  
-               /*
-                * Ensure minimum capacity to grant the required boost.
-                * The target CPU can be already at a capacity level higher
-                * than the one required to boost the task.
-                */
-               if (new_util > capacity_orig_of(i))
-                       continue;
+       do {
+               int i;
+
+               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+                       unsigned long cur_capacity, new_util;
+
+                       if (!cpu_online(i))
+                               continue;
+
+                       /*
+                        * p's blocked utilization is still accounted for on prev_cpu
+                        * so prev_cpu will receive a negative bias due to the double
+                        * accounting. However, the blocked utilization may be zero.
+                        */
+                       new_util = cpu_util(i) + task_util(p);
+
+                       /*
+                        * Ensure minimum capacity to grant the required boost.
+                        * The target CPU can be already at a capacity level higher
+                        * than the one required to boost the task.
+                        */
+                       new_util = max(min_util, new_util);
+
+                       if (new_util > capacity_orig_of(i))
+                               continue;
  
  #ifdef CONFIG_SCHED_WALT
-               if (walt_cpu_high_irqload(i))
-                       continue;
+                       if (walt_cpu_high_irqload(i))
+                               continue;
  #endif
-               /*
-                * Unconditionally favoring tasks that prefer idle cpus to
-                * improve latency.
-                */
-               if (idle_cpu(i) && prefer_idle) {
-                       if (best_idle_cpu < 0)
-                               best_idle_cpu = i;
-                       continue;
-               }
  
-               cur_capacity = capacity_curr_of(i);
-               rq = cpu_rq(i);
-               idle_idx = idle_get_state_idx(rq);
+                       /*
+                        * Unconditionally favoring tasks that prefer idle cpus to
+                        * improve latency.
+                        */
+                       if (idle_cpu(i) && prefer_idle)
+                               return i;
  
-               if (new_util < cur_capacity) {
-                       if (cpu_rq(i)->nr_running) {
-                               if (prefer_idle) {
-                                       /* Find a target cpu with highest
-                                        * utilization.
+                       cur_capacity = capacity_curr_of(i);
+
+                       if (new_util < cur_capacity) {
+                               if (cpu_rq(i)->nr_running) {
+                                       /*
+                                        * Find a target cpu with the lowest/highest
+                                        * utilization if prefer_idle/!prefer_idle.
                                          */
-                                       if (target_util == 0 ||
-                                               target_util < new_util) {
-                                               target_cpu = i;
+                                       if ((prefer_idle && target_util > new_util) ||
+                                           (!prefer_idle && target_util < new_util)) {
                                                 target_util = new_util;
-                                       }
-                               } else {
-                                       /* Find a target cpu with lowest
-                                        * utilization.
-                                        */
-                                       if (target_util == 0 ||
-                                               target_util > new_util) {
                                                 target_cpu = i;
-                                               target_util = new_util;
+                                       }
+                               } else if (!prefer_idle) {
+                                       int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+                                       if (best_idle_cpu < 0 ||
+                                               (sysctl_sched_cstate_aware &&
+                                                       best_idle_cstate > idle_idx)) {
+                                               best_idle_cstate = idle_idx;
+                                               best_idle_cpu = i;
                                         }
                                 }
-                       } else if (!prefer_idle) {
-                               if (best_idle_cpu < 0 ||
-                                       (sysctl_sched_cstate_aware &&
-                                               best_idle_cstate > idle_idx)) {
-                                       best_idle_cstate = idle_idx;
-                                       best_idle_cpu = i;
-                               }
+                       } else if (backup_capacity > cur_capacity) {
+                               /* Find a backup cpu with least capacity. */
+                               backup_capacity = cur_capacity;
+                               backup_cpu = i;
                         }
-               } else if (backup_capacity == 0 ||
-                               backup_capacity > cur_capacity) {
-                       // Find a backup cpu with least capacity.
-                       backup_capacity = cur_capacity;
-                       backup_cpu = i;
                 }
-       }
+       } while (sg = sg->next, sg != sd->groups);
  
-       if (prefer_idle && best_idle_cpu >= 0)
-               target_cpu = best_idle_cpu;
-       else if (target_cpu < 0)
+       if (target_cpu < 0)
                 target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
  
         return target_cpu;
@@ -9248,6 +9408,34 @@ static inline bool vruntime_normalized(struct task_struct *p)
         return false;
  }
  
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       /* Catch up with the cfs_rq and remove our load when we leave */
+       update_load_avg(se, 0);
+       detach_entity_load_avg(cfs_rq, se);
+       update_tg_load_avg(cfs_rq, false);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /*
+        * Since the real-depth could have been changed (only FAIR
+        * class maintain depth value), reset depth properly.
+        */
+       se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+       /* Synchronize entity with its cfs_rq */
+       update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+       attach_entity_load_avg(cfs_rq, se);
+       update_tg_load_avg(cfs_rq, false);
+}
+
  static void detach_task_cfs_rq(struct task_struct *p)
  {
         struct sched_entity *se = &p->se;
@@ -9262,8 +9450,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
                 se->vruntime -= cfs_rq->min_vruntime;
         }
  
-       /* Catch up with the cfs_rq and remove our load when we leave */
-       detach_entity_load_avg(cfs_rq, se);
+       detach_entity_cfs_rq(se);
  }
  
  static void attach_task_cfs_rq(struct task_struct *p)
@@ -9271,16 +9458,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       /*
-        * Since the real-depth could have been changed (only FAIR
-        * class maintain depth value), reset depth properly.
-        */
-       se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
-       /* Synchronize task with its cfs_rq */
-       attach_entity_load_avg(cfs_rq, se);
+       attach_entity_cfs_rq(se);
  
         if (!vruntime_normalized(p))
                 se->vruntime += cfs_rq->min_vruntime;
@@ -9374,8 +9552,9 @@ void free_fair_sched_group(struct task_group *tg)
  
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
-       struct cfs_rq *cfs_rq;
         struct sched_entity *se;
+       struct cfs_rq *cfs_rq;
+       struct rq *rq;
         int i;
  
         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -9390,6 +9569,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  
         for_each_possible_cpu(i) {
+               rq = cpu_rq(i);
+
                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                       GFP_KERNEL, cpu_to_node(i));
                 if (!cfs_rq)
@@ -9403,6 +9584,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 init_cfs_rq(cfs_rq);
                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                 init_entity_runnable_average(se);
+
+               raw_spin_lock_irq(&rq->lock);
+               post_init_entity_util_avg(se);
+               raw_spin_unlock_irq(&rq->lock);
         }
  
         return 1;