DEBUG: sched: add tracepoint for RD overutilized
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index 7444383c032f3c0a962bc33b47907c52276ff637..9139e153671a499495983331ba929d4e052534f5 100644 (file)
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/module.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
 #include "tune.h"
+#include "walt.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
+#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -683,7 +696,9 @@ void init_entity_runnable_average(struct sched_entity *se)
        sa->period_contrib = 1023;
        sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-       sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+       sa->util_avg =  sched_freq() ?
+               sysctl_sched_initial_task_util :
+               scale_load_down(SCHED_LOAD_SCALE);
        sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
@@ -2746,6 +2761,10 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
        if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
                update_tg_load_avg(cfs_rq, 0);
+
+       if (entity_is_task(se))
+               trace_sched_load_avg_task(task_of(se), &se->avg);
+       trace_sched_load_avg_cpu(cpu, cfs_rq);
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4162,8 +4181,14 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
 static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
 
+#ifdef CONFIG_SMP
 static void update_capacity_of(int cpu)
 {
        unsigned long req_cap;
@@ -4176,8 +4201,7 @@ static void update_capacity_of(int cpu)
        req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
        set_cfs_cpu_capacity(cpu, true, req_cap);
 }
-
-static bool cpu_overutilized(int cpu);
+#endif
 
 /*
  * The enqueue_task method is called before nr_running is
@@ -4189,8 +4213,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
        int task_new = flags & ENQUEUE_WAKEUP_NEW;
        int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
 
        for_each_sched_entity(se) {
                if (se->on_rq)
@@ -4207,6 +4233,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                flags = ENQUEUE_WAKEUP;
        }
@@ -4214,6 +4241,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -4222,13 +4250,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                update_cfs_shares(cfs_rq);
        }
 
-       if (!se) {
+       if (!se)
                add_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+       if (!se) {
+               walt_inc_cumulative_runnable_avg(rq, p);
                if (!task_new && !rq->rd->overutilized &&
-                   cpu_overutilized(rq->cpu))
+                   cpu_overutilized(rq->cpu)) {
                        rq->rd->overutilized = true;
-
-               schedtune_enqueue_task(p, cpu_of(rq));
+                       trace_sched_overutilized(true);
+               }
 
                /*
                 * We want to potentially trigger a freq switch
@@ -4240,6 +4273,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (task_new || task_wakeup)
                        update_capacity_of(cpu_of(rq));
        }
+
+       /* Update SchedTune accouting */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
+#endif /* CONFIG_SMP */
        hrtick_update(rq);
 }
 
@@ -4269,6 +4307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
@@ -4289,6 +4328,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -4297,9 +4337,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                update_cfs_shares(cfs_rq);
        }
 
-       if (!se) {
+       if (!se)
                sub_nr_running(rq, 1);
-               schedtune_dequeue_task(p, cpu_of(rq));
+
+#ifdef CONFIG_SMP
+
+       if (!se) {
+               walt_dec_cumulative_runnable_avg(rq, p);
 
                /*
                 * We want to potentially trigger a freq switch
@@ -4316,6 +4360,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                                set_cfs_cpu_capacity(cpu_of(rq), false, 0);
                }
        }
+
+       /* Update SchedTune accouting */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
+#endif /* CONFIG_SMP */
+
        hrtick_update(rq);
 }
 
@@ -4715,6 +4765,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity_orig *
+              arch_scale_freq_capacity(NULL, cpu)
+              >> SCHED_CAPACITY_SHIFT;
+}
+
 static inline bool energy_aware(void)
 {
        return sched_feat(ENERGY_AWARE);
@@ -4942,6 +5003,7 @@ static int sched_group_energy(struct energy_env *eenv)
                        } while (sg = sg->next, sg != sd->groups);
                }
 next_cpu:
+               cpumask_clear_cpu(cpu, &visit_cpus);
                continue;
        }
 
@@ -4954,44 +5016,6 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
        return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
 }
 
-#ifdef CONFIG_SCHED_TUNE
-static int energy_diff_evaluate(struct energy_env *eenv)
-{
-       unsigned int boost;
-       int nrg_delta;
-
-       /* Return energy diff when boost margin is 0 */
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-       boost = schedtune_task_boost(eenv->task);
-#else
-       boost = get_sysctl_sched_cfs_boost();
-#endif
-       if (boost == 0)
-               return eenv->nrg.diff;
-
-       /* Compute normalized energy diff */
-       nrg_delta = schedtune_normalize_energy(eenv->nrg.diff);
-       eenv->nrg.delta = nrg_delta;
-
-       eenv->payoff = schedtune_accept_deltas(
-                       eenv->nrg.delta,
-                       eenv->cap.delta,
-                       eenv->task);
-
-       /*
-        * When SchedTune is enabled, the energy_diff() function will return
-        * the computed energy payoff value. Since the energy_diff() return
-        * value is expected to be negative by its callers, this evaluation
-        * function return a negative value each time the evaluation return a
-        * positive payoff, which is the condition for the acceptance of
-        * a scheduling decision
-        */
-       return -eenv->payoff;
-}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff_evaluate(eenv) eenv->nrg.diff
-#endif
-
 /*
  * energy_diff(): Estimate the energy impact of changing the utilization
  * distribution. eenv specifies the change: utilisation amount, source, and
@@ -4999,7 +5023,7 @@ static int energy_diff_evaluate(struct energy_env *eenv)
  * utilization is removed from or added to the system (e.g. task wake-up). If
  * both are specified, the utilization is migrated.
  */
-static inenergy_diff(struct energy_env *eenv)
+static inline int __energy_diff(struct energy_env *eenv)
 {
        struct sched_domain *sd;
        struct sched_group *sg;
@@ -5047,9 +5071,86 @@ static int energy_diff(struct energy_env *eenv)
        eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
        eenv->payoff = 0;
 
-       return energy_diff_evaluate(eenv);
+       trace_sched_energy_diff(eenv->task,
+                       eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+                       eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+                       eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+                       eenv->nrg.delta, eenv->payoff);
+
+       return eenv->nrg.diff;
 }
 
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+       u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+       int max_delta;
+
+       /* Check for boundaries */
+       max_delta  = schedtune_target_nrg.max_power;
+       max_delta -= schedtune_target_nrg.min_power;
+       WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+       /* Do scaling using positive numbers to increase the range */
+       normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+       /* Scale by energy magnitude */
+       normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+       /* Normalize on max energy for target platform */
+       normalized_nrg = reciprocal_divide(
+                       normalized_nrg, schedtune_target_nrg.rdiv);
+
+       return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+       int boost = schedtune_task_boost(eenv->task);
+       int nrg_delta;
+
+       /* Conpute "absolute" energy diff */
+       __energy_diff(eenv);
+
+       /* Return energy diff when boost margin is 0 */
+       if (boost == 0)
+               return eenv->nrg.diff;
+
+       /* Compute normalized energy diff */
+       nrg_delta = normalize_energy(eenv->nrg.diff);
+       eenv->nrg.delta = nrg_delta;
+
+       eenv->payoff = schedtune_accept_deltas(
+                       eenv->nrg.delta,
+                       eenv->cap.delta,
+                       eenv->task);
+
+       /*
+        * When SchedTune is enabled, the energy_diff() function will return
+        * the computed energy payoff value. Since the energy_diff() return
+        * value is expected to be negative by its callers, this evaluation
+        * function return a negative value each time the evaluation return a
+        * positive payoff, which is the condition for the acceptance of
+        * a scheduling decision
+        */
+       return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  * A waker of many should wake a different task than the one last awakened
@@ -5143,6 +5244,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
 static inline unsigned long task_util(struct task_struct *p)
 {
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+               unsigned long demand = p->ravg.demand;
+               return (demand << 10) / walt_ravg_window;
+       }
+#endif
        return p->se.avg.util_avg;
 }
 
@@ -5185,22 +5292,25 @@ static bool cpu_overutilized(int cpu)
 
 #ifdef CONFIG_SCHED_TUNE
 
-static unsigned long
-schedtune_margin(unsigned long signal, unsigned long boost)
+static long
+schedtune_margin(unsigned long signal, long boost)
 {
-       unsigned long long margin = 0;
+       long long margin = 0;
 
        /*
         * Signal proportional compensation (SPC)
         *
         * The Boost (B) value is used to compute a Margin (M) which is
         * proportional to the complement of the original Signal (S):
-        *   M = B * (SCHED_LOAD_SCALE - S)
+        *   M = B * (SCHED_LOAD_SCALE - S), if B is positive
+        *   M = B * S, if B is negative
         * The obtained M could be used by the caller to "boost" S.
         */
-       margin  = SCHED_LOAD_SCALE - signal;
-       margin *= boost;
-
+       if (boost >= 0) {
+               margin  = SCHED_LOAD_SCALE - signal;
+               margin *= boost;
+       } else
+               margin = -signal * boost;
        /*
         * Fast integer division by constant:
         *  Constant   :                 (C) = 100
@@ -5216,37 +5326,29 @@ schedtune_margin(unsigned long signal, unsigned long boost)
        margin  *= 1311;
        margin >>= 17;
 
+       if (boost < 0)
+               margin *= -1;
        return margin;
 }
 
-static inline unsigned int
+static inline int
 schedtune_cpu_margin(unsigned long util, int cpu)
 {
-       unsigned int boost;
+       int boost = schedtune_cpu_boost(cpu);
 
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-       boost = schedtune_cpu_boost(cpu);
-#else
-       boost = get_sysctl_sched_cfs_boost();
-#endif
        if (boost == 0)
                return 0;
 
        return schedtune_margin(util, boost);
 }
 
-static inline unsigned long
+static inline long
 schedtune_task_margin(struct task_struct *task)
 {
-       unsigned int boost;
+       int boost = schedtune_task_boost(task);
        unsigned long util;
-       unsigned long margin;
+       long margin;
 
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-       boost = schedtune_task_boost(task);
-#else
-       boost = get_sysctl_sched_cfs_boost();
-#endif
        if (boost == 0)
                return 0;
 
@@ -5258,13 +5360,13 @@ schedtune_task_margin(struct task_struct *task)
 
 #else /* CONFIG_SCHED_TUNE */
 
-static inline unsigned int
+static inline int
 schedtune_cpu_margin(unsigned long util, int cpu)
 {
        return 0;
 }
 
-static inline unsigned int
+static inline int
 schedtune_task_margin(struct task_struct *task)
 {
        return 0;
@@ -5276,7 +5378,9 @@ static inline unsigned long
 boosted_cpu_util(int cpu)
 {
        unsigned long util = cpu_util(cpu);
-       unsigned long margin = schedtune_cpu_margin(util, cpu);
+       long margin = schedtune_cpu_margin(util, cpu);
+
+       trace_sched_boost_cpu(cpu, util, margin);
 
        return util + margin;
 }
@@ -5285,7 +5389,9 @@ static inline unsigned long
 boosted_task_util(struct task_struct *task)
 {
        unsigned long util = task_util(task);
-       unsigned long margin = schedtune_task_margin(task);
+       long margin = schedtune_task_margin(task);
+
+       trace_sched_boost_task(task, util, margin);
 
        return util + margin;
 }
@@ -5441,15 +5547,20 @@ static int select_idle_sibling(struct task_struct *p, int target)
        struct sched_domain *sd;
        struct sched_group *sg;
        int i = task_cpu(p);
+       int best_idle = -1;
+       int best_idle_cstate = -1;
+       int best_idle_capacity = INT_MAX;
 
-       if (idle_cpu(target))
-               return target;
+       if (!sysctl_sched_cstate_aware) {
+               if (idle_cpu(target))
+                       return target;
 
-       /*
-        * If the prevous cpu is cache affine and idle, don't be stupid.
-        */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-               return i;
+               /*
+                * If the prevous cpu is cache affine and idle, don't be stupid.
+                */
+               if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+                       return i;
+       }
 
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -5462,30 +5573,150 @@ static int select_idle_sibling(struct task_struct *p, int target)
                                                tsk_cpus_allowed(p)))
                                goto next;
 
-                       for_each_cpu(i, sched_group_cpus(sg)) {
-                               if (i == target || !idle_cpu(i))
-                                       goto next;
-                       }
+                       if (sysctl_sched_cstate_aware) {
+                               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+                                       struct rq *rq = cpu_rq(i);
+                                       int idle_idx = idle_get_state_idx(rq);
+                                       unsigned long new_usage = boosted_task_util(p);
+                                       unsigned long capacity_orig = capacity_orig_of(i);
+                                       if (new_usage > capacity_orig || !idle_cpu(i))
+                                               goto next;
+
+                                       if (i == target && new_usage <= capacity_curr_of(target))
+                                               return target;
+
+                                       if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+                                               best_idle = i;
+                                               best_idle_cstate = idle_idx;
+                                               best_idle_capacity = capacity_orig;
+                                       }
+                               }
+                       } else {
+                               for_each_cpu(i, sched_group_cpus(sg)) {
+                                       if (i == target || !idle_cpu(i))
+                                               goto next;
+                               }
 
-                       target = cpumask_first_and(sched_group_cpus(sg),
+                               target = cpumask_first_and(sched_group_cpus(sg),
                                        tsk_cpus_allowed(p));
-                       goto done;
+                               goto done;
+                       }
 next:
                        sg = sg->next;
                } while (sg != sd->groups);
        }
+       if (best_idle > 0)
+               target = best_idle;
+
 done:
        return target;
 }
 
-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p, bool prefer_idle)
+{
+       int iter_cpu;
+       int target_cpu = -1;
+       int target_util = 0;
+       int backup_capacity = 0;
+       int best_idle_cpu = -1;
+       int best_idle_cstate = INT_MAX;
+       int backup_cpu = -1;
+       unsigned long task_util_boosted, new_util;
+
+       task_util_boosted = boosted_task_util(p);
+       for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+               int cur_capacity;
+               struct rq *rq;
+               int idle_idx;
+
+               /*
+                * favor higher cpus for tasks that prefer idle cores
+                */
+               int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+               if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+                       continue;
+
+               /*
+                * p's blocked utilization is still accounted for on prev_cpu
+                * so prev_cpu will receive a negative bias due to the double
+                * accounting. However, the blocked utilization may be zero.
+                */
+               new_util = cpu_util(i) + task_util_boosted;
+
+               /*
+                * Ensure minimum capacity to grant the required boost.
+                * The target CPU can be already at a capacity level higher
+                * than the one required to boost the task.
+                */
+               if (new_util > capacity_orig_of(i))
+                       continue;
+
+#ifdef CONFIG_SCHED_WALT
+               if (walt_cpu_high_irqload(i))
+                       continue;
+#endif
+               /*
+                * Unconditionally favoring tasks that prefer idle cpus to
+                * improve latency.
+                */
+               if (idle_cpu(i) && prefer_idle) {
+                       if (best_idle_cpu < 0)
+                               best_idle_cpu = i;
+                       continue;
+               }
+
+               cur_capacity = capacity_curr_of(i);
+               rq = cpu_rq(i);
+               idle_idx = idle_get_state_idx(rq);
+
+               if (new_util < cur_capacity) {
+                       if (cpu_rq(i)->nr_running) {
+                               if (target_util == 0 ||
+                                       target_util > new_util) {
+                                       target_cpu = i;
+                                       target_util = new_util;
+                               }
+                       } else if (!prefer_idle) {
+                               if (best_idle_cpu < 0 ||
+                                       (sysctl_sched_cstate_aware &&
+                                               best_idle_cstate > idle_idx)) {
+                                       best_idle_cstate = idle_idx;
+                                       best_idle_cpu = i;
+                               }
+                       }
+               } else if (backup_capacity == 0 ||
+                               backup_capacity > cur_capacity) {
+                       backup_capacity = cur_capacity;
+                       backup_cpu = i;
+               }
+       }
+
+       if (prefer_idle && best_idle_cpu >= 0)
+               target_cpu = best_idle_cpu;
+       else if (target_cpu < 0)
+               target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+       return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 {
        struct sched_domain *sd;
        struct sched_group *sg, *sg_target;
        int target_max_cap = INT_MAX;
        int target_cpu = task_cpu(p);
+       unsigned long task_util_boosted, new_util;
        int i;
 
+       if (sysctl_sched_sync_hint_enable && sync) {
+               int cpu = smp_processor_id();
+               cpumask_t search_cpus;
+               cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+               if (cpumask_test_cpu(cpu, &search_cpus))
+                       return cpu;
+       }
+
        sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
 
        if (!sd)
@@ -5494,50 +5725,76 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
        sg = sd->groups;
        sg_target = sg;
 
-       /*
-        * Find group with sufficient capacity. We only get here if no cpu is
-        * overutilized. We may end up overutilizing a cpu by adding the task,
-        * but that should not be any worse than select_idle_sibling().
-        * load_balance() should sort it out later as we get above the tipping
-        * point.
-        */
-       do {
-               /* Assuming all cpus are the same in group */
-               int max_cap_cpu = group_first_cpu(sg);
+       if (sysctl_sched_is_big_little) {
 
                /*
-                * Assume smaller max capacity means more energy-efficient.
-                * Ideally we should query the energy model for the right
-                * answer but it easily ends up in an exhaustive search.
+                * Find group with sufficient capacity. We only get here if no cpu is
+                * overutilized. We may end up overutilizing a cpu by adding the task,
+                * but that should not be any worse than select_idle_sibling().
+                * load_balance() should sort it out later as we get above the tipping
+                * point.
                 */
-               if (capacity_of(max_cap_cpu) < target_max_cap &&
-                   task_fits_max(p, max_cap_cpu)) {
-                       sg_target = sg;
-                       target_max_cap = capacity_of(max_cap_cpu);
-               }
-       } while (sg = sg->next, sg != sd->groups);
+               do {
+                       /* Assuming all cpus are the same in group */
+                       int max_cap_cpu = group_first_cpu(sg);
 
-       /* Find cpu with sufficient capacity */
-       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
-               /*
-                * p's blocked utilization is still accounted for on prev_cpu
-                * so prev_cpu will receive a negative bias due to the double
-                * accounting. However, the blocked utilization may be zero.
-                */
-               int new_util = cpu_util(i) + boosted_task_util(p);
+                       /*
+                        * Assume smaller max capacity means more energy-efficient.
+                        * Ideally we should query the energy model for the right
+                        * answer but it easily ends up in an exhaustive search.
+                        */
+                       if (capacity_of(max_cap_cpu) < target_max_cap &&
+                           task_fits_max(p, max_cap_cpu)) {
+                               sg_target = sg;
+                               target_max_cap = capacity_of(max_cap_cpu);
+                       }
+               } while (sg = sg->next, sg != sd->groups);
 
-               if (new_util > capacity_orig_of(i))
-                       continue;
+               task_util_boosted = boosted_task_util(p);
+               /* Find cpu with sufficient capacity */
+               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+                       /*
+                        * p's blocked utilization is still accounted for on prev_cpu
+                        * so prev_cpu will receive a negative bias due to the double
+                        * accounting. However, the blocked utilization may be zero.
+                        */
+                       new_util = cpu_util(i) + task_util_boosted;
 
-               if (new_util < capacity_curr_of(i)) {
-                       target_cpu = i;
-                       if (cpu_rq(i)->nr_running)
-                               break;
-               }
+                       /*
+                        * Ensure minimum capacity to grant the required boost.
+                        * The target CPU can be already at a capacity level higher
+                        * than the one required to boost the task.
+                        */
+                       if (new_util > capacity_orig_of(i))
+                               continue;
 
-               /* cpu has capacity at higher OPP, keep it as fallback */
-               if (target_cpu == task_cpu(p))
-                       target_cpu = i;
+                       if (new_util < capacity_curr_of(i)) {
+                               target_cpu = i;
+                               if (cpu_rq(i)->nr_running)
+                                       break;
+                       }
+
+                       /* cpu has capacity at higher OPP, keep it as fallback */
+                       if (target_cpu == task_cpu(p))
+                               target_cpu = i;
+               }
+       } else {
+               /*
+                * Find a cpu with sufficient capacity
+                */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+               bool boosted = schedtune_task_boost(p) > 0;
+               bool prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+               bool boosted = 0;
+               bool prefer_idle = 0;
+#endif
+               int tmp_target = find_best_target(p, boosted || prefer_idle);
+               if (tmp_target >= 0) {
+                       target_cpu = tmp_target;
+                       if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+                               return target_cpu;
+               }
        }
 
        if (target_cpu != task_cpu(p)) {
@@ -5614,7 +5871,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
        if (!sd) {
                if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
-                       new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+                       new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
                else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                        new_cpu = select_idle_sibling(p, new_cpu);
 
@@ -5685,6 +5942,8 @@ static void task_dead_fair(struct task_struct *p)
 {
        remove_entity_load_avg(&p->se);
 }
+#else
+#define task_fits_max(p, cpu) true
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -6390,7 +6649,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 
        deactivate_task(env->src_rq, p, 0);
        p->on_rq = TASK_ON_RQ_MIGRATING;
+       double_lock_balance(env->src_rq, env->dst_rq);
        set_task_cpu(p, env->dst_cpu);
+       double_unlock_balance(env->src_rq, env->dst_rq);
 }
 
 /*
@@ -7265,12 +7526,17 @@ next_group:
                        env->dst_rq->rd->overload = overload;
 
                /* Update over-utilization (tipping point, U >= 0) indicator */
-               if (env->dst_rq->rd->overutilized != overutilized)
+               if (env->dst_rq->rd->overutilized != overutilized) {
                        env->dst_rq->rd->overutilized = overutilized;
+                       trace_sched_overutilized(overutilized);
+               }
        } else {
-               if (!env->dst_rq->rd->overutilized && overutilized)
+               if (!env->dst_rq->rd->overutilized && overutilized) {
                        env->dst_rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
        }
+
 }
 
 /**
@@ -8709,10 +8975,15 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
        if (static_branch_unlikely(&sched_numa_balancing))
                task_tick_numa(rq, curr);
 
-       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+#ifdef CONFIG_SMP
+       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
                rq->rd->overutilized = true;
+               trace_sched_overutilized(true);
+       }
 
        rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
 }
 
 /*