#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/module.h>
#include <trace/events/sched.h>
#include "sched.h"
+#include "tune.h"
+#include "walt.h"
/*
* Targeted preemption latency for CPU-bound tasks:
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+ (10 * NSEC_PER_MSEC);
+#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_avg = sched_freq() ?
+ sysctl_sched_initial_task_util :
+ scale_load_down(SCHED_LOAD_SCALE);
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->period_contrib;
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
update_tg_load_avg(cfs_rq, 0);
+
+ if (entity_is_task(se))
+ trace_sched_load_avg_task(task_of(se), &se->avg);
+ trace_sched_load_avg_cpu(cpu, cfs_rq);
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#endif
+#ifdef CONFIG_SMP
static bool cpu_overutilized(int cpu);
+static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+
+#ifdef CONFIG_SMP
+static void update_capacity_of(int cpu)
+{
+ unsigned long req_cap;
+
+ if (!sched_freq())
+ return;
+
+ /* Convert scale-invariant capacity to cpu. */
+ req_cap = boosted_cpu_util(cpu);
+ req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+ set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+#endif
/*
* The enqueue_task method is called before nr_running is
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int task_new = !(flags & ENQUEUE_WAKEUP);
+#ifdef CONFIG_SMP
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
+ int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
for_each_sched_entity(se) {
if (se->on_rq)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
update_cfs_shares(cfs_rq);
}
- if (!se) {
+ if (!se)
add_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+ if (!se) {
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_new && !rq->rd->overutilized &&
- cpu_overutilized(rq->cpu))
+ cpu_overutilized(rq->cpu)) {
rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are waking up; this is
+ * because we get here also during load balancing, but
+ * in these cases it seems wise to trigger as single
+ * request after load balancing is done.
+ */
+ if (task_new || task_wakeup)
+ update_capacity_of(cpu_of(rq));
}
+
+ /* Update SchedTune accouting */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
if (!se)
sub_nr_running(rq, 1);
+#ifdef CONFIG_SMP
+
+ if (!se) {
+ walt_dec_cumulative_runnable_avg(rq, p);
+
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are going to sleep;
+ * this is because we get here also during load
+ * balancing, but in these cases it seems wise to
+ * trigger as single request after load balancing is
+ * done.
+ */
+ if (task_sleep) {
+ if (rq->cfs.nr_running)
+ update_capacity_of(cpu_of(rq));
+ else if (sched_freq())
+ set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+ }
+ }
+
+ /* Update SchedTune accouting */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
return max(rq->cpu_load[type-1], total);
}
-static unsigned long capacity_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
static unsigned long cpu_avg_load_per_task(int cpu)
{
* Returns the current capacity of cpu after applying both
* cpu and freq scaling.
*/
-static unsigned long capacity_curr_of(int cpu)
+unsigned long capacity_curr_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity_orig *
arch_scale_freq_capacity(NULL, cpu)
>> SCHED_CAPACITY_SHIFT;
}
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static unsigned long __cpu_util(int cpu, int delta)
-{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
-
- delta += util;
- if (delta < 0)
- return 0;
-
- return (delta >= capacity) ? capacity : delta;
-}
-
-static unsigned long cpu_util(int cpu)
-{
- return __cpu_util(cpu, 0);
-}
-
static inline bool energy_aware(void)
{
return sched_feat(ENERGY_AWARE);
int src_cpu;
int dst_cpu;
int energy;
+ int payoff;
+ struct task_struct *task;
+ struct {
+ int before;
+ int after;
+ int delta;
+ int diff;
+ } nrg;
+ struct {
+ int before;
+ int after;
+ int delta;
+ } cap;
};
/*
eenv->sg_cap = sg;
cap_idx = find_new_capacity(eenv, sg->sge);
+
+ if (sg->group_weight == 1) {
+ /* Remove capacity of src CPU (before task move) */
+ if (eenv->util_delta == 0 &&
+ cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+ eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta -= eenv->cap.before;
+ }
+ /* Add capacity of dst CPU (after task move) */
+ if (eenv->util_delta != 0 &&
+ cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+ eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta += eenv->cap.after;
+ }
+ }
+
idle_idx = group_idle_state(sg);
group_util = group_norm_util(eenv, sg);
sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
} while (sg = sg->next, sg != sd->groups);
}
next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
continue;
}
* utilization is removed from or added to the system (e.g. task wake-up). If
* both are specified, the utilization is migrated.
*/
-static int energy_diff(struct energy_env *eenv)
+static inline int __energy_diff(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
.util_delta = 0,
.src_cpu = eenv->src_cpu,
.dst_cpu = eenv->dst_cpu,
+ .nrg = { 0, 0, 0, 0},
+ .cap = { 0, 0, 0 },
};
if (eenv->src_cpu == eenv->dst_cpu)
return 0; /* Invalid result abort */
energy_before += eenv_before.energy;
+ /* Keep track of SRC cpu (before) capacity */
+ eenv->cap.before = eenv_before.cap.before;
+ eenv->cap.delta = eenv_before.cap.delta;
+
if (sched_group_energy(eenv))
return 0; /* Invalid result abort */
energy_after += eenv->energy;
}
} while (sg = sg->next, sg != sd->groups);
- return energy_after-energy_before;
+ eenv->nrg.before = energy_before;
+ eenv->nrg.after = energy_after;
+ eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+ eenv->payoff = 0;
+
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
+ return eenv->nrg.diff;
}
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+ int boost = schedtune_task_boost(eenv->task);
+ int nrg_delta;
+
+ /* Conpute "absolute" energy diff */
+ __energy_diff(eenv);
+
+ /* Return energy diff when boost margin is 0 */
+ if (boost == 0)
+ return eenv->nrg.diff;
+
+ /* Compute normalized energy diff */
+ nrg_delta = normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
* A waker of many should wake a different task than the one last awakened
static inline unsigned long task_util(struct task_struct *p)
{
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << 10) / walt_ravg_window;
+ }
+#endif
return p->se.avg.util_avg;
}
unsigned int capacity_margin = 1280; /* ~20% margin */
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
{
unsigned long capacity = capacity_of(cpu);
- util += task_util(p);
+ util += boosted_task_util(p);
return (capacity * 1024) > (util * capacity_margin);
}
return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
}
+#ifdef CONFIG_SCHED_TUNE
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+ long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_LOAD_SCALE - S), if B is positive
+ * M = B * S, if B is negative
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ if (boost >= 0) {
+ margin = SCHED_LOAD_SCALE - signal;
+ margin *= boost;
+ } else
+ margin = -signal * boost;
+ /*
+ * Fast integer division by constant:
+ * Constant : (C) = 100
+ * Precision : 0.1% (P) = 0.1
+ * Reference : C * 100 / P (R) = 100000
+ *
+ * Thus:
+ * Shift bits : ceil(log(R,2)) (S) = 17
+ * Mult const : round(2^S/C) (M) = 1311
+ *
+ *
+ */
+ margin *= 1311;
+ margin >>= 17;
+
+ if (boost < 0)
+ margin *= -1;
+ return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ int boost = schedtune_cpu_boost(cpu);
+
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+ int boost = schedtune_task_boost(task);
+ unsigned long util;
+ long margin;
+
+ if (boost == 0)
+ return 0;
+
+ util = task_util(task);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+static inline unsigned long
+boosted_cpu_util(int cpu)
+{
+ unsigned long util = cpu_util(cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
+
+ trace_sched_boost_cpu(cpu, util, margin);
+
+ return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+ unsigned long util = task_util(task);
+ long margin = schedtune_task_margin(task);
+
+ trace_sched_boost_task(task, util, margin);
+
+ return util + margin;
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
struct sched_domain *sd;
struct sched_group *sg;
int i = task_cpu(p);
+ int best_idle = -1;
+ int best_idle_cstate = -1;
+ int best_idle_capacity = INT_MAX;
- if (idle_cpu(target))
- return target;
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target))
+ return target;
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
+ }
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
tsk_cpus_allowed(p)))
goto next;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ struct rq *rq = cpu_rq(i);
+ int idle_idx = idle_get_state_idx(rq);
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target))
+ return target;
+
+ if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+ best_idle = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- target = cpumask_first_and(sched_group_cpus(sg),
+ target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
- goto done;
+ goto done;
+ }
next:
sg = sg->next;
} while (sg != sd->groups);
}
+ if (best_idle > 0)
+ target = best_idle;
+
done:
return target;
}
-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p, bool prefer_idle)
+{
+ int iter_cpu;
+ int target_cpu = -1;
+ int target_util = 0;
+ int backup_capacity = 0;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ int backup_cpu = -1;
+ unsigned long task_util_boosted, new_util;
+
+ task_util_boosted = boosted_task_util(p);
+ for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+ int cur_capacity;
+ struct rq *rq;
+ int idle_idx;
+
+ /*
+ * favor higher cpus for tasks that prefer idle cores
+ */
+ int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+ if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+#ifdef CONFIG_SCHED_WALT
+ if (walt_cpu_high_irqload(i))
+ continue;
+#endif
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle) {
+ if (best_idle_cpu < 0)
+ best_idle_cpu = i;
+ continue;
+ }
+
+ cur_capacity = capacity_curr_of(i);
+ rq = cpu_rq(i);
+ idle_idx = idle_get_state_idx(rq);
+
+ if (new_util < cur_capacity) {
+ if (cpu_rq(i)->nr_running) {
+ if (target_util == 0 ||
+ target_util > new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ } else if (!prefer_idle) {
+ if (best_idle_cpu < 0 ||
+ (sysctl_sched_cstate_aware &&
+ best_idle_cstate > idle_idx)) {
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ }
+ }
+ } else if (backup_capacity == 0 ||
+ backup_capacity > cur_capacity) {
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
+ }
+ }
+
+ if (prefer_idle && best_idle_cpu >= 0)
+ target_cpu = best_idle_cpu;
+ else if (target_cpu < 0)
+ target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+ return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
{
struct sched_domain *sd;
struct sched_group *sg, *sg_target;
int target_max_cap = INT_MAX;
int target_cpu = task_cpu(p);
+ unsigned long task_util_boosted, new_util;
int i;
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+ cpumask_t search_cpus;
+ cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+ if (cpumask_test_cpu(cpu, &search_cpus))
+ return cpu;
+ }
+
sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
if (!sd)
sg = sd->groups;
sg_target = sg;
- /*
- * Find group with sufficient capacity. We only get here if no cpu is
- * overutilized. We may end up overutilizing a cpu by adding the task,
- * but that should not be any worse than select_idle_sibling().
- * load_balance() should sort it out later as we get above the tipping
- * point.
- */
- do {
- /* Assuming all cpus are the same in group */
- int max_cap_cpu = group_first_cpu(sg);
+ if (sysctl_sched_is_big_little) {
/*
- * Assume smaller max capacity means more energy-efficient.
- * Ideally we should query the energy model for the right
- * answer but it easily ends up in an exhaustive search.
+ * Find group with sufficient capacity. We only get here if no cpu is
+ * overutilized. We may end up overutilizing a cpu by adding the task,
+ * but that should not be any worse than select_idle_sibling().
+ * load_balance() should sort it out later as we get above the tipping
+ * point.
*/
- if (capacity_of(max_cap_cpu) < target_max_cap &&
- task_fits_max(p, max_cap_cpu)) {
- sg_target = sg;
- target_max_cap = capacity_of(max_cap_cpu);
- }
- } while (sg = sg->next, sg != sd->groups);
+ do {
+ /* Assuming all cpus are the same in group */
+ int max_cap_cpu = group_first_cpu(sg);
- /* Find cpu with sufficient capacity */
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- int new_util = cpu_util(i) + task_util(p);
+ /*
+ * Assume smaller max capacity means more energy-efficient.
+ * Ideally we should query the energy model for the right
+ * answer but it easily ends up in an exhaustive search.
+ */
+ if (capacity_of(max_cap_cpu) < target_max_cap &&
+ task_fits_max(p, max_cap_cpu)) {
+ sg_target = sg;
+ target_max_cap = capacity_of(max_cap_cpu);
+ }
+ } while (sg = sg->next, sg != sd->groups);
- if (new_util > capacity_orig_of(i))
- continue;
+ task_util_boosted = boosted_task_util(p);
+ /* Find cpu with sufficient capacity */
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
- if (new_util < capacity_curr_of(i)) {
- target_cpu = i;
- if (cpu_rq(i)->nr_running)
- break;
- }
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ if (new_util < capacity_curr_of(i)) {
+ target_cpu = i;
+ if (cpu_rq(i)->nr_running)
+ break;
+ }
- /* cpu has capacity at higher OPP, keep it as fallback */
- if (target_cpu == task_cpu(p))
- target_cpu = i;
+ /* cpu has capacity at higher OPP, keep it as fallback */
+ if (target_cpu == task_cpu(p))
+ target_cpu = i;
+ }
+ } else {
+ /*
+ * Find a cpu with sufficient capacity
+ */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ bool boosted = schedtune_task_boost(p) > 0;
+ bool prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ bool boosted = 0;
+ bool prefer_idle = 0;
+#endif
+ int tmp_target = find_best_target(p, boosted || prefer_idle);
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+ return target_cpu;
+ }
}
if (target_cpu != task_cpu(p)) {
.util_delta = task_util(p),
.src_cpu = task_cpu(p),
.dst_cpu = target_cpu,
+ .task = p,
};
/* Not enough spare capacity on previous cpu */
if (!sd) {
if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+ new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, new_cpu);
{
remove_entity_load_avg(&p->se);
}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
+ /*
+ * We want to potentially raise target_cpu's OPP.
+ */
+ update_capacity_of(cpu_of(rq));
raw_spin_unlock(&rq->lock);
}
attach_task(env->dst_rq, p);
}
+ /*
+ * We want to potentially raise env.dst_cpu's OPP.
+ */
+ update_capacity_of(env->dst_cpu);
+
raw_spin_unlock(&env->dst_rq->lock);
}
used = div_u64(avg, total);
+ /*
+ * deadline bandwidth is defined at system level so we must
+ * weight this bandwidth with the max capacity of the system.
+ * As a reminder, avg_bw is 20bits width and
+ * scale_cpu_capacity is 10 bits width
+ */
+ used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
if (likely(used < SCHED_CAPACITY_SCALE))
return SCHED_CAPACITY_SCALE - used;
env->dst_rq->rd->overload = overload;
/* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized)
+ if (env->dst_rq->rd->overutilized != overutilized) {
env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
} else {
- if (!env->dst_rq->rd->overutilized && overutilized)
+ if (!env->dst_rq->rd->overutilized && overutilized) {
env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
+
}
/**
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ if (cur_ld_moved)
+ update_capacity_of(env.src_cpu);
/*
* We've detached some tasks from busiest_rq. Every
schedstat_inc(sd, alb_count);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ update_capacity_of(env.src_cpu);
+ }
else
schedstat_inc(sd, alb_failed);
}
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+#ifdef CONFIG_SMP
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*