X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=kernel%2Fsched%2Ffair.c;h=15ccfbff1bde46473a9eb3116db2e97fc12edc78;hb=13fd688e51b865040d2683848a834920f5ca4969;hp=30d76a18ae1a7fe9c939f4501e9dea65aed3493b;hpb=bdcf7d8df4cfa7f99423da161019b964f7ce2520;p=firefly-linux-kernel-4.4.55.git diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 30d76a18ae1a..15ccfbff1bde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -53,7 +53,6 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; -unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; @@ -128,6 +127,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -300,19 +305,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -670,7 +715,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -696,18 +741,108 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = sched_freq() ? - sysctl_sched_initial_task_util : - scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * In previous Android versions, we used to have: + * sa->util_avg = sched_freq() ? + * sysctl_sched_initial_task_util : + * scale_load_down(SCHED_LOAD_SCALE); + * sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + * However, that functionality has been moved to enqueue. + * It is unclear if we should restore this in enqueue. + */ + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static void attach_entity_cfs_rq(struct sched_entity *se); + +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + /* + * If we wish to restore tuning via setting initial util, + * this is where we should do it. + */ + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); + return; + } + } + + attach_entity_cfs_rq(se); +} + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { } -#endif +void post_init_entity_util_avg(struct sched_entity *se) +{ +} +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1404,7 +1539,8 @@ balance: * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: assigned = true; @@ -2693,9 +2829,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2711,6 +2859,29 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* @@ -2730,11 +2901,28 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); @@ -2747,6 +2935,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + removed_util = 1; } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2757,11 +2946,24 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif + /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */ + if (cfs_rq == &rq_of(cfs_rq)->cfs) + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + return decayed || removed; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); @@ -2771,55 +2973,55 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); - trace_sched_load_avg_cpu(cpu, cfs_rq); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + + cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -2827,34 +3029,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -2882,6 +3070,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) } #endif +/* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2889,7 +3090,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * Newly created task or never used group entity should not be removed @@ -2898,9 +3098,7 @@ void remove_entity_load_avg(struct sched_entity *se) if (se->avg.last_update_time == 0) return; - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -2937,7 +3135,16 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void @@ -3080,6 +3287,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3155,6 +3363,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -3245,7 +3454,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3361,7 +3570,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); + update_load_avg(curr, UPDATE_TG); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -3958,6 +4167,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4199,7 +4428,7 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static bool cpu_overutilized(int cpu); -static inline unsigned long boosted_cpu_util(int cpu); +unsigned long boosted_cpu_util(int cpu); #else #define boosted_cpu_util(cpu) cpu_util(cpu) #endif @@ -4234,6 +4463,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_wakeup = flags & ENQUEUE_WAKEUP; #endif + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -4262,7 +4499,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4343,15 +4580,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; @@ -4365,7 +4601,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4916,7 +5152,7 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) } static int find_new_capacity(struct energy_env *eenv, - const struct sched_group_energy const *sge) + const struct sched_group_energy * const sge) { int idx; unsigned long util = group_max_util(eenv); @@ -5066,6 +5302,7 @@ static inline int __energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int diff, margin; struct energy_env eenv_before = { .util_delta = 0, @@ -5115,6 +5352,16 @@ static inline int __energy_diff(struct energy_env *eenv) eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); + /* + * Dead-zone margin preventing too many migrations. + */ + + margin = eenv->nrg.before >> 6; /* ~1.56% */ + + diff = eenv->nrg.after - eenv->nrg.before; + + eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; + return eenv->nrg.diff; } @@ -5214,18 +5461,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5291,8 +5538,6 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -unsigned int capacity_margin = 1280; /* ~20% margin */ - static inline unsigned long boosted_task_util(struct task_struct *task); static inline bool __task_fits(struct task_struct *p, int cpu, int util) @@ -5318,11 +5563,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static inline bool task_fits_spare(struct task_struct *p, int cpu) -{ - return __task_fits(p, cpu, cpu_util(cpu)); -} - static bool cpu_overutilized(int cpu) { return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); @@ -5412,7 +5652,7 @@ schedtune_task_margin(struct task_struct *task) #endif /* CONFIG_SCHED_TUNE */ -static inline unsigned long +unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); @@ -5434,6 +5674,13 @@ boosted_task_util(struct task_struct *task) return util + margin; } +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5443,10 +5690,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL, *spare_group = NULL; + struct sched_group *most_spare_sg = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; - unsigned long fit_capacity = ULONG_MAX; - unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -5454,7 +5700,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load, spare_capacity; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -5466,8 +5712,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -5478,24 +5728,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; - /* - * Look for most energy-efficient group that can fit - * that can fit the task. - */ - if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { - fit_capacity = capacity_of(i); - fit_group = group; - } + spare_cap = capacity_spare_wake(i, p); - /* - * Look for group which has most spare capacity on a - * single cpu. - */ - spare_capacity = capacity_of(i) - cpu_util(i); - if (spare_capacity > max_spare_capacity) { - max_spare_capacity = spare_capacity; - spare_group = group; - } + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -5503,17 +5739,32 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_spare = max_spare_cap; + } else { + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); - if (fit_group) - return fit_group; - - if (spare_group) - return spare_group; + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + */ + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -5533,9 +5784,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (task_fits_spare(p, i)) { + if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -5547,8 +5802,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (idle_cpu(i) && - (!idle || idle->exit_latency == min_exit_latency) && + } else if ((!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -5557,13 +5811,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (shallowest_idle_cpu == -1) { - /* - * If we haven't found an idle CPU yet - * pick a non-idle one that can fit the task as - * fallback. - */ - shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -5580,14 +5827,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); - int best_idle = -1; - int best_idle_cstate = -1; - int best_idle_capacity = INT_MAX; + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + unsigned long best_idle_capacity = ULONG_MAX; if (!sysctl_sched_cstate_aware) { if (idle_cpu(target)) @@ -5596,8 +5842,8 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; } /* @@ -5607,24 +5853,26 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; if (sysctl_sched_cstate_aware) { for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + int idle_idx = idle_get_state_idx(cpu_rq(i)); unsigned long new_usage = boosted_task_util(p); unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) goto next; if (i == target && new_usage <= capacity_curr_of(target)) return target; - if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { - best_idle = i; + if (idle_idx < best_idle_cstate && + capacity_orig <= best_idle_capacity) { + best_idle_cpu = i; best_idle_cstate = idle_idx; best_idle_capacity = capacity_orig; } @@ -5643,229 +5891,216 @@ next: sg = sg->next; } while (sg != sd->groups); } - if (best_idle > 0) - target = best_idle; + + if (best_idle_cpu >= 0) + target = best_idle_cpu; done: return target; } +static int start_cpu(bool boosted) +{ + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + + RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; +} + static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { - int iter_cpu; int target_cpu = -1; - int target_util = 0; - int backup_capacity = 0; + unsigned long target_util = prefer_idle ? ULONG_MAX : 0; + unsigned long backup_capacity = ULONG_MAX; int best_idle_cpu = -1; int best_idle_cstate = INT_MAX; int backup_cpu = -1; - unsigned long task_util_boosted, new_util; + unsigned long min_util = boosted_task_util(p); + struct sched_domain *sd; + struct sched_group *sg; + int cpu = start_cpu(boosted); - task_util_boosted = boosted_task_util(p); - for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { - int cur_capacity; - struct rq *rq; - int idle_idx; + if (cpu < 0) + return target_cpu; - /* - * Iterate from higher cpus for boosted tasks. - */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + sd = rcu_dereference(per_cpu(sd_ea, cpu)); - if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) - continue; + if (!sd) + return target_cpu; - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - new_util = cpu_util(i) + task_util_boosted; + sg = sd->groups; - /* - * Ensure minimum capacity to grant the required boost. - * The target CPU can be already at a capacity level higher - * than the one required to boost the task. - */ - if (new_util > capacity_orig_of(i)) - continue; + do { + int i; + + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + unsigned long cur_capacity, new_util; + + if (!cpu_online(i)) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util(p); + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + new_util = max(min_util, new_util); + + if (new_util > capacity_orig_of(i)) + continue; #ifdef CONFIG_SCHED_WALT - if (walt_cpu_high_irqload(i)) - continue; + if (walt_cpu_high_irqload(i)) + continue; #endif - /* - * Unconditionally favoring tasks that prefer idle cpus to - * improve latency. - */ - if (idle_cpu(i) && prefer_idle) { - if (best_idle_cpu < 0) - best_idle_cpu = i; - continue; - } - cur_capacity = capacity_curr_of(i); - rq = cpu_rq(i); - idle_idx = idle_get_state_idx(rq); + /* + * Unconditionally favoring tasks that prefer idle cpus to + * improve latency. + */ + if (idle_cpu(i) && prefer_idle) + return i; + + cur_capacity = capacity_curr_of(i); - if (new_util < cur_capacity) { - if (cpu_rq(i)->nr_running) { - if (prefer_idle) { - /* Find a target cpu with highest - * utilization. + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + /* + * Find a target cpu with the lowest/highest + * utilization if prefer_idle/!prefer_idle. */ - if (target_util == 0 || - target_util < new_util) { - target_cpu = i; + if ((prefer_idle && target_util > new_util) || + (!prefer_idle && target_util < new_util)) { target_util = new_util; - } - } else { - /* Find a target cpu with lowest - * utilization. - */ - if (target_util == 0 || - target_util > new_util) { target_cpu = i; - target_util = new_util; + } + } else if (!prefer_idle) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + + if (best_idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + best_idle_cpu = i; } } - } else if (!prefer_idle) { - if (best_idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - best_idle_cpu = i; - } + } else if (backup_capacity > cur_capacity) { + /* Find a backup cpu with least capacity. */ + backup_capacity = cur_capacity; + backup_cpu = i; } - } else if (backup_capacity == 0 || - backup_capacity > cur_capacity) { - // Find a backup cpu with least capacity. - backup_capacity = cur_capacity; - backup_cpu = i; } - } + } while (sg = sg->next, sg != sd->groups); - if (prefer_idle && best_idle_cpu >= 0) - target_cpu = best_idle_cpu; - else if (target_cpu < 0) + if (target_cpu < 0) target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; return target_cpu; } -static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) +/* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) { - struct sched_domain *sd; - struct sched_group *sg, *sg_target; - int target_max_cap = INT_MAX; - int target_cpu = task_cpu(p); - unsigned long task_util_boosted, new_util; - int i; + unsigned long util, capacity; - if (sysctl_sched_sync_hint_enable && sync) { - int cpu = smp_processor_id(); - cpumask_t search_cpus; - cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); - if (cpumask_test_cpu(cpu, &search_cpus)) - return cpu; - } + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); - sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); - if (!sd) - return target; + return (util >= capacity) ? capacity : util; +} - sg = sd->groups; - sg_target = sg; +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; - if (sysctl_sched_is_big_little) { + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; - /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. - */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); - task_util_boosted = boosted_task_util(p); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - new_util = cpu_util(i) + task_util_boosted; + return min_cap * 1024 < task_util(p) * capacity_margin; +} - /* - * Ensure minimum capacity to grant the required boost. - * The target CPU can be already at a capacity level higher - * than the one required to boost the task. - */ - if (new_util > capacity_orig_of(i)) - continue; +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) +{ + struct sched_domain *sd; + int target_cpu = prev_cpu, tmp_target; + bool boosted, prefer_idle; - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; - } + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; - } - } else { - /* - * Find a cpu with sufficient capacity - */ + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + return cpu; + } + + rcu_read_lock(); #ifdef CONFIG_CGROUP_SCHEDTUNE - bool boosted = schedtune_task_boost(p) > 0; - bool prefer_idle = schedtune_prefer_idle(p) > 0; + boosted = schedtune_task_boost(p) > 0; + prefer_idle = schedtune_prefer_idle(p) > 0; #else - bool boosted = 0; - bool prefer_idle = 0; + boosted = get_sysctl_sched_cfs_boost() > 0; + prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted, prefer_idle); - if (tmp_target >= 0) { - target_cpu = tmp_target; - if ((boosted || prefer_idle) && idle_cpu(target_cpu)) - return target_cpu; - } + + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + /* Find a cpu with sufficient capacity */ + tmp_target = find_best_target(p, boosted, prefer_idle); + + if (!sd) + goto unlock; + if (tmp_target >= 0) { + target_cpu = tmp_target; + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) + goto unlock; } - if (target_cpu != task_cpu(p)) { + if (target_cpu != prev_cpu) { struct energy_env eenv = { - .util_delta = task_util(p), - .src_cpu = task_cpu(p), - .dst_cpu = target_cpu, - .task = p, + .util_delta = task_util(p), + .src_cpu = prev_cpu, + .dst_cpu = target_cpu, + .task = p, }; /* Not enough spare capacity on previous cpu */ - if (cpu_overutilized(task_cpu(p))) - return target_cpu; + if (cpu_overutilized(prev_cpu)) + goto unlock; if (energy_diff(&eenv) >= 0) - return task_cpu(p); + target_cpu = prev_cpu; } +unlock: + rcu_read_unlock(); return target_cpu; } @@ -5891,9 +6126,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || - energy_aware(); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + + if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) + return select_energy_cpu_brute(p, prev_cpu, sync); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5918,15 +6155,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { - if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); - else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group; @@ -6901,7 +7136,8 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, + true)) update_tg_load_avg(cfs_rq, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6962,7 +7198,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -7142,13 +7378,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, max_capacity; + unsigned long capacity, max_capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7162,6 +7399,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; max_capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -7192,6 +7430,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } max_capacity = max(capacity, max_capacity); + min_capacity = min(capacity, min_capacity); } } else { /* @@ -7205,12 +7444,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = max_capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -7438,14 +7679,20 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + /* - * Candiate sg has no more than one task per cpu and has higher - * per-cpu capacity. No reason to pull tasks to less capable cpus. + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && group_smaller_cpu_capacity(sds->local, sg)) return false; +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -9161,6 +9408,34 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Catch up with the cfs_rq and remove our load when we leave */ + update_load_avg(se, 0); + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); +} + +static void attach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); +} + static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -9175,8 +9450,7 @@ static void detach_task_cfs_rq(struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } - /* Catch up with the cfs_rq and remove our load when we leave */ - detach_entity_load_avg(cfs_rq, se); + detach_entity_cfs_rq(se); } static void attach_task_cfs_rq(struct task_struct *p) @@ -9184,16 +9458,7 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - - /* Synchronize task with its cfs_rq */ - attach_entity_load_avg(cfs_rq, se); + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -9287,8 +9552,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -9303,6 +9569,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -9316,6 +9584,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1;