Partial Revert: "WIP: sched: Add cpu capacity awareness to wakeup balancing"

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 88eae79d1a3cd49f0c58017c079d07b6fda338bc..2150edce955a429371954625033c51e65e4c103f 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1207,8 +1207,6 @@ static void task_numa_assign(struct task_numa_env *env,
  {
         if (env->best_task)
                 put_task_struct(env->best_task);
-       if (p)
-               get_task_struct(p);
  
         env->best_task = p;
         env->best_imp = imp;
@@ -1276,20 +1274,30 @@ static void task_numa_compare(struct task_numa_env *env,
         long imp = env->p->numa_group ? groupimp : taskimp;
         long moveimp = imp;
         int dist = env->dist;
+       bool assigned = false;
  
         rcu_read_lock();
  
         raw_spin_lock_irq(&dst_rq->lock);
         cur = dst_rq->curr;
         /*
-        * No need to move the exiting task, and this ensures that ->curr
-        * wasn't reaped and thus get_task_struct() in task_numa_assign()
-        * is safe under RCU read lock.
-        * Note that rcu_read_lock() itself can't protect from the final
-        * put_task_struct() after the last schedule().
+        * No need to move the exiting task or idle task.
          */
         if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                 cur = NULL;
+       else {
+               /*
+                * The task_struct must be protected here to protect the
+                * p->numa_faults access in the task_weight since the
+                * numa_faults could already be freed in the following path:
+                * finish_task_switch()
+                *     --> put_task_struct()
+                *         --> __put_task_struct()
+                *             --> task_numa_free()
+                */
+               get_task_struct(cur);
+       }
+
         raw_spin_unlock_irq(&dst_rq->lock);
  
         /*
@@ -1373,6 +1381,7 @@ balance:
                  */
                 if (!load_too_imbalanced(src_load, dst_load, env)) {
                         imp = moveimp - 1;
+                       put_task_struct(cur);
                         cur = NULL;
                         goto assign;
                 }
@@ -1398,9 +1407,16 @@ balance:
                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
  
  assign:
+       assigned = true;
         task_numa_assign(env, cur, imp);
  unlock:
         rcu_read_unlock();
+       /*
+        * The dst_rq->curr isn't assigned. The protection for task_struct is
+        * finished.
+        */
+       if (cur && !assigned)
+               put_task_struct(cur);
  }
  
  static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2695,6 +2711,29 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+        if (&this_rq()->cfs == cfs_rq) {
+                /*
+                 * There are a few boundary cases this might miss but it should
+                 * get called often enough that that should (hopefully) not be
+                 * a real problem -- added to that it only calls on the local
+                 * CPU, so if we enqueue remotely we'll miss an update, but
+                 * the next tick/schedule should update.
+                 *
+                 * It will not get called when we go idle, because the idle
+                 * thread is a different class (!fair), nor will the utilization
+                 * number include things like RT tasks.
+                 *
+                 * As is, the util number is not freq-invariant (we'd have to
+                 * implement arch_scale_freq_capacity() for that).
+                 *
+                 * See cpu_util().
+                 */
+                cpufreq_update_util(rq_of(cfs_rq), 0);
+        }
+}
+
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  
  /*
@@ -2715,10 +2754,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  } while (0)
  
  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq,
+                                        bool update_freq)
  {
         struct sched_avg *sa = &cfs_rq->avg;
-       int decayed, removed = 0;
+       int decayed, removed = 0, removed_util = 0;
  
         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2731,6 +2771,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                 sub_positive(&sa->util_avg, r);
                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+               removed_util = 1;
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2741,6 +2782,13 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
+       /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+       if (cfs_rq == &rq_of(cfs_rq)->cfs)
+               trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+       if (update_freq && (decayed || removed_util))
+               cfs_rq_util_change(cfs_rq);
+
         return decayed || removed;
  }
  
@@ -2759,12 +2807,11 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
                           se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
  
-       if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+       if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
                 update_tg_load_avg(cfs_rq, 0);
  
         if (entity_is_task(se))
                 trace_sched_load_avg_task(task_of(se), &se->avg);
-       trace_sched_load_avg_cpu(cpu, cfs_rq);
  }
  
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2792,6 +2839,8 @@ skip_aging:
         cfs_rq->avg.load_sum += se->avg.load_sum;
         cfs_rq->avg.util_avg += se->avg.util_avg;
         cfs_rq->avg.util_sum += se->avg.util_sum;
+
+       cfs_rq_util_change(cfs_rq);
  }
  
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2804,6 +2853,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+
+       cfs_rq_util_change(cfs_rq);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@ -2821,7 +2872,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         cfs_rq->curr == se, NULL);
         }
  
-       decayed = update_cfs_rq_load_avg(now, cfs_rq);
+       decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
  
         cfs_rq->runnable_load_avg += sa->load_avg;
         cfs_rq->runnable_load_sum += sa->load_sum;
@@ -2921,7 +2972,11 @@ static int idle_balance(struct rq *this_rq);
  
  #else /* CONFIG_SMP */
  
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
+{
+       cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+}
+
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
@@ -3942,6 +3997,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
         if (!cfs_bandwidth_used())
                 return;
  
+       /* Synchronize hierarchical throttle counter: */
+       if (unlikely(!cfs_rq->throttle_uptodate)) {
+               struct rq *rq = rq_of(cfs_rq);
+               struct cfs_rq *pcfs_rq;
+               struct task_group *tg;
+
+               cfs_rq->throttle_uptodate = 1;
+
+               /* Get closest up-to-date node, because leaves go first: */
+               for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+                       pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+                       if (pcfs_rq->throttle_uptodate)
+                               break;
+               }
+               if (tg) {
+                       cfs_rq->throttle_count = pcfs_rq->throttle_count;
+                       cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               }
+       }
+
         /* an active group must be handled by the update_curr()->put() path */
         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                 return;
@@ -4183,7 +4258,7 @@ static inline void hrtick_update(struct rq *rq)
  
  #ifdef CONFIG_SMP
  static bool cpu_overutilized(int cpu);
-static inline unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu);
  #else
  #define boosted_cpu_util(cpu) cpu_util(cpu)
  #endif
@@ -4218,6 +4293,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         int task_wakeup = flags & ENQUEUE_WAKEUP;
  #endif
  
+       /*
+        * If in_iowait is set, the code below may not trigger any cpufreq
+        * utilization updates, so do it here explicitly with the IOWAIT flag
+        * passed.
+        */
+       if (p->in_iowait)
+               cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
         for_each_sched_entity(se) {
                 if (se->on_rq)
                         break;
@@ -4255,6 +4338,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
  #ifdef CONFIG_SMP
  
+       /*
+        * Update SchedTune accounting.
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
         if (!se) {
                 walt_inc_cumulative_runnable_avg(rq, p);
                 if (!task_new && !rq->rd->overutilized &&
@@ -4274,9 +4376,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                         update_capacity_of(cpu_of(rq));
         }
  
-       /* Update SchedTune accouting */
-       schedtune_enqueue_task(p, cpu_of(rq));
-
  #endif /* CONFIG_SMP */
         hrtick_update(rq);
  }
@@ -4311,15 +4410,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
+                       /* Avoid re-evaluating load for this entity: */
+                       se = parent_entity(se);
                         /*
                          * Bias pick_next to pick a task from this cfs_rq, as
                          * p is sleeping when it is within its sched_slice.
                          */
-                       if (task_sleep && parent_entity(se))
-                               set_next_buddy(parent_entity(se));
-
-                       /* avoid re-evaluating load for this entity */
-                       se = parent_entity(se);
+                       if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+                               set_next_buddy(se);
                         break;
                 }
                 flags |= DEQUEUE_SLEEP;
@@ -4342,6 +4440,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
  #ifdef CONFIG_SMP
  
+       /*
+        * Update SchedTune accounting
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
         if (!se) {
                 walt_dec_cumulative_runnable_avg(rq, p);
  
@@ -4361,9 +4468,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 }
         }
  
-       /* Update SchedTune accouting */
-       schedtune_dequeue_task(p, cpu_of(rq));
-
  #endif /* CONFIG_SMP */
  
         hrtick_update(rq);
@@ -4878,7 +4982,7 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
  }
  
  static int find_new_capacity(struct energy_env *eenv,
-       const struct sched_group_energy const *sge)
+       const struct sched_group_energy * const sge)
  {
         int idx;
         unsigned long util = group_max_util(eenv);
@@ -5280,11 +5384,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
         return __task_fits(p, cpu, 0);
  }
  
-static inline bool task_fits_spare(struct task_struct *p, int cpu)
-{
-       return __task_fits(p, cpu, cpu_util(cpu));
-}
-
  static bool cpu_overutilized(int cpu)
  {
         return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
@@ -5374,7 +5473,7 @@ schedtune_task_margin(struct task_struct *task)
  
  #endif /* CONFIG_SCHED_TUNE */
  
-static inline unsigned long
+unsigned long
  boosted_cpu_util(int cpu)
  {
         unsigned long util = cpu_util(cpu);
@@ -5405,10 +5504,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                   int this_cpu, int sd_flag)
  {
         struct sched_group *idlest = NULL, *group = sd->groups;
-       struct sched_group *fit_group = NULL, *spare_group = NULL;
         unsigned long min_load = ULONG_MAX, this_load = 0;
-       unsigned long fit_capacity = ULONG_MAX;
-       unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
         int load_idx = sd->forkexec_idx;
         int imbalance = 100 + (sd->imbalance_pct-100)/2;
  
@@ -5416,7 +5512,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 load_idx = sd->wake_idx;
  
         do {
-               unsigned long load, avg_load, spare_capacity;
+               unsigned long load, avg_load;
                 int local_group;
                 int i;
  
@@ -5439,25 +5535,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                                 load = target_load(i, load_idx);
  
                         avg_load += load;
-
-                       /*
-                        * Look for most energy-efficient group that can fit
-                        * that can fit the task.
-                        */
-                       if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
-                               fit_capacity = capacity_of(i);
-                               fit_group = group;
-                       }
-
-                       /*
-                        * Look for group which has most spare capacity on a
-                        * single cpu.
-                        */
-                       spare_capacity = capacity_of(i) - cpu_util(i);
-                       if (spare_capacity > max_spare_capacity) {
-                               max_spare_capacity = spare_capacity;
-                               spare_group = group;
-                       }
                 }
  
                 /* Adjust by relative CPU capacity of the group */
@@ -5471,12 +5548,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                 }
         } while (group = group->next, group != sd->groups);
  
-       if (fit_group)
-               return fit_group;
-
-       if (spare_group)
-               return spare_group;
-
         if (!idlest || 100*this_load < imbalance*min_load)
                 return NULL;
         return idlest;
@@ -5497,7 +5568,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-               if (task_fits_spare(p, i)) {
+               if (idle_cpu(i)) {
                         struct rq *rq = cpu_rq(i);
                         struct cpuidle_state *idle = idle_get_state(rq);
                         if (idle && idle->exit_latency < min_exit_latency) {
@@ -5509,8 +5580,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                 min_exit_latency = idle->exit_latency;
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
-                       } else if (idle_cpu(i) &&
-                                  (!idle || idle->exit_latency == min_exit_latency) &&
+                       } else if ((!idle || idle->exit_latency == min_exit_latency) &&
                                    rq->idle_stamp > latest_idle_timestamp) {
                                 /*
                                  * If equal or no active idle state, then
@@ -5519,13 +5589,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                  */
                                 latest_idle_timestamp = rq->idle_stamp;
                                 shallowest_idle_cpu = i;
-                       } else if (shallowest_idle_cpu == -1) {
-                               /*
-                                * If we haven't found an idle CPU yet
-                                * pick a non-idle one that can fit the task as
-                                * fallback.
-                                */
-                               shallowest_idle_cpu = i;
                         }
                 } else if (shallowest_idle_cpu == -1) {
                         load = weighted_cpuload(i);
@@ -5612,7 +5675,7 @@ done:
         return target;
  }
  
-static inline int find_best_target(struct task_struct *p, bool prefer_idle)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
  {
         int iter_cpu;
         int target_cpu = -1;
@@ -5630,9 +5693,9 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle)
                 int idle_idx;
  
                 /*
-                * favor higher cpus for tasks that prefer idle cores
+                * Iterate from higher cpus for boosted tasks.
                  */
-               int i = prefer_idle ? NR_CPUS-iter_cpu-1 : iter_cpu;
+               int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
  
                 if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
                         continue;
@@ -5672,17 +5735,19 @@ static inline int find_best_target(struct task_struct *p, bool prefer_idle)
  
                 if (new_util < cur_capacity) {
                         if (cpu_rq(i)->nr_running) {
-                               if(prefer_idle) {
-                                       // Find a target cpu with lowest
-                                       // utilization.
+                               if (prefer_idle) {
+                                       /* Find a target cpu with highest
+                                        * utilization.
+                                        */
                                         if (target_util == 0 ||
                                                 target_util < new_util) {
                                                 target_cpu = i;
                                                 target_util = new_util;
                                         }
                                 } else {
-                                       // Find a target cpu with highest
-                                       // utilization.
+                                       /* Find a target cpu with lowest
+                                        * utilization.
+                                        */
                                         if (target_util == 0 ||
                                                 target_util > new_util) {
                                                 target_cpu = i;
@@ -5802,7 +5867,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
                 bool boosted = 0;
                 bool prefer_idle = 0;
  #endif
-               int tmp_target = find_best_target(p, boosted || prefer_idle);
+               int tmp_target = find_best_target(p, boosted, prefer_idle);
                 if (tmp_target >= 0) {
                         target_cpu = tmp_target;
                         if ((boosted || prefer_idle) && idle_cpu(target_cpu))
@@ -5851,8 +5916,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         int sync = wake_flags & WF_SYNC;
  
         if (sd_flag & SD_BALANCE_WAKE)
-               want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
-                             cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+               want_affine = (!wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
                               energy_aware();
  
         rcu_read_lock();
@@ -6861,7 +6925,8 @@ static void update_blocked_averages(int cpu)
                 if (throttled_hierarchy(cfs_rq))
                         continue;
  
-               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+                                          true))
                         update_tg_load_avg(cfs_rq, 0);
         }
         raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6922,7 +6987,7 @@ static inline void update_blocked_averages(int cpu)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
         update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -7085,7 +7150,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
                 mcc->cpu = cpu;
  #ifdef CONFIG_SCHED_DEBUG
                 raw_spin_unlock_irqrestore(&mcc->lock, flags);
-               pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+               printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+                               cpu, capacity);
                 goto skip_unlock;
  #endif
         }
@@ -7308,7 +7374,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         bool *overload, bool *overutilized)
  {
         unsigned long load;
-       int i;
+       int i, nr_running;
  
         memset(sgs, 0, sizeof(*sgs));
  
@@ -7325,7 +7391,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->group_util += cpu_util(i);
                 sgs->sum_nr_running += rq->cfs.h_nr_running;
  
-               if (rq->nr_running > 1)
+               nr_running = rq->nr_running;
+               if (nr_running > 1)
                         *overload = true;
  
  #ifdef CONFIG_NUMA_BALANCING
@@ -7333,7 +7400,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->nr_preferred_running += rq->nr_preferred_running;
  #endif
                 sgs->sum_weighted_load += weighted_cpuload(i);
-               if (idle_cpu(i))
+               /*
+                * No need to call idle_cpu() if nr_running is not 0
+                */
+               if (!nr_running && idle_cpu(i))
                         sgs->idle_cpus++;
  
                 if (cpu_overutilized(i)) {