mm/migrate: correct failure handling if !hugepage_migration_support()

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index fd773ade1a3141cd4b152cb8fce905866a7c223a..867b0a4b08935a4ad2ae3f49f56acee481ce07ba 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -178,59 +178,61 @@ void sched_init_granularity(void)
         update_sysctl();
  }
  
-#if BITS_PER_LONG == 32
-# define WMULT_CONST   (~0UL)
-#else
-# define WMULT_CONST   (1UL << 32)
-#endif
-
+#define WMULT_CONST    (~0U)
  #define WMULT_SHIFT    32
  
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+static void __update_inv_weight(struct load_weight *lw)
+{
+       unsigned long w;
+
+       if (likely(lw->inv_weight))
+               return;
+
+       w = scale_load_down(lw->weight);
+
+       if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+               lw->inv_weight = 1;
+       else if (unlikely(!w))
+               lw->inv_weight = WMULT_CONST;
+       else
+               lw->inv_weight = WMULT_CONST / w;
+}
  
  /*
- * delta *= weight / lw
+ * delta_exec * weight / lw.weight
+ *   OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
   */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-               struct load_weight *lw)
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
  {
-       u64 tmp;
+       u64 fact = scale_load_down(weight);
+       int shift = WMULT_SHIFT;
  
-       /*
-        * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-        * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-        * 2^SCHED_LOAD_RESOLUTION.
-        */
-       if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-               tmp = (u64)delta_exec * scale_load_down(weight);
-       else
-               tmp = (u64)delta_exec;
+       __update_inv_weight(lw);
  
-       if (!lw->inv_weight) {
-               unsigned long w = scale_load_down(lw->weight);
-
-               if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-                       lw->inv_weight = 1;
-               else if (unlikely(!w))
-                       lw->inv_weight = WMULT_CONST;
-               else
-                       lw->inv_weight = WMULT_CONST / w;
+       if (unlikely(fact >> 32)) {
+               while (fact >> 32) {
+                       fact >>= 1;
+                       shift--;
+               }
         }
  
-       /*
-        * Check whether we'd overflow the 64-bit multiplication:
-        */
-       if (unlikely(tmp > WMULT_CONST))
-               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                       WMULT_SHIFT/2);
-       else
-               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+       /* hint to use a 32x32->64 mul */
+       fact = (u64)(u32)fact * lw->inv_weight;
  
-       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+       while (fact >> 32) {
+               fact >>= 1;
+               shift--;
+       }
+
+       return mul_u64_u32_shr(delta_exec, fact, shift);
  }
  
  
@@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
  
  /**************************************************************
   * Scheduling class tree data structure manipulation methods:
@@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
  /*
   * delta /= w
   */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
  {
         if (unlikely(se->load.weight != NICE_0_LOAD))
-               delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+               delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
  
         return delta;
  }
@@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         update_load_add(&lw, se->load.weight);
                         load = &lw;
                 }
-               slice = calc_delta_mine(slice, se->load.weight, load);
+               slice = __calc_delta(slice, se->load.weight, load);
         }
         return slice;
  }
@@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p)
  #endif
  
  /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
+ * Update the current task's runtime statistics.
   */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-             unsigned long delta_exec)
-{
-       unsigned long delta_exec_weighted;
-
-       schedstat_set(curr->statistics.exec_max,
-                     max((u64)delta_exec, curr->statistics.exec_max));
-
-       curr->sum_exec_runtime += delta_exec;
-       schedstat_add(cfs_rq, exec_clock, delta_exec);
-       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-       curr->vruntime += delta_exec_weighted;
-       update_min_vruntime(cfs_rq);
-}
-
  static void update_curr(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *curr = cfs_rq->curr;
         u64 now = rq_clock_task(rq_of(cfs_rq));
-       unsigned long delta_exec;
+       u64 delta_exec;
  
         if (unlikely(!curr))
                 return;
  
-       /*
-        * Get the amount of time the current task was running
-        * since the last time we changed load (this cannot
-        * overflow on 32 bits):
-        */
-       delta_exec = (unsigned long)(now - curr->exec_start);
-       if (!delta_exec)
+       delta_exec = now - curr->exec_start;
+       if (unlikely((s64)delta_exec <= 0))
                 return;
  
-       __update_curr(cfs_rq, curr, delta_exec);
         curr->exec_start = now;
  
+       schedstat_set(curr->statistics.exec_max,
+                     max(delta_exec, curr->statistics.exec_max));
+
+       curr->sum_exec_runtime += delta_exec;
+       schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+       curr->vruntime += calc_delta_fair(delta_exec, curr);
+       update_min_vruntime(cfs_rq);
+
         if (entity_is_task(curr)) {
                 struct task_struct *curtask = task_of(curr);
  
@@ -886,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
         return max(smin, smax);
  }
  
-/*
- * Once a preferred node is selected the scheduler balancer will prefer moving
- * a task to that node for sysctl_numa_balancing_settle_count number of PTE
- * scans. This will give the process the chance to accumulate more faults on
- * the preferred node but still allow the scheduler to move the task again if
- * the nodes CPUs are overloaded.
- */
-unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
-
  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
         rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -944,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
         if (!p->numa_group)
                 return 0;
  
-       return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+       return p->numa_group->faults[task_faults_idx(nid, 0)] +
+               p->numa_group->faults[task_faults_idx(nid, 1)];
  }
  
  /*
@@ -1037,7 +1015,7 @@ struct task_numa_env {
  
         struct numa_stats src_stats, dst_stats;
  
-       int imbalance_pct, idx;
+       int imbalance_pct;
  
         struct task_struct *best_task;
         long best_imp;
@@ -1225,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
          * elsewhere, so there is no point in (re)trying.
          */
         if (unlikely(!sd)) {
-               p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+               p->numa_preferred_nid = task_node(p);
                 return -EINVAL;
         }
  
@@ -1272,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
         p->numa_scan_period = task_scan_min(p);
  
         if (env.best_task == NULL) {
-               int ret = migrate_task_to(p, env.best_cpu);
+               ret = migrate_task_to(p, env.best_cpu);
+               if (ret != 0)
+                       trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
                 return ret;
         }
  
         ret = migrate_swap(p, env.best_task);
+       if (ret != 0)
+               trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
         put_task_struct(env.best_task);
         return ret;
  }
@@ -1292,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p)
         p->numa_migrate_retry = jiffies + HZ;
  
         /* Success if task is already running on preferred CPU */
-       if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+       if (task_node(p) == p->numa_preferred_nid)
                 return;
  
         /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1364,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p,
                  * scanning faster if shared accesses dominate as it may
                  * simply bounce migrations uselessly
                  */
-               period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
                 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
                 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
         }
@@ -1752,6 +1733,13 @@ void task_numa_work(struct callback_head *work)
                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                         continue;
  
+               /*
+                * Skip inaccessible VMAs to avoid any confusion between
+                * PROT_NONE and NUMA hinting ptes
+                */
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                       continue;
+
                 do {
                         start = max(start, vma->vm_start);
                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -3015,8 +3003,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         }
  }
  
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec)
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         /* dock delta_exec before expiring quota (as it could span periods) */
         cfs_rq->runtime_remaining -= delta_exec;
@@ -3034,7 +3021,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  }
  
  static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
  {
         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                 return;
@@ -3574,8 +3561,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
         return rq_clock_task(rq_of(cfs_rq));
  }
  
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec) {}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
  static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -3932,7 +3918,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
  
-       if (!tg->parent || !wl) /* the trivial, non-cgroup case */
+       if (!tg->parent)        /* the trivial, non-cgroup case */
                 return wl;
  
         for_each_sched_entity(se) {
@@ -4110,12 +4096,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
   */
  static struct sched_group *
  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int load_idx)
+                 int this_cpu, int sd_flag)
  {
         struct sched_group *idlest = NULL, *group = sd->groups;
         unsigned long min_load = ULONG_MAX, this_load = 0;
+       int load_idx = sd->forkexec_idx;
         int imbalance = 100 + (sd->imbalance_pct-100)/2;
  
+       if (sd_flag & SD_BALANCE_WAKE)
+               load_idx = sd->wake_idx;
+
         do {
                 unsigned long load, avg_load;
                 int local_group;
@@ -4283,7 +4273,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         }
  
         while (sd) {
-               int load_idx = sd->forkexec_idx;
                 struct sched_group *group;
                 int weight;
  
@@ -4292,10 +4281,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                         continue;
                 }
  
-               if (sd_flag & SD_BALANCE_WAKE)
-                       load_idx = sd->wake_idx;
-
-               group = find_idlest_group(sd, p, cpu, load_idx);
+               group = find_idlest_group(sd, p, cpu, sd_flag);
                 if (!group) {
                         sd = sd->child;
                         continue;
@@ -5521,7 +5507,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
                         int local_group, struct sg_lb_stats *sgs)
  {
-       unsigned long nr_running;
         unsigned long load;
         int i;
  
@@ -5530,8 +5515,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                 struct rq *rq = cpu_rq(i);
  
-               nr_running = rq->nr_running;
-
                 /* Bias balancing toward cpus of our domain */
                 if (local_group)
                         load = target_load(i, load_idx);
@@ -5539,7 +5522,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         load = source_load(i, load_idx);
  
                 sgs->group_load += load;
-               sgs->sum_nr_running += nr_running;
+               sgs->sum_nr_running += rq->nr_running;
  #ifdef CONFIG_NUMA_BALANCING
                 sgs->nr_numa_running += rq->nr_numa_running;
                 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6530,7 +6513,7 @@ static struct {
         unsigned long next_balance;     /* in jiffy units */
  } nohz ____cacheline_aligned;
  
-static inline int find_new_ilb(int call_cpu)
+static inline int find_new_ilb(void)
  {
         int ilb = cpumask_first(nohz.idle_cpus_mask);
  
@@ -6545,13 +6528,13 @@ static inline int find_new_ilb(int call_cpu)
   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
   * CPU (if there is one).
   */
-static void nohz_balancer_kick(int cpu)
+static void nohz_balancer_kick(void)
  {
         int ilb_cpu;
  
         nohz.next_balance++;
  
-       ilb_cpu = find_new_ilb(cpu);
+       ilb_cpu = find_new_ilb();
  
         if (ilb_cpu >= nr_cpu_ids)
                 return;
@@ -6661,10 +6644,10 @@ void update_max_interval(void)
   *
   * Balancing parameters are set up in init_sched_domains.
   */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
  {
         int continue_balancing = 1;
-       struct rq *rq = cpu_rq(cpu);
+       int cpu = rq->cpu;
         unsigned long interval;
         struct sched_domain *sd;
         /* Earliest time when we have to do rebalance again */
@@ -6761,9 +6744,9 @@ out:
   * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
   * rebalancing for all the cpus for whom scheduler ticks are stopped.
   */
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  {
-       struct rq *this_rq = cpu_rq(this_cpu);
+       int this_cpu = this_rq->cpu;
         struct rq *rq;
         int balance_cpu;
  
@@ -6790,7 +6773,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                 update_idle_cpu_load(rq);
                 raw_spin_unlock_irq(&rq->lock);
  
-               rebalance_domains(balance_cpu, CPU_IDLE);
+               rebalance_domains(rq, CPU_IDLE);
  
                 if (time_after(this_rq->next_balance, rq->next_balance))
                         this_rq->next_balance = rq->next_balance;
@@ -6809,14 +6792,14 @@ end:
   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
   *     domain span are idle.
   */
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
+static inline int nohz_kick_needed(struct rq *rq)
  {
         unsigned long now = jiffies;
         struct sched_domain *sd;
         struct sched_group_power *sgp;
-       int nr_busy;
+       int nr_busy, cpu = rq->cpu;
  
-       if (unlikely(idle_cpu(cpu)))
+       if (unlikely(rq->idle_balance))
                 return 0;
  
         /*
@@ -6865,7 +6848,7 @@ need_kick:
         return 1;
  }
  #else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
  #endif
  
  /*
@@ -6874,38 +6857,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
   */
  static void run_rebalance_domains(struct softirq_action *h)
  {
-       int this_cpu = smp_processor_id();
-       struct rq *this_rq = cpu_rq(this_cpu);
+       struct rq *this_rq = this_rq();
         enum cpu_idle_type idle = this_rq->idle_balance ?
                                                 CPU_IDLE : CPU_NOT_IDLE;
  
-       rebalance_domains(this_cpu, idle);
+       rebalance_domains(this_rq, idle);
  
         /*
          * If this cpu has a pending nohz_balance_kick, then do the
          * balancing on behalf of the other idle cpus whose ticks are
          * stopped.
          */
-       nohz_idle_balance(this_cpu, idle);
+       nohz_idle_balance(this_rq, idle);
  }
  
-static inline int on_null_domain(int cpu)
+static inline int on_null_domain(struct rq *rq)
  {
-       return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+       return !rcu_dereference_sched(rq->sd);
  }
  
  /*
   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
   */
-void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq)
  {
         /* Don't need to rebalance while attached to NULL domain */
-       if (time_after_eq(jiffies, rq->next_balance) &&
-           likely(!on_null_domain(cpu)))
+       if (unlikely(on_null_domain(rq)))
+               return;
+
+       if (time_after_eq(jiffies, rq->next_balance))
                 raise_softirq(SCHED_SOFTIRQ);
  #ifdef CONFIG_NO_HZ_COMMON
-       if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
-               nohz_balancer_kick(cpu);
+       if (nohz_kick_needed(rq))
+               nohz_balancer_kick();
  #endif
  }