sched: Fix the relax_domain_level boot parameter
[firefly-linux-kernel-4.4.55.git] / kernel / sched / core.c
index 39eb6011bc38e3f20188c942cb31a173fce74093..d5594a4268d4a5bb531bcf86dfc66fe359e7c956 100644 (file)
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)      \
        #name ,
 
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
 #include "features.h"
-       NULL
 };
 
 #undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
        sched_avg_update(this_rq);
 }
 
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
 /*
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
 void update_idle_cpu_load(struct rq *this_rq)
 {
-       unsigned long curr_jiffies = jiffies;
+       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
        unsigned long load = this_rq->load.weight;
        unsigned long pending_updates;
 
        /*
-        * Bloody broken means of dealing with nohz, but better than nothing..
-        * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
-        * update and see 0 difference the one time and 2 the next, even though
-        * we ticked at roughtly the same rate.
-        *
-        * Hence we only use this from nohz_idle_balance() and skip this
-        * nonsense when called from the scheduler_tick() since that's
-        * guaranteed a stable rate.
+        * bail if there's load or we're actually up-to-date.
         */
        if (load || curr_jiffies == this_rq->last_load_update_tick)
                return;
@@ -2546,13 +2552,39 @@ void update_idle_cpu_load(struct rq *this_rq)
        __update_cpu_load(this_rq, load, pending_updates);
 }
 
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+       struct rq *this_rq = this_rq();
+       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+       unsigned long pending_updates;
+
+       if (curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       raw_spin_lock(&this_rq->lock);
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       if (pending_updates) {
+               this_rq->last_load_update_tick = curr_jiffies;
+               /*
+                * We were idle, this means load 0, the current load might be
+                * !0 due to remote wakeups and the sort.
+                */
+               __update_cpu_load(this_rq, 0, pending_updates);
+       }
+       raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
 /*
  * Called from scheduler_tick()
  */
 static void update_cpu_load_active(struct rq *this_rq)
 {
        /*
-        * See the mess in update_idle_cpu_load().
+        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
         */
        this_rq->last_load_update_tick = jiffies;
        __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                p->sched_class->set_cpus_allowed(p, new_mask);
 
        cpumask_copy(&p->cpus_allowed, new_mask);
-       p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+       p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
 /*
@@ -5524,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
 
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
 {
-       sched_domain_debug_enabled = 1;
+       sched_debug_enabled = 1;
 
        return 0;
 }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+       return sched_debug_enabled;
+}
 
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
@@ -5572,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
 
-               if (!group->sgp->power) {
+               /*
+                * Even though we initialize ->power to something semi-sane,
+                * we leave power_orig unset. This allows us to detect if
+                * domain iteration is still funny without causing /0 traps.
+                */
+               if (!group->sgp->power_orig) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -5620,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
 
-       if (!sched_domain_debug_enabled)
+       if (!sched_debug_enabled)
                return;
 
        if (!sd) {
@@ -5641,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+       return false;
+}
 #endif /* CONFIG_SCHED_DEBUG */
 
 static int sd_degenerate(struct sched_domain *sd)
@@ -5962,6 +6008,44 @@ struct sched_domain_topology_level {
        struct sd_data      data;
 };
 
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+       const struct cpumask *span = sched_domain_span(sd);
+       struct sd_data *sdd = sd->private;
+       struct sched_domain *sibling;
+       int i;
+
+       for_each_cpu(i, span) {
+               sibling = *per_cpu_ptr(sdd->sd, i);
+               if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                       continue;
+
+               cpumask_set_cpu(i, sched_group_mask(sg));
+       }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+       return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -5980,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
 
+               child = *per_cpu_ptr(sdd->sd, i);
+
+               /* See the comment near build_group_mask(). */
+               if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                       continue;
+
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
 
@@ -5987,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
 
                sg_span = sched_group_cpus(sg);
-
-               child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -5997,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
                cpumask_or(covered, covered, sg_span);
 
-               sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
-               atomic_inc(&sg->sgp->ref);
+               sg->sgp = *per_cpu_ptr(sdd->sgp, i);
+               if (atomic_inc_return(&sg->sgp->ref) == 1)
+                       build_group_mask(sd, sg);
 
-               if (cpumask_test_cpu(cpu, sg_span))
+               /*
+                * Initialize sgp->power such that even if we mess up the
+                * domains and no possible iteration will get us here, we won't
+                * die on a /0 trap.
+                */
+               sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+
+               /*
+                * Make sure the first group of this domain contains the
+                * canonical balance cpu. Otherwise the sched_domain iteration
+                * breaks. See update_sg_lb_stats().
+                */
+               if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+                   group_balance_cpu(sg) == cpu)
                        groups = sg;
 
                if (!first)
@@ -6074,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+               cpumask_setall(sched_group_mask(sg));
 
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6115,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
 
-       if (cpu != group_first_cpu(sg))
+       if (cpu != group_balance_cpu(sg))
                return;
 
        update_group_power(sd, cpu);
@@ -6165,11 +6268,8 @@ int sched_domain_level_max;
 
 static int __init setup_relax_domain_level(char *str)
 {
-       unsigned long val;
-
-       val = simple_strtoul(str, NULL, 0);
-       if (val < sched_domain_level_max)
-               default_relax_domain_level = val;
+       if (kstrtoint(str, 0, &default_relax_domain_level))
+               pr_warn("Unable to set relax_domain_level\n");
 
        return 1;
 }
@@ -6279,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
 #ifdef CONFIG_NUMA
 
 static int sched_domains_numa_levels;
-static int sched_domains_numa_scale;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 
 static inline int sd_local_flags(int level)
 {
-       if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+       if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
                return 0;
 
        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6344,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 }
 
+static void sched_numa_warn(const char *str)
+{
+       static int done = false;
+       int i,j;
+
+       if (done)
+               return;
+
+       done = true;
+
+       printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+       for (i = 0; i < nr_node_ids; i++) {
+               printk(KERN_WARNING "  ");
+               for (j = 0; j < nr_node_ids; j++)
+                       printk(KERN_CONT "%02d ", node_distance(i,j));
+               printk(KERN_CONT "\n");
+       }
+       printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+       int i;
+
+       if (distance == node_distance(0, 0))
+               return true;
+
+       for (i = 0; i < sched_domains_numa_levels; i++) {
+               if (sched_domains_numa_distance[i] == distance)
+                       return true;
+       }
+
+       return false;
+}
+
 static void sched_init_numa(void)
 {
        int next_distance, curr_distance = node_distance(0, 0);
@@ -6351,7 +6486,6 @@ static void sched_init_numa(void)
        int level = 0;
        int i, j, k;
 
-       sched_domains_numa_scale = curr_distance;
        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
        if (!sched_domains_numa_distance)
                return;
@@ -6362,23 +6496,41 @@ static void sched_init_numa(void)
         *
         * Assumes node_distance(0,j) includes all distances in
         * node_distance(i,j) in order to avoid cubic time.
-        *
-        * XXX: could be optimized to O(n log n) by using sort()
         */
        next_distance = curr_distance;
        for (i = 0; i < nr_node_ids; i++) {
                for (j = 0; j < nr_node_ids; j++) {
-                       int distance = node_distance(0, j);
-                       if (distance > curr_distance &&
-                                       (distance < next_distance ||
-                                        next_distance == curr_distance))
-                               next_distance = distance;
+                       for (k = 0; k < nr_node_ids; k++) {
+                               int distance = node_distance(i, k);
+
+                               if (distance > curr_distance &&
+                                   (distance < next_distance ||
+                                    next_distance == curr_distance))
+                                       next_distance = distance;
+
+                               /*
+                                * While not a strong assumption it would be nice to know
+                                * about cases where if node A is connected to B, B is not
+                                * equally connected to A.
+                                */
+                               if (sched_debug() && node_distance(k, i) != distance)
+                                       sched_numa_warn("Node-distance not symmetric");
+
+                               if (sched_debug() && i && !find_numa_distance(distance))
+                                       sched_numa_warn("Node-0 not representative");
+                       }
+                       if (next_distance != curr_distance) {
+                               sched_domains_numa_distance[level++] = next_distance;
+                               sched_domains_numa_levels = level;
+                               curr_distance = next_distance;
+                       } else break;
                }
-               if (next_distance != curr_distance) {
-                       sched_domains_numa_distance[level++] = next_distance;
-                       sched_domains_numa_levels = level;
-                       curr_distance = next_distance;
-               } else break;
+
+               /*
+                * In case of sched_debug() we verify the above assumption.
+                */
+               if (!sched_debug())
+                       break;
        }
        /*
         * 'level' contains the number of unique distances, excluding the
@@ -6403,7 +6555,7 @@ static void sched_init_numa(void)
                        return;
 
                for (j = 0; j < nr_node_ids; j++) {
-                       struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                       struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
                        if (!mask)
                                return;
 
@@ -6490,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
                        *per_cpu_ptr(sdd->sg, j) = sg;
 
-                       sgp = kzalloc_node(sizeof(struct sched_group_power),
+                       sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
@@ -6543,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
        if (!sd)
                return child;
 
-       set_domain_attribute(sd, attr);
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
@@ -6551,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                child->parent = sd;
        }
        sd->child = child;
+       set_domain_attribute(sd, attr);
 
        return sd;
 }
@@ -6691,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
        if (!doms_cur)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-       dattr_cur = NULL;
        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();