sched: Fix the relax_domain_level boot parameter

[firefly-linux-kernel-4.4.55.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index ea8a4769fea55bf2bd0d2b12ed6dae034dc62407..d5594a4268d4a5bb531bcf86dfc66fe359e7c956 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
  #define SCHED_FEAT(name, enabled)      \
         #name ,
  
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
  #include "features.h"
-       NULL
  };
  
  #undef SCHED_FEAT
@@ -693,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
  }
  #endif
  
-void update_cpu_load(struct rq *this_rq);
-
  static void set_load_weight(struct task_struct *p)
  {
         int prio = p->static_prio - MAX_RT_PRIO;
@@ -2488,22 +2485,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
   * scheduler tick (TICK_NSEC). With tickless idle this will not be called
   * every tick. We fix it up based on jiffies.
   */
-void update_cpu_load(struct rq *this_rq)
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                             unsigned long pending_updates)
  {
-       unsigned long this_load = this_rq->load.weight;
-       unsigned long curr_jiffies = jiffies;
-       unsigned long pending_updates;
         int i, scale;
  
         this_rq->nr_load_updates++;
  
-       /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
         /* Update our load: */
         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2528,9 +2516,78 @@ void update_cpu_load(struct rq *this_rq)
         sched_avg_update(this_rq);
  }
  
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+       unsigned long load = this_rq->load.weight;
+       unsigned long pending_updates;
+
+       /*
+        * bail if there's load or we're actually up-to-date.
+        */
+       if (load || curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+
+       __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+       struct rq *this_rq = this_rq();
+       unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+       unsigned long pending_updates;
+
+       if (curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       raw_spin_lock(&this_rq->lock);
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       if (pending_updates) {
+               this_rq->last_load_update_tick = curr_jiffies;
+               /*
+                * We were idle, this means load 0, the current load might be
+                * !0 due to remote wakeups and the sort.
+                */
+               __update_cpu_load(this_rq, 0, pending_updates);
+       }
+       raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
  static void update_cpu_load_active(struct rq *this_rq)
  {
-       update_cpu_load(this_rq);
+       /*
+        * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+        */
+       this_rq->last_load_update_tick = jiffies;
+       __update_cpu_load(this_rq, this_rq->load.weight, 1);
  
         calc_load_account_active(this_rq);
  }
@@ -3115,6 +3172,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
         if (irqs_disabled())
                 print_irqtrace_events(prev);
         dump_stack();
+       add_taint(TAINT_WARN);
  }
  
  /*
@@ -4044,11 +4102,8 @@ static bool check_same_owner(struct task_struct *p)
  
         rcu_read_lock();
         pcred = __task_cred(p);
-       if (cred->user->user_ns == pcred->user->user_ns)
-               match = (cred->euid == pcred->euid ||
-                        cred->euid == pcred->uid);
-       else
-               match = false;
+       match = (uid_eq(cred->euid, pcred->euid) ||
+                uid_eq(cred->euid, pcred->uid));
         rcu_read_unlock();
         return match;
  }
@@ -4959,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                 p->sched_class->set_cpus_allowed(p, new_mask);
  
         cpumask_copy(&p->cpus_allowed, new_mask);
-       p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+       p->nr_cpus_allowed = cpumask_weight(new_mask);
  }
  
  /*
@@ -5501,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
  
  #ifdef CONFIG_SCHED_DEBUG
  
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
  
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
  {
-       sched_domain_debug_enabled = 1;
+       sched_debug_enabled = 1;
  
         return 0;
  }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+       return sched_debug_enabled;
+}
  
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                   struct cpumask *groupmask)
@@ -5549,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                         break;
                 }
  
-               if (!group->sgp->power) {
+               /*
+                * Even though we initialize ->power to something semi-sane,
+                * we leave power_orig unset. This allows us to detect if
+                * domain iteration is still funny without causing /0 traps.
+                */
+               if (!group->sgp->power_orig) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: domain->cpu_power not "
                                         "set\n");
@@ -5562,7 +5627,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                         break;
                 }
  
-               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+               if (!(sd->flags & SD_OVERLAP) &&
+                   cpumask_intersects(groupmask, sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: repeated CPUs\n");
                         break;
@@ -5596,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
  {
         int level = 0;
  
-       if (!sched_domain_debug_enabled)
+       if (!sched_debug_enabled)
                 return;
  
         if (!sd) {
@@ -5617,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
  }
  #else /* !CONFIG_SCHED_DEBUG */
  # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+       return false;
+}
  #endif /* CONFIG_SCHED_DEBUG */
  
  static int sd_degenerate(struct sched_domain *sd)
@@ -5900,99 +5970,11 @@ static int __init isolated_cpu_setup(char *str)
  
  __setup("isolcpus=", isolated_cpu_setup);
  
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-       int i, n, val, min_val, best_node = -1;
-
-       min_val = INT_MAX;
-
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Start at @node */
-               n = (node + i) % nr_node_ids;
-
-               if (!nr_cpus_node(n))
-                       continue;
-
-               /* Skip already used nodes */
-               if (node_isset(n, *used_nodes))
-                       continue;
-
-               /* Simple min distance search */
-               val = node_distance(node, n);
-
-               if (val < min_val) {
-                       min_val = val;
-                       best_node = n;
-               }
-       }
-
-       if (best_node != -1)
-               node_set(best_node, *used_nodes);
-       return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-       nodemask_t used_nodes;
-       int i;
-
-       cpumask_clear(span);
-       nodes_clear(used_nodes);
-
-       cpumask_or(span, span, cpumask_of_node(node));
-       node_set(node, used_nodes);
-
-       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-               int next_node = find_next_best_node(node, &used_nodes);
-               if (next_node < 0)
-                       break;
-               cpumask_or(span, span, cpumask_of_node(next_node));
-       }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-       lockdep_assert_held(&sched_domains_mutex);
-
-       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
-       return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-       return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
  static const struct cpumask *cpu_cpu_mask(int cpu)
  {
         return cpumask_of_node(cpu_to_node(cpu));
  }
  
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
  struct sd_data {
         struct sched_domain **__percpu sd;
         struct sched_group **__percpu sg;
@@ -6022,9 +6004,48 @@ struct sched_domain_topology_level {
         sched_domain_init_f init;
         sched_domain_mask_f mask;
         int                 flags;
+       int                 numa_level;
         struct sd_data      data;
  };
  
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+       const struct cpumask *span = sched_domain_span(sd);
+       struct sd_data *sdd = sd->private;
+       struct sched_domain *sibling;
+       int i;
+
+       for_each_cpu(i, span) {
+               sibling = *per_cpu_ptr(sdd->sd, i);
+               if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                       continue;
+
+               cpumask_set_cpu(i, sched_group_mask(sg));
+       }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+       return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
  static int
  build_overlap_sched_groups(struct sched_domain *sd, int cpu)
  {
@@ -6043,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                 if (cpumask_test_cpu(i, covered))
                         continue;
  
+               child = *per_cpu_ptr(sdd->sd, i);
+
+               /* See the comment near build_group_mask(). */
+               if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                       continue;
+
                 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                 GFP_KERNEL, cpu_to_node(cpu));
  
@@ -6050,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                         goto fail;
  
                 sg_span = sched_group_cpus(sg);
-
-               child = *per_cpu_ptr(sdd->sd, i);
                 if (child->child) {
                         child = child->child;
                         cpumask_copy(sg_span, sched_domain_span(child));
@@ -6060,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
  
                 cpumask_or(covered, covered, sg_span);
  
-               sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
-               atomic_inc(&sg->sgp->ref);
+               sg->sgp = *per_cpu_ptr(sdd->sgp, i);
+               if (atomic_inc_return(&sg->sgp->ref) == 1)
+                       build_group_mask(sd, sg);
+
+               /*
+                * Initialize sgp->power such that even if we mess up the
+                * domains and no possible iteration will get us here, we won't
+                * die on a /0 trap.
+                */
+               sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
  
-               if (cpumask_test_cpu(cpu, sg_span))
+               /*
+                * Make sure the first group of this domain contains the
+                * canonical balance cpu. Otherwise the sched_domain iteration
+                * breaks. See update_sg_lb_stats().
+                */
+               if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+                   group_balance_cpu(sg) == cpu)
                         groups = sg;
  
                 if (!first)
@@ -6137,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
  
                 cpumask_clear(sched_group_cpus(sg));
                 sg->sgp->power = 0;
+               cpumask_setall(sched_group_mask(sg));
  
                 for_each_cpu(j, span) {
                         if (get_group(j, sdd, NULL) != group)
@@ -6178,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                 sg = sg->next;
         } while (sg != sd->groups);
  
-       if (cpu != group_first_cpu(sg))
+       if (cpu != group_balance_cpu(sg))
                 return;
  
         update_group_power(sd, cpu);
@@ -6213,10 +6253,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu)  \
  }
  
  SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
  #ifdef CONFIG_SCHED_SMT
   SD_INIT_FUNC(SIBLING)
  #endif
@@ -6232,11 +6268,8 @@ int sched_domain_level_max;
  
  static int __init setup_relax_domain_level(char *str)
  {
-       unsigned long val;
-
-       val = simple_strtoul(str, NULL, 0);
-       if (val < sched_domain_level_max)
-               default_relax_domain_level = val;
+       if (kstrtoint(str, 0, &default_relax_domain_level))
+               pr_warn("Unable to set relax_domain_level\n");
  
         return 1;
  }
@@ -6338,15 +6371,236 @@ static struct sched_domain_topology_level default_topology[] = {
         { sd_init_BOOK, cpu_book_mask, },
  #endif
         { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-       { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
         { NULL, },
  };
  
  static struct sched_domain_topology_level *sched_domain_topology = default_topology;
  
+#ifdef CONFIG_NUMA
+
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static inline int sd_local_flags(int level)
+{
+       if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
+               return 0;
+
+       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+       int level = tl->numa_level;
+       int sd_weight = cpumask_weight(
+                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+
+       *sd = (struct sched_domain){
+               .min_interval           = sd_weight,
+               .max_interval           = 2*sd_weight,
+               .busy_factor            = 32,
+               .imbalance_pct          = 125,
+               .cache_nice_tries       = 2,
+               .busy_idx               = 3,
+               .idle_idx               = 2,
+               .newidle_idx            = 0,
+               .wake_idx               = 0,
+               .forkexec_idx           = 0,
+
+               .flags                  = 1*SD_LOAD_BALANCE
+                                       | 1*SD_BALANCE_NEWIDLE
+                                       | 0*SD_BALANCE_EXEC
+                                       | 0*SD_BALANCE_FORK
+                                       | 0*SD_BALANCE_WAKE
+                                       | 0*SD_WAKE_AFFINE
+                                       | 0*SD_PREFER_LOCAL
+                                       | 0*SD_SHARE_CPUPOWER
+                                       | 0*SD_SHARE_PKG_RESOURCES
+                                       | 1*SD_SERIALIZE
+                                       | 0*SD_PREFER_SIBLING
+                                       | sd_local_flags(level)
+                                       ,
+               .last_balance           = jiffies,
+               .balance_interval       = sd_weight,
+       };
+       SD_INIT_NAME(sd, NUMA);
+       sd->private = &tl->data;
+
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+
+       return sd;
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+       return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+       static int done = false;
+       int i,j;
+
+       if (done)
+               return;
+
+       done = true;
+
+       printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+       for (i = 0; i < nr_node_ids; i++) {
+               printk(KERN_WARNING "  ");
+               for (j = 0; j < nr_node_ids; j++)
+                       printk(KERN_CONT "%02d ", node_distance(i,j));
+               printk(KERN_CONT "\n");
+       }
+       printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+       int i;
+
+       if (distance == node_distance(0, 0))
+               return true;
+
+       for (i = 0; i < sched_domains_numa_levels; i++) {
+               if (sched_domains_numa_distance[i] == distance)
+                       return true;
+       }
+
+       return false;
+}
+
+static void sched_init_numa(void)
+{
+       int next_distance, curr_distance = node_distance(0, 0);
+       struct sched_domain_topology_level *tl;
+       int level = 0;
+       int i, j, k;
+
+       sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+       if (!sched_domains_numa_distance)
+               return;
+
+       /*
+        * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+        * unique distances in the node_distance() table.
+        *
+        * Assumes node_distance(0,j) includes all distances in
+        * node_distance(i,j) in order to avoid cubic time.
+        */
+       next_distance = curr_distance;
+       for (i = 0; i < nr_node_ids; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       for (k = 0; k < nr_node_ids; k++) {
+                               int distance = node_distance(i, k);
+
+                               if (distance > curr_distance &&
+                                   (distance < next_distance ||
+                                    next_distance == curr_distance))
+                                       next_distance = distance;
+
+                               /*
+                                * While not a strong assumption it would be nice to know
+                                * about cases where if node A is connected to B, B is not
+                                * equally connected to A.
+                                */
+                               if (sched_debug() && node_distance(k, i) != distance)
+                                       sched_numa_warn("Node-distance not symmetric");
+
+                               if (sched_debug() && i && !find_numa_distance(distance))
+                                       sched_numa_warn("Node-0 not representative");
+                       }
+                       if (next_distance != curr_distance) {
+                               sched_domains_numa_distance[level++] = next_distance;
+                               sched_domains_numa_levels = level;
+                               curr_distance = next_distance;
+                       } else break;
+               }
+
+               /*
+                * In case of sched_debug() we verify the above assumption.
+                */
+               if (!sched_debug())
+                       break;
+       }
+       /*
+        * 'level' contains the number of unique distances, excluding the
+        * identity distance node_distance(i,i).
+        *
+        * The sched_domains_nume_distance[] array includes the actual distance
+        * numbers.
+        */
+
+       sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+       if (!sched_domains_numa_masks)
+               return;
+
+       /*
+        * Now for each level, construct a mask per node which contains all
+        * cpus of nodes that are that many hops away from us.
+        */
+       for (i = 0; i < level; i++) {
+               sched_domains_numa_masks[i] =
+                       kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+               if (!sched_domains_numa_masks[i])
+                       return;
+
+               for (j = 0; j < nr_node_ids; j++) {
+                       struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+                       if (!mask)
+                               return;
+
+                       sched_domains_numa_masks[i][j] = mask;
+
+                       for (k = 0; k < nr_node_ids; k++) {
+                               if (node_distance(j, k) > sched_domains_numa_distance[i])
+                                       continue;
+
+                               cpumask_or(mask, mask, cpumask_of_node(k));
+                       }
+               }
+       }
+
+       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                       sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+       if (!tl)
+               return;
+
+       /*
+        * Copy the default topology bits..
+        */
+       for (i = 0; default_topology[i].init; i++)
+               tl[i] = default_topology[i];
+
+       /*
+        * .. and append 'j' levels of NUMA goodness.
+        */
+       for (j = 0; j < level; i++, j++) {
+               tl[i] = (struct sched_domain_topology_level){
+                       .init = sd_numa_init,
+                       .mask = sd_numa_mask,
+                       .flags = SDTL_OVERLAP,
+                       .numa_level = j,
+               };
+       }
+
+       sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
+
  static int __sdt_alloc(const struct cpumask *cpu_map)
  {
         struct sched_domain_topology_level *tl;
@@ -6388,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
  
                         *per_cpu_ptr(sdd->sg, j) = sg;
  
-                       sgp = kzalloc_node(sizeof(struct sched_group_power),
+                       sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                         GFP_KERNEL, cpu_to_node(j));
                         if (!sgp)
                                 return -ENOMEM;
@@ -6441,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
         if (!sd)
                 return child;
  
-       set_domain_attribute(sd, attr);
         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
         if (child) {
                 sd->level = child->level + 1;
@@ -6449,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 child->parent = sd;
         }
         sd->child = child;
+       set_domain_attribute(sd, attr);
  
         return sd;
  }
@@ -6589,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
         if (!doms_cur)
                 doms_cur = &fallback_doms;
         cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-       dattr_cur = NULL;
         err = build_sched_domains(doms_cur[0], NULL);
         register_sched_domain_sysctl();
  
@@ -6714,97 +6967,6 @@ match2:
         mutex_unlock(&sched_domains_mutex);
  }
  
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
-       get_online_cpus();
-
-       /* Destroy domains first to force the rebuild */
-       partition_sched_domains(0, NULL, NULL);
-
-       rebuild_sched_domains();
-       put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
-       unsigned int level = 0;
-
-       if (sscanf(buf, "%u", &level) != 1)
-               return -EINVAL;
-
-       /*
-        * level is always be positive so don't check for
-        * level < POWERSAVINGS_BALANCE_NONE which is 0
-        * What happens on 0 or 1 byte write,
-        * need to check for count as well?
-        */
-
-       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-               return -EINVAL;
-
-       if (smt)
-               sched_smt_power_savings = level;
-       else
-               sched_mc_power_savings = level;
-
-       reinit_sched_domains();
-
-       return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                          struct device_attribute *attr,
-                                          char *buf)
-{
-       return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                           const char *buf, size_t count)
-{
-       return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                  sched_mc_power_savings_show,
-                  sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
-{
-       return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                           struct device_attribute *attr,
-                                            const char *buf, size_t count)
-{
-       return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
-                  sched_smt_power_savings_show,
-                  sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
-       int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
-       if (smt_capable())
-               err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
-       if (!err && mc_capable())
-               err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
-       return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
  /*
   * Update cpusets according to cpu_active mask.  If cpusets are
   * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6842,6 +7004,8 @@ void __init sched_init_smp(void)
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  
+       sched_init_numa();
+
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
         init_sched_domains(cpu_active_mask);
@@ -7985,13 +8149,9 @@ static struct cftype cpu_files[] = {
                 .write_u64 = cpu_rt_period_write_uint,
         },
  #endif
+       { }     /* terminate */
  };
  
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-       return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
  struct cgroup_subsys cpu_cgroup_subsys = {
         .name           = "cpu",
         .create         = cpu_cgroup_create,
@@ -7999,8 +8159,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
-       .populate       = cpu_cgroup_populate,
         .subsys_id      = cpu_cgroup_subsys_id,
+       .base_cftypes   = cpu_files,
         .early_init     = 1,
  };
  
@@ -8185,13 +8345,9 @@ static struct cftype files[] = {
                 .name = "stat",
                 .read_map = cpuacct_stats_show,
         },
+       { }     /* terminate */
  };
  
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-       return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
-
  /*
   * charge this task's execution time to its accounting group.
   *
@@ -8223,7 +8379,7 @@ struct cgroup_subsys cpuacct_subsys = {
         .name = "cpuacct",
         .create = cpuacct_create,
         .destroy = cpuacct_destroy,
-       .populate = cpuacct_populate,
         .subsys_id = cpuacct_subsys_id,
+       .base_cftypes = files,
  };
  #endif /* CONFIG_CGROUP_CPUACCT */