sched/walt: Accounting for number of irqs pending on each core

[firefly-linux-kernel-4.4.55.git] / kernel / cpuset.c
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 10ae73611d80a560977aa554d6cbedee78bec19a..e2e294d997e0c9b8b5f26b5626649c984ffa44db 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -57,7 +57,6 @@
  #include <asm/uaccess.h>
  #include <linux/atomic.h>
  #include <linux/mutex.h>
-#include <linux/workqueue.h>
  #include <linux/cgroup.h>
  #include <linux/wait.h>
  
@@ -99,6 +98,7 @@ struct cpuset {
  
         /* user-configured CPUs and Memory Nodes allow to tasks */
         cpumask_var_t cpus_allowed;
+       cpumask_var_t cpus_requested;
         nodemask_t mems_allowed;
  
         /* effective CPUs and Memory Nodes allow to tasks */
@@ -286,6 +286,8 @@ static struct cpuset top_cpuset = {
  static DEFINE_MUTEX(cpuset_mutex);
  static DEFINE_SPINLOCK(callback_lock);
  
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
  /*
   * CPU / memory hotplug is handled asynchronously.
   */
@@ -385,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
-       return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+       return  cpumask_subset(p->cpus_requested, q->cpus_requested) &&
                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                 is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -485,7 +487,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
         cpuset_for_each_child(c, css, par) {
                 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                     c != cur &&
-                   cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+                   cpumask_intersects(trial->cpus_requested, c->cpus_requested))
                         goto out;
                 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                     c != cur &&
@@ -944,17 +946,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
         if (!*buf) {
                 cpumask_clear(trialcs->cpus_allowed);
         } else {
-               retval = cpulist_parse(buf, trialcs->cpus_allowed);
+               retval = cpulist_parse(buf, trialcs->cpus_requested);
                 if (retval < 0)
                         return retval;
  
-               if (!cpumask_subset(trialcs->cpus_allowed,
-                                   top_cpuset.cpus_allowed))
+               if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
                         return -EINVAL;
+
+               cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
         }
  
         /* Nothing to do if the cpus didn't change */
-       if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+       if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
                 return 0;
  
         retval = validate_change(cs, trialcs);
@@ -963,6 +966,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  
         spin_lock_irq(&callback_lock);
         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+       cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
         spin_unlock_irq(&callback_lock);
  
         /* use trialcs->cpus_allowed as a temp variable */
@@ -971,31 +975,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  }
  
  /*
- * cpuset_migrate_mm
- *
- *    Migrate memory region from one set of nodes to another.
- *
- *    Temporarilly set tasks mems_allowed to target nodes of migration,
- *    so that the migration code can allocate pages on these nodes.
- *
- *    While the mm_struct we are migrating is typically from some
- *    other task, the task_struct mems_allowed that we are hacking
- *    is for our current task, which must allocate new pages for that
- *    migrating memory region.
+ * Migrate memory region from one set of nodes to another.  This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management.  All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
   */
  
+struct cpuset_migrate_mm_work {
+       struct work_struct      work;
+       struct mm_struct        *mm;
+       nodemask_t              from;
+       nodemask_t              to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+       struct cpuset_migrate_mm_work *mwork =
+               container_of(work, struct cpuset_migrate_mm_work, work);
+
+       /* on a wq worker, no need to worry about %current's mems_allowed */
+       do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+       mmput(mwork->mm);
+       kfree(mwork);
+}
+
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                         const nodemask_t *to)
  {
-       struct task_struct *tsk = current;
-
-       tsk->mems_allowed = *to;
+       struct cpuset_migrate_mm_work *mwork;
  
-       do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+       mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+       if (mwork) {
+               mwork->mm = mm;
+               mwork->from = *from;
+               mwork->to = *to;
+               INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+               queue_work(cpuset_migrate_mm_wq, &mwork->work);
+       } else {
+               mmput(mm);
+       }
+}
  
-       rcu_read_lock();
-       guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
-       rcu_read_unlock();
+static void cpuset_post_attach(void)
+{
+       flush_workqueue(cpuset_migrate_mm_wq);
  }
  
  /*
@@ -1096,7 +1120,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
                 mpol_rebind_mm(mm, &cs->mems_allowed);
                 if (migrate)
                         cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
-               mmput(mm);
+               else
+                       mmput(mm);
         }
         css_task_iter_end(&it);
  
@@ -1429,15 +1454,16 @@ static int fmeter_getrate(struct fmeter *fmp)
  static struct cpuset *cpuset_attach_old_cs;
  
  /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys_state *css,
-                            struct cgroup_taskset *tset)
+static int cpuset_can_attach(struct cgroup_taskset *tset)
  {
-       struct cpuset *cs = css_cs(css);
+       struct cgroup_subsys_state *css;
+       struct cpuset *cs;
         struct task_struct *task;
         int ret;
  
         /* used later by cpuset_attach() */
-       cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+       cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+       cs = css_cs(css);
  
         mutex_lock(&cpuset_mutex);
  
@@ -1447,7 +1473,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                 goto out_unlock;
  
-       cgroup_taskset_for_each(task, tset) {
+       cgroup_taskset_for_each(task, css, tset) {
                 ret = task_can_attach(task, cs->cpus_allowed);
                 if (ret)
                         goto out_unlock;
@@ -1467,9 +1493,14 @@ out_unlock:
         return ret;
  }
  
-static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
-                                struct cgroup_taskset *tset)
+static void cpuset_cancel_attach(struct cgroup_taskset *tset)
  {
+       struct cgroup_subsys_state *css;
+       struct cpuset *cs;
+
+       cgroup_taskset_first(tset, &css);
+       cs = css_cs(css);
+
         mutex_lock(&cpuset_mutex);
         css_cs(css)->attach_in_progress--;
         mutex_unlock(&cpuset_mutex);
@@ -1482,16 +1513,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
   */
  static cpumask_var_t cpus_attach;
  
-static void cpuset_attach(struct cgroup_subsys_state *css,
-                         struct cgroup_taskset *tset)
+static void cpuset_attach(struct cgroup_taskset *tset)
  {
         /* static buf protected by cpuset_mutex */
         static nodemask_t cpuset_attach_nodemask_to;
         struct task_struct *task;
         struct task_struct *leader;
-       struct cpuset *cs = css_cs(css);
+       struct cgroup_subsys_state *css;
+       struct cpuset *cs;
         struct cpuset *oldcs = cpuset_attach_old_cs;
  
+       cgroup_taskset_first(tset, &css);
+       cs = css_cs(css);
+
         mutex_lock(&cpuset_mutex);
  
         /* prepare for attach */
@@ -1502,7 +1536,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
  
         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
  
-       cgroup_taskset_for_each(task, tset) {
+       cgroup_taskset_for_each(task, css, tset) {
                 /*
                  * can_attach beforehand should guarantee that this doesn't
                  * fail.  TODO: have a better way to handle failure here
@@ -1518,7 +1552,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
          * sleep and should be moved outside migration path proper.
          */
         cpuset_attach_nodemask_to = cs->effective_mems;
-       cgroup_taskset_for_each_leader(leader, tset) {
+       cgroup_taskset_for_each_leader(leader, css, tset) {
                 struct mm_struct *mm = get_task_mm(leader);
  
                 if (mm) {
@@ -1532,11 +1566,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
                          * @old_mems_allowed is the right nodesets that we
                          * migrate mm from.
                          */
-                       if (is_memory_migrate(cs)) {
+                       if (is_memory_migrate(cs))
                                 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                                   &cpuset_attach_nodemask_to);
-                       }
-                       mmput(mm);
+                       else
+                               mmput(mm);
                 }
         }
  
@@ -1701,6 +1735,7 @@ out_unlock:
         mutex_unlock(&cpuset_mutex);
         kernfs_unbreak_active_protection(of->kn);
         css_put(&cs->css);
+       flush_workqueue(cpuset_migrate_mm_wq);
         return retval ?: nbytes;
  }
  
@@ -1722,7 +1757,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
  
         switch (type) {
         case FILE_CPULIST:
-               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
                 break;
         case FILE_MEMLIST:
                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -1911,11 +1946,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
                 return ERR_PTR(-ENOMEM);
         if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
                 goto free_cs;
+       if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+               goto free_allowed;
         if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-               goto free_cpus;
+               goto free_requested;
  
         set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
         cpumask_clear(cs->cpus_allowed);
+       cpumask_clear(cs->cpus_requested);
         nodes_clear(cs->mems_allowed);
         cpumask_clear(cs->effective_cpus);
         nodes_clear(cs->effective_mems);
@@ -1924,7 +1962,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
  
         return &cs->css;
  
-free_cpus:
+free_requested:
+       free_cpumask_var(cs->cpus_requested);
+free_allowed:
         free_cpumask_var(cs->cpus_allowed);
  free_cs:
         kfree(cs);
@@ -1987,6 +2027,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
         cs->mems_allowed = parent->mems_allowed;
         cs->effective_mems = parent->mems_allowed;
         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+       cpumask_copy(cs->cpus_requested, parent->cpus_requested);
         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
         spin_unlock_irq(&callback_lock);
  out_unlock:
@@ -2021,6 +2062,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
  
         free_cpumask_var(cs->effective_cpus);
         free_cpumask_var(cs->cpus_allowed);
+       free_cpumask_var(cs->cpus_requested);
         kfree(cs);
  }
  
@@ -2042,14 +2084,33 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
         mutex_unlock(&cpuset_mutex);
  }
  
+static int cpuset_allow_attach(struct cgroup_taskset *tset)
+{
+       const struct cred *cred = current_cred(), *tcred;
+       struct task_struct *task;
+       struct cgroup_subsys_state *css;
+
+       cgroup_taskset_for_each(task, css, tset) {
+               tcred = __task_cred(task);
+
+               if ((current != task) && !capable(CAP_SYS_ADMIN) &&
+                    cred->euid.val != tcred->uid.val && cred->euid.val != tcred->suid.val)
+                       return -EACCES;
+       }
+
+       return 0;
+}
+
  struct cgroup_subsys cpuset_cgrp_subsys = {
         .css_alloc      = cpuset_css_alloc,
         .css_online     = cpuset_css_online,
         .css_offline    = cpuset_css_offline,
         .css_free       = cpuset_css_free,
         .can_attach     = cpuset_can_attach,
+       .allow_attach   = cpuset_allow_attach,
         .cancel_attach  = cpuset_cancel_attach,
         .attach         = cpuset_attach,
+       .post_attach    = cpuset_post_attach,
         .bind           = cpuset_bind,
         .legacy_cftypes = files,
         .early_init     = 1,
@@ -2069,8 +2130,11 @@ int __init cpuset_init(void)
                 BUG();
         if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
                 BUG();
+       if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+               BUG();
  
         cpumask_setall(top_cpuset.cpus_allowed);
+       cpumask_setall(top_cpuset.cpus_requested);
         nodes_setall(top_cpuset.mems_allowed);
         cpumask_setall(top_cpuset.effective_cpus);
         nodes_setall(top_cpuset.effective_mems);
@@ -2204,7 +2268,7 @@ retry:
                 goto retry;
         }
  
-       cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+       cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
         nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
  
         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
@@ -2346,6 +2410,9 @@ void __init cpuset_init_smp(void)
         top_cpuset.effective_mems = node_states[N_MEMORY];
  
         register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+       cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+       BUG_ON(!cpuset_migrate_mm_wq);
  }
  
  /**