cgroup: enable task_cg_lists on the first cgroup mount

[firefly-linux-kernel-4.4.55.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index cffdb6e2ad08935afbdcd593b10898ce52c8fb64..506f6da67ad1c0e1361db63d11c0ecd91e340802 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -53,6 +53,7 @@
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
  #include <linux/flex_array.h> /* used in cgroup_attach_task */
  #include <linux/kthread.h>
+#include <linux/delay.h>
  
  #include <linux/atomic.h>
  
@@ -172,6 +173,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
  static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                               bool is_add);
  static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+static void cgroup_enable_task_cg_lists(void);
  
  /**
   * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -374,7 +376,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
   * fork()/exit() overhead for people who have cgroups compiled into their
   * kernel but not actually in use.
   */
-static int use_task_css_set_links __read_mostly;
+static bool use_task_css_set_links __read_mostly;
  
  static void __put_css_set(struct css_set *cset, int taskexit)
  {
@@ -728,45 +730,19 @@ static void cgroup_free_root(struct cgroupfs_root *root)
         }
  }
  
-static void cgroup_get_root(struct cgroupfs_root *root)
-{
-       /*
-        * The caller must ensure that @root is alive, which can be
-        * achieved by holding a ref on one of the member cgroups or
-        * following a registered reference to @root while holding
-        * cgroup_tree_mutex.
-        */
-       WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0);
-       atomic_inc(&root->refcnt);
-}
-
-static void cgroup_put_root(struct cgroupfs_root *root)
+static void cgroup_destroy_root(struct cgroupfs_root *root)
  {
         struct cgroup *cgrp = &root->top_cgroup;
         struct cgrp_cset_link *link, *tmp_link;
-       int ret;
  
-       /*
-        * @root's refcnt reaching zero and its deregistration should be
-        * atomic w.r.t. cgroup_tree_mutex.  This ensures that
-        * cgroup_get_root() is safe to invoke if @root is registered.
-        */
         mutex_lock(&cgroup_tree_mutex);
-       if (!atomic_dec_and_test(&root->refcnt)) {
-               mutex_unlock(&cgroup_tree_mutex);
-               return;
-       }
         mutex_lock(&cgroup_mutex);
  
-       BUG_ON(atomic_read(&root->nr_cgrps) != 1);
+       BUG_ON(atomic_read(&root->nr_cgrps));
         BUG_ON(!list_empty(&cgrp->children));
  
         /* Rebind all subsystems back to the default hierarchy */
-       if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
-               ret = rebind_subsystems(root, 0, root->subsys_mask);
-               /* Shouldn't be able to fail ... */
-               BUG_ON(ret);
-       }
+       WARN_ON(rebind_subsystems(root, 0, root->subsys_mask));
  
         /*
          * Release all the links from cset_links to this hierarchy's
@@ -929,21 +905,24 @@ static void cgroup_free_fn(struct work_struct *work)
         struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
  
         atomic_dec(&cgrp->root->nr_cgrps);
-
-       /*
-        * We get a ref to the parent, and put the ref when this cgroup is
-        * being freed, so it's guaranteed that the parent won't be
-        * destroyed before its children.
-        */
-       cgroup_put(cgrp->parent);
-
-       /* put the root reference that we took when we created the cgroup */
-       cgroup_put_root(cgrp->root);
-
         cgroup_pidlist_destroy_all(cgrp);
  
-       kernfs_put(cgrp->kn);
-       kfree(cgrp);
+       if (cgrp->parent) {
+               /*
+                * We get a ref to the parent, and put the ref when this
+                * cgroup is being freed, so it's guaranteed that the
+                * parent won't be destroyed before its children.
+                */
+               cgroup_put(cgrp->parent);
+               kernfs_put(cgrp->kn);
+               kfree(cgrp);
+       } else {
+               /*
+                * This is top cgroup's refcnt reaching zero, which
+                * indicates that the root should be released.
+                */
+               cgroup_destroy_root(cgrp->root);
+       }
  }
  
  static void cgroup_free_rcu(struct rcu_head *head)
@@ -965,7 +944,7 @@ static void cgroup_put(struct cgroup *cgrp)
  {
         if (!atomic_dec_and_test(&cgrp->refcnt))
                 return;
-       if (WARN_ON_ONCE(!cgroup_is_dead(cgrp)))
+       if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
                 return;
  
         /*
@@ -1072,13 +1051,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                 }
         }
  
-       /*
-        * Mark @root has finished binding subsystems.  @root->subsys_mask
-        * now matches the bound subsystems.
-        */
-       root->flags |= CGRP_ROOT_SUBSYS_BOUND;
         kernfs_activate(cgrp->kn);
-
         return 0;
  }
  
@@ -1243,18 +1216,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
                 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
  
-               if (opts->flags & CGRP_ROOT_NOPREFIX) {
-                       pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
+               if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+                   opts->cpuset_clone_children || opts->release_agent ||
+                   opts->name) {
+                       pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
                         return -EINVAL;
                 }
-
-               if (opts->cpuset_clone_children) {
-                       pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
-                       return -EINVAL;
-               }
-
-               if (opts->flags & CGRP_ROOT_XATTR)
-                       pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n");
         }
  
         /*
@@ -1356,7 +1323,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
  {
         struct cgroup *cgrp = &root->top_cgroup;
  
-       atomic_set(&root->refcnt, 1);
         INIT_LIST_HEAD(&root->root_list);
         atomic_set(&root->nr_cgrps, 1);
         cgrp->root = root;
@@ -1377,15 +1343,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
  
         init_cgroup_root(root);
  
-       /*
-        * We need to set @root->subsys_mask now so that @root can be
-        * matched by cgroup_test_super() before it finishes
-        * initialization; otherwise, competing mounts with the same
-        * options may try to bind the same subsystems instead of waiting
-        * for the first one leading to unexpected mount errors.
-        * SUBSYS_BOUND will be set once actual binding is complete.
-        */
-       root->subsys_mask = opts->subsys_mask;
         root->flags = opts->flags;
         if (opts->release_agent)
                 strcpy(root->release_agent_path, opts->release_agent);
@@ -1396,7 +1353,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
         return root;
  }
  
-static int cgroup_setup_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
  {
         LIST_HEAD(tmp_links);
         struct cgroup *root_cgrp = &root->top_cgroup;
@@ -1439,7 +1396,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root)
         if (ret)
                 goto destroy_root;
  
-       ret = rebind_subsystems(root, root->subsys_mask, 0);
+       ret = rebind_subsystems(root, ss_mask, 0);
         if (ret)
                 goto destroy_root;
  
@@ -1486,6 +1443,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         struct dentry *dentry;
         int ret;
  
+       /*
+        * The first time anyone tries to mount a cgroup, enable the list
+        * linking each css_set to its tasks and fix up all existing tasks.
+        */
+       if (!use_task_css_set_links)
+               cgroup_enable_task_cg_lists();
+retry:
         mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
  
@@ -1531,7 +1495,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         }
                 }
  
-               cgroup_get_root(root);
+               /*
+                * A root's lifetime is governed by its top cgroup.  Zero
+                * ref indicate that the root is being destroyed.  Wait for
+                * destruction to complete so that the subsystems are free.
+                * We can use wait_queue for the wait but this path is
+                * super cold.  Let's just sleep for a bit and retry.
+                */
+               if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) {
+                       mutex_unlock(&cgroup_mutex);
+                       mutex_unlock(&cgroup_tree_mutex);
+                       msleep(10);
+                       goto retry;
+               }
+
+               ret = 0;
                 goto out_unlock;
         }
  
@@ -1542,7 +1520,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 goto out_unlock;
         }
  
-       ret = cgroup_setup_root(root);
+       ret = cgroup_setup_root(root, opts.subsys_mask);
         if (ret)
                 cgroup_free_root(root);
  
@@ -1558,7 +1536,7 @@ out_unlock:
  
         dentry = kernfs_mount(fs_type, flags, root->kf_root);
         if (IS_ERR(dentry))
-               cgroup_put_root(root);
+               cgroup_put(&root->top_cgroup);
         return dentry;
  }
  
@@ -1567,7 +1545,7 @@ static void cgroup_kill_sb(struct super_block *sb)
         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
         struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
  
-       cgroup_put_root(root);
+       cgroup_put(&root->top_cgroup);
         kernfs_kill_sb(sb);
  }
  
@@ -1722,10 +1700,8 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
         rcu_assign_pointer(tsk->cgroups, new_cset);
         task_unlock(tsk);
  
-       /* Update the css_set linked lists if we're using them */
         write_lock(&css_set_lock);
-       if (!list_empty(&tsk->cg_list))
-               list_move(&tsk->cg_list, &new_cset->tasks);
+       list_move(&tsk->cg_list, &new_cset->tasks);
         write_unlock(&css_set_lock);
  
         /*
@@ -2392,13 +2368,19 @@ int cgroup_task_count(const struct cgroup *cgrp)
   * To reduce the fork() overhead for systems that are not actually using
   * their cgroups capability, we don't maintain the lists running through
   * each css_set to its tasks until we see the list actually used - in other
- * words after the first call to css_task_iter_start().
+ * words after the first mount.
   */
  static void cgroup_enable_task_cg_lists(void)
  {
         struct task_struct *p, *g;
+
         write_lock(&css_set_lock);
-       use_task_css_set_links = 1;
+
+       if (use_task_css_set_links)
+               goto out_unlock;
+
+       use_task_css_set_links = true;
+
         /*
          * We need tasklist_lock because RCU is not safe against
          * while_each_thread(). Besides, a forking task that has passed
@@ -2409,16 +2391,22 @@ static void cgroup_enable_task_cg_lists(void)
         read_lock(&tasklist_lock);
         do_each_thread(g, p) {
                 task_lock(p);
+
+               WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+                            task_css_set(p) != &init_css_set);
+
                 /*
                  * We should check if the process is exiting, otherwise
                  * it will race with cgroup_exit() in that the list
                  * entry won't be deleted though the process has exited.
                  */
-               if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
+               if (!(p->flags & PF_EXITING))
                         list_add(&p->cg_list, &task_css_set(p)->tasks);
+
                 task_unlock(p);
         } while_each_thread(g, p);
         read_unlock(&tasklist_lock);
+out_unlock:
         write_unlock(&css_set_lock);
  }
  
@@ -2651,13 +2639,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
                          struct css_task_iter *it)
         __acquires(css_set_lock)
  {
-       /*
-        * The first time anyone tries to iterate across a css, we need to
-        * enable the list linking each css_set to its tasks, and fix up
-        * all existing tasks.
-        */
-       if (!use_task_css_set_links)
-               cgroup_enable_task_cg_lists();
+       /* no one should try to iterate before mounting cgroups */
+       WARN_ON_ONCE(!use_task_css_set_links);
  
         read_lock(&css_set_lock);
  
@@ -3708,12 +3691,6 @@ static long cgroup_create(struct cgroup *parent, const char *name,
         /* allocation complete, commit to creation */
         list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
         atomic_inc(&root->nr_cgrps);
-
-       /*
-        * Grab a reference on the root and parent so that they don't get
-        * deleted while there are child cgroups.
-        */
-       cgroup_get_root(root);
         cgroup_get(parent);
  
         /*