#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/atomic.h>
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
-static struct cgroup_name root_cgroup_name = { .name = "/" };
-
/*
* Assign a monotonically increasing serial number to cgroups. It
* guarantees cgroups with bigger numbers are newer than those with smaller
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+static void cgroup_enable_task_cg_lists(void);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* fork()/exit() overhead for people who have cgroups compiled into their
* kernel but not actually in use.
*/
-static int use_task_css_set_links __read_mostly;
+static bool use_task_css_set_links __read_mostly;
static void __put_css_set(struct css_set *cset, int taskexit)
{
}
}
-static void cgroup_get_root(struct cgroupfs_root *root)
-{
- /*
- * The caller must ensure that @root is alive, which can be
- * achieved by holding a ref on one of the member cgroups or
- * following a registered reference to @root while holding
- * cgroup_tree_mutex.
- */
- WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0);
- atomic_inc(&root->refcnt);
-}
-
-static void cgroup_put_root(struct cgroupfs_root *root)
+static void cgroup_destroy_root(struct cgroupfs_root *root)
{
struct cgroup *cgrp = &root->top_cgroup;
struct cgrp_cset_link *link, *tmp_link;
- int ret;
- /*
- * @root's refcnt reaching zero and its deregistration should be
- * atomic w.r.t. cgroup_tree_mutex. This ensures that
- * cgroup_get_root() is safe to invoke if @root is registered.
- */
mutex_lock(&cgroup_tree_mutex);
- if (!atomic_dec_and_test(&root->refcnt)) {
- mutex_unlock(&cgroup_tree_mutex);
- return;
- }
mutex_lock(&cgroup_mutex);
- BUG_ON(root->number_of_cgroups != 1);
+ BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->children));
/* Rebind all subsystems back to the default hierarchy */
- if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
- ret = rebind_subsystems(root, 0, root->subsys_mask);
- /* Shouldn't be able to fail ... */
- BUG_ON(ret);
- }
+ WARN_ON(rebind_subsystems(root, 0, root->subsys_mask));
/*
* Release all the links from cset_links to this hierarchy's
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
-static struct cgroup_name *cgroup_alloc_name(const char *name_str)
-{
- struct cgroup_name *name;
-
- name = kmalloc(sizeof(*name) + strlen(name_str) + 1, GFP_KERNEL);
- if (!name)
- return NULL;
- strcpy(name->name, name_str);
- return name;
-}
-
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
- mutex_lock(&cgroup_mutex);
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
-
- /*
- * We get a ref to the parent, and put the ref when this cgroup is
- * being freed, so it's guaranteed that the parent won't be
- * destroyed before its children.
- */
- cgroup_put(cgrp->parent);
-
- /* put the root reference that we took when we created the cgroup */
- cgroup_put_root(cgrp->root);
-
+ atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
- kernfs_put(cgrp->kn);
-
- kfree(rcu_dereference_raw(cgrp->name));
- kfree(cgrp);
+ if (cgrp->parent) {
+ /*
+ * We get a ref to the parent, and put the ref when this
+ * cgroup is being freed, so it's guaranteed that the
+ * parent won't be destroyed before its children.
+ */
+ cgroup_put(cgrp->parent);
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is top cgroup's refcnt reaching zero, which
+ * indicates that the root should be released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
}
static void cgroup_free_rcu(struct rcu_head *head)
{
if (!atomic_dec_and_test(&cgrp->refcnt))
return;
- if (WARN_ON_ONCE(!cgroup_is_dead(cgrp)))
+ if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
return;
/*
}
}
- /*
- * Mark @root has finished binding subsystems. @root->subsys_mask
- * now matches the bound subsystems.
- */
- root->flags |= CGRP_ROOT_SUBSYS_BOUND;
kernfs_activate(cgrp->kn);
-
return 0;
}
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (opts->flags & CGRP_ROOT_NOPREFIX) {
- pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
- return -EINVAL;
- }
-
- if (opts->cpuset_clone_children) {
- pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+ if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+ opts->cpuset_clone_children || opts->release_agent ||
+ opts->name) {
+ pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
return -EINVAL;
}
-
- if (opts->flags & CGRP_ROOT_XATTR)
- pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n");
}
/*
}
/* remounting is not allowed for populated hierarchies */
- if (root->number_of_cgroups > 1) {
+ if (!list_empty(&root->top_cgroup.children)) {
ret = -EBUSY;
goto out_unlock;
}
{
struct cgroup *cgrp = &root->top_cgroup;
- atomic_set(&root->refcnt, 1);
INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
+ atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
- RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
}
init_cgroup_root(root);
- /*
- * We need to set @root->subsys_mask now so that @root can be
- * matched by cgroup_test_super() before it finishes
- * initialization; otherwise, competing mounts with the same
- * options may try to bind the same subsystems instead of waiting
- * for the first one leading to unexpected mount errors.
- * SUBSYS_BOUND will be set once actual binding is complete.
- */
- root->subsys_mask = opts->subsys_mask;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
return root;
}
-static int cgroup_setup_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->top_cgroup;
if (ret)
goto destroy_root;
- ret = rebind_subsystems(root, root->subsys_mask, 0);
+ ret = rebind_subsystems(root, ss_mask, 0);
if (ret)
goto destroy_root;
write_unlock(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->children));
- BUG_ON(root->number_of_cgroups != 1);
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
kernfs_activate(root_cgrp->kn);
ret = 0;
struct dentry *dentry;
int ret;
+ /*
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+retry:
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
}
}
- cgroup_get_root(root);
+ /*
+ * A root's lifetime is governed by its top cgroup. Zero
+ * ref indicate that the root is being destroyed. Wait for
+ * destruction to complete so that the subsystems are free.
+ * We can use wait_queue for the wait but this path is
+ * super cold. Let's just sleep for a bit and retry.
+ */
+ if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
+ msleep(10);
+ goto retry;
+ }
+
+ ret = 0;
goto out_unlock;
}
goto out_unlock;
}
- ret = cgroup_setup_root(root);
+ ret = cgroup_setup_root(root, opts.subsys_mask);
if (ret)
cgroup_free_root(root);
dentry = kernfs_mount(fs_type, flags, root->kf_root);
if (IS_ERR(dentry))
- cgroup_put_root(root);
+ cgroup_put(&root->top_cgroup);
return dentry;
}
struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
- cgroup_put_root(root);
+ cgroup_put(&root->top_cgroup);
kernfs_kill_sb(sb);
}
static struct kobject *cgroup_kobj;
-/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
- *
- * We can't generate cgroup path using dentry->d_name, as accessing
- * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
- * inode's i_mutex, while on the other hand cgroup_path() can be called
- * with some irq-safe spinlocks held.
- */
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
-{
- int ret = -ENAMETOOLONG;
- char *start;
-
- if (!cgrp->parent) {
- if (strlcpy(buf, "/", buflen) >= buflen)
- return -ENAMETOOLONG;
- return 0;
- }
-
- start = buf + buflen - 1;
- *start = '\0';
-
- rcu_read_lock();
- do {
- const char *name = cgroup_name(cgrp);
- int len;
-
- len = strlen(name);
- if ((start -= len) < buf)
- goto out;
- memcpy(start, name, len);
-
- if (--start < buf)
- goto out;
- *start = '/';
-
- cgrp = cgrp->parent;
- } while (cgrp->parent);
- ret = 0;
- memmove(buf, start, buf + buflen - start);
-out:
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_path);
-
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
* function grabs cgroup_mutex and shouldn't be used inside locks used by
* cgroup controller callbacks.
*
- * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
+ * Return value is the same as kernfs_path().
*/
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
struct cgroupfs_root *root;
struct cgroup *cgrp;
- int hierarchy_id = 1, ret = 0;
-
- if (buflen < 2)
- return -ENAMETOOLONG;
+ int hierarchy_id = 1;
+ char *path = NULL;
mutex_lock(&cgroup_mutex);
if (root) {
cgrp = task_cgroup_from_root(task, root);
- ret = cgroup_path(cgrp, buf, buflen);
+ path = cgroup_path(cgrp, buf, buflen);
} else {
/* if no hierarchy exists, everyone is in "/" */
- memcpy(buf, "/", 2);
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
}
mutex_unlock(&cgroup_mutex);
- return ret;
+ return path;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
rcu_assign_pointer(tsk->cgroups, new_cset);
task_unlock(tsk);
- /* Update the css_set linked lists if we're using them */
write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &new_cset->tasks);
+ list_move(&tsk->cg_list, &new_cset->tasks);
write_unlock(&css_set_lock);
/*
const char *new_name_str)
{
struct cgroup *cgrp = kn->priv;
- struct cgroup_name *name, *old_name;
int ret;
if (kernfs_type(kn) != KERNFS_DIR)
if (cgroup_sane_behavior(cgrp))
return -EPERM;
- name = cgroup_alloc_name(new_name_str);
- if (!name)
- return -ENOMEM;
-
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
ret = kernfs_rename(kn, new_parent, new_name_str);
- if (!ret) {
- old_name = rcu_dereference_protected(cgrp->name, true);
- rcu_assign_pointer(cgrp->name, name);
- } else {
- old_name = name;
- }
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
-
- kfree_rcu(old_name, rcu_head);
return ret;
}
* To reduce the fork() overhead for systems that are not actually using
* their cgroups capability, we don't maintain the lists running through
* each css_set to its tasks until we see the list actually used - in other
- * words after the first call to css_task_iter_start().
+ * words after the first mount.
*/
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
+
write_lock(&css_set_lock);
- use_task_css_set_links = 1;
+
+ if (use_task_css_set_links)
+ goto out_unlock;
+
+ use_task_css_set_links = true;
+
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
read_lock(&tasklist_lock);
do_each_thread(g, p) {
task_lock(p);
+
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
*/
- if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
+ if (!(p->flags & PF_EXITING))
list_add(&p->cg_list, &task_css_set(p)->tasks);
+
task_unlock(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
+out_unlock:
write_unlock(&css_set_lock);
}
struct css_task_iter *it)
__acquires(css_set_lock)
{
- /*
- * The first time anyone tries to iterate across a css, we need to
- * enable the list linking each css_set to its tasks, and fix up
- * all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
read_lock(&css_set_lock);
/**
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
- * @name_str: name of the new cgroup
+ * @name: name of the new cgroup
* @mode: mode to set on new cgroup
*/
-static long cgroup_create(struct cgroup *parent, const char *name_str,
+static long cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
{
struct cgroup *cgrp;
- struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
int ssid, err;
struct cgroup_subsys *ss;
if (!cgrp)
return -ENOMEM;
- name = cgroup_alloc_name(name_str);
- if (!name) {
- err = -ENOMEM;
- goto err_free_cgrp;
- }
- rcu_assign_pointer(cgrp->name, name);
-
mutex_lock(&cgroup_tree_mutex);
/*
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
/* create the directory */
- kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp);
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
if (IS_ERR(kn)) {
err = PTR_ERR(kn);
goto err_free_id;
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
- root->number_of_cgroups++;
-
- /*
- * Grab a reference on the root and parent so that they don't get
- * deleted while there are child cgroups.
- */
- cgroup_get_root(root);
+ atomic_inc(&root->nr_cgrps);
cgroup_get(parent);
/*
mutex_unlock(&cgroup_mutex);
err_unlock_tree:
mutex_unlock(&cgroup_tree_mutex);
- kfree(rcu_dereference_raw(cgrp->name));
-err_free_cgrp:
kfree(cgrp);
return err;
{
struct pid *pid;
struct task_struct *tsk;
- char *buf;
+ char *buf, *path;
int retval;
struct cgroupfs_root *root;
retval = -ENOMEM;
- buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
root->name);
seq_putc(m, ':');
cgrp = task_cgroup_from_root(tsk, root);
- retval = cgroup_path(cgrp, buf, PAGE_SIZE);
- if (retval < 0)
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
goto out_unlock;
- seq_puts(m, buf);
+ }
+ seq_puts(m, path);
seq_putc(m, '\n');
}
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
- ss->root->number_of_cgroups, !ss->disabled);
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
mutex_unlock(&cgroup_mutex);
return 0;
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
- if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
i = 0;
argv[i++] = agentbuf;
- argv[i++] = pathbuf;
+ argv[i++] = path;
argv[i] = NULL;
i = 0;
{
struct cgrp_cset_link *link;
struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
read_lock(&css_set_lock);
rcu_read_lock();
struct cgroup *c = link->cgrp;
const char *name = "?";
- if (c != cgroup_dummy_top)
- name = cgroup_name(c);
+ if (c != cgroup_dummy_top) {
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ name = name_buf;
+ }
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name);
}
rcu_read_unlock();
read_unlock(&css_set_lock);
+ kfree(name_buf);
return 0;
}