From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Sep 2015 19:00:19 +0000 (-0400)
Subject: cgroup, memcg, cpuset: implement cgroup_taskset_for_each_leader()
X-Git-Tag: firefly_0821_release~176^2~787^2~23
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=4530eddb59494b89650d6bcd980fc7f7717ad80c;p=firefly-linux-kernel-4.4.55.git

cgroup, memcg, cpuset: implement cgroup_taskset_for_each_leader()

It wasn't explicitly documented but, when a process is being migrated,
cpuset and memcg depend on cgroup_taskset_first() returning the
threadgroup leader; however, this approach is somewhat ghetto and
would no longer work for the planned multi-process migration.

This patch introduces explicit cgroup_taskset_for_each_leader() which
iterates over only the threadgroup leaders and replaces
cgroup_taskset_first() usages for accessing the leader with it.

This prepares both memcg and cpuset for multi-process migration.  This
patch also updates the documentation for cgroup_taskset_for_each() to
clarify the iteration rules and removes comments mentioning task
ordering in tasksets.

v2: A previous patch which added threadgroup leader test was dropped.
    Patch updated accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index fb717f2cba5b..e9c3eac074e2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -232,11 +232,33 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each - iterate cgroup_taskset
  * @task: the loop cursor
  * @tset: taskset to iterate
+ *
+ * @tset may contain multiple tasks and they may belong to multiple
+ * processes.  When there are multiple tasks in @tset, if a task of a
+ * process is in @tset, all tasks of the process are in @tset.  Also, all
+ * are guaranteed to share the same source and destination csses.
+ *
+ * Iteration is not in any specific order.
  */
 #define cgroup_taskset_for_each(task, tset)				\
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
 	     (task) = cgroup_taskset_next((tset)))
 
+/**
+ * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
+ * @leader: the loop cursor
+ * @tset: takset to iterate
+ *
+ * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
+ * may not contain any.
+ */
+#define cgroup_taskset_for_each_leader(leader, tset)			\
+	for ((leader) = cgroup_taskset_first((tset)); (leader);		\
+	     (leader) = cgroup_taskset_next((tset)))			\
+		if ((leader) != (leader)->group_leader)			\
+			;						\
+		else
+
 /*
  * Inline functions.
  */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0be276ffe08a..7f4b85af03dc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2217,13 +2217,6 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 
 	get_css_set(new_cset);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
-
-	/*
-	 * Use move_tail so that cgroup_taskset_first() still returns the
-	 * leader after migration.  This works because cgroup_migrate()
-	 * ensures that the dst_cset of the leader is the first on the
-	 * tset's dst_csets list.
-	 */
 	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 
 	/*
@@ -2419,10 +2412,6 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 		if (!cset->mg_src_cgrp)
 			goto next;
 
-		/*
-		 * cgroup_taskset_first() must always return the leader.
-		 * Take care to avoid disturbing the ordering.
-		 */
 		list_move_tail(&task->cg_list, &cset->mg_tasks);
 		if (list_empty(&cset->mg_node))
 			list_add_tail(&cset->mg_node, &tset.src_csets);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b361a0b58f6..e4d999929903 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1488,7 +1488,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 	/* static buf protected by cpuset_mutex */
 	static nodemask_t cpuset_attach_nodemask_to;
 	struct task_struct *task;
-	struct task_struct *leader = cgroup_taskset_first(tset);
+	struct task_struct *leader;
 	struct cpuset *cs = css_cs(css);
 	struct cpuset *oldcs = cpuset_attach_old_cs;
 
@@ -1514,12 +1514,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 	}
 
 	/*
-	 * Change mm, possibly for multiple threads in a threadgroup. This
-	 * is expensive and may sleep and should be moved outside migration
-	 * path proper.
+	 * Change mm for all threadgroup leaders. This is expensive and may
+	 * sleep and should be moved outside migration path proper.
 	 */
 	cpuset_attach_nodemask_to = cs->effective_mems;
-	if (thread_group_leader(leader)) {
+	cgroup_taskset_for_each_leader(leader, tset) {
 		struct mm_struct *mm = get_task_mm(leader);
 
 		if (mm) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9f331402e502..33c8dad6830f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4828,7 +4828,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *from;
-	struct task_struct *p;
+	struct task_struct *leader, *p;
 	struct mm_struct *mm;
 	unsigned long move_flags;
 	int ret = 0;
@@ -4842,7 +4842,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 	if (!move_flags)
 		return 0;
 
-	p = cgroup_taskset_first(tset);
+	/*
+	 * Multi-process migrations only happen on the default hierarchy
+	 * where charge immigration is not used.  Perform charge
+	 * immigration if @tset contains a leader and whine if there are
+	 * multiple.
+	 */
+	p = NULL;
+	cgroup_taskset_for_each_leader(leader, tset) {
+		WARN_ON_ONCE(p);
+		p = leader;
+	}
+	if (!p)
+		return 0;
+
 	from = mem_cgroup_from_task(p);
 
 	VM_BUG_ON(from == memcg);