From 414e73880e4a57b75b177f1f7aa6980c87150a80 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 28 Jul 2016 18:44:40 +0100
Subject: [PATCH] FIXUP: sched/tune: fix accounting for runnable tasks

Contains:

sched/tune: fix accounting for runnable tasks (1/5)

The accounting for tasks into boost groups of different CPUs is currently
broken mainly because:
a) we do not properly track the change of boost group of a RUNNABLE task
b) there are race conditions between migration code and accounting code

This patch provides a fixes to ensure enqueue/dequeue
accounting also for throttled tasks.

Without this patch is can happen that a task is enqueued into a throttled
RQ thus not being accounted for the boosting of the corresponding RQ.
We could argue that a throttled task should not boost a CPU, however:
a) properly implementing CPU boosting considering throttled tasks will
   increase a lot the complexity of the solution
b) it's not easy to quantify the benefits introduced by such a more
   complex solution

Since task throttling requires the usage of the CFS bandwidth controller,
which is not widely used on mobile systems (at least not by Android kernels
so far), for the time being we go for the simple solution and boost also
for throttled RQs.

sched/tune: fix accounting for runnable tasks (2/5)

This patch provides the code required to enforce proper locking.
A per boost group spinlock has been added to grant atomic
accounting of tasks as well as to serialise enqueue/dequeue operations,
triggered by tasks migrations, with cgroups's attach/detach operations.

sched/tune: fix accounting for runnable tasks (3/5)

This patch adds cgroups {allow,can,cancel}_attach callbacks.

Since a task can be migrated between boost groups while it's running,
the CGroups's attach callbacks have been added to properly migrate
boost contributions of RUNNABLE tasks.

The RQ's lock is used to serialise enqueue/dequeue operations, triggered
by tasks migrations, with cgroups's attach/detach operations. While the
SchedTune's CPU lock is used to grant atrocity of the accounting within
the CPU.

NOTE: the current implementation does not allows a concurrent CPU migration
      and CGroups change.

sched/tune: fix accounting for runnable tasks (4/5)

This fixes accounting for exiting tasks by adding a dedicated call early
in the do_exit() syscall, which disables SchedTune accounting as soon as a
task is flagged PF_EXITING.

This flag is set before the multiple dequeue/enqueue dance triggered
by cgroup_exit() which is useful only to inject useless tasks movements
thus increasing possibilities for race conditions with the migration code.
The schedtune_exit_task() call does the last dequeue of a task from its
current boost group. This is a solution more aligned with what happens in
mainline kernels (>v4.4) where the exit_cgroup does not move anymore a dying
task to the root control group.

sched/tune: fix accounting for runnable tasks (5/5)

To avoid accounting issues at startup, this patch disable the SchedTune
accounting until the required data structures have been properly
initialized.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/exit.c        |   5 ++
 kernel/sched/core.c  |  12 +++
 kernel/sched/fair.c  |  11 ++-
 kernel/sched/sched.h |   3 +
 kernel/sched/tune.c  | 178 ++++++++++++++++++++++++++++++++++++++-----
 kernel/sched/tune.h  |   6 ++
 6 files changed, 193 insertions(+), 22 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index ffba5df4abd5..62c4bd4abd3a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,8 @@
 #include <linux/writeback.h>
 #include <linux/shm.h>
 
+#include "sched/tune.h"
+
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
@@ -699,6 +701,9 @@ void do_exit(long code)
 	}
 
 	exit_signals(tsk);  /* sets PF_EXITING */
+
+	schedtune_exit_task(tsk);
+
 	/*
 	 * tsk->flags are checked in the futex code to protect against
 	 * an exiting task cleaning up the robust pi futexes.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 07f389a0f22f..b814c13f850f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -287,6 +287,18 @@ int sysctl_sched_rt_runtime = 950000;
 /* cpus with isolated domains */
 cpumask_var_t cpu_isolated_map;
 
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+	return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+	task_rq_unlock(rq, p, flags);
+}
+
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9c717c3be75d..736adab1a503 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4250,8 +4250,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		    cpu_overutilized(rq->cpu))
 			rq->rd->overutilized = true;
 
-		schedtune_enqueue_task(p, cpu_of(rq));
-
 		/*
 		 * We want to potentially trigger a freq switch
 		 * request only for tasks that are waking up; this is
@@ -4262,6 +4260,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (task_new || task_wakeup)
 			update_capacity_of(cpu_of(rq));
 	}
+
+	/* Update SchedTune accouting */
+	schedtune_enqueue_task(p, cpu_of(rq));
+
 #endif /* CONFIG_SMP */
 
 	hrtick_update(rq);
@@ -4327,7 +4329,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_SMP
 
 	if (!se) {
-		schedtune_dequeue_task(p, cpu_of(rq));
 
 		/*
 		 * We want to potentially trigger a freq switch
@@ -4345,6 +4346,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		}
 	}
 
+	/* Update SchedTune accouting */
+	schedtune_dequeue_task(p, cpu_of(rq));
+
 #endif /* CONFIG_SMP */
 
 	hrtick_update(rq);
@@ -5625,7 +5629,6 @@ static inline int find_best_target(struct task_struct *p)
 		 * The target CPU can be already at a capacity level higher
 		 * than the one required to boost the task.
 		 */
-
 		if (new_util > capacity_orig_of(i))
 			continue;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cd947923e11..1b838cff2f20 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1701,6 +1701,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
+extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
 
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index a691b8db2888..4c77cc23e65b 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -11,6 +11,10 @@
 #include "sched.h"
 #include "tune.h"
 
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+static bool schedtune_initialized = false;
+#endif
+
 unsigned int sysctl_sched_cfs_boost __read_mostly;
 
 extern struct target_nrg schedtune_target_nrg;
@@ -222,6 +226,8 @@ struct boost_groups {
 		/* Count of RUNNABLE tasks on that boost group */
 		unsigned tasks;
 	} group[BOOSTGROUPS_COUNT];
+	/* CPU's boost group locking */
+	raw_spinlock_t lock;
 };
 
 /* Boost groups affecting each CPU in the system */
@@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost)
 	return 0;
 }
 
+#define ENQUEUE_TASK  1
+#define DEQUEUE_TASK -1
+
 static inline void
 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
 {
-	struct boost_groups *bg;
-	int tasks;
-
-	bg = &per_cpu(cpu_boost_groups, cpu);
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	int tasks = bg->group[idx].tasks + task_count;
 
 	/* Update boosted tasks count while avoiding to make it negative */
-	if (task_count < 0 && bg->group[idx].tasks <= -task_count)
-		bg->group[idx].tasks = 0;
-	else
-		bg->group[idx].tasks += task_count;
-
-	/* Boost group activation or deactivation on that RQ */
-	tasks = bg->group[idx].tasks;
-	if (tasks == 1 || tasks == 0)
-		schedtune_cpu_update(cpu);
+	bg->group[idx].tasks = max(0, tasks);
 
 	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
 			bg->group[idx].boost, bg->boost_max);
 
+	/* Boost group activation or deactivation on that RQ */
+	if (tasks == 1 || tasks == 0)
+		schedtune_cpu_update(cpu);
 }
 
 /*
@@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
  */
 void schedtune_enqueue_task(struct task_struct *p, int cpu)
 {
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	unsigned long irq_flags;
 	struct schedtune *st;
 	int idx;
 
+	if (!unlikely(schedtune_initialized))
+		return;
+
 	/*
 	 * When a task is marked PF_EXITING by do_exit() it's going to be
 	 * dequeued and enqueued multiple times in the exit path.
@@ -339,13 +346,109 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
 	if (p->flags & PF_EXITING)
 		return;
 
-	/* Get task boost group */
+	/*
+	 * Boost group accouting is protected by a per-cpu lock and requires
+	 * interrupt to be disabled to avoid race conditions for example on
+	 * do_exit()::cgroup_exit() and task migration.
+	 */
+	raw_spin_lock_irqsave(&bg->lock, irq_flags);
 	rcu_read_lock();
+
 	st = task_schedtune(p);
 	idx = st->idx;
+
+	schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
 	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
 
-	schedtune_tasks_update(p, cpu, idx, 1);
+int schedtune_allow_attach(struct cgroup_taskset *tset)
+{
+	/* We always allows tasks to be moved between existing CGroups */
+	return 0;
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+	struct boost_groups *bg;
+	unsigned long irq_flags;
+	unsigned int cpu;
+	struct rq *rq;
+	int src_bg; /* Source boost group index */
+	int dst_bg; /* Destination boost group index */
+	int tasks;
+
+	if (!unlikely(schedtune_initialized))
+		return 0;
+
+
+	cgroup_taskset_for_each(task, css, tset) {
+
+		/*
+		 * Lock the CPU's RQ the task is enqueued to avoid race
+		 * conditions with migration code while the task is being
+		 * accounted
+		 */
+		rq = lock_rq_of(task, &irq_flags);
+
+		if (!task->on_rq) {
+			unlock_rq_of(rq, task, &irq_flags);
+			continue;
+		}
+
+		/*
+		 * Boost group accouting is protected by a per-cpu lock and requires
+		 * interrupt to be disabled to avoid race conditions on...
+		 */
+		cpu = cpu_of(rq);
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		raw_spin_lock(&bg->lock);
+
+		dst_bg = css_st(css)->idx;
+		src_bg = task_schedtune(task)->idx;
+
+		/*
+		 * Current task is not changing boostgroup, which can
+		 * happen when the new hierarchy is in use.
+		 */
+		if (unlikely(dst_bg == src_bg)) {
+			raw_spin_unlock(&bg->lock);
+			unlock_rq_of(rq, task, &irq_flags);
+			continue;
+		}
+
+		/*
+		 * This is the case of a RUNNABLE task which is switching its
+		 * current boost group.
+		 */
+
+		/* Move task from src to dst boost group */
+		tasks = bg->group[src_bg].tasks - 1;
+		bg->group[src_bg].tasks = max(0, tasks);
+		bg->group[dst_bg].tasks += 1;
+
+		raw_spin_unlock(&bg->lock);
+		unlock_rq_of(rq, task, &irq_flags);
+
+		/* Update CPU boost group */
+		if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+			schedtune_cpu_update(task_cpu(task));
+
+	}
+
+	return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+	/* This can happen only if SchedTune controller is mounted with
+	 * other hierarchies ane one of them fails. Since usually SchedTune is
+	 * mouted on its own hierarcy, for the time being we do not implement
+	 * a proper rollback mechanism */
+	WARN(1, "SchedTune cancel attach not implemented");
 }
 
 /*
@@ -353,26 +456,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
  */
 void schedtune_dequeue_task(struct task_struct *p, int cpu)
 {
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	unsigned long irq_flags;
 	struct schedtune *st;
 	int idx;
 
+	if (!unlikely(schedtune_initialized))
+		return;
+
 	/*
 	 * When a task is marked PF_EXITING by do_exit() it's going to be
 	 * dequeued and enqueued multiple times in the exit path.
 	 * Thus we avoid any further update, since we do not want to change
 	 * CPU boosting while the task is exiting.
-	 * The last dequeue will be done by cgroup exit() callback.
+	 * The last dequeue is already enforce by the do_exit() code path
+	 * via schedtune_exit_task().
 	 */
 	if (p->flags & PF_EXITING)
 		return;
 
-	/* Get task boost group */
+	/*
+	 * Boost group accouting is protected by a per-cpu lock and requires
+	 * interrupt to be disabled to avoid race conditions on...
+	 */
+	raw_spin_lock_irqsave(&bg->lock, irq_flags);
 	rcu_read_lock();
+
 	st = task_schedtune(p);
 	idx = st->idx;
+
+	schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
 	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+	struct schedtune *st;
+	unsigned long irq_flags;
+	unsigned int cpu;
+	struct rq *rq;
+	int idx;
+
+	if (!unlikely(schedtune_initialized))
+		return;
 
-	schedtune_tasks_update(p, cpu, idx, -1);
+	rq = lock_rq_of(tsk, &irq_flags);
+	rcu_read_lock();
+
+	cpu = cpu_of(rq);
+	st = task_schedtune(tsk);
+	idx = st->idx;
+	schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+	rcu_read_unlock();
+	unlock_rq_of(rq, tsk, &irq_flags);
 }
 
 int schedtune_cpu_boost(int cpu)
@@ -518,6 +657,9 @@ schedtune_css_free(struct cgroup_subsys_state *css)
 struct cgroup_subsys schedtune_cgrp_subsys = {
 	.css_alloc	= schedtune_css_alloc,
 	.css_free	= schedtune_css_free,
+//	.allow_attach   = schedtune_allow_attach,
+	.can_attach     = schedtune_can_attach,
+	.cancel_attach  = schedtune_cancel_attach,
 	.legacy_cftypes	= files,
 	.early_init	= 1,
 };
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index 99637758a8af..be1785eb1c5b 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -17,6 +17,8 @@ struct target_nrg {
 int schedtune_cpu_boost(int cpu);
 int schedtune_task_boost(struct task_struct *tsk);
 
+void schedtune_exit_task(struct task_struct *tsk);
+
 void schedtune_enqueue_task(struct task_struct *p, int cpu);
 void schedtune_dequeue_task(struct task_struct *p, int cpu);
 
@@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu);
 #define schedtune_cpu_boost(cpu)  get_sysctl_sched_cfs_boost()
 #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
 
+#define schedtune_exit_task(task) do { } while (0)
+
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 
@@ -39,6 +43,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta,
 #define schedtune_cpu_boost(cpu)  0
 #define schedtune_task_boost(tsk) 0
 
+#define schedtune_exit_task(task) do { } while (0)
+
 #define schedtune_enqueue_task(task, cpu) do { } while (0)
 #define schedtune_dequeue_task(task, cpu) do { } while (0)
 
-- 
2.34.1