From 1c4d0672bebacc1f362cfe9c0cef9755dbd77a8a Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 13 Jan 2017 17:54:34 +0000
Subject: [PATCH] sched/fair: Change cpu iteration order in find_best_target()

The schedtune task parameter 'boosted' is mapped into the cpu iteration
order. Currently for 'boosted' equal true the iteration starts at the
last cpu (NR_CPUS-1) whereas for 'boosted' equal false it starts at the
first cpu (0).

This only has the desired effect if the cpu topology oerdering matches
the underlying assumption. This e.g. is the case for the
Qc snapdragon 821 with its [L0 L1 b0 b1] cpu topology layout
(L=lower max freq, b=higher max freq). This results in cpus with higher
maximum capacity being given the highest logical cpu ids. However not
all big.LITTLE systems enumerate their cpus in the same way. For example,
the ARM Versatile Express Juno board has 6 cpus for which the default
configuration has topology [L0 b0 b1 L1 L2 L3].

To make this approach independent from the cpu topology layout it now
iterates over the cpus in the order of the sched_groups of the EAS
sched_domain (sd_ea). The order of cpu iteration is different for the
different cpu types in case the cpu is used to dereference sd_ea.

Considering the Qc snapdragon 821 again, for cpu L0 and L1 the order is
'b0->b1->L0->L1' whereas for b0 and b1 the order is 'b0->b1->L0->L1'.

This approach does not allow the exact same iteration order as with the
currently used flat iteration over [0 .. NR_CPUS-1] but the cpus
are ordered by the original cpu capacity.

The cpu iteration is now done in the sd_ea sched_group order required by
the 'boosted' value ['L0->L1->b0->b1'/'b0->b1->L0->L1'] rather than
forward/backward over the flat cpu space ['L0->L1->b0->b1'/
'b1->b0->L1->L0'].

Change-Id: I8fbe2073dedd2ecb1c750620c6000c11a5ff4358
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
(cherry picked from commit a0c6a4272c3968c0ff50d3fed65f5865b72d777b)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 169 +++++++++++++++++++++++++-------------------
 1 file changed, 97 insertions(+), 72 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 770d0ab957cc..d9c732de3295 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5743,100 +5743,125 @@ done:
 	return target;
 }
 
+static int start_cpu(bool boosted)
+{
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+	RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+			   "sched RCU must be held");
+
+	return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
 static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
 {
-	int iter_cpu;
 	int target_cpu = -1;
 	int target_util = 0;
 	int backup_capacity = 0;
 	int best_idle_cpu = -1;
 	int best_idle_cstate = INT_MAX;
 	int backup_cpu = -1;
-	unsigned long task_util_boosted, new_util;
+	unsigned long min_util;
+	unsigned long new_util;
+	struct sched_domain *sd;
+	struct sched_group *sg;
+	int cpu = start_cpu(boosted);
 
-	task_util_boosted = boosted_task_util(p);
-	for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
-		int cur_capacity;
-		struct rq *rq;
-		int idle_idx;
+	if (cpu < 0)
+		return target_cpu;
 
-		/*
-		 * Iterate from higher cpus for boosted tasks.
-		 */
-		int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+	sd = rcu_dereference(per_cpu(sd_ea, cpu));
 
-		if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
-			continue;
+	if (!sd)
+		return target_cpu;
 
-		/*
-		 * p's blocked utilization is still accounted for on prev_cpu
-		 * so prev_cpu will receive a negative bias due to the double
-		 * accounting. However, the blocked utilization may be zero.
-		 */
-		new_util = cpu_util(i) + task_util_boosted;
+	sg = sd->groups;
 
-		/*
-		 * Ensure minimum capacity to grant the required boost.
-		 * The target CPU can be already at a capacity level higher
-		 * than the one required to boost the task.
-		 */
-		if (new_util > capacity_orig_of(i))
-			continue;
+	min_util = boosted_task_util(p);
+
+	do {
+		int i;
+
+		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+			int cur_capacity;
+			struct rq *rq;
+			int idle_idx;
+
+			if (!cpu_online(i))
+				continue;
+
+			/*
+			 * p's blocked utilization is still accounted for on prev_cpu
+			 * so prev_cpu will receive a negative bias due to the double
+			 * accounting. However, the blocked utilization may be zero.
+			 */
+			new_util = cpu_util(i) + task_util(p);
+
+			/*
+			 * Ensure minimum capacity to grant the required boost.
+			 * The target CPU can be already at a capacity level higher
+			 * than the one required to boost the task.
+			 */
+			new_util = max(min_util, new_util);
+			if (new_util > capacity_orig_of(i))
+				continue;
 
 #ifdef CONFIG_SCHED_WALT
-		if (walt_cpu_high_irqload(i))
-			continue;
+			if (walt_cpu_high_irqload(i))
+				continue;
 #endif
-		/*
-		 * Unconditionally favoring tasks that prefer idle cpus to
-		 * improve latency.
-		 */
-		if (idle_cpu(i) && prefer_idle) {
-			if (best_idle_cpu < 0)
-				best_idle_cpu = i;
-			continue;
-		}
 
-		cur_capacity = capacity_curr_of(i);
-		rq = cpu_rq(i);
-		idle_idx = idle_get_state_idx(rq);
-
-		if (new_util < cur_capacity) {
-			if (cpu_rq(i)->nr_running) {
-				if (prefer_idle) {
-					/* Find a target cpu with highest
-					 * utilization.
-					 */
-					if (target_util == 0 ||
-						target_util < new_util) {
-						target_cpu = i;
-						target_util = new_util;
+			/*
+			 * Unconditionally favoring tasks that prefer idle cpus to
+			 * improve latency.
+			 */
+			if (idle_cpu(i) && prefer_idle) {
+				if (best_idle_cpu < 0)
+					best_idle_cpu = i;
+				continue;
+			}
+
+			cur_capacity = capacity_curr_of(i);
+			rq = cpu_rq(i);
+			idle_idx = idle_get_state_idx(rq);
+
+			if (new_util < cur_capacity) {
+				if (cpu_rq(i)->nr_running) {
+					if (!prefer_idle) {
+						/* Find a target cpu with highest
+						 * utilization.
+						 */
+						if (target_util == 0 ||
+							target_util < new_util) {
+							target_cpu = i;
+							target_util = new_util;
+						}
+					} else {
+						/* Find a target cpu with lowest
+						 * utilization.
+						 */
+						if (target_util == 0 ||
+							target_util > new_util) {
+							target_cpu = i;
+							target_util = new_util;
+						}
 					}
-				} else {
-					/* Find a target cpu with lowest
-					 * utilization.
-					 */
-					if (target_util == 0 ||
-						target_util > new_util) {
-						target_cpu = i;
-						target_util = new_util;
+				} else if (!prefer_idle) {
+					if (best_idle_cpu < 0 ||
+						(sysctl_sched_cstate_aware &&
+							best_idle_cstate > idle_idx)) {
+						best_idle_cstate = idle_idx;
+						best_idle_cpu = i;
 					}
 				}
-			} else if (!prefer_idle) {
-				if (best_idle_cpu < 0 ||
-					(sysctl_sched_cstate_aware &&
-						best_idle_cstate > idle_idx)) {
-					best_idle_cstate = idle_idx;
-					best_idle_cpu = i;
-				}
+			} else if (backup_capacity == 0 ||
+					backup_capacity > cur_capacity) {
+				// Find a backup cpu with least capacity.
+				backup_capacity = cur_capacity;
+				backup_cpu = i;
 			}
-		} else if (backup_capacity == 0 ||
-				backup_capacity > cur_capacity) {
-			// Find a backup cpu with least capacity.
-			backup_capacity = cur_capacity;
-			backup_cpu = i;
 		}
-	}
+	} while (sg = sg->next, sg != sd->groups);
 
 	if (prefer_idle && best_idle_cpu >= 0)
 		target_cpu = best_idle_cpu;
-- 
2.34.1