sched/fair: Clean up the __clear_buddies_*() functions

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 966cc2bfcb77586d2ce9c55986aae2a3f2dce521..846172107ba5939153d7c129dd6a4683412343d5 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
  
  /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
  is_same_group(struct sched_entity *se, struct sched_entity *pse)
  {
         if (se->cfs_rq == pse->cfs_rq)
-               return 1;
+               return se->cfs_rq;
  
-       return 0;
+       return NULL;
  }
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
         return se->parent;
  }
  
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-       int depth = 0;
-
-       for_each_sched_entity(se)
-               depth++;
-
-       return depth;
-}
-
  static void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
          */
  
         /* First walk up until both entities are at same depth */
-       se_depth = depth_se(*se);
-       pse_depth = depth_se(*pse);
+       se_depth = (*se)->depth;
+       pse_depth = (*pse)->depth;
  
         while (se_depth > pse_depth) {
                 se_depth--;
@@ -426,10 +415,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
  
-static inline int
+static inline struct cfs_rq *
  is_same_group(struct sched_entity *se, struct sched_entity *pse)
  {
-       return 1;
+       return cfs_rq_of(se); /* always the same rq */
  }
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -819,14 +808,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
  static unsigned int task_nr_scan_windows(struct task_struct *p)
  {
         unsigned long rss = 0;
@@ -893,10 +874,26 @@ struct numa_group {
         struct list_head task_list;
  
         struct rcu_head rcu;
+       nodemask_t active_nodes;
         unsigned long total_faults;
+       /*
+        * Faults_cpu is used to decide whether memory should move
+        * towards the CPU. As a consequence, these stats are weighted
+        * more by CPU use than by memory faults.
+        */
+       unsigned long *faults_cpu;
         unsigned long faults[0];
  };
  
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
  pid_t task_numa_group_id(struct task_struct *p)
  {
         return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +901,16 @@ pid_t task_numa_group_id(struct task_struct *p)
  
  static inline int task_faults_idx(int nid, int priv)
  {
-       return 2 * nid + priv;
+       return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
  }
  
  static inline unsigned long task_faults(struct task_struct *p, int nid)
  {
-       if (!p->numa_faults)
+       if (!p->numa_faults_memory)
                 return 0;
  
-       return p->numa_faults[task_faults_idx(nid, 0)] +
-               p->numa_faults[task_faults_idx(nid, 1)];
+       return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+               p->numa_faults_memory[task_faults_idx(nid, 1)];
  }
  
  static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +922,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
                 p->numa_group->faults[task_faults_idx(nid, 1)];
  }
  
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+       return group->faults_cpu[task_faults_idx(nid, 0)] +
+               group->faults_cpu[task_faults_idx(nid, 1)];
+}
+
  /*
   * These return the fraction of accesses done by a particular task, or
   * task group, on a particular numa node.  The group weight is given a
@@ -935,7 +938,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
  {
         unsigned long total_faults;
  
-       if (!p->numa_faults)
+       if (!p->numa_faults_memory)
                 return 0;
  
         total_faults = p->total_numa_faults;
@@ -954,6 +957,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
         return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
  }
  
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+                               int src_nid, int dst_cpu)
+{
+       struct numa_group *ng = p->numa_group;
+       int dst_nid = cpu_to_node(dst_cpu);
+       int last_cpupid, this_cpupid;
+
+       this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+       /*
+        * Multi-stage node selection is used in conjunction with a periodic
+        * migration fault to build a temporal task<->page relation. By using
+        * a two-stage filter we remove short/unlikely relations.
+        *
+        * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+        * a task's usage of a particular page (n_p) per total usage of this
+        * page (n_t) (in a given time-span) to a probability.
+        *
+        * Our periodic faults will sample this probability and getting the
+        * same result twice in a row, given these samples are fully
+        * independent, is then given by P(n)^2, provided our sample period
+        * is sufficiently short compared to the usage pattern.
+        *
+        * This quadric squishes small probabilities, making it less likely we
+        * act on an unlikely task<->page relation.
+        */
+       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+       if (!cpupid_pid_unset(last_cpupid) &&
+                               cpupid_to_nid(last_cpupid) != dst_nid)
+               return false;
+
+       /* Always allow migrate on private faults */
+       if (cpupid_match_pid(p, last_cpupid))
+               return true;
+
+       /* A shared fault, but p->numa_group has not been set up yet. */
+       if (!ng)
+               return true;
+
+       /*
+        * Do not migrate if the destination is not a node that
+        * is actively used by this numa group.
+        */
+       if (!node_isset(dst_nid, ng->active_nodes))
+               return false;
+
+       /*
+        * Source is a node that is not actively used by this
+        * numa group, while the destination is. Migrate.
+        */
+       if (!node_isset(src_nid, ng->active_nodes))
+               return true;
+
+       /*
+        * Both source and destination are nodes in active
+        * use by this numa group. Maximize memory bandwidth
+        * by migrating from more heavily used groups, to less
+        * heavily used ones, spreading the load around.
+        * Use a 1/4 hysteresis to avoid spurious page movement.
+        */
+       return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
  static unsigned long weighted_cpuload(const int cpu);
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1333,7 @@ static int task_numa_migrate(struct task_struct *p)
  static void numa_migrate_preferred(struct task_struct *p)
  {
         /* This task has no NUMA fault statistics yet */
-       if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+       if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                 return;
  
         /* Periodically retry migrating the task to the preferred node */
@@ -1281,6 +1347,38 @@ static void numa_migrate_preferred(struct task_struct *p)
         task_numa_migrate(p);
  }
  
+/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+       unsigned long faults, max_faults = 0;
+       int nid;
+
+       for_each_online_node(nid) {
+               faults = group_faults_cpu(numa_group, nid);
+               if (faults > max_faults)
+                       max_faults = faults;
+       }
+
+       for_each_online_node(nid) {
+               faults = group_faults_cpu(numa_group, nid);
+               if (!node_isset(nid, numa_group->active_nodes)) {
+                       if (faults > max_faults * 6 / 16)
+                               node_set(nid, numa_group->active_nodes);
+               } else if (faults < max_faults * 3 / 16)
+                       node_clear(nid, numa_group->active_nodes);
+       }
+}
+
  /*
   * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
   * increments. The more local the fault statistics are, the higher the scan
@@ -1355,11 +1453,41 @@ static void update_task_scan_period(struct task_struct *p,
         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
  }
  
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+       u64 runtime, delta, now;
+       /* Use the start of this time slice to avoid calculations. */
+       now = p->se.exec_start;
+       runtime = p->se.sum_exec_runtime;
+
+       if (p->last_task_numa_placement) {
+               delta = runtime - p->last_sum_exec_runtime;
+               *period = now - p->last_task_numa_placement;
+       } else {
+               delta = p->se.avg.runnable_avg_sum;
+               *period = p->se.avg.runnable_avg_period;
+       }
+
+       p->last_sum_exec_runtime = runtime;
+       p->last_task_numa_placement = now;
+
+       return delta;
+}
+
  static void task_numa_placement(struct task_struct *p)
  {
         int seq, nid, max_nid = -1, max_group_nid = -1;
         unsigned long max_faults = 0, max_group_faults = 0;
         unsigned long fault_types[2] = { 0, 0 };
+       unsigned long total_faults;
+       u64 runtime, period;
         spinlock_t *group_lock = NULL;
  
         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1496,10 @@ static void task_numa_placement(struct task_struct *p)
         p->numa_scan_seq = seq;
         p->numa_scan_period_max = task_scan_max(p);
  
+       total_faults = p->numa_faults_locality[0] +
+                      p->numa_faults_locality[1];
+       runtime = numa_get_avg_runtime(p, &period);
+
         /* If the task is part of a group prevent parallel updates to group stats */
         if (p->numa_group) {
                 group_lock = &p->numa_group->lock;
@@ -1379,24 +1511,37 @@ static void task_numa_placement(struct task_struct *p)
                 unsigned long faults = 0, group_faults = 0;
                 int priv, i;
  
-               for (priv = 0; priv < 2; priv++) {
-                       long diff;
+               for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+                       long diff, f_diff, f_weight;
  
                         i = task_faults_idx(nid, priv);
-                       diff = -p->numa_faults[i];
  
                         /* Decay existing window, copy faults since last scan */
-                       p->numa_faults[i] >>= 1;
-                       p->numa_faults[i] += p->numa_faults_buffer[i];
-                       fault_types[priv] += p->numa_faults_buffer[i];
-                       p->numa_faults_buffer[i] = 0;
+                       diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
+                       fault_types[priv] += p->numa_faults_buffer_memory[i];
+                       p->numa_faults_buffer_memory[i] = 0;
  
-                       faults += p->numa_faults[i];
-                       diff += p->numa_faults[i];
+                       /*
+                        * Normalize the faults_from, so all tasks in a group
+                        * count according to CPU use, instead of by the raw
+                        * number of faults. Tasks with little runtime have
+                        * little over-all impact on throughput, and thus their
+                        * faults are less important.
+                        */
+                       f_weight = div64_u64(runtime << 16, period + 1);
+                       f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+                                  (total_faults + 1);
+                       f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+                       p->numa_faults_buffer_cpu[i] = 0;
+
+                       p->numa_faults_memory[i] += diff;
+                       p->numa_faults_cpu[i] += f_diff;
+                       faults += p->numa_faults_memory[i];
                         p->total_numa_faults += diff;
                         if (p->numa_group) {
                                 /* safe because we can only change our own group */
                                 p->numa_group->faults[i] += diff;
+                               p->numa_group->faults_cpu[i] += f_diff;
                                 p->numa_group->total_faults += diff;
                                 group_faults += p->numa_group->faults[i];
                         }
@@ -1416,6 +1561,7 @@ static void task_numa_placement(struct task_struct *p)
         update_task_scan_period(p, fault_types[0], fault_types[1]);
  
         if (p->numa_group) {
+               update_numa_active_node_mask(p->numa_group);
                 /*
                  * If the preferred task and group nids are different,
                  * iterate over the nodes again to find the best place.
@@ -1465,7 +1611,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
  
         if (unlikely(!p->numa_group)) {
                 unsigned int size = sizeof(struct numa_group) +
-                                   2*nr_node_ids*sizeof(unsigned long);
+                                   4*nr_node_ids*sizeof(unsigned long);
  
                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
                 if (!grp)
@@ -1475,9 +1621,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                 spin_lock_init(&grp->lock);
                 INIT_LIST_HEAD(&grp->task_list);
                 grp->gid = p->pid;
+               /* Second half of the array tracks nids where faults happen */
+               grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+                                               nr_node_ids;
  
-               for (i = 0; i < 2*nr_node_ids; i++)
-                       grp->faults[i] = p->numa_faults[i];
+               node_set(task_node(current), grp->active_nodes);
+
+               for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+                       grp->faults[i] = p->numa_faults_memory[i];
  
                 grp->total_faults = p->total_numa_faults;
  
@@ -1534,9 +1685,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
  
         double_lock(&my_grp->lock, &grp->lock);
  
-       for (i = 0; i < 2*nr_node_ids; i++) {
-               my_grp->faults[i] -= p->numa_faults[i];
-               grp->faults[i] += p->numa_faults[i];
+       for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+               my_grp->faults[i] -= p->numa_faults_memory[i];
+               grp->faults[i] += p->numa_faults_memory[i];
         }
         my_grp->total_faults -= p->total_numa_faults;
         grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1713,12 @@ void task_numa_free(struct task_struct *p)
  {
         struct numa_group *grp = p->numa_group;
         int i;
-       void *numa_faults = p->numa_faults;
+       void *numa_faults = p->numa_faults_memory;
  
         if (grp) {
                 spin_lock(&grp->lock);
-               for (i = 0; i < 2*nr_node_ids; i++)
-                       grp->faults[i] -= p->numa_faults[i];
+               for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+                       grp->faults[i] -= p->numa_faults_memory[i];
                 grp->total_faults -= p->total_numa_faults;
  
                 list_del(&p->numa_entry);
@@ -1577,18 +1728,21 @@ void task_numa_free(struct task_struct *p)
                 put_numa_group(grp);
         }
  
-       p->numa_faults = NULL;
-       p->numa_faults_buffer = NULL;
+       p->numa_faults_memory = NULL;
+       p->numa_faults_buffer_memory = NULL;
+       p->numa_faults_cpu= NULL;
+       p->numa_faults_buffer_cpu = NULL;
         kfree(numa_faults);
  }
  
  /*
   * Got a PROT_NONE fault for a page on @node.
   */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
  {
         struct task_struct *p = current;
         bool migrated = flags & TNF_MIGRATED;
+       int cpu_node = task_node(current);
         int priv;
  
         if (!numabalancing_enabled)
@@ -1603,16 +1757,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                 return;
  
         /* Allocate buffer to track faults on a per-node basis */
-       if (unlikely(!p->numa_faults)) {
-               int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+       if (unlikely(!p->numa_faults_memory)) {
+               int size = sizeof(*p->numa_faults_memory) *
+                          NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
  
-               /* numa_faults and numa_faults_buffer share the allocation */
-               p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
-               if (!p->numa_faults)
+               p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+               if (!p->numa_faults_memory)
                         return;
  
-               BUG_ON(p->numa_faults_buffer);
-               p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+               BUG_ON(p->numa_faults_buffer_memory);
+               /*
+                * The averaged statistics, shared & private, memory & cpu,
+                * occupy the first half of the array. The second half of the
+                * array is for current counters, which are averaged into the
+                * first set by task_numa_placement.
+                */
+               p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+               p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+               p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
                 p->total_numa_faults = 0;
                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
         }
@@ -1641,7 +1803,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
         if (migrated)
                 p->numa_pages_migrated += pages;
  
-       p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+       p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+       p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
         p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
  }
  
@@ -2576,10 +2739,10 @@ static void __clear_buddies_last(struct sched_entity *se)
  {
         for_each_sched_entity(se) {
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->last == se)
-                       cfs_rq->last = NULL;
-               else
+               if (cfs_rq->last != se)
                         break;
+
+               cfs_rq->last = NULL;
         }
  }
  
@@ -2587,10 +2750,10 @@ static void __clear_buddies_next(struct sched_entity *se)
  {
         for_each_sched_entity(se) {
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->next == se)
-                       cfs_rq->next = NULL;
-               else
+               if (cfs_rq->next != se)
                         break;
+
+               cfs_rq->next = NULL;
         }
  }
  
@@ -2598,10 +2761,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
  {
         for_each_sched_entity(se) {
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->skip == se)
-                       cfs_rq->skip = NULL;
-               else
+               if (cfs_rq->skip != se)
                         break;
+
+               cfs_rq->skip = NULL;
         }
  }
  
@@ -4492,7 +4655,8 @@ preempt:
                 set_last_buddy(se);
  }
  
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
  {
         struct task_struct *p;
         struct cfs_rq *cfs_rq = &rq->cfs;
@@ -4501,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
         if (!cfs_rq->nr_running)
                 return NULL;
  
+       if (prev)
+               prev->sched_class->put_prev_task(rq, prev);
+
         do {
                 se = pick_next_entity(cfs_rq);
                 set_next_entity(cfs_rq, se);
@@ -4783,7 +4950,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
  {
         int src_nid, dst_nid;
  
-       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
             !(env->sd->flags & SD_NUMA)) {
                 return false;
         }
@@ -4814,7 +4981,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
         if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
                 return false;
  
-       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+       if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
                 return false;
  
         src_nid = cpu_to_node(env->src_cpu);
@@ -6357,17 +6524,16 @@ out:
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
   */
-void idle_balance(int this_cpu, struct rq *this_rq)
+int idle_balance(struct rq *this_rq)
  {
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
         u64 curr_cost = 0;
-
-       this_rq->idle_stamp = rq_clock(this_rq);
+       int this_cpu = this_rq->cpu;
  
         if (this_rq->avg_idle < sysctl_sched_migration_cost)
-               return;
+               return 0;
  
         /*
          * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6571,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
                         next_balance = sd->last_balance + interval;
-               if (pulled_task) {
-                       this_rq->idle_stamp = 0;
+               if (pulled_task)
                         break;
-               }
         }
         rcu_read_unlock();
  
         raw_spin_lock(&this_rq->lock);
  
+       /*
+        * While browsing the domains, we released the rq lock.
+        * A task could have be enqueued in the meantime
+        */
+       if (this_rq->nr_running && !pulled_task)
+               return 1;
+
         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                 /*
                  * We are going idle. next_balance may be set based on
@@ -6424,6 +6595,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
  
         if (curr_cost > this_rq->max_idle_balance_cost)
                 this_rq->max_idle_balance_cost = curr_cost;
+
+       return pulled_task;
  }
  
  /*
@@ -7082,7 +7255,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void task_move_group_fair(struct task_struct *p, int on_rq)
  {
+       struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq;
+
         /*
          * If the task was not on the rq at the time of this cgroup movement
          * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7283,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
          * To prevent boost or penalty in the new cfs_rq caused by delta
          * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
          */
-       if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+       if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
                 on_rq = 1;
  
         if (!on_rq)
-               p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+               se->vruntime -= cfs_rq_of(se)->min_vruntime;
         set_task_rq(p, task_cpu(p));
+       se->depth = se->parent ? se->parent->depth + 1 : 0;
         if (!on_rq) {
-               cfs_rq = cfs_rq_of(&p->se);
-               p->se.vruntime += cfs_rq->min_vruntime;
+               cfs_rq = cfs_rq_of(se);
+               se->vruntime += cfs_rq->min_vruntime;
  #ifdef CONFIG_SMP
                 /*
                  * migrate_task_rq_fair() will have removed our previous
                  * contribution, but we must synchronize for ongoing future
                  * decay.
                  */
-               p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-               cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+               se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+               cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
  #endif
         }
  }
@@ -7220,10 +7396,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
         if (!se)
                 return;
  
-       if (!parent)
+       if (!parent) {
                 se->cfs_rq = &rq->cfs;
-       else
+               se->depth = 0;
+       } else {
                 se->cfs_rq = parent->my_q;
+               se->depth = parent->depth + 1;
+       }
  
         se->my_q = cfs_rq;
         /* guarantee group entities always have weight */