sched/numa: Favour moving tasks towards the preferred node
authorMel Gorman <mgorman@suse.de>
Mon, 7 Oct 2013 10:29:00 +0000 (11:29 +0100)
committerIngo Molnar <mingo@kernel.org>
Wed, 9 Oct 2013 10:40:26 +0000 (12:40 +0200)
This patch favours moving tasks towards NUMA node that recorded a higher
number of NUMA faults during active load balancing.  Ideally this is
self-reinforcing as the longer the task runs on that node, the more faults
it should incur causing task_numa_placement to keep the task running on that
node. In reality a big weakness is that the nodes CPUs can be overloaded
and it would be more efficient to queue tasks on an idle node and migrate
to the new node. This would require additional smarts in the balancer so
for now the balancer will simply prefer to place the task on the preferred
node for a PTE scans which is controlled by the numa_balancing_settle_count
sysctl. Once the settle_count number of scans has complete the schedule
is free to place the task on an alternative node if the load is imbalanced.

[srikar@linux.vnet.ibm.com: Fixed statistics]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
[ Tunable and use higher faults instead of preferred. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-23-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Documentation/sysctl/kernel.txt
include/linux/sched.h
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/features.h
kernel/sysctl.c

index 8cd7e5fc79da96c6534681a8ae28cc1bf1cc1fd5..d48bca45b6f2bf0e415f0a56b4b84b802cfcd948 100644 (file)
@@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the
 feature is too high then the rate the kernel samples for NUMA hinting
 faults may be controlled by the numa_balancing_scan_period_min_ms,
 numa_balancing_scan_delay_ms, numa_balancing_scan_period_reset,
-numa_balancing_scan_period_max_ms and numa_balancing_scan_size_mb sysctls.
+numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb and
+numa_balancing_settle_count sysctls.
 
 ==============================================================
 
@@ -420,6 +421,11 @@ scanned for a given scan.
 numa_balancing_scan_period_reset is a blunt instrument that controls how
 often a tasks scan delay is reset to detect sudden changes in task behaviour.
 
+numa_balancing_settle_count is how many scan periods must complete before
+the schedule balancer stops pushing the task towards a preferred node. This
+gives the scheduler a chance to place the task on an alternative node if the
+preferred node is overloaded.
+
 ==============================================================
 
 osrelease, ostype & version:
index a463bc3ad4377ad69b3b58aedcfb4bd64038855b..aecdc5a1877341e355ebe19633f2a8173a2fd75c 100644 (file)
@@ -777,6 +777,7 @@ enum cpu_idle_type {
 #define SD_ASYM_PACKING                0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling domain */
 #define SD_OVERLAP             0x2000  /* sched_domains of this level overlap */
+#define SD_NUMA                        0x4000  /* cross-node balancing */
 
 extern int __weak arch_sd_sibiling_asym_packing(void);
 
index 064a0af44540ec58ad60d5d82ebe51f29a26c73f..b7e6b6f9c5f6b888c93e38f735c969ae6d7ead19 100644 (file)
@@ -1631,7 +1631,7 @@ static void __sched_fork(struct task_struct *p)
 
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+       p->numa_migrate_seq = 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_preferred_nid = -1;
        p->numa_work.next = &p->numa_work;
@@ -5656,6 +5656,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 1*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                        | sd_local_flags(level)
                                        ,
                .last_balance           = jiffies,
index 3abc651bc38a2bdd156d0dec3e8bfc9be007d2ee..6ffddca687feafb005250200857212c9a48f2d8c 100644 (file)
@@ -877,6 +877,15 @@ static unsigned int task_scan_max(struct task_struct *p)
        return max(smin, smax);
 }
 
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
+
 static void task_numa_placement(struct task_struct *p)
 {
        int seq, nid, max_nid = -1;
@@ -888,6 +897,7 @@ static void task_numa_placement(struct task_struct *p)
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
+       p->numa_migrate_seq++;
        p->numa_scan_period_max = task_scan_max(p);
 
        /* Find the node with the highest number of faults */
@@ -907,8 +917,10 @@ static void task_numa_placement(struct task_struct *p)
        }
 
        /* Update the tasks preferred node if necessary */
-       if (max_faults && max_nid != p->numa_preferred_nid)
+       if (max_faults && max_nid != p->numa_preferred_nid) {
                p->numa_preferred_nid = max_nid;
+               p->numa_migrate_seq = 0;
+       }
 }
 
 /*
@@ -4071,6 +4083,38 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+           !(env->sd->flags & SD_NUMA)) {
+               return false;
+       }
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid ||
+           p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+               return false;
+
+       if (dst_nid == p->numa_preferred_nid ||
+           p->numa_faults[dst_nid] > p->numa_faults[src_nid])
+               return true;
+
+       return false;
+}
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+#endif
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -4128,11 +4172,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
        /*
         * Aggressive migration if:
-        * 1) task is cache cold, or
-        * 2) too many balance attempts have failed.
+        * 1) destination numa is preferred
+        * 2) task is cache cold, or
+        * 3) too many balance attempts have failed.
         */
-
        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+
+       if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+               if (tsk_cache_hot) {
+                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 
index cba5c616a15731ddbd2e980b3a9e8bb7c1edcd1d..d9278ce2c4b458d765871814c122c08b3c9faa8c 100644 (file)
@@ -67,4 +67,11 @@ SCHED_FEAT(LB_MIN, false)
  */
 #ifdef CONFIG_NUMA_BALANCING
 SCHED_FEAT(NUMA,       false)
+
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
 #endif
index b2f06f3c6a3ff32ec9f8f2b92bea0c4766ac6f7d..42f616a74f40d8ce857d284262fdeefc12188d63 100644 (file)
@@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "numa_balancing_settle_count",
+               .data           = &sysctl_numa_balancing_settle_count,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
        {