mm: numa: Add fault driven placement and migration
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Thu, 25 Oct 2012 12:16:43 +0000 (14:16 +0200)
committerMel Gorman <mgorman@suse.de>
Tue, 11 Dec 2012 14:42:45 +0000 (14:42 +0000)
NOTE: This patch is based on "sched, numa, mm: Add fault driven
placement and migration policy" but as it throws away all the policy
to just leave a basic foundation I had to drop the signed-offs-by.

This patch creates a bare-bones method for setting PTEs pte_numa in the
context of the scheduler that when faulted later will be faulted onto the
node the CPU is running on.  In itself this does nothing useful but any
placement policy will fundamentally depend on receiving hints on placement
from fault context and doing something intelligent about it.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
arch/sh/mm/Kconfig
arch/x86/Kconfig
include/linux/mm_types.h
include/linux/sched.h
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/features.h
kernel/sched/sched.h
kernel/sysctl.c
mm/huge_memory.c
mm/memory.c

index cb8f9920f4dd873b69050e81c4a46fc7e40f5007..0f7c852f355c25a77e20badccc65862ff83eb469 100644 (file)
@@ -111,6 +111,7 @@ config VSYSCALL
 config NUMA
        bool "Non Uniform Memory Access (NUMA) Support"
        depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+       select ARCH_WANT_NUMA_VARIABLE_LOCALITY
        default n
        help
          Some SH systems have many various memories scattered around
index 46c3bff3ced20e31c379bdde1c9a50c95206ff8e..1137028fc6d90126c9aa6435f6849e3fdc8f735a 100644 (file)
@@ -22,6 +22,8 @@ config X86
        def_bool y
        select HAVE_AOUT if X86_32
        select HAVE_UNSTABLE_SCHED_CLOCK
+       select ARCH_SUPPORTS_NUMA_BALANCING
+       select ARCH_WANTS_PROT_NUMA_PROT_NONE
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_PCSPKR_PLATFORM
index 31f8a3af7d942545562afd1fcacf11f52a4f82a0..ed8638c29b3ea7d94946edb1d7af5f3c78e25f0f 100644 (file)
@@ -397,6 +397,17 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
        struct cpumask cpumask_allocation;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+       /*
+        * numa_next_scan is the next time when the PTEs will me marked
+        * pte_numa to gather statistics and migrate pages to new nodes
+        * if necessary
+        */
+       unsigned long numa_next_scan;
+
+       /* numa_scan_seq prevents two threads setting pte_numa */
+       int numa_scan_seq;
 #endif
        struct uprobes_state uprobes_state;
 };
index 0dd42a02df2e851e0847df640dd559f3689c36ed..844af5b12cb2c15a1ec5676f7f9173d63f4e93ae 100644 (file)
@@ -1479,6 +1479,14 @@ struct task_struct {
        short il_next;
        short pref_node_fork;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+       int numa_scan_seq;
+       int numa_migrate_seq;
+       unsigned int numa_scan_period;
+       u64 node_stamp;                 /* migration stamp  */
+       struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+
        struct rcu_head rcu;
 
        /*
@@ -1553,6 +1561,14 @@ struct task_struct {
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages);
+#else
+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_settle_count;
+
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
index 2d8927fda712f5ee1e19f1fe364fbad557d8a736..cad0d092ce3b02d1e118d0e165072ab65e7dccbf 100644 (file)
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+               p->mm->numa_next_scan = jiffies;
+               p->mm->numa_scan_seq = 0;
+       }
+
+       p->node_stamp = 0ULL;
+       p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+       p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+       p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
 
 /*
index 6b800a14b9903c81d7bc9b2176b03e750c951108..6831abb5dbefd74a5d75fa125a3fd89a92fbf5c6 100644 (file)
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
 
 #include <trace/events/sched.h>
 
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+       if (p->numa_scan_seq == seq)
+               return;
+       p->numa_scan_seq = seq;
+
+       /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+       struct task_struct *p = current;
+
+       /* FIXME: Allocate task-specific structure for placement policy here */
+
+       task_numa_placement(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+       unsigned long migrate, next_scan, now = jiffies;
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+
+       WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+       work->next = work; /* protect against double add */
+       /*
+        * Who cares about NUMA placement when they're dying.
+        *
+        * NOTE: make sure not to dereference p->mm before this check,
+        * exit_task_work() happens _after_ exit_mm() so we could be called
+        * without p->mm even though we still had it when we enqueued this
+        * work.
+        */
+       if (p->flags & PF_EXITING)
+               return;
+
+       /*
+        * Enforce maximal scan/migration frequency..
+        */
+       migrate = mm->numa_next_scan;
+       if (time_before(now, migrate))
+               return;
+
+       if (p->numa_scan_period == 0)
+               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+       next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+       if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+               return;
+
+       ACCESS_ONCE(mm->numa_scan_seq)++;
+       {
+               struct vm_area_struct *vma;
+
+               down_read(&mm->mmap_sem);
+               for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                       if (!vma_migratable(vma))
+                               continue;
+                       change_prot_numa(vma, vma->vm_start, vma->vm_end);
+               }
+               up_read(&mm->mmap_sem);
+       }
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+       struct callback_head *work = &curr->numa_work;
+       u64 period, now;
+
+       /*
+        * We don't care about NUMA placement if we don't have memory.
+        */
+       if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+               return;
+
+       /*
+        * Using runtime rather than walltime has the dual advantage that
+        * we (mostly) drive the selection from busy threads and that the
+        * task needs to have done some actual work before we bother with
+        * NUMA placement.
+        */
+       now = curr->se.sum_exec_runtime;
+       period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+       if (now - curr->node_stamp > period) {
+               curr->node_stamp = now;
+
+               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                       task_work_add(curr, work, true);
+               }
+       }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
+
+       if (sched_feat_numa(NUMA))
+               task_tick_numa(rq, curr);
 }
 
 /*
index eebefcad7027a0587977fb1cecaeb0d13cac79a0..5fb7aefbec80284f25f716ff52184846069d5c0a 100644 (file)
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,       true)
+#endif
index 7a7db09cfabc18af1b35dd2bee57bf60d4103cf4..ae31c051ff2f1847db72ed3c18ae75a36377a1bf 100644 (file)
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
+
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
index 26f65eaa01f9c94366aa5156f9c304bbc589808f..025e1ae50ef1942247485939eab28ef805c94ba4 100644 (file)
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000;              /* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;    /* 1 second */
 static int min_wakeup_granularity_ns;                  /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;   /* 1 second */
+#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+       {
+               .procname       = "numa_balancing_scan_period_min_ms",
+               .data           = &sysctl_numa_balancing_scan_period_min,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "numa_balancing_scan_period_max_ms",
+               .data           = &sysctl_numa_balancing_scan_period_max,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
index d79f7a55bf6f7cdfc677eecbbbcc422d4fd36046..ee8133794a564ae8aaf263631339853752d99512 100644 (file)
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        split_huge_page(page);
        put_page(page);
+
        return 0;
 
 clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
 
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-       if (page)
+       if (page) {
                put_page(page);
+               task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+       }
        return 0;
 }
 
index d52542680e108d42bcb9f4dcbe43238a078ceab5..8012c1907895612008d0f58eb85c44a05b6d6bc9 100644 (file)
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *page = NULL;
        spinlock_t *ptl;
-       int current_nid, target_nid;
+       int current_nid = -1;
+       int target_nid;
 
        /*
        * The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                current_nid = target_nid;
 
 out:
+       task_numa_fault(current_nid, 1);
        return 0;
 }
 
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
                pte_t pteval = *pte;
                struct page *page;
+               int curr_nid;
                if (!pte_present(pteval))
                        continue;
                if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page = vm_normal_page(vma, addr, pteval);
                if (unlikely(!page))
                        continue;
+               /* only check non-shared pages */
+               if (unlikely(page_mapcount(page) != 1))
+                       continue;
+               pte_unmap_unlock(pte, ptl);
+
+               curr_nid = page_to_nid(page);
+               task_numa_fault(curr_nid, 1);
+
+               pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        }
        pte_unmap_unlock(orig_pte, ptl);