mm: numa: Add fault driven placement and migration

author Peter Zijlstra <a.p.zijlstra@chello.nl>

Thu, 25 Oct 2012 12:16:43 +0000 (14:16 +0200)

committer Mel Gorman <mgorman@suse.de>

Tue, 11 Dec 2012 14:42:45 +0000 (14:42 +0000)
author Peter Zijlstra <a.p.zijlstra@chello.nl>
Thu, 25 Oct 2012 12:16:43 +0000 (14:16 +0200)
committer Mel Gorman <mgorman@suse.de>
Tue, 11 Dec 2012 14:42:45 +0000 (14:42 +0000)
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig

index cb8f9920f4dd873b69050e81c4a46fc7e40f5007..0f7c852f355c25a77e20badccc65862ff83eb469 100644 (file)
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
  config NUMA
         bool "Non Uniform Memory Access (NUMA) Support"
         depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+       select ARCH_WANT_NUMA_VARIABLE_LOCALITY
         default n
         help
           Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 46c3bff3ced20e31c379bdde1c9a50c95206ff8e..1137028fc6d90126c9aa6435f6849e3fdc8f735a 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
         def_bool y
         select HAVE_AOUT if X86_32
         select HAVE_UNSTABLE_SCHED_CLOCK
+       select ARCH_SUPPORTS_NUMA_BALANCING
+       select ARCH_WANTS_PROT_NUMA_PROT_NONE
         select HAVE_IDE
         select HAVE_OPROFILE
         select HAVE_PCSPKR_PLATFORM
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 31f8a3af7d942545562afd1fcacf11f52a4f82a0..ed8638c29b3ea7d94946edb1d7af5f3c78e25f0f 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -397,6 +397,17 @@ struct mm_struct {
  #endif
  #ifdef CONFIG_CPUMASK_OFFSTACK
         struct cpumask cpumask_allocation;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+       /*
+        * numa_next_scan is the next time when the PTEs will me marked
+        * pte_numa to gather statistics and migrate pages to new nodes
+        * if necessary
+        */
+       unsigned long numa_next_scan;
+
+       /* numa_scan_seq prevents two threads setting pte_numa */
+       int numa_scan_seq;
  #endif
         struct uprobes_state uprobes_state;
  };
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 0dd42a02df2e851e0847df640dd559f3689c36ed..844af5b12cb2c15a1ec5676f7f9173d63f4e93ae 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1479,6 +1479,14 @@ struct task_struct {
         short il_next;
         short pref_node_fork;
  #endif
+#ifdef CONFIG_NUMA_BALANCING
+       int numa_scan_seq;
+       int numa_migrate_seq;
+       unsigned int numa_scan_period;
+       u64 node_stamp;                 /* migration stamp  */
+       struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+
         struct rcu_head rcu;
  
         /*
@@ -1553,6 +1561,14 @@ struct task_struct {
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages);
+#else
+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif
+
  /*
   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
  };
  extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
  
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_settle_count;
+
  #ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 2d8927fda712f5ee1e19f1fe364fbad557d8a736..cad0d092ce3b02d1e118d0e165072ab65e7dccbf 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
+
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+               p->mm->numa_next_scan = jiffies;
+               p->mm->numa_scan_seq = 0;
+       }
+
+       p->node_stamp = 0ULL;
+       p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+       p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+       p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
  }
  
  /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 6b800a14b9903c81d7bc9b2176b03e750c951108..6831abb5dbefd74a5d75fa125a3fd89a92fbf5c6 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
  #include <linux/slab.h>
  #include <linux/profile.h>
  #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
  
  #include <trace/events/sched.h>
  
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   * Scheduling class queueing methods:
   */
  
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+       if (p->numa_scan_seq == seq)
+               return;
+       p->numa_scan_seq = seq;
+
+       /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+       struct task_struct *p = current;
+
+       /* FIXME: Allocate task-specific structure for placement policy here */
+
+       task_numa_placement(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+       unsigned long migrate, next_scan, now = jiffies;
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+
+       WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+       work->next = work; /* protect against double add */
+       /*
+        * Who cares about NUMA placement when they're dying.
+        *
+        * NOTE: make sure not to dereference p->mm before this check,
+        * exit_task_work() happens _after_ exit_mm() so we could be called
+        * without p->mm even though we still had it when we enqueued this
+        * work.
+        */
+       if (p->flags & PF_EXITING)
+               return;
+
+       /*
+        * Enforce maximal scan/migration frequency..
+        */
+       migrate = mm->numa_next_scan;
+       if (time_before(now, migrate))
+               return;
+
+       if (p->numa_scan_period == 0)
+               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+       next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+       if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+               return;
+
+       ACCESS_ONCE(mm->numa_scan_seq)++;
+       {
+               struct vm_area_struct *vma;
+
+               down_read(&mm->mmap_sem);
+               for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                       if (!vma_migratable(vma))
+                               continue;
+                       change_prot_numa(vma, vma->vm_start, vma->vm_end);
+               }
+               up_read(&mm->mmap_sem);
+       }
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+       struct callback_head *work = &curr->numa_work;
+       u64 period, now;
+
+       /*
+        * We don't care about NUMA placement if we don't have memory.
+        */
+       if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+               return;
+
+       /*
+        * Using runtime rather than walltime has the dual advantage that
+        * we (mostly) drive the selection from busy threads and that the
+        * task needs to have done some actual work before we bother with
+        * NUMA placement.
+        */
+       now = curr->se.sum_exec_runtime;
+       period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+       if (now - curr->node_stamp > period) {
+               curr->node_stamp = now;
+
+               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                       init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                       task_work_add(curr, work, true);
+               }
+       }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
  static void
  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                 cfs_rq = cfs_rq_of(se);
                 entity_tick(cfs_rq, se, queued);
         }
+
+       if (sched_feat_numa(NUMA))
+               task_tick_numa(rq, curr);
  }
  
  /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index eebefcad7027a0587977fb1cecaeb0d13cac79a0..5fb7aefbec80284f25f716ff52184846069d5c0a 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
  SCHED_FEAT(FORCE_SD_OVERLAP, false)
  SCHED_FEAT(RT_RUNTIME_SHARE, true)
  SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,       true)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 7a7db09cfabc18af1b35dd2bee57bf60d4103cf4..ae31c051ff2f1847db72ed3c18ae75a36377a1bf 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
  #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
  #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
  
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
+
  static inline u64 global_rt_period(void)
  {
         return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 26f65eaa01f9c94366aa5156f9c304bbc589808f..025e1ae50ef1942247485939eab28ef805c94ba4 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000;              /* 100 usecs */
  static int max_sched_granularity_ns = NSEC_PER_SEC;    /* 1 second */
  static int min_wakeup_granularity_ns;                  /* 0 usecs */
  static int max_wakeup_granularity_ns = NSEC_PER_SEC;   /* 1 second */
+#ifdef CONFIG_SMP
  static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
  static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
  
  #ifdef CONFIG_COMPACTION
  static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_wakeup_granularity_ns,
                 .extra2         = &max_wakeup_granularity_ns,
         },
+#ifdef CONFIG_SMP
         {
                 .procname       = "sched_tunable_scaling",
                 .data           = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &zero,
                 .extra2         = &one,
         },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+       {
+               .procname       = "numa_balancing_scan_period_min_ms",
+               .data           = &sysctl_numa_balancing_scan_period_min,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "numa_balancing_scan_period_max_ms",
+               .data           = &sysctl_numa_balancing_scan_period_max,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
         {
                 .procname       = "sched_rt_period_us",
                 .data           = &sysctl_sched_rt_period,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index d79f7a55bf6f7cdfc677eecbbbcc422d4fd36046..ee8133794a564ae8aaf263631339853752d99512 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
          */
         split_huge_page(page);
         put_page(page);
+
         return 0;
  
  clear_pmdnuma:
@@ -1060,8 +1061,10 @@ clear_pmdnuma:
  
  out_unlock:
         spin_unlock(&mm->page_table_lock);
-       if (page)
+       if (page) {
                 put_page(page);
+               task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+       }
         return 0;
  }
  
diff --git a/mm/memory.c b/mm/memory.c

index d52542680e108d42bcb9f4dcbe43238a078ceab5..8012c1907895612008d0f58eb85c44a05b6d6bc9 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
  {
         struct page *page = NULL;
         spinlock_t *ptl;
-       int current_nid, target_nid;
+       int current_nid = -1;
+       int target_nid;
  
         /*
         * The "pte" at this point cannot be used safely without
@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 current_nid = target_nid;
  
  out:
+       task_numa_fault(current_nid, 1);
         return 0;
  }
  
@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
                 pte_t pteval = *pte;
                 struct page *page;
+               int curr_nid;
                 if (!pte_present(pteval))
                         continue;
                 if (!pte_numa(pteval))
@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 page = vm_normal_page(vma, addr, pteval);
                 if (unlikely(!page))
                         continue;
+               /* only check non-shared pages */
+               if (unlikely(page_mapcount(page) != 1))
+                       continue;
+               pte_unmap_unlock(pte, ptl);
+
+               curr_nid = page_to_nid(page);
+               task_numa_fault(curr_nid, 1);
+
+               pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
         }
         pte_unmap_unlock(orig_pte, ptl);
author	Peter Zijlstra <a.p.zijlstra@chello.nl>
	Thu, 25 Oct 2012 12:16:43 +0000 (14:16 +0200)
committer	Mel Gorman <mgorman@suse.de>
	Tue, 11 Dec 2012 14:42:45 +0000 (14:42 +0000)
arch/sh/mm/Kconfig		patch \| blob \| history
arch/x86/Kconfig		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/features.h		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history