1. fix error return code in dw_mci_probe()

[firefly-linux-kernel-4.4.55.git] / mm / migrate.c
diff --git a/mm/migrate.c b/mm/migrate.c

index 666e4e677414e6d790de715761e395116441e6c4..a88c12f2235de8768bdd29063d7175510f03b45f 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -13,7 +13,7 @@
   */
  
  #include <linux/migrate.h>
-#include <linux/module.h>
+#include <linux/export.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/pagemap.h>
@@ -33,13 +33,16 @@
  #include <linux/memcontrol.h>
  #include <linux/syscalls.h>
  #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
  #include <linux/gfp.h>
+#include <linux/balloon_compaction.h>
  
  #include <asm/tlbflush.h>
  
-#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
  
-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#include "internal.h"
  
  /*
   * migrate_prep() needs to be called before we start compiling a list of pages
@@ -80,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
                 list_del(&page->lru);
                 dec_zone_page_state(page, NR_ISOLATED_ANON +
                                 page_is_file_cache(page));
-               putback_lru_page(page);
+                       putback_lru_page(page);
+       }
+}
+
+/*
+ * Put previously isolated pages back onto the appropriate lists
+ * from where they were once taken off for compaction/migration.
+ *
+ * This function shall be used instead of putback_lru_pages(),
+ * whenever the isolated pageset has been built by isolate_migratepages_range()
+ */
+void putback_movable_pages(struct list_head *l)
+{
+       struct page *page;
+       struct page *page2;
+
+       list_for_each_entry_safe(page, page2, l, lru) {
+               list_del(&page->lru);
+               dec_zone_page_state(page, NR_ISOLATED_ANON +
+                               page_is_file_cache(page));
+               if (unlikely(isolated_balloon_page(page)))
+                       balloon_page_putback(page);
+               else
+                       putback_lru_page(page);
         }
  }
  
@@ -92,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
  {
         struct mm_struct *mm = vma->vm_mm;
         swp_entry_t entry;
-       pgd_t *pgd;
-       pud_t *pud;
         pmd_t *pmd;
         pte_t *ptep, pte;
         spinlock_t *ptl;
@@ -104,26 +128,18 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                         goto out;
                 ptl = &mm->page_table_lock;
         } else {
-               pgd = pgd_offset(mm, addr);
-               if (!pgd_present(*pgd))
+               pmd = mm_find_pmd(mm, addr);
+               if (!pmd)
                         goto out;
-
-               pud = pud_offset(pgd, addr);
-               if (!pud_present(*pud))
-                       goto out;
-
-               pmd = pmd_offset(pud, addr);
                 if (pmd_trans_huge(*pmd))
                         goto out;
-               if (!pmd_present(*pmd))
-                       goto out;
  
                 ptep = pte_offset_map(pmd, addr);
  
-               if (!is_swap_pte(*ptep)) {
-                       pte_unmap(ptep);
-                       goto out;
-               }
+               /*
+                * Peek to check is_swap_pte() before taking ptlock?  No, we
+                * can race mremap's move_ptes(), which skips anon_vma lock.
+                */
  
                 ptl = pte_lockptr(mm, pmd);
         }
@@ -144,10 +160,12 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
         if (is_write_migration_entry(entry))
                 pte = pte_mkwrite(pte);
  #ifdef CONFIG_HUGETLB_PAGE
-       if (PageHuge(new))
+       if (PageHuge(new)) {
                 pte = pte_mkhuge(pte);
+               pte = arch_make_huge_pte(pte, vma, new, 0);
+       }
  #endif
-       flush_cache_page(vma, addr, pte_pfn(pte));
+       flush_dcache_page(new);
         set_pte_at(mm, addr, ptep, pte);
  
         if (PageHuge(new)) {
@@ -181,18 +199,15 @@ static void remove_migration_ptes(struct page *old, struct page *new)
   * Something used the pte of a page under migration. We need to
   * get to the page and wait until migration is finished.
   * When we return from this function the fault will be retried.
- *
- * This function is called from do_swap_page().
   */
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
-                               unsigned long address)
+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+                               spinlock_t *ptl)
  {
-       pte_t *ptep, pte;
-       spinlock_t *ptl;
+       pte_t pte;
         swp_entry_t entry;
         struct page *page;
  
-       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       spin_lock(ptl);
         pte = *ptep;
         if (!is_swap_pte(pte))
                 goto out;
@@ -220,6 +235,70 @@ out:
         pte_unmap_unlock(ptep, ptl);
  }
  
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                               unsigned long address)
+{
+       spinlock_t *ptl = pte_lockptr(mm, pmd);
+       pte_t *ptep = pte_offset_map(pmd, address);
+       __migration_entry_wait(mm, ptep, ptl);
+}
+
+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+{
+       spinlock_t *ptl = &(mm)->page_table_lock;
+       __migration_entry_wait(mm, pte, ptl);
+}
+
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+                                                       enum migrate_mode mode)
+{
+       struct buffer_head *bh = head;
+
+       /* Simple case, sync compaction */
+       if (mode != MIGRATE_ASYNC) {
+               do {
+                       get_bh(bh);
+                       lock_buffer(bh);
+                       bh = bh->b_this_page;
+
+               } while (bh != head);
+
+               return true;
+       }
+
+       /* async case, we cannot block on lock_buffer so use trylock_buffer */
+       do {
+               get_bh(bh);
+               if (!trylock_buffer(bh)) {
+                       /*
+                        * We failed to lock the buffer and cannot stall in
+                        * async migration. Release the taken locks
+                        */
+                       struct buffer_head *failed_bh = bh;
+                       put_bh(failed_bh);
+                       bh = head;
+                       while (bh != failed_bh) {
+                               unlock_buffer(bh);
+                               put_bh(bh);
+                               bh = bh->b_this_page;
+                       }
+                       return false;
+               }
+
+               bh = bh->b_this_page;
+       } while (bh != head);
+       return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+                                                       enum migrate_mode mode)
+{
+       return true;
+}
+#endif /* CONFIG_BLOCK */
+
  /*
   * Replace the page in the mapping.
   *
@@ -229,16 +308,17 @@ out:
   * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
   */
  static int migrate_page_move_mapping(struct address_space *mapping,
-               struct page *newpage, struct page *page)
+               struct page *newpage, struct page *page,
+               struct buffer_head *head, enum migrate_mode mode)
  {
-       int expected_count;
+       int expected_count = 0;
         void **pslot;
  
         if (!mapping) {
                 /* Anonymous page without mapping */
                 if (page_count(page) != 1)
                         return -EAGAIN;
-               return 0;
+               return MIGRATEPAGE_SUCCESS;
         }
  
         spin_lock_irq(&mapping->tree_lock);
@@ -258,6 +338,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                 return -EAGAIN;
         }
  
+       /*
+        * In the async migration case of moving a page with buffers, lock the
+        * buffers using trylock before the mapping is moved. If the mapping
+        * was moved, we later failed to lock the buffers and could not move
+        * the mapping back due to an elevated page count, we would have to
+        * block waiting on other references to be dropped.
+        */
+       if (mode == MIGRATE_ASYNC && head &&
+                       !buffer_migrate_lock_buffers(head, mode)) {
+               page_unfreeze_refs(page, expected_count);
+               spin_unlock_irq(&mapping->tree_lock);
+               return -EAGAIN;
+       }
+
         /*
          * Now we know that no one else is looking at the page.
          */
@@ -269,12 +363,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
  
         radix_tree_replace_slot(pslot, newpage);
  
-       page_unfreeze_refs(page, expected_count);
         /*
-        * Drop cache reference from old page.
+        * Drop cache reference from old page by unfreezing
+        * to one less reference.
          * We know this isn't the last reference.
          */
-       __put_page(page);
+       page_unfreeze_refs(page, expected_count - 1);
  
         /*
          * If moved to a different zone then also account
@@ -294,7 +388,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
         }
         spin_unlock_irq(&mapping->tree_lock);
  
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
  }
  
  /*
@@ -310,7 +404,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
         if (!mapping) {
                 if (page_count(page) != 1)
                         return -EAGAIN;
-               return 0;
+               return MIGRATEPAGE_SUCCESS;
         }
  
         spin_lock_irq(&mapping->tree_lock);
@@ -334,12 +428,10 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
  
         radix_tree_replace_slot(pslot, newpage);
  
-       page_unfreeze_refs(page, expected_count);
-
-       __put_page(page);
+       page_unfreeze_refs(page, expected_count - 1);
  
         spin_unlock_irq(&mapping->tree_lock);
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
  }
  
  /*
@@ -347,7 +439,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
   */
  void migrate_page_copy(struct page *newpage, struct page *page)
  {
-       if (PageHuge(page))
+       if (PageHuge(page) || PageTransHuge(page))
                 copy_huge_page(newpage, page);
         else
                 copy_highpage(newpage, page);
@@ -377,16 +469,21 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                  * is actually a signal that all of the page has become dirty.
                  * Whereas only part of our page may be dirty.
                  */
-               __set_page_dirty_nobuffers(newpage);
+               if (PageSwapBacked(page))
+                       SetPageDirty(newpage);
+               else
+                       __set_page_dirty_nobuffers(newpage);
         }
  
         mlock_migrate_page(newpage, page);
         ksm_migrate_page(newpage, page);
-
+       /*
+        * Please do not reorder this without considering how mm/ksm.c's
+        * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
+        */
         ClearPageSwapCache(page);
         ClearPagePrivate(page);
         set_page_private(page, 0);
-       page->mapping = NULL;
  
         /*
          * If any waiters have accumulated on the new page then
@@ -415,19 +512,20 @@ EXPORT_SYMBOL(fail_migrate_page);
   * Pages are locked upon entry and exit.
   */
  int migrate_page(struct address_space *mapping,
-               struct page *newpage, struct page *page)
+               struct page *newpage, struct page *page,
+               enum migrate_mode mode)
  {
         int rc;
  
         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
  
-       rc = migrate_page_move_mapping(mapping, newpage, page);
+       rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
  
-       if (rc)
+       if (rc != MIGRATEPAGE_SUCCESS)
                 return rc;
  
         migrate_page_copy(newpage, page);
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
  }
  EXPORT_SYMBOL(migrate_page);
  
@@ -438,28 +536,28 @@ EXPORT_SYMBOL(migrate_page);
   * exist.
   */
  int buffer_migrate_page(struct address_space *mapping,
-               struct page *newpage, struct page *page)
+               struct page *newpage, struct page *page, enum migrate_mode mode)
  {
         struct buffer_head *bh, *head;
         int rc;
  
         if (!page_has_buffers(page))
-               return migrate_page(mapping, newpage, page);
+               return migrate_page(mapping, newpage, page, mode);
  
         head = page_buffers(page);
  
-       rc = migrate_page_move_mapping(mapping, newpage, page);
+       rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
  
-       if (rc)
+       if (rc != MIGRATEPAGE_SUCCESS)
                 return rc;
  
-       bh = head;
-       do {
-               get_bh(bh);
-               lock_buffer(bh);
-               bh = bh->b_this_page;
-
-       } while (bh != head);
+       /*
+        * In the async case, migrate_page_move_mapping locked the buffers
+        * with an IRQ-safe spinlock held. In the sync case, the buffers
+        * need to be locked now
+        */
+       if (mode != MIGRATE_ASYNC)
+               BUG_ON(!buffer_migrate_lock_buffers(head, mode));
  
         ClearPagePrivate(page);
         set_page_private(newpage, page_private(page));
@@ -486,7 +584,7 @@ int buffer_migrate_page(struct address_space *mapping,
  
         } while (bh != head);
  
-       return 0;
+       return MIGRATEPAGE_SUCCESS;
  }
  EXPORT_SYMBOL(buffer_migrate_page);
  #endif
@@ -536,10 +634,14 @@ static int writeout(struct address_space *mapping, struct page *page)
   * Default handling if a filesystem does not provide a migration function.
   */
  static int fallback_migrate_page(struct address_space *mapping,
-       struct page *newpage, struct page *page)
+       struct page *newpage, struct page *page, enum migrate_mode mode)
  {
-       if (PageDirty(page))
+       if (PageDirty(page)) {
+               /* Only writeback pages in full synchronous migration */
+               if (mode != MIGRATE_SYNC)
+                       return -EBUSY;
                 return writeout(mapping, page);
+       }
  
         /*
          * Buffers may be managed in a filesystem specific way.
@@ -549,7 +651,7 @@ static int fallback_migrate_page(struct address_space *mapping,
             !try_to_release_page(page, GFP_KERNEL))
                 return -EAGAIN;
  
-       return migrate_page(mapping, newpage, page);
+       return migrate_page(mapping, newpage, page, mode);
  }
  
  /*
@@ -561,10 +663,10 @@ static int fallback_migrate_page(struct address_space *mapping,
   *
   * Return value:
   *   < 0 - error code
- *  == 0 - success
+ *  MIGRATEPAGE_SUCCESS - success
   */
  static int move_to_new_page(struct page *newpage, struct page *page,
-                                       int remap_swapcache, bool sync)
+                               int remap_swapcache, enum migrate_mode mode)
  {
         struct address_space *mapping;
         int rc;
@@ -585,35 +687,25 @@ static int move_to_new_page(struct page *newpage, struct page *page,
  
         mapping = page_mapping(page);
         if (!mapping)
-               rc = migrate_page(mapping, newpage, page);
-       else {
+               rc = migrate_page(mapping, newpage, page, mode);
+       else if (mapping->a_ops->migratepage)
                 /*
-                * Do not writeback pages if !sync and migratepage is
-                * not pointing to migrate_page() which is nonblocking
-                * (swapcache/tmpfs uses migratepage = migrate_page).
+                * Most pages have a mapping and most filesystems provide a
+                * migratepage callback. Anonymous pages are part of swap
+                * space which also has its own migratepage callback. This
+                * is the most common path for page migration.
                  */
-               if (PageDirty(page) && !sync &&
-                   mapping->a_ops->migratepage != migrate_page)
-                       rc = -EBUSY;
-               else if (mapping->a_ops->migratepage)
-                       /*
-                        * Most pages have a mapping and most filesystems
-                        * should provide a migration function. Anonymous
-                        * pages are part of swap space which also has its
-                        * own migration function. This is the most common
-                        * path for page migration.
-                        */
-                       rc = mapping->a_ops->migratepage(mapping,
-                                                       newpage, page);
-               else
-                       rc = fallback_migrate_page(mapping, newpage, page);
-       }
+               rc = mapping->a_ops->migratepage(mapping,
+                                               newpage, page, mode);
+       else
+               rc = fallback_migrate_page(mapping, newpage, page, mode);
  
-       if (rc) {
+       if (rc != MIGRATEPAGE_SUCCESS) {
                 newpage->mapping = NULL;
         } else {
                 if (remap_swapcache)
                         remove_migration_ptes(page, newpage);
+               page->mapping = NULL;
         }
  
         unlock_page(newpage);
@@ -621,38 +713,17 @@ static int move_to_new_page(struct page *newpage, struct page *page,
         return rc;
  }
  
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                       struct page *page, int force, bool offlining, bool sync)
+static int __unmap_and_move(struct page *page, struct page *newpage,
+                               int force, enum migrate_mode mode)
  {
-       int rc = 0;
-       int *result = NULL;
-       struct page *newpage = get_new_page(page, private, &result);
+       int rc = -EAGAIN;
         int remap_swapcache = 1;
-       int charge = 0;
         struct mem_cgroup *mem;
         struct anon_vma *anon_vma = NULL;
  
-       if (!newpage)
-               return -ENOMEM;
-
-       if (page_count(page) == 1) {
-               /* page was freed from under us. So we are done. */
-               goto move_newpage;
-       }
-       if (unlikely(PageTransHuge(page)))
-               if (unlikely(split_huge_page(page)))
-                       goto move_newpage;
-
-       /* prepare cgroup just returns 0 or -ENOMEM */
-       rc = -EAGAIN;
-
         if (!trylock_page(page)) {
-               if (!force || !sync)
-                       goto move_newpage;
+               if (!force || mode == MIGRATE_ASYNC)
+                       goto out;
  
                 /*
                  * It's not safe for direct compaction to call lock_page.
@@ -668,39 +739,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                  * altogether.
                  */
                 if (current->flags & PF_MEMALLOC)
-                       goto move_newpage;
+                       goto out;
  
                 lock_page(page);
         }
  
-       /*
-        * Only memory hotplug's offline_pages() caller has locked out KSM,
-        * and can safely migrate a KSM page.  The other cases have skipped
-        * PageKsm along with PageReserved - but it is only now when we have
-        * the page lock that we can be certain it will not go KSM beneath us
-        * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
-        * its pagecount raised, but only here do we take the page lock which
-        * serializes that).
-        */
-       if (PageKsm(page) && !offlining) {
-               rc = -EBUSY;
-               goto unlock;
-       }
-
         /* charge against new page */
-       charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
-       if (charge == -ENOMEM) {
-               rc = -ENOMEM;
-               goto unlock;
-       }
-       BUG_ON(charge);
+       mem_cgroup_prepare_migration(page, newpage, &mem);
  
         if (PageWriteback(page)) {
                 /*
-                * For !sync, there is no point retrying as the retry loop
-                * is expected to be too short for PageWriteback to be cleared
+                * Only in the case of a full synchronous migration is it
+                * necessary to wait for PageWriteback. In the async case,
+                * the retry loop is too short and in the sync-light case,
+                * the overhead of stalling is too much
                  */
-               if (!sync) {
+               if (mode != MIGRATE_SYNC) {
                         rc = -EBUSY;
                         goto uncharge;
                 }
@@ -716,9 +770,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
          * File Caches may use write_page() or lock_page() in migration, then,
          * just care Anon page here.
          */
-       if (PageAnon(page)) {
+       if (PageAnon(page) && !PageKsm(page)) {
                 /*
-                * Only page_lock_anon_vma() understands the subtleties of
+                * Only page_lock_anon_vma_read() understands the subtleties of
                  * getting a hold on an anon_vma from outside one of its mms.
                  */
                 anon_vma = page_get_anon_vma(page);
@@ -745,6 +799,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                 }
         }
  
+       if (unlikely(balloon_page_movable(page))) {
+               /*
+                * A ballooned page does not need any special attention from
+                * physical to virtual reverse mapping procedures.
+                * Skip any attempt to unmap PTEs or to remap swap cache,
+                * in order to avoid burning cycles at rmap level, and perform
+                * the page migration right away (proteced by page lock).
+                */
+               rc = balloon_page_migrate(newpage, page, mode);
+               goto uncharge;
+       }
+
         /*
          * Corner case handling:
          * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -771,7 +837,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
  
  skip_unmap:
         if (!page_mapped(page))
-               rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+               rc = move_to_new_page(newpage, page, remap_swapcache, mode);
  
         if (rc && remap_swapcache)
                 remove_migration_ptes(page, page);
@@ -781,31 +847,68 @@ skip_unmap:
                 put_anon_vma(anon_vma);
  
  uncharge:
-       if (!charge)
-               mem_cgroup_end_migration(mem, page, newpage, rc == 0);
-unlock:
+       mem_cgroup_end_migration(mem, page, newpage,
+                                (rc == MIGRATEPAGE_SUCCESS ||
+                                 rc == MIGRATEPAGE_BALLOON_SUCCESS));
         unlock_page(page);
+out:
+       return rc;
+}
  
-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+                       struct page *page, int force, enum migrate_mode mode)
+{
+       int rc = 0;
+       int *result = NULL;
+       struct page *newpage = get_new_page(page, private, &result);
+
+       if (!newpage)
+               return -ENOMEM;
+
+       if (page_count(page) == 1) {
+               /* page was freed from under us. So we are done. */
+               goto out;
+       }
+
+       if (unlikely(PageTransHuge(page)))
+               if (unlikely(split_huge_page(page)))
+                       goto out;
+
+       rc = __unmap_and_move(page, newpage, force, mode);
+
+       if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
+               /*
+                * A ballooned page has been migrated already.
+                * Now, it's the time to wrap-up counters,
+                * handle the page back to Buddy and return.
+                */
+               dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                   page_is_file_cache(page));
+               balloon_page_free(page);
+               return MIGRATEPAGE_SUCCESS;
+       }
+out:
         if (rc != -EAGAIN) {
-               /*
-                * A page that has been migrated has all references
-                * removed and will be freed. A page that has not been
-                * migrated will have kepts its references and be
-                * restored.
-                */
-               list_del(&page->lru);
+               /*
+                * A page that has been migrated has all references
+                * removed and will be freed. A page that has not been
+                * migrated will have kepts its references and be
+                * restored.
+                */
+               list_del(&page->lru);
                 dec_zone_page_state(page, NR_ISOLATED_ANON +
                                 page_is_file_cache(page));
                 putback_lru_page(page);
         }
-
         /*
          * Move the new page to the LRU. If migration was not successful
          * then this will free the page.
          */
         putback_lru_page(newpage);
-
         if (result) {
                 if (rc)
                         *result = rc;
@@ -835,7 +938,7 @@ move_newpage:
   */
  static int unmap_and_move_huge_page(new_page_t get_new_page,
                                 unsigned long private, struct page *hpage,
-                               int force, bool offlining, bool sync)
+                               int force, enum migrate_mode mode)
  {
         int rc = 0;
         int *result = NULL;
@@ -848,7 +951,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         rc = -EAGAIN;
  
         if (!trylock_page(hpage)) {
-               if (!force || !sync)
+               if (!force || mode != MIGRATE_SYNC)
                         goto out;
                 lock_page(hpage);
         }
@@ -859,23 +962,20 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
  
         if (!page_mapped(hpage))
-               rc = move_to_new_page(new_hpage, hpage, 1, sync);
+               rc = move_to_new_page(new_hpage, hpage, 1, mode);
  
         if (rc)
                 remove_migration_ptes(hpage, hpage);
  
         if (anon_vma)
                 put_anon_vma(anon_vma);
-out:
-       unlock_page(hpage);
  
-       if (rc != -EAGAIN) {
-               list_del(&hpage->lru);
-               put_page(hpage);
-       }
+       if (!rc)
+               hugetlb_cgroup_migrate(hpage, new_hpage);
  
+       unlock_page(hpage);
+out:
         put_page(new_hpage);
-
         if (result) {
                 if (rc)
                         *result = rc;
@@ -886,26 +986,30 @@ out:
  }
  
  /*
- * migrate_pages
+ * migrate_pages - migrate the pages specified in a list, to the free pages
+ *                supplied as the target for the page migration
   *
- * The function takes one list of pages to migrate and a function
- * that determines from the page to be migrated and the private data
- * the target of the move and allocates the page.
+ * @from:              The list of pages to be migrated.
+ * @get_new_page:      The function used to allocate free pages to be used
+ *                     as the target of the page migration.
+ * @private:           Private data to be passed on to get_new_page()
+ * @mode:              The migration mode that specifies the constraints for
+ *                     page migration, if any.
+ * @reason:            The reason for page migration.
   *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- * Caller should call putback_lru_pages to return pages to the LRU
+ * The function returns after 10 attempts or if no pages are movable any more
+ * because the list has become empty or no retryable pages exist any more.
+ * The caller should call putback_lru_pages() to return pages to the LRU
   * or free list only if ret != 0.
   *
- * Return: Number of pages not migrated or error code.
+ * Returns the number of pages that were not migrated, or an error code.
   */
-int migrate_pages(struct list_head *from,
-               new_page_t get_new_page, unsigned long private, bool offlining,
-               bool sync)
+int migrate_pages(struct list_head *from, new_page_t get_new_page,
+               unsigned long private, enum migrate_mode mode, int reason)
  {
         int retry = 1;
         int nr_failed = 0;
+       int nr_succeeded = 0;
         int pass = 0;
         struct page *page;
         struct page *page2;
@@ -922,8 +1026,7 @@ int migrate_pages(struct list_head *from,
                         cond_resched();
  
                         rc = unmap_and_move(get_new_page, private,
-                                               page, pass > 2, offlining,
-                                               sync);
+                                               page, pass > 2, mode);
  
                         switch(rc) {
                         case -ENOMEM:
@@ -931,7 +1034,8 @@ int migrate_pages(struct list_head *from,
                         case -EAGAIN:
                                 retry++;
                                 break;
-                       case 0:
+                       case MIGRATEPAGE_SUCCESS:
+                               nr_succeeded++;
                                 break;
                         default:
                                 /* Permanent failure */
@@ -940,59 +1044,44 @@ int migrate_pages(struct list_head *from,
                         }
                 }
         }
-       rc = 0;
+       rc = nr_failed + retry;
  out:
+       if (nr_succeeded)
+               count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+       if (nr_failed)
+               count_vm_events(PGMIGRATE_FAIL, nr_failed);
+       trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+
         if (!swapwrite)
                 current->flags &= ~PF_SWAPWRITE;
  
-       if (rc)
-               return rc;
-
-       return nr_failed + retry;
+       return rc;
  }
  
-int migrate_huge_pages(struct list_head *from,
-               new_page_t get_new_page, unsigned long private, bool offlining,
-               bool sync)
+int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
+                     unsigned long private, enum migrate_mode mode)
  {
-       int retry = 1;
-       int nr_failed = 0;
-       int pass = 0;
-       struct page *page;
-       struct page *page2;
-       int rc;
-
-       for (pass = 0; pass < 10 && retry; pass++) {
-               retry = 0;
+       int pass, rc;
  
-               list_for_each_entry_safe(page, page2, from, lru) {
+       for (pass = 0; pass < 10; pass++) {
+               rc = unmap_and_move_huge_page(get_new_page, private,
+                                               hpage, pass > 2, mode);
+               switch (rc) {
+               case -ENOMEM:
+                       goto out;
+               case -EAGAIN:
+                       /* try again */
                         cond_resched();
-
-                       rc = unmap_and_move_huge_page(get_new_page,
-                                       private, page, pass > 2, offlining,
-                                       sync);
-
-                       switch(rc) {
-                       case -ENOMEM:
-                               goto out;
-                       case -EAGAIN:
-                               retry++;
-                               break;
-                       case 0:
-                               break;
-                       default:
-                               /* Permanent failure */
-                               nr_failed++;
-                               break;
-                       }
+                       break;
+               case MIGRATEPAGE_SUCCESS:
+                       goto out;
+               default:
+                       rc = -EIO;
+                       goto out;
                 }
         }
-       rc = 0;
  out:
-       if (rc)
-               return rc;
-
-       return nr_failed + retry;
+       return rc;
  }
  
  #ifdef CONFIG_NUMA
@@ -1062,7 +1151,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                         goto set_status;
  
                 /* Use PageReserved to check for zero page */
-               if (PageReserved(page) || PageKsm(page))
+               if (PageReserved(page))
                         goto put_and_set;
  
                 pp->page = page;
@@ -1099,7 +1188,7 @@ set_status:
         err = 0;
         if (!list_empty(&pagelist)) {
                 err = migrate_pages(&pagelist, new_page_node,
-                               (unsigned long)pm, 0, true);
+                               (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
                 if (err)
                         putback_lru_pages(&pagelist);
         }
@@ -1112,20 +1201,17 @@ set_status:
   * Migrate an array of page address onto an array of nodes and fill
   * the corresponding array of status.
   */
-static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                          unsigned long nr_pages,
                          const void __user * __user *pages,
                          const int __user *nodes,
                          int __user *status, int flags)
  {
         struct page_to_node *pm;
-       nodemask_t task_nodes;
         unsigned long chunk_nr_pages;
         unsigned long chunk_start;
         int err;
  
-       task_nodes = cpuset_mems_allowed(task);
-
         err = -ENOMEM;
         pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
         if (!pm)
@@ -1164,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                         if (node < 0 || node >= MAX_NUMNODES)
                                 goto out_pm;
  
-                       if (!node_state(node, N_HIGH_MEMORY))
+                       if (!node_state(node, N_MEMORY))
                                 goto out_pm;
  
                         err = -EACCES;
@@ -1226,7 +1312,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
  
                 err = -ENOENT;
                 /* Use PageReserved to check for zero page */
-               if (!page || PageReserved(page) || PageKsm(page))
+               if (!page || PageReserved(page))
                         goto set_status;
  
                 err = page_to_nid(page);
@@ -1287,6 +1373,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
         struct task_struct *task;
         struct mm_struct *mm;
         int err;
+       nodemask_t task_nodes;
  
         /* Check flags */
         if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1302,11 +1389,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
                 rcu_read_unlock();
                 return -ESRCH;
         }
-       mm = get_task_mm(task);
-       rcu_read_unlock();
-
-       if (!mm)
-               return -EINVAL;
+       get_task_struct(task);
  
         /*
          * Check if this process has the right to modify the specified
@@ -1314,10 +1397,9 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
          * capabilities, superuser privileges or the same
          * userid as the target process.
          */
-       rcu_read_lock();
         tcred = __task_cred(task);
-       if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
-           cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
+       if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+           !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
             !capable(CAP_SYS_NICE)) {
                 rcu_read_unlock();
                 err = -EPERM;
@@ -1329,16 +1411,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
         if (err)
                 goto out;
  
-       if (nodes) {
-               err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
-                                   flags);
-       } else {
+       task_nodes = cpuset_mems_allowed(task);
+       mm = get_task_mm(task);
+       put_task_struct(task);
+
+       if (!mm)
+               return -EINVAL;
+
+       if (nodes)
+               err = do_pages_move(mm, task_nodes, nr_pages, pages,
+                                   nodes, status, flags);
+       else
                 err = do_pages_stat(mm, nr_pages, pages, status);
-       }
  
-out:
         mmput(mm);
         return err;
+
+out:
+       put_task_struct(task);
+       return err;
  }
  
  /*
@@ -1361,4 +1452,325 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
         }
         return err;
  }
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+                                  unsigned long nr_migrate_pages)
+{
+       int z;
+       for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+               struct zone *zone = pgdat->node_zones + z;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone->all_unreclaimable)
+                       continue;
+
+               /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+               if (!zone_watermark_ok(zone, 0,
+                                      high_wmark_pages(zone) +
+                                      nr_migrate_pages,
+                                      0, 0))
+                       continue;
+               return true;
+       }
+       return false;
+}
+
+static struct page *alloc_misplaced_dst_page(struct page *page,
+                                          unsigned long data,
+                                          int **result)
+{
+       int nid = (int) data;
+       struct page *newpage;
+
+       newpage = alloc_pages_exact_node(nid,
+                                        (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+                                         __GFP_NOMEMALLOC | __GFP_NORETRY |
+                                         __GFP_NOWARN) &
+                                        ~GFP_IOFS, 0);
+       if (newpage)
+               page_nid_xchg_last(newpage, page_nid_last(page));
+
+       return newpage;
+}
+
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+       pg_data_t *pgdat = NODE_DATA(node);
+
+       if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+                               msecs_to_jiffies(pteupdate_interval_millisecs)))
+               return false;
+
+       if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+               return false;
+
+       return true;
+}
+
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+       bool rate_limited = false;
+
+       /*
+        * Rate-limit the amount of data that is being migrated to a node.
+        * Optimal placement is no good if the memory bus is saturated and
+        * all the time is being spent migrating!
+        */
+       spin_lock(&pgdat->numabalancing_migrate_lock);
+       if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+               pgdat->numabalancing_migrate_nr_pages = 0;
+               pgdat->numabalancing_migrate_next_window = jiffies +
+                       msecs_to_jiffies(migrate_interval_millisecs);
+       }
+       if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+               rate_limited = true;
+       else
+               pgdat->numabalancing_migrate_nr_pages += nr_pages;
+       spin_unlock(&pgdat->numabalancing_migrate_lock);
+       
+       return rate_limited;
+}
+
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+       int page_lru;
+
+       VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
+
+       /* Avoid migrating to a node that is nearly full */
+       if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+               return 0;
+
+       if (isolate_lru_page(page))
+               return 0;
+
+       /*
+        * migrate_misplaced_transhuge_page() skips page migration's usual
+        * check on page_count(), so we must do it here, now that the page
+        * has been isolated: a GUP pin, or any other pin, prevents migration.
+        * The expected page count is 3: 1 for page's mapcount and 1 for the
+        * caller's pin and 1 for the reference taken by isolate_lru_page().
+        */
+       if (PageTransHuge(page) && page_count(page) != 3) {
+               putback_lru_page(page);
+               return 0;
+       }
+
+       page_lru = page_is_file_cache(page);
+       mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
+                               hpage_nr_pages(page));
+
+       /*
+        * Isolating the page has taken another reference, so the
+        * caller's reference can be safely dropped without the page
+        * disappearing underneath us during migration.
+        */
+       put_page(page);
+       return 1;
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+       pg_data_t *pgdat = NODE_DATA(node);
+       int isolated;
+       int nr_remaining;
+       LIST_HEAD(migratepages);
+
+       /*
+        * Don't migrate pages that are mapped in multiple processes.
+        * TODO: Handle false sharing detection instead of this hammer
+        */
+       if (page_mapcount(page) != 1)
+               goto out;
+
+       /*
+        * Rate-limit the amount of data that is being migrated to a node.
+        * Optimal placement is no good if the memory bus is saturated and
+        * all the time is being spent migrating!
+        */
+       if (numamigrate_update_ratelimit(pgdat, 1))
+               goto out;
+
+       isolated = numamigrate_isolate_page(pgdat, page);
+       if (!isolated)
+               goto out;
+
+       list_add(&page->lru, &migratepages);
+       nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+                                    node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+       if (nr_remaining) {
+               putback_lru_pages(&migratepages);
+               isolated = 0;
+       } else
+               count_vm_numa_event(NUMA_PAGE_MIGRATE);
+       BUG_ON(!list_empty(&migratepages));
+       return isolated;
+
+out:
+       put_page(page);
+       return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+/*
+ * Migrates a THP to a given target node. page must be locked and is unlocked
+ * before returning.
+ */
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                               struct vm_area_struct *vma,
+                               pmd_t *pmd, pmd_t entry,
+                               unsigned long address,
+                               struct page *page, int node)
+{
+       unsigned long haddr = address & HPAGE_PMD_MASK;
+       pg_data_t *pgdat = NODE_DATA(node);
+       int isolated = 0;
+       struct page *new_page = NULL;
+       struct mem_cgroup *memcg = NULL;
+       int page_lru = page_is_file_cache(page);
+
+       /*
+        * Don't migrate pages that are mapped in multiple processes.
+        * TODO: Handle false sharing detection instead of this hammer
+        */
+       if (page_mapcount(page) != 1)
+               goto out_dropref;
+
+       /*
+        * Rate-limit the amount of data that is being migrated to a node.
+        * Optimal placement is no good if the memory bus is saturated and
+        * all the time is being spent migrating!
+        */
+       if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+               goto out_dropref;
+
+       new_page = alloc_pages_node(node,
+               (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+       if (!new_page)
+               goto out_fail;
+
+       page_nid_xchg_last(new_page, page_nid_last(page));
+
+       isolated = numamigrate_isolate_page(pgdat, page);
+       if (!isolated) {
+               put_page(new_page);
+               goto out_fail;
+       }
+
+       /* Prepare a page as a migration target */
+       __set_page_locked(new_page);
+       SetPageSwapBacked(new_page);
+
+       /* anon mapping, we can simply copy page->mapping to the new page: */
+       new_page->mapping = page->mapping;
+       new_page->index = page->index;
+       migrate_page_copy(new_page, page);
+       WARN_ON(PageLRU(new_page));
+
+       /* Recheck the target PMD */
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(*pmd, entry))) {
+               spin_unlock(&mm->page_table_lock);
+
+               /* Reverse changes made by migrate_page_copy() */
+               if (TestClearPageActive(new_page))
+                       SetPageActive(page);
+               if (TestClearPageUnevictable(new_page))
+                       SetPageUnevictable(page);
+               mlock_migrate_page(page, new_page);
+
+               unlock_page(new_page);
+               put_page(new_page);             /* Free it */
+
+               /* Retake the callers reference and putback on LRU */
+               get_page(page);
+               putback_lru_page(page);
+               mod_zone_page_state(page_zone(page),
+                        NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
+
+               goto out_unlock;
+       }
+
+       /*
+        * Traditional migration needs to prepare the memcg charge
+        * transaction early to prevent the old page from being
+        * uncharged when installing migration entries.  Here we can
+        * save the potential rollback and start the charge transfer
+        * only when migration is already known to end successfully.
+        */
+       mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+       entry = mk_pmd(new_page, vma->vm_page_prot);
+       entry = pmd_mknonnuma(entry);
+       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+       entry = pmd_mkhuge(entry);
+
+       pmdp_clear_flush(vma, haddr, pmd);
+       set_pmd_at(mm, haddr, pmd, entry);
+       page_add_new_anon_rmap(new_page, vma, haddr);
+       update_mmu_cache_pmd(vma, address, &entry);
+       page_remove_rmap(page);
+       /*
+        * Finish the charge transaction under the page table lock to
+        * prevent split_huge_page() from dividing up the charge
+        * before it's fully transferred to the new page.
+        */
+       mem_cgroup_end_migration(memcg, page, new_page, true);
+       spin_unlock(&mm->page_table_lock);
+
+       unlock_page(new_page);
+       unlock_page(page);
+       put_page(page);                 /* Drop the rmap reference */
+       put_page(page);                 /* Drop the LRU isolation reference */
+
+       count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+       count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+       mod_zone_page_state(page_zone(page),
+                       NR_ISOLATED_ANON + page_lru,
+                       -HPAGE_PMD_NR);
+       return isolated;
+
+out_fail:
+       count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+out_dropref:
+       entry = pmd_mknonnuma(entry);
+       set_pmd_at(mm, haddr, pmd, entry);
+       update_mmu_cache_pmd(vma, address, &entry);
+
+out_unlock:
+       unlock_page(page);
+       put_page(page);
+       return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* CONFIG_NUMA */