Revert "netfilter: xt_qtaguid: Allow tracking loopback"

[firefly-linux-kernel-4.4.55.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 70d43bf00dc6c74fc8757e3c88ce91eda06f1cc5..eaf40d6d220846628ef647fe1111d6fcaa05040d 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
  #include <linux/pagemap.h>
  #include <linux/jiffies.h>
  #include <linux/bootmem.h>
+#include <linux/memblock.h>
  #include <linux/compiler.h>
  #include <linux/kernel.h>
  #include <linux/kmemcheck.h>
@@ -29,6 +30,7 @@
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
+#include <linux/ratelimit.h>
  #include <linux/oom.h>
  #include <linux/notifier.h>
  #include <linux/topology.h>
@@ -38,6 +40,7 @@
  #include <linux/memory_hotplug.h>
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
+#include <linux/vmstat.h>
  #include <linux/mempolicy.h>
  #include <linux/stop_machine.h>
  #include <linux/sort.h>
@@ -48,12 +51,33 @@
  #include <linux/page_cgroup.h>
  #include <linux/debugobjects.h>
  #include <linux/kmemleak.h>
+#include <linux/memory.h>
+#include <linux/compaction.h>
  #include <trace/events/kmem.h>
+#include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
  #include "internal.h"
  
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
+ * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
+ * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
+ * defined in <linux/topology.h>.
+ */
+DEFINE_PER_CPU(int, _numa_mem_);               /* Kernel "local memory" node */
+EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+#endif
+
  /*
   * Array of node states.
   */
@@ -75,6 +99,50 @@ unsigned long totalreserve_pages __read_mostly;
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended.  To avoid races with the suspend/hibernate code,
+ * they should always be called with pm_mutex held (gfp_allowed_mask also should
+ * only be modified with pm_mutex held, unless the suspend/hibernate code is
+ * guaranteed not to run in parallel with that modification).
+ */
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+       WARN_ON(!mutex_is_locked(&pm_mutex));
+       if (saved_gfp_mask) {
+               gfp_allowed_mask = saved_gfp_mask;
+               saved_gfp_mask = 0;
+       }
+}
+
+void pm_restrict_gfp_mask(void)
+{
+       WARN_ON(!mutex_is_locked(&pm_mutex));
+       WARN_ON(saved_gfp_mask);
+       saved_gfp_mask = gfp_allowed_mask;
+       gfp_allowed_mask &= ~GFP_IOFS;
+}
+
+static bool pm_suspending(void)
+{
+       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+               return false;
+       return true;
+}
+
+#else
+
+static bool pm_suspending(void)
+{
+       return false;
+}
+#endif /* CONFIG_PM_SLEEP */
+
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  int pageblock_order __read_mostly;
  #endif
@@ -237,7 +305,7 @@ static void bad_page(struct page *page)
  
         /* Don't complain about poisoned pages */
         if (PageHWPoison(page)) {
-               __ClearPageBuddy(page);
+               reset_page_mapcount(page); /* remove PageBuddy */
                 return;
         }
  
@@ -263,15 +331,12 @@ static void bad_page(struct page *page)
  
         printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
                 current->comm, page_to_pfn(page));
-       printk(KERN_ALERT
-               "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
-               page, (void *)page->flags, page_count(page),
-               page_mapcount(page), page->mapping, page->index);
+       dump_page(page);
  
         dump_stack();
  out:
         /* Leave bad fields for debug, except PageBuddy could make trouble */
-       __ClearPageBuddy(page);
+       reset_page_mapcount(page); /* remove PageBuddy */
         add_taint(TAINT_BAD_PAGE);
  }
  
@@ -305,12 +370,13 @@ void prep_compound_page(struct page *page, unsigned long order)
         __SetPageHead(page);
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
-
                 __SetPageTail(p);
+               set_page_count(p, 0);
                 p->first_page = page;
         }
  }
  
+/* update __split_huge_page_refcount if you change this function */
  static int destroy_compound_page(struct page *page, unsigned long order)
  {
         int i;
@@ -380,18 +446,10 @@ static inline void rmv_page_order(struct page *page)
   *
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
   */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-       unsigned long buddy_idx = page_idx ^ (1 << order);
-
-       return page + (buddy_idx - page_idx);
-}
-
  static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
  {
-       return (page_idx & ~(1 << order));
+       return page_idx ^ (1 << order);
  }
  
  /*
@@ -402,8 +460,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
   *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
   *
   * For recording page's order, we use page_private(page).
   */
@@ -436,7 +494,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
   * order is recorded in page_private(page) field.
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
@@ -452,6 +510,9 @@ static inline void __free_one_page(struct page *page,
                 int migratetype)
  {
         unsigned long page_idx;
+       unsigned long combined_idx;
+       unsigned long uninitialized_var(buddy_idx);
+       struct page *buddy;
  
         if (unlikely(PageCompound(page)))
                 if (unlikely(destroy_compound_page(page, order)))
@@ -465,10 +526,8 @@ static inline void __free_one_page(struct page *page,
         VM_BUG_ON(bad_range(zone, page));
  
         while (order < MAX_ORDER-1) {
-               unsigned long combined_idx;
-               struct page *buddy;
-
-               buddy = __page_find_buddy(page, page_idx, order);
+               buddy_idx = __find_buddy_index(page_idx, order);
+               buddy = page + (buddy_idx - page_idx);
                 if (!page_is_buddy(page, buddy, order))
                         break;
  
@@ -476,18 +535,39 @@ static inline void __free_one_page(struct page *page,
                 list_del(&buddy->lru);
                 zone->free_area[order].nr_free--;
                 rmv_page_order(buddy);
-               combined_idx = __find_combined_index(page_idx, order);
+               combined_idx = buddy_idx & page_idx;
                 page = page + (combined_idx - page_idx);
                 page_idx = combined_idx;
                 order++;
         }
         set_page_order(page, order);
-       list_add(&page->lru,
-               &zone->free_area[order].free_list[migratetype]);
+
+       /*
+        * If this is not the largest possible page, check if the buddy
+        * of the next-highest order is free. If it is, it's possible
+        * that pages are being freed that will coalesce soon. In case,
+        * that is happening, add the free page to the tail of the list
+        * so it's less likely to be used soon and more likely to be merged
+        * as a higher order page
+        */
+       if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+               struct page *higher_page, *higher_buddy;
+               combined_idx = buddy_idx & page_idx;
+               higher_page = page + (combined_idx - page_idx);
+               buddy_idx = __find_buddy_index(combined_idx, order + 1);
+               higher_buddy = higher_page + (buddy_idx - combined_idx);
+               if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+                       list_add_tail(&page->lru,
+                               &zone->free_area[order].free_list[migratetype]);
+                       goto out;
+               }
+       }
+
+       list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+out:
         zone->free_area[order].nr_free++;
  }
  
-#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
  /*
   * free_page_mlock() -- clean up attempts to free and mlocked() page.
   * Page should not be on lru, so no need to fix that up.
@@ -498,16 +578,14 @@ static inline void free_page_mlock(struct page *page)
         __dec_zone_page_state(page, NR_MLOCK);
         __count_vm_event(UNEVICTABLE_MLOCKFREED);
  }
-#else
-static void free_page_mlock(struct page *page) { }
-#endif
  
  static inline int free_pages_check(struct page *page)
  {
         if (unlikely(page_mapcount(page) |
                 (page->mapping != NULL)  |
                 (atomic_read(&page->_count) != 0) |
-               (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+               (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+               (mem_cgroup_bad_page_check(page)))) {
                 bad_page(page);
                 return 1;
         }
@@ -532,13 +610,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  {
         int migratetype = 0;
         int batch_free = 0;
+       int to_free = count;
  
         spin_lock(&zone->lock);
-       zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+       zone->all_unreclaimable = 0;
         zone->pages_scanned = 0;
  
-       __mod_zone_page_state(zone, NR_FREE_PAGES, count);
-       while (count) {
+       while (to_free) {
                 struct page *page;
                 struct list_head *list;
  
@@ -556,14 +634,20 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                         list = &pcp->lists[migratetype];
                 } while (list_empty(list));
  
+               /* This is the only non-empty list. Free them all. */
+               if (batch_free == MIGRATE_PCPTYPES)
+                       batch_free = to_free;
+
                 do {
                         page = list_entry(list->prev, struct page, lru);
                         /* must delete as __free_one_page list manipulates */
                         list_del(&page->lru);
-                       __free_one_page(page, zone, 0, migratetype);
-                       trace_mm_page_pcpu_drain(page, 0, migratetype);
-               } while (--count && --batch_free && !list_empty(list));
+                       /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+                       __free_one_page(page, zone, 0, page_private(page));
+                       trace_mm_page_pcpu_drain(page, 0, page_private(page));
+               } while (--to_free && --batch_free && !list_empty(list));
         }
+       __mod_zone_page_state(zone, NR_FREE_PAGES, count);
         spin_unlock(&zone->lock);
  }
  
@@ -571,27 +655,28 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
                                 int migratetype)
  {
         spin_lock(&zone->lock);
-       zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+       zone->all_unreclaimable = 0;
         zone->pages_scanned = 0;
  
-       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
         __free_one_page(page, zone, order, migratetype);
+       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
         spin_unlock(&zone->lock);
  }
  
-static void __free_pages_ok(struct page *page, unsigned int order)
+static bool free_pages_prepare(struct page *page, unsigned int order)
  {
-       unsigned long flags;
         int i;
         int bad = 0;
-       int wasMlocked = __TestClearPageMlocked(page);
  
+       trace_mm_page_free_direct(page, order);
         kmemcheck_free_shadow(page, order);
  
-       for (i = 0 ; i < (1 << order) ; ++i)
+       if (PageAnon(page))
+               page->mapping = NULL;
+       for (i = 0; i < (1 << order); i++)
                 bad += free_pages_check(page + i);
         if (bad)
-               return;
+               return false;
  
         if (!PageHighMem(page)) {
                 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -601,6 +686,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         arch_free_page(page, order);
         kernel_map_pages(page, 1 << order, 0);
  
+       return true;
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+       unsigned long flags;
+       int wasMlocked = __TestClearPageMlocked(page);
+
+       if (!free_pages_prepare(page, order))
+               return;
+
         local_irq_save(flags);
         if (unlikely(wasMlocked))
                 free_page_mlock(page);
@@ -678,7 +774,8 @@ static inline int check_new_page(struct page *page)
         if (unlikely(page_mapcount(page) |
                 (page->mapping != NULL)  |
                 (atomic_read(&page->_count) != 0)  |
-               (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+               (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+               (mem_cgroup_bad_page_check(page)))) {
                 bad_page(page);
                 return 1;
         }
@@ -791,9 +888,8 @@ static int move_freepages(struct zone *zone,
                 }
  
                 order = page_order(page);
-               list_del(&page->lru);
-               list_add(&page->lru,
-                       &zone->free_area[order].free_list[migratetype]);
+               list_move(&page->lru,
+                         &zone->free_area[order].free_list[migratetype]);
                 page += 1 << order;
                 pages_moved += 1 << order;
         }
@@ -864,7 +960,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                          * If breaking a large block of pages, move all free
                          * pages to the preferred allocation list. If falling
                          * back for a reclaimable kernel allocation, be more
-                        * agressive about taking ownership of free pages
+                        * aggressive about taking ownership of free pages
                          */
                         if (unlikely(current_order >= (pageblock_order >> 1)) ||
                                         start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1012,12 +1108,14 @@ static void drain_pages(unsigned int cpu)
                 struct per_cpu_pageset *pset;
                 struct per_cpu_pages *pcp;
  
-               pset = zone_pcp(zone, cpu);
+               local_irq_save(flags);
+               pset = per_cpu_ptr(zone->pageset, cpu);
  
                 pcp = &pset->pcp;
-               local_irq_save(flags);
-               free_pcppages_bulk(zone, pcp->count, pcp);
-               pcp->count = 0;
+               if (pcp->count) {
+                       free_pcppages_bulk(zone, pcp->count, pcp);
+                       pcp->count = 0;
+               }
                 local_irq_restore(flags);
         }
  }
@@ -1076,8 +1174,9 @@ void mark_free_pages(struct zone *zone)
  
  /*
   * Free a 0-order page
+ * cold == 1 ? free a cold page : free a hot page
   */
-static void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, int cold)
  {
         struct zone *zone = page_zone(page);
         struct per_cpu_pages *pcp;
@@ -1085,21 +1184,9 @@ static void free_hot_cold_page(struct page *page, int cold)
         int migratetype;
         int wasMlocked = __TestClearPageMlocked(page);
  
-       kmemcheck_free_shadow(page, 0);
-
-       if (PageAnon(page))
-               page->mapping = NULL;
-       if (free_pages_check(page))
+       if (!free_pages_prepare(page, 0))
                 return;
  
-       if (!PageHighMem(page)) {
-               debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
-               debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
-       }
-       arch_free_page(page, 0);
-       kernel_map_pages(page, 1, 0);
-
-       pcp = &zone_pcp(zone, get_cpu())->pcp;
         migratetype = get_pageblock_migratetype(page);
         set_page_private(page, migratetype);
         local_irq_save(flags);
@@ -1122,6 +1209,7 @@ static void free_hot_cold_page(struct page *page, int cold)
                 migratetype = MIGRATE_MOVABLE;
         }
  
+       pcp = &this_cpu_ptr(zone->pageset)->pcp;
         if (cold)
                 list_add_tail(&page->lru, &pcp->lists[migratetype]);
         else
@@ -1134,15 +1222,8 @@ static void free_hot_cold_page(struct page *page, int cold)
  
  out:
         local_irq_restore(flags);
-       put_cpu();
  }
  
-void free_hot_page(struct page *page)
-{
-       trace_mm_page_free_direct(page, 0);
-       free_hot_cold_page(page, 0);
-}
-       
  /*
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
@@ -1171,6 +1252,51 @@ void split_page(struct page *page, unsigned int order)
                 set_page_refcounted(page + i);
  }
  
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+       unsigned int order;
+       unsigned long watermark;
+       struct zone *zone;
+
+       BUG_ON(!PageBuddy(page));
+
+       zone = page_zone(page);
+       order = page_order(page);
+
+       /* Obey watermarks as if the page was being allocated */
+       watermark = low_wmark_pages(zone) + (1 << order);
+       if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+               return 0;
+
+       /* Remove page from free list */
+       list_del(&page->lru);
+       zone->free_area[order].nr_free--;
+       rmv_page_order(page);
+       __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+
+       /* Split into individual pages */
+       set_page_refcounted(page);
+       split_page(page, order);
+
+       if (order >= pageblock_order - 1) {
+               struct page *endpage = page + (1 << order) - 1;
+               for (; page < endpage; page += pageblock_nr_pages)
+                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+       }
+
+       return 1 << order;
+}
+
  /*
   * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
   * we cheat by calling it from here, in the order > 0 path.  Saves a branch
@@ -1184,17 +1310,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
         unsigned long flags;
         struct page *page;
         int cold = !!(gfp_flags & __GFP_COLD);
-       int cpu;
  
  again:
-       cpu  = get_cpu();
         if (likely(order == 0)) {
                 struct per_cpu_pages *pcp;
                 struct list_head *list;
  
-               pcp = &zone_pcp(zone, cpu)->pcp;
-               list = &pcp->lists[migratetype];
                 local_irq_save(flags);
+               pcp = &this_cpu_ptr(zone->pageset)->pcp;
+               list = &pcp->lists[migratetype];
                 if (list_empty(list)) {
                         pcp->count += rmqueue_bulk(zone, 0,
                                         pcp->batch, list,
@@ -1226,16 +1350,15 @@ again:
                 }
                 spin_lock_irqsave(&zone->lock, flags);
                 page = __rmqueue(zone, order, migratetype);
-               __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
                 spin_unlock(&zone->lock);
                 if (!page)
                         goto failed;
+               __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
         }
  
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
-       zone_statistics(preferred_zone, zone);
+       zone_statistics(preferred_zone, zone, gfp_flags);
         local_irq_restore(flags);
-       put_cpu();
  
         VM_BUG_ON(bad_range(zone, page));
         if (prep_new_page(page, order, gfp_flags))
@@ -1244,7 +1367,6 @@ again:
  
  failed:
         local_irq_restore(flags);
-       put_cpu();
         return NULL;
  }
  
@@ -1357,24 +1479,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
  
  /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
   * of the allocation.
   */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags, long free_pages)
  {
         /* free_pages my go negative - that's OK */
         long min = mark;
-       long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
         int o;
  
+       free_pages -= (1 << order) + 1;
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
  
         if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-               return 0;
+               return false;
         for (o = 0; o < order; o++) {
                 /* At the next order, this order's pages become unavailable */
                 free_pages -= z->free_area[o].nr_free << o;
@@ -1383,9 +1505,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                 min >>= min_free_order_shift;
  
                 if (free_pages <= min)
-                       return 0;
+                       return false;
         }
-       return 1;
+       return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                               free_pages);
  }
  
  #ifdef CONFIG_NUMA
@@ -1490,6 +1631,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
         set_bit(i, zlc->fullzones);
  }
  
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return;
+
+       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
  #else  /* CONFIG_NUMA */
  
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1506,6 +1662,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
  {
  }
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
  #endif /* CONFIG_NUMA */
  
  /*
@@ -1538,7 +1698,7 @@ zonelist_scan:
                                 continue;
                 if ((alloc_flags & ALLOC_CPUSET) &&
                         !cpuset_zone_allowed_softwall(zone, gfp_mask))
-                               goto try_next_zone;
+                               continue;
  
                 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1550,17 +1710,36 @@ zonelist_scan:
                                     classzone_idx, alloc_flags))
                                 goto try_this_zone;
  
+                       if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                               /*
+                                * we do zlc_setup if there are multiple nodes
+                                * and before considering the first zone allowed
+                                * by the cpuset.
+                                */
+                               allowednodes = zlc_setup(zonelist, alloc_flags);
+                               zlc_active = 1;
+                               did_zlc_setup = 1;
+                       }
+
                         if (zone_reclaim_mode == 0)
                                 goto this_zone_full;
  
+                       /*
+                        * As we may have just activated ZLC, check if the first
+                        * eligible zone has failed zone_reclaim recently.
+                        */
+                       if (NUMA_BUILD && zlc_active &&
+                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                               continue;
+
                         ret = zone_reclaim(zone, gfp_mask, order);
                         switch (ret) {
                         case ZONE_RECLAIM_NOSCAN:
                                 /* did not scan */
-                               goto try_next_zone;
+                               continue;
                         case ZONE_RECLAIM_FULL:
                                 /* scanned but unreclaimable */
-                               goto this_zone_full;
+                               continue;
                         default:
                                 /* did we reclaim enough */
                                 if (!zone_watermark_ok(zone, order, mark,
@@ -1577,16 +1756,6 @@ try_this_zone:
  this_zone_full:
                 if (NUMA_BUILD)
                         zlc_mark_zone_full(zonelist, z);
-try_next_zone:
-               if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-                       /*
-                        * we do zlc_setup after the first zone is tried but only
-                        * if there are multiple nodes make it worthwhile
-                        */
-                       allowednodes = zlc_setup(zonelist, alloc_flags);
-                       zlc_active = 1;
-                       did_zlc_setup = 1;
-               }
         }
  
         if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1597,6 +1766,59 @@ try_next_zone:
         return page;
  }
  
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+       bool ret = false;
+
+#if NODES_SHIFT > 8
+       ret = in_interrupt();
+#endif
+       return ret;
+}
+
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+               DEFAULT_RATELIMIT_INTERVAL,
+               DEFAULT_RATELIMIT_BURST);
+
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+       va_list args;
+       unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+               return;
+
+       /*
+        * This documents exceptions given to allocations in certain
+        * contexts that are allowed to allocate outside current's set
+        * of allowed nodes.
+        */
+       if (!(gfp_mask & __GFP_NOMEMALLOC))
+               if (test_thread_flag(TIF_MEMDIE) ||
+                   (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                       filter &= ~SHOW_MEM_FILTER_NODES;
+       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+               filter &= ~SHOW_MEM_FILTER_NODES;
+
+       if (fmt) {
+               printk(KERN_WARNING);
+               va_start(args, fmt);
+               vprintk(fmt, args);
+               va_end(args);
+       }
+
+       pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+                  current->comm, order, gfp_mask);
+
+       dump_stack();
+       if (!should_suppress_show_mem())
+               show_mem(filter);
+}
+
  static inline int
  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
                                 unsigned long pages_reclaimed)
@@ -1642,7 +1864,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         struct page *page;
  
         /* Acquire the OOM killer lock for the zones in zonelist */
-       if (!try_set_zone_oom(zonelist, gfp_mask)) {
+       if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
                 schedule_timeout_uninterruptible(1);
                 return NULL;
         }
@@ -1659,18 +1881,104 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         if (page)
                 goto out;
  
-       /* The OOM killer will not help higher order allocs */
-       if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
-               goto out;
-
+       if (!(gfp_mask & __GFP_NOFAIL)) {
+               /* The OOM killer will not help higher order allocs */
+               if (order > PAGE_ALLOC_COSTLY_ORDER)
+                       goto out;
+               /* The OOM killer does not needlessly kill tasks for lowmem */
+               if (high_zoneidx < ZONE_NORMAL)
+                       goto out;
+               /*
+                * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
+                * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
+                * The caller should handle page allocation failure by itself if
+                * it specifies __GFP_THISNODE.
+                * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
+                */
+               if (gfp_mask & __GFP_THISNODE)
+                       goto out;
+       }
         /* Exhausted what can be done so it's blamo time */
-       out_of_memory(zonelist, gfp_mask, order);
+       out_of_memory(zonelist, gfp_mask, order, nodemask);
  
  out:
         clear_zonelist_oom(zonelist, gfp_mask);
         return page;
  }
  
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
+{
+       struct page *page;
+
+       if (!order)
+               return NULL;
+
+       if (compaction_deferred(preferred_zone)) {
+               *deferred_compaction = true;
+               return NULL;
+       }
+
+       current->flags |= PF_MEMALLOC;
+       *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+                                               nodemask, sync_migration);
+       current->flags &= ~PF_MEMALLOC;
+       if (*did_some_progress != COMPACT_SKIPPED) {
+
+               /* Page migration frees to the PCP lists but we want merging */
+               drain_pages(get_cpu());
+               put_cpu();
+
+               page = get_page_from_freelist(gfp_mask, nodemask,
+                               order, zonelist, high_zoneidx,
+                               alloc_flags, preferred_zone,
+                               migratetype);
+               if (page) {
+                       preferred_zone->compact_considered = 0;
+                       preferred_zone->compact_defer_shift = 0;
+                       count_vm_event(COMPACTSUCCESS);
+                       return page;
+               }
+
+               /*
+                * It's bad if compaction run occurs and fails.
+                * The most likely reason is that pages exist,
+                * but not enough to satisfy watermarks.
+                */
+               count_vm_event(COMPACTFAIL);
+
+               /*
+                * As async compaction considers a subset of pageblocks, only
+                * defer if the failure was a sync compaction failure.
+                */
+               if (sync_migration)
+                       defer_compaction(preferred_zone);
+
+               cond_resched();
+       }
+
+       return NULL;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
+{
+       return NULL;
+}
+#endif /* CONFIG_COMPACTION */
+
  /* The really slow allocator path where we enter direct reclaim */
  static inline struct page *
  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1680,33 +1988,48 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  {
         struct page *page = NULL;
         struct reclaim_state reclaim_state;
-       struct task_struct *p = current;
+       bool drained = false;
  
         cond_resched();
  
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
-       p->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
         lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       current->reclaim_state = &reclaim_state;
  
         *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
  
-       p->reclaim_state = NULL;
+       current->reclaim_state = NULL;
         lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
  
         cond_resched();
  
-       if (order != 0)
-               drain_all_pages();
+       if (unlikely(!(*did_some_progress)))
+               return NULL;
  
-       if (likely(*did_some_progress))
-               page = get_page_from_freelist(gfp_mask, nodemask, order,
+       /* After successful reclaim, reconsider all zones for allocation */
+       if (NUMA_BUILD)
+               zlc_clear_zones_full(zonelist);
+
+retry:
+       page = get_page_from_freelist(gfp_mask, nodemask, order,
                                         zonelist, high_zoneidx,
                                         alloc_flags, preferred_zone,
                                         migratetype);
+
+       /*
+        * If an allocation failed after direct reclaim, it could be because
+        * pages are pinned on the per-cpu lists. Drain them and try again
+        */
+       if (!page && !drained) {
+               drain_all_pages();
+               drained = true;
+               goto retry;
+       }
+
         return page;
  }
  
@@ -1728,7 +2051,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                         preferred_zone, migratetype);
  
                 if (!page && gfp_mask & __GFP_NOFAIL)
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
         } while (!page && (gfp_mask & __GFP_NOFAIL));
  
         return page;
@@ -1736,24 +2059,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
  
  static inline
  void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                               enum zone_type high_zoneidx)
+                                               enum zone_type high_zoneidx,
+                                               enum zone_type classzone_idx)
  {
         struct zoneref *z;
         struct zone *zone;
  
         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-               wakeup_kswapd(zone, order);
+               wakeup_kswapd(zone, order, classzone_idx);
  }
  
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
-       struct task_struct *p = current;
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
         const gfp_t wait = gfp_mask & __GFP_WAIT;
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
-       BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+       BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
  
         /*
          * The caller may dip into page reserves a bit more if the caller
@@ -1761,21 +2084,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
          * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
          */
-       alloc_flags |= (gfp_mask & __GFP_HIGH);
+       alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
         if (!wait) {
-               alloc_flags |= ALLOC_HARDER;
+               /*
+                * Not worth trying to allocate harder for
+                * __GFP_NOMEMALLOC even if it can't schedule.
+                */
+               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                       alloc_flags |= ALLOC_HARDER;
                 /*
                  * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                  * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(p)) && !in_interrupt())
+       } else if (unlikely(rt_task(current)) && !in_interrupt())
                 alloc_flags |= ALLOC_HARDER;
  
         if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                 if (!in_interrupt() &&
-                   ((p->flags & PF_MEMALLOC) ||
+                   ((current->flags & PF_MEMALLOC) ||
                      unlikely(test_thread_flag(TIF_MEMDIE))))
                         alloc_flags |= ALLOC_NO_WATERMARKS;
         }
@@ -1794,7 +2122,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
-       struct task_struct *p = current;
+       bool sync_migration = false;
+       bool deferred_compaction = false;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -1819,7 +2148,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                 goto nopage;
  
  restart:
-       wake_all_kswapd(order, zonelist, high_zoneidx);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapd(order, zonelist, high_zoneidx,
+                                               zone_idx(preferred_zone));
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -1828,6 +2159,15 @@ restart:
          */
         alloc_flags = gfp_to_alloc_flags(gfp_mask);
  
+       /*
+        * Find the true preferred zone if the allocation is unconstrained by
+        * cpusets.
+        */
+       if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+               first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                       &preferred_zone);
+
+rebalance:
         /* This is the last chance, in general, before the goto nopage. */
         page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                         high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1835,7 +2175,6 @@ restart:
         if (page)
                 goto got_pg;
  
-rebalance:
         /* Allocate without watermarks if the context allows */
         if (alloc_flags & ALLOC_NO_WATERMARKS) {
                 page = __alloc_pages_high_priority(gfp_mask, order,
@@ -1850,13 +2189,37 @@ rebalance:
                 goto nopage;
  
         /* Avoid recursion of direct reclaim */
-       if (p->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC)
                 goto nopage;
  
         /* Avoid allocations with no watermarks from looping endlessly */
         if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                 goto nopage;
  
+       /*
+        * Try direct compaction. The first pass is asynchronous. Subsequent
+        * attempts after direct reclaim are synchronous
+        */
+       page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
+       if (page)
+               goto got_pg;
+       sync_migration = true;
+
+       /*
+        * If compaction is deferred for high-order allocations, it is because
+        * sync compaction recently failed. In this is the case and the caller
+        * has requested the system not be heavily disrupted, fail the
+        * allocation now instead of entering direct reclaim
+        */
+       if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+               goto nopage;
+
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                         zonelist, high_zoneidx,
@@ -1881,36 +2244,61 @@ rebalance:
                         if (page)
                                 goto got_pg;
  
-                       /*
-                        * The OOM killer does not trigger for high-order
-                        * ~__GFP_NOFAIL allocations so if no progress is being
-                        * made, there are no other options and retrying is
-                        * unlikely to help.
-                        */
-                       if (order > PAGE_ALLOC_COSTLY_ORDER &&
-                                               !(gfp_mask & __GFP_NOFAIL))
-                               goto nopage;
+                       if (!(gfp_mask & __GFP_NOFAIL)) {
+                               /*
+                                * The oom killer is not called for high-order
+                                * allocations that may fail, so if no progress
+                                * is being made, there are no other options and
+                                * retrying is unlikely to help.
+                                */
+                               if (order > PAGE_ALLOC_COSTLY_ORDER)
+                                       goto nopage;
+                               /*
+                                * The oom killer is not called for lowmem
+                                * allocations to prevent needlessly killing
+                                * innocent tasks.
+                                */
+                               if (high_zoneidx < ZONE_NORMAL)
+                                       goto nopage;
+                       }
  
                         goto restart;
                 }
+
+               /*
+                * Suspend converts GFP_KERNEL to __GFP_WAIT which can
+                * prevent reclaim making forward progress without
+                * invoking OOM. Bail if we are suspending
+                */
+               if (pm_suspending())
+                       goto nopage;
         }
  
         /* Check if we should retry the allocation */
         pages_reclaimed += did_some_progress;
         if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                 /* Wait for some write requests to complete then retry */
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto rebalance;
+       } else {
+               /*
+                * High-order allocations do not necessarily loop after
+                * direct reclaim and reclaim/compaction depends on compaction
+                * being called after reclaim so call directly if necessary
+                */
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
+               if (page)
+                       goto got_pg;
         }
  
  nopage:
-       if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
-               printk(KERN_WARNING "%s: page allocation failure."
-                       " order:%d, mode:0x%x\n",
-                       p->comm, order, gfp_mask);
-               dump_stack();
-               show_mem();
-       }
+       warn_alloc_failed(gfp_mask, order, NULL);
         return page;
  got_pg:
         if (kmemcheck_enabled)
@@ -1928,8 +2316,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  {
         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
         struct zone *preferred_zone;
-       struct page *page;
+       struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
  
         gfp_mask &= gfp_allowed_mask;
  
@@ -1948,10 +2337,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!zonelist->_zonerefs->zone))
                 return NULL;
  
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
         /* The preferred zone is used for statistics later */
-       first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+       first_zones_zonelist(zonelist, high_zoneidx,
+                               nodemask ? : &cpuset_current_mems_allowed,
+                               &preferred_zone);
         if (!preferred_zone)
-               return NULL;
+               goto out;
  
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1963,6 +2357,17 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                                 preferred_zone, migratetype);
  
         trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+       /*
+        * When updating a task's mems_allowed, it is possible to race with
+        * parallel threads in such a way that an allocation can fail while
+        * the mask is being updated. If a page allocation is about to fail,
+        * check if the cpuset changed during allocation and if so, retry.
+        */
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
         return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2006,9 +2411,8 @@ void __pagevec_free(struct pagevec *pvec)
  void __free_pages(struct page *page, unsigned int order)
  {
         if (put_page_testzero(page)) {
-               trace_mm_page_free_direct(page, order);
                 if (order == 0)
-                       free_hot_page(page);
+                       free_hot_cold_page(page, 0);
                 else
                         __free_pages_ok(page, order);
         }
@@ -2026,6 +2430,21 @@ void free_pages(unsigned long addr, unsigned int order)
  
  EXPORT_SYMBOL(free_pages);
  
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+       if (addr) {
+               unsigned long alloc_end = addr + (PAGE_SIZE << order);
+               unsigned long used = addr + PAGE_ALIGN(size);
+
+               split_page(virt_to_page((void *)addr), order);
+               while (used < alloc_end) {
+                       free_page(used);
+                       used += PAGE_SIZE;
+               }
+       }
+       return (void *)addr;
+}
+
  /**
   * alloc_pages_exact - allocate an exact number physically-contiguous pages.
   * @size: the number of bytes to allocate
@@ -2045,21 +2464,32 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
         unsigned long addr;
  
         addr = __get_free_pages(gfp_mask, order);
-       if (addr) {
-               unsigned long alloc_end = addr + (PAGE_SIZE << order);
-               unsigned long used = addr + PAGE_ALIGN(size);
-
-               split_page(virt_to_page((void *)addr), order);
-               while (used < alloc_end) {
-                       free_page(used);
-                       used += PAGE_SIZE;
-               }
-       }
-
-       return (void *)addr;
+       return make_alloc_exact(addr, order, size);
  }
  EXPORT_SYMBOL(alloc_pages_exact);
  
+/**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ *                        pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+       unsigned order = get_order(size);
+       struct page *p = alloc_pages_node(nid, gfp_mask, order);
+       if (!p)
+               return NULL;
+       return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+
  /**
   * free_pages_exact - release memory allocated via alloc_pages_exact()
   * @virt: the value returned by alloc_pages_exact.
@@ -2154,26 +2584,50 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  }
  #endif
  
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+bool skip_free_areas_node(unsigned int flags, int nid)
+{
+       bool ret = false;
+       unsigned int cpuset_mems_cookie;
+
+       if (!(flags & SHOW_MEM_FILTER_NODES))
+               goto out;
+
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               ret = !node_isset(nid, cpuset_current_mems_allowed);
+       } while (!put_mems_allowed(cpuset_mems_cookie));
+out:
+       return ret;
+}
+
  #define K(x) ((x) << (PAGE_SHIFT-10))
  
  /*
   * Show free area list (used inside shift_scroll-lock stuff)
   * We also calculate the percentage fragmentation. We do this by counting the
   * memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
   */
-void show_free_areas(void)
+void show_free_areas(unsigned int filter)
  {
         int cpu;
         struct zone *zone;
  
         for_each_populated_zone(zone) {
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                       continue;
                 show_node(zone);
                 printk("%s per-cpu:\n", zone->name);
  
                 for_each_online_cpu(cpu) {
                         struct per_cpu_pageset *pageset;
  
-                       pageset = zone_pcp(zone, cpu);
+                       pageset = per_cpu_ptr(zone->pageset, cpu);
  
                         printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
                                cpu, pageset->pcp.high,
@@ -2208,6 +2662,8 @@ void show_free_areas(void)
         for_each_populated_zone(zone) {
                 int i;
  
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                       continue;
                 show_node(zone);
                 printk("%s"
                         " free:%lukB"
@@ -2264,7 +2720,7 @@ void show_free_areas(void)
                         K(zone_page_state(zone, NR_BOUNCE)),
                         K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                         zone->pages_scanned,
-                       (zone_is_all_unreclaimable(zone) ? "yes" : "no")
+                       (zone->all_unreclaimable ? "yes" : "no")
                         );
                 printk("lowmem_reserve[]:");
                 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2275,6 +2731,8 @@ void show_free_areas(void)
         for_each_populated_zone(zone) {
                 unsigned long nr[MAX_ORDER], flags, order, total = 0;
  
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                       continue;
                 show_node(zone);
                 printk("%s: ", zone->name);
  
@@ -2381,9 +2839,16 @@ static int __parse_numa_zonelist_order(char *s)
  
  static __init int setup_numa_zonelist_order(char *s)
  {
-       if (s)
-               return __parse_numa_zonelist_order(s);
-       return 0;
+       int ret;
+
+       if (!s)
+               return 0;
+
+       ret = __parse_numa_zonelist_order(s);
+       if (ret == 0)
+               strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+       return ret;
  }
  early_param("numa_zonelist_order", setup_numa_zonelist_order);
  
@@ -2396,13 +2861,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
  {
         char saved_string[NUMA_ZONELIST_ORDER_LEN];
         int ret;
+       static DEFINE_MUTEX(zl_order_mutex);
  
+       mutex_lock(&zl_order_mutex);
         if (write)
-               strncpy(saved_string, (char*)table->data,
-                       NUMA_ZONELIST_ORDER_LEN);
+               strcpy(saved_string, (char*)table->data);
         ret = proc_dostring(table, write, buffer, length, ppos);
         if (ret)
-               return ret;
+               goto out;
         if (write) {
                 int oldval = user_zonelist_order;
                 if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2412,10 +2878,15 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                         strncpy((char*)table->data, saved_string,
                                 NUMA_ZONELIST_ORDER_LEN);
                         user_zonelist_order = oldval;
-               } else if (oldval != user_zonelist_order)
-                       build_all_zonelists();
+               } else if (oldval != user_zonelist_order) {
+                       mutex_lock(&zonelists_mutex);
+                       build_all_zonelists(NULL);
+                       mutex_unlock(&zonelists_mutex);
+               }
         }
-       return 0;
+out:
+       mutex_unlock(&zl_order_mutex);
+       return ret;
  }
  
  
@@ -2555,10 +3026,10 @@ static int default_zonelist_order(void)
         struct zone *z;
         int average_size;
         /*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
          * If they are really small and used heavily, the system can fall
          * into OOM very easily.
-        * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+        * This function detect ZONE_DMA/DMA32 size and configures zone order.
          */
         /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
         low_kmem_size = 0;
@@ -2570,6 +3041,15 @@ static int default_zonelist_order(void)
                                 if (zone_type < ZONE_NORMAL)
                                         low_kmem_size += z->present_pages;
                                 total_size += z->present_pages;
+                       } else if (zone_type == ZONE_NORMAL) {
+                               /*
+                                * If any node has only lowmem, then node order
+                                * is preferred to allow kernel allocations
+                                * locally; otherwise, they can easily infringe
+                                * on other nodes when there is an abundance of
+                                * lowmem available to allocate from.
+                                */
+                               return ZONELIST_ORDER_NODE;
                         }
                 }
         }
@@ -2683,6 +3163,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)
                 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
  }
  
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * Return node id of node used for "local" allocations.
+ * I.e., first node id of first zone in arg node's generic zonelist.
+ * Used for initializing percpu 'numa_mem', which is used primarily
+ * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
+ */
+int local_memory_node(int node)
+{
+       struct zone *zone;
+
+       (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+                                  gfp_zone(GFP_KERNEL),
+                                  NULL,
+                                  &zone);
+       return zone->node;
+}
+#endif
  
  #else  /* CONFIG_NUMA */
  
@@ -2735,10 +3233,36 @@ static void build_zonelist_cache(pg_data_t *pgdat)
  
  #endif /* CONFIG_NUMA */
  
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
+
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
+
  /* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *dummy)
+static __init_refok int __build_all_zonelists(void *data)
  {
         int nid;
+       int cpu;
  
  #ifdef CONFIG_NUMA
         memset(node_load, 0, sizeof(node_load));
@@ -2749,10 +3273,45 @@ static int __build_all_zonelists(void *dummy)
                 build_zonelists(pgdat);
                 build_zonelist_cache(pgdat);
         }
+
+       /*
+        * Initialize the boot_pagesets that are going to be used
+        * for bootstrapping processors. The real pagesets for
+        * each zone will be allocated later when the per cpu
+        * allocator is available.
+        *
+        * boot_pagesets are used also for bootstrapping offline
+        * cpus if the system is already booted because the pagesets
+        * are needed to initialize allocators on a specific cpu too.
+        * F.e. the percpu allocator needs the page allocator which
+        * needs the percpu allocator in order to allocate its pagesets
+        * (a chicken-egg dilemma).
+        */
+       for_each_possible_cpu(cpu) {
+               setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+               /*
+                * We now know the "local memory node" for each node--
+                * i.e., the node of the first zone in the generic zonelist.
+                * Set up numa_mem percpu variable for on-line cpus.  During
+                * boot, only the boot cpu should be on-line;  we'll init the
+                * secondary cpus' numa_mem as they come on-line.  During
+                * node/memory hotplug, we'll fixup all on-line cpus.
+                */
+               if (cpu_online(cpu))
+                       set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+#endif
+       }
+
         return 0;
  }
  
-void build_all_zonelists(void)
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
+void __ref build_all_zonelists(void *data)
  {
         set_zonelist_order();
  
@@ -2763,6 +3322,10 @@ void build_all_zonelists(void)
         } else {
                 /* we have to stop all cpus to guarantee there is no user
                    of zonelist */
+#ifdef CONFIG_MEMORY_HOTPLUG
+               if (data)
+                       setup_zone_pageset((struct zone *)data);
+#endif
                 stop_machine(__build_all_zonelists, NULL, NULL);
                 /* cpuset refresh routine should be here */
         }
@@ -2858,6 +3421,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
  
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
  
+/*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pfn;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+                       return 1;
+       }
+       return 0;
+}
+
  /*
   * Mark a number of pageblocks as MIGRATE_RESERVE. The number
   * of blocks reserved is based on min_wmark_pages(zone). The memory within
@@ -2867,14 +3444,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
   */
  static void setup_zone_migrate_reserve(struct zone *zone)
  {
-       unsigned long start_pfn, pfn, end_pfn;
+       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
         struct page *page;
         unsigned long block_migratetype;
         int reserve;
  
-       /* Get the start pfn, end pfn and the number of blocks to reserve */
+       /*
+        * Get the start pfn, end pfn and the number of blocks to reserve
+        * We have to be careful to be aligned to pageblock_nr_pages to
+        * make sure that we always check pfn_valid for the first page in
+        * the block.
+        */
         start_pfn = zone->zone_start_pfn;
         end_pfn = start_pfn + zone->spanned_pages;
+       start_pfn = roundup(start_pfn, pageblock_nr_pages);
         reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                         pageblock_order;
  
@@ -2896,24 +3479,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                 if (page_to_nid(page) != zone_to_nid(zone))
                         continue;
  
-               /* Blocks with reserved pages will never free, skip them. */
-               if (PageReserved(page))
-                       continue;
-
                 block_migratetype = get_pageblock_migratetype(page);
  
-               /* If this block is reserved, account for it */
-               if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
-                       reserve--;
-                       continue;
-               }
+               /* Only test what is necessary when the reserves are not met */
+               if (reserve > 0) {
+                       /*
+                        * Blocks with reserved pages will never free, skip
+                        * them.
+                        */
+                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                       if (pageblock_is_reserved(pfn, block_end_pfn))
+                               continue;
  
-               /* Suitable for reserving if this block is movable */
-               if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
-                       set_pageblock_migratetype(page, MIGRATE_RESERVE);
-                       move_freepages_block(zone, page, MIGRATE_RESERVE);
-                       reserve--;
-                       continue;
+                       /* If this block is reserved, account for it */
+                       if (block_migratetype == MIGRATE_RESERVE) {
+                               reserve--;
+                               continue;
+                       }
+
+                       /* Suitable for reserving if this block is movable */
+                       if (block_migratetype == MIGRATE_MOVABLE) {
+                               set_pageblock_migratetype(page,
+                                                       MIGRATE_RESERVE);
+                               move_freepages_block(zone, page,
+                                                       MIGRATE_RESERVE);
+                               reserve--;
+                               continue;
+                       }
                 }
  
                 /*
@@ -3086,121 +3678,36 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                 pcp->batch = PAGE_SHIFT * 8;
  }
  
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
+static void setup_zone_pageset(struct zone *zone)
  {
-       struct zone *zone, *dzone;
-       int node = cpu_to_node(cpu);
+       int cpu;
  
-       node_set_state(node, N_CPU);    /* this node has a cpu */
+       zone->pageset = alloc_percpu(struct per_cpu_pageset);
  
-       for_each_populated_zone(zone) {
-               zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-                                        GFP_KERNEL, node);
-               if (!zone_pcp(zone, cpu))
-                       goto bad;
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
  
-               setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+               setup_pageset(pcp, zone_batchsize(zone));
  
                 if (percpu_pagelist_fraction)
-                       setup_pagelist_highmark(zone_pcp(zone, cpu),
-                               (zone->present_pages / percpu_pagelist_fraction));
-       }
-
-       return 0;
-bad:
-       for_each_zone(dzone) {
-               if (!populated_zone(dzone))
-                       continue;
-               if (dzone == zone)
-                       break;
-               kfree(zone_pcp(dzone, cpu));
-               zone_pcp(dzone, cpu) = &boot_pageset[cpu];
-       }
-       return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
-       struct zone *zone;
-
-       for_each_zone(zone) {
-               struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
-               /* Free per_cpu_pageset if it is slab allocated */
-               if (pset != &boot_pageset[cpu])
-                       kfree(pset);
-               zone_pcp(zone, cpu) = &boot_pageset[cpu];
-       }
-}
-
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
-               unsigned long action,
-               void *hcpu)
-{
-       int cpu = (long)hcpu;
-       int ret = NOTIFY_OK;
-
-       switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               if (process_zones(cpu))
-                       ret = NOTIFY_BAD;
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               free_zone_pagesets(cpu);
-               break;
-       default:
-               break;
+                       setup_pagelist_highmark(pcp,
+                               (zone->present_pages /
+                                       percpu_pagelist_fraction));
         }
-       return ret;
  }
  
-static struct notifier_block __cpuinitdata pageset_notifier =
-       { &pageset_cpuup_callback, NULL, 0 };
-
+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ */
  void __init setup_per_cpu_pageset(void)
  {
-       int err;
+       struct zone *zone;
  
-       /* Initialize per_cpu_pageset for cpu 0.
-        * A cpuup callback will do this for every cpu
-        * as it comes online
-        */
-       err = process_zones(smp_processor_id());
-       BUG_ON(err);
-       register_cpu_notifier(&pageset_notifier);
+       for_each_populated_zone(zone)
+               setup_zone_pageset(zone);
  }
  
-#endif
-
  static noinline __init_refok
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  {
@@ -3221,7 +3728,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  
         if (!slab_is_available()) {
                 zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node(pgdat, alloc_size);
+                       alloc_bootmem_node_nopanic(pgdat, alloc_size);
         } else {
                 /*
                  * This case means that a zone whose size was 0 gets new memory
@@ -3250,11 +3757,11 @@ static int __zone_pcp_update(void *data)
         int cpu;
         unsigned long batch = zone_batchsize(zone), flags;
  
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+       for_each_possible_cpu(cpu) {
                 struct per_cpu_pageset *pset;
                 struct per_cpu_pages *pcp;
  
-               pset = zone_pcp(zone, cpu);
+               pset = per_cpu_ptr(zone->pageset, cpu);
                 pcp = &pset->pcp;
  
                 local_irq_save(flags);
@@ -3272,21 +3779,17 @@ void zone_pcp_update(struct zone *zone)
  
  static __meminit void zone_pcp_init(struct zone *zone)
  {
-       int cpu;
-       unsigned long batch = zone_batchsize(zone);
+       /*
+        * per cpu subsystem is not up at this point. The following code
+        * relies on the ability of the linker to provide the
+        * offset of a (static) per cpu variable into the per cpu area.
+        */
+       zone->pageset = &boot_pageset;
  
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-               /* Early boot. Slab allocator not functional yet */
-               zone_pcp(zone, cpu) = &boot_pageset[cpu];
-               setup_pageset(&boot_pageset[cpu],0);
-#else
-               setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-       }
         if (zone->present_pages)
-               printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-                       zone->name, zone->present_pages, batch);
+               printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
+                       zone->name, zone->present_pages,
+                                        zone_batchsize(zone));
  }
  
  __meminit int init_currently_empty_zone(struct zone *zone,
@@ -3425,6 +3928,88 @@ void __init free_bootmem_with_active_regions(int nid,
         }
  }
  
+#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+       int i;
+
+       for (i = nr_nodemap_entries - 1; i >= 0; i--)
+               if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+                       return i;
+
+       return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+       for (index = index - 1; index >= 0; index--)
+               if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                       return index;
+
+       return -1;
+}
+
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+       for (i = last_active_region_index_in_nid(nid); i != -1; \
+                               i = previous_active_region_index_in_nid(i, nid))
+
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+                                       u64 goal, u64 limit)
+{
+       int i;
+
+       /* Need to go over early_node_map to find out good range for node */
+       for_each_active_range_index_in_nid_reverse(i, nid) {
+               u64 addr;
+               u64 ei_start, ei_last;
+               u64 final_start, final_end;
+
+               ei_last = early_node_map[i].end_pfn;
+               ei_last <<= PAGE_SHIFT;
+               ei_start = early_node_map[i].start_pfn;
+               ei_start <<= PAGE_SHIFT;
+
+               final_start = max(ei_start, goal);
+               final_end = min(ei_last, limit);
+
+               if (final_start >= final_end)
+                       continue;
+
+               addr = memblock_find_in_range(final_start, final_end, size, align);
+
+               if (addr == MEMBLOCK_ERROR)
+                       continue;
+
+               return addr;
+       }
+
+       return MEMBLOCK_ERROR;
+}
+#endif
+
+int __init add_from_early_node_map(struct range *range, int az,
+                                  int nr_range, int nid)
+{
+       int i;
+       u64 start, end;
+
+       /* need to go over early_node_map to find out good range for node */
+       for_each_active_range_index_in_nid(i, nid) {
+               start = early_node_map[i].start_pfn;
+               end = early_node_map[i].end_pfn;
+               nr_range = add_range(range, az, nr_range, start, end);
+       }
+       return nr_range;
+}
+
  void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
  {
         int i;
@@ -3505,7 +4090,7 @@ static void __init find_usable_zone_for_movable(void)
  
  /*
   * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
   * the starting point for ZONE_MOVABLE is not fixed. It may be different
   * in each node depending on the size of each node and how evenly kernelcore
   * is distributed. This helper function adjusts the zone ranges
@@ -3574,7 +4159,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
   * then all holes in the requested range will be accounted for.
   */
-static unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
                                 unsigned long range_start_pfn,
                                 unsigned long range_end_pfn)
  {
@@ -3720,10 +4305,11 @@ static void __init setup_usemap(struct pglist_data *pgdat,
         unsigned long usemapsize = usemap_size(zonesize);
         zone->pageblock_flags = NULL;
         if (usemapsize)
-               zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+               zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                                                                  usemapsize);
  }
  #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                 struct zone *zone, unsigned long zonesize) {}
  #endif /* CONFIG_SPARSEMEM */
  
@@ -3839,13 +4425,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 zone_seqlock_init(zone);
                 zone->zone_pgdat = pgdat;
  
-               zone->prev_priority = DEF_PRIORITY;
-
                 zone_pcp_init(zone);
-               for_each_lru(l) {
+               for_each_lru(l)
                         INIT_LIST_HEAD(&zone->lru[l].list);
-                       zone->reclaim_stat.nr_saved_scan[l] = 0;
-               }
                 zone->reclaim_stat.recent_rotated[0] = 0;
                 zone->reclaim_stat.recent_rotated[1] = 0;
                 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -3888,7 +4470,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 size =  (end - start) * sizeof(struct page);
                 map = alloc_remap(pgdat->node_id, size);
                 if (!map)
-                       map = alloc_bootmem_node(pgdat, size);
+                       map = alloc_bootmem_node_nopanic(pgdat, size);
                 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
         }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -3989,7 +4571,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
                 }
  
                 /* Merge backward if suitable */
-               if (start_pfn < early_node_map[i].end_pfn &&
+               if (start_pfn < early_node_map[i].start_pfn &&
                                 end_pfn >= early_node_map[i].start_pfn) {
                         early_node_map[i].start_pfn = start_pfn;
                         return;
@@ -4103,7 +4685,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
  }
  
  /* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
+void __init sort_node_map(void)
  {
         sort(early_node_map, (size_t)nr_nodemap_entries,
                         sizeof(struct node_active_region),
@@ -4367,8 +4949,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
         for (i = 0; i < MAX_NR_ZONES; i++) {
                 if (i == ZONE_MOVABLE)
                         continue;
-               printk("  %-8s %0#10lx -> %0#10lx\n",
-                               zone_names[i],
+               printk("  %-8s ", zone_names[i]);
+               if (arch_zone_lowest_possible_pfn[i] ==
+                               arch_zone_highest_possible_pfn[i])
+                       printk("empty\n");
+               else
+                       printk("%0#10lx -> %0#10lx\n",
                                 arch_zone_lowest_possible_pfn[i],
                                 arch_zone_highest_possible_pfn[i]);
         }
@@ -4456,11 +5042,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
         dma_reserve = new_dma_reserve;
  }
  
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
  void __init free_area_init(unsigned long *zones_size)
  {
         free_area_init_node(0, zones_size,
@@ -4654,7 +5235,7 @@ void setup_per_zone_wmarks(void)
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
  {
         unsigned int gb, ratio;
  
@@ -4668,7 +5249,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
         zone->inactive_ratio = ratio;
  }
  
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
  {
         struct zone *zone;
  
@@ -4700,7 +5281,7 @@ static void __init setup_per_zone_inactive_ratio(void)
   * 8192MB:     11584k
   * 16384MB:    16384k
   */
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
  {
         unsigned long lowmem_kbytes;
  
@@ -4712,6 +5293,7 @@ static int __init init_per_zone_wmark_min(void)
         if (min_free_kbytes > 65536)
                 min_free_kbytes = 65536;
         setup_per_zone_wmarks();
+       refresh_zone_stat_thresholds();
         setup_per_zone_lowmem_reserve();
         setup_per_zone_inactive_ratio();
         return 0;
@@ -4800,10 +5382,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
         if (!write || (ret == -EINVAL))
                 return ret;
         for_each_populated_zone(zone) {
-               for_each_online_cpu(cpu) {
+               for_each_possible_cpu(cpu) {
                         unsigned long  high;
                         high = zone->present_pages / percpu_pagelist_fraction;
-                       setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+                       setup_pagelist_highmark(
+                               per_cpu_ptr(zone->pageset, cpu), high);
                 }
         }
         return 0;
@@ -4901,9 +5484,9 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (!table)
                 panic("Failed to allocate %s hash table\n", tablename);
  
-       printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
+       printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
                tablename,
-              (1U << log2qty),
+              (1UL << log2qty),
                ilog2(size) - PAGE_SHIFT,
                size);
  
@@ -4932,7 +5515,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
         pfn &= (PAGES_PER_SECTION-1);
         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
  #else
-       pfn = pfn - zone->zone_start_pfn;
+       pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
  #endif /* CONFIG_SPARSEMEM */
  }
@@ -5000,26 +5583,121 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
   * page allocater never alloc memory from ISOLATE block.
   */
  
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+       unsigned long pfn, iter, found;
+       /*
+        * For avoiding noise data, lru_add_drain_all() should be called
+        * If ZONE_MOVABLE, the zone never contains immobile pages
+        */
+       if (zone_idx(zone) == ZONE_MOVABLE)
+               return true;
+
+       if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+               return true;
+
+       pfn = page_to_pfn(page);
+       for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+               unsigned long check = pfn + iter;
+
+               if (!pfn_valid_within(check))
+                       continue;
+
+               page = pfn_to_page(check);
+               if (!page_count(page)) {
+                       if (PageBuddy(page))
+                               iter += (1 << page_order(page)) - 1;
+                       continue;
+               }
+               if (!PageLRU(page))
+                       found++;
+               /*
+                * If there are RECLAIMABLE pages, we need to check it.
+                * But now, memory offline itself doesn't call shrink_slab()
+                * and it still to be fixed.
+                */
+               /*
+                * If the page is not RAM, page_count()should be 0.
+                * we don't need more check. This is an _used_ not-movable page.
+                *
+                * The problematic thing here is PG_reserved pages. PG_reserved
+                * is set to both of a memory hole page and a _used_ kernel
+                * page at boot.
+                */
+               if (found > count)
+                       return false;
+       }
+       return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+       struct zone *zone = page_zone(page);
+       unsigned long pfn = page_to_pfn(page);
+
+       /*
+        * We have to be careful here because we are iterating over memory
+        * sections which are not zone aware so we might end up outside of
+        * the zone but still within the section.
+        */
+       if (!zone || zone->zone_start_pfn > pfn ||
+                       zone->zone_start_pfn + zone->spanned_pages <= pfn)
+               return false;
+
+       return __count_immobile_pages(zone, page, 0);
+}
+
  int set_migratetype_isolate(struct page *page)
  {
         struct zone *zone;
-       unsigned long flags;
+       unsigned long flags, pfn;
+       struct memory_isolate_notify arg;
+       int notifier_ret;
         int ret = -EBUSY;
-       int zone_idx;
  
         zone = page_zone(page);
-       zone_idx = zone_idx(zone);
+
         spin_lock_irqsave(&zone->lock, flags);
+
+       pfn = page_to_pfn(page);
+       arg.start_pfn = pfn;
+       arg.nr_pages = pageblock_nr_pages;
+       arg.pages_found = 0;
+
         /*
-        * In future, more migrate types will be able to be isolation target.
+        * It may be possible to isolate a pageblock even if the
+        * migratetype is not MIGRATE_MOVABLE. The memory isolation
+        * notifier chain is used by balloon drivers to return the
+        * number of pages in a range that are held by the balloon
+        * driver to shrink memory. If all the pages are accounted for
+        * by balloons, are free, or on the LRU, isolation can continue.
+        * Later, for example, when memory hotplug notifier runs, these
+        * pages reported as "can be isolated" should be isolated(freed)
+        * by the balloon driver through the memory notifier chain.
          */
-       if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
-           zone_idx != ZONE_MOVABLE)
+       notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+       notifier_ret = notifier_to_errno(notifier_ret);
+       if (notifier_ret)
                 goto out;
-       set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-       move_freepages_block(zone, page, MIGRATE_ISOLATE);
-       ret = 0;
+       /*
+        * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+        * We just check MOVABLE pages.
+        */
+       if (__count_immobile_pages(zone, page, arg.pages_found))
+               ret = 0;
+
+       /*
+        * immobile means "not-on-lru" paes. If immobile is larger than
+        * removable-by-driver pages reported by notifier, we'll fail.
+        */
+
  out:
+       if (!ret) {
+               set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+               move_freepages_block(zone, page, MIGRATE_ISOLATE);
+       }
+
         spin_unlock_irqrestore(&zone->lock, flags);
         if (!ret)
                 drain_all_pages();
@@ -5086,3 +5764,101 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
         spin_unlock_irqrestore(&zone->lock, flags);
  }
  #endif
+
+#ifdef CONFIG_MEMORY_FAILURE
+bool is_free_buddy_page(struct page *page)
+{
+       struct zone *zone = page_zone(page);
+       unsigned long pfn = page_to_pfn(page);
+       unsigned long flags;
+       int order;
+
+       spin_lock_irqsave(&zone->lock, flags);
+       for (order = 0; order < MAX_ORDER; order++) {
+               struct page *page_head = page - (pfn & ((1 << order) - 1));
+
+               if (PageBuddy(page_head) && page_order(page_head) >= order)
+                       break;
+       }
+       spin_unlock_irqrestore(&zone->lock, flags);
+
+       return order < MAX_ORDER;
+}
+#endif
+
+static struct trace_print_flags pageflag_names[] = {
+       {1UL << PG_locked,              "locked"        },
+       {1UL << PG_error,               "error"         },
+       {1UL << PG_referenced,          "referenced"    },
+       {1UL << PG_uptodate,            "uptodate"      },
+       {1UL << PG_dirty,               "dirty"         },
+       {1UL << PG_lru,                 "lru"           },
+       {1UL << PG_active,              "active"        },
+       {1UL << PG_slab,                "slab"          },
+       {1UL << PG_owner_priv_1,        "owner_priv_1"  },
+       {1UL << PG_arch_1,              "arch_1"        },
+       {1UL << PG_reserved,            "reserved"      },
+       {1UL << PG_private,             "private"       },
+       {1UL << PG_private_2,           "private_2"     },
+       {1UL << PG_writeback,           "writeback"     },
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+       {1UL << PG_head,                "head"          },
+       {1UL << PG_tail,                "tail"          },
+#else
+       {1UL << PG_compound,            "compound"      },
+#endif
+       {1UL << PG_swapcache,           "swapcache"     },
+       {1UL << PG_mappedtodisk,        "mappedtodisk"  },
+       {1UL << PG_reclaim,             "reclaim"       },
+       {1UL << PG_swapbacked,          "swapbacked"    },
+       {1UL << PG_unevictable,         "unevictable"   },
+#ifdef CONFIG_MMU
+       {1UL << PG_mlocked,             "mlocked"       },
+#endif
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+       {1UL << PG_uncached,            "uncached"      },
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+       {1UL << PG_hwpoison,            "hwpoison"      },
+#endif
+       {-1UL,                          NULL            },
+};
+
+static void dump_page_flags(unsigned long flags)
+{
+       const char *delim = "";
+       unsigned long mask;
+       int i;
+
+       printk(KERN_ALERT "page flags: %#lx(", flags);
+
+       /* remove zone id */
+       flags &= (1UL << NR_PAGEFLAGS) - 1;
+
+       for (i = 0; pageflag_names[i].name && flags; i++) {
+
+               mask = pageflag_names[i].mask;
+               if ((flags & mask) != mask)
+                       continue;
+
+               flags &= ~mask;
+               printk("%s%s", delim, pageflag_names[i].name);
+               delim = "|";
+       }
+
+       /* check for left over flags */
+       if (flags)
+               printk("%s%#lx", delim, flags);
+
+       printk(")\n");
+}
+
+void dump_page(struct page *page)
+{
+       printk(KERN_ALERT
+              "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+               page, atomic_read(&page->_count), page_mapcount(page),
+               page->mapping, page->index);
+       dump_page_flags(page->flags);
+       mem_cgroup_print_bad_page(page);
+}