mm: set zone->present_pages to number of existing pages in the zone
[firefly-linux-kernel-4.4.55.git] / mm / page_alloc.c
index d1107adf174a5390c597414ea335494e857f1af2..07fe78d01ffd8f9e50ae41f855671fa1fd7b54b6 100644 (file)
@@ -202,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/* Movable memory ranges, will also be used by memblock subsystem. */
+struct movablemem_map movablemem_map = {
+       .acpi = false,
+       .nr_map = 0,
+};
+
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -2801,7 +2808,7 @@ static unsigned int nr_free_zone_pages(int offset)
        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 
        for_each_zone_zonelist(zone, z, zonelist, offset) {
-               unsigned long size = zone->present_pages;
+               unsigned long size = zone->managed_pages;
                unsigned long high = high_wmark_pages(zone);
                if (size > high)
                        sum += size - high;
@@ -2854,7 +2861,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        val->totalram = pgdat->node_present_pages;
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
-       val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+       val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
        val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
                        NR_FREE_PAGES);
 #else
@@ -3932,7 +3939,7 @@ static int __meminit zone_batchsize(struct zone *zone)
         *
         * OK, so we don't know how big the cache is.  So guess.
         */
-       batch = zone->present_pages / 1024;
+       batch = zone->managed_pages / 1024;
        if (batch * PAGE_SIZE > 512 * 1024)
                batch = (512 * 1024) / PAGE_SIZE;
        batch /= 4;             /* We effectively *= 4 below */
@@ -4016,7 +4023,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
 
                if (percpu_pagelist_fraction)
                        setup_pagelist_highmark(pcp,
-                               (zone->present_pages /
+                               (zone->managed_pages /
                                        percpu_pagelist_fraction));
        }
 }
@@ -4372,6 +4379,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
+/**
+ * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
+ *
+ * zone_movable_limit is initialized as 0. This function will try to get
+ * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
+ * assigne them to zone_movable_limit.
+ * zone_movable_limit[nid] == 0 means no limit for the node.
+ *
+ * Note: Each range is represented as [start_pfn, end_pfn)
+ */
+static void __meminit sanitize_zone_movable_limit(void)
+{
+       int map_pos = 0, i, nid;
+       unsigned long start_pfn, end_pfn;
+
+       if (!movablemem_map.nr_map)
+               return;
+
+       /* Iterate all ranges from minimum to maximum */
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+               /*
+                * If we have found lowest pfn of ZONE_MOVABLE of the node
+                * specified by user, just go on to check next range.
+                */
+               if (zone_movable_limit[nid])
+                       continue;
+
+#ifdef CONFIG_ZONE_DMA
+               /* Skip DMA memory. */
+               if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
+                       start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+               /* Skip DMA32 memory. */
+               if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
+                       start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
+#endif
+
+#ifdef CONFIG_HIGHMEM
+               /* Skip lowmem if ZONE_MOVABLE is highmem. */
+               if (zone_movable_is_highmem() &&
+                   start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
+                       start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
+#endif
+
+               if (start_pfn >= end_pfn)
+                       continue;
+
+               while (map_pos < movablemem_map.nr_map) {
+                       if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
+                               break;
+
+                       if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
+                               map_pos++;
+                               continue;
+                       }
+
+                       /*
+                        * The start_pfn of ZONE_MOVABLE is either the minimum
+                        * pfn specified by movablemem_map, or 0, which means
+                        * the node has no ZONE_MOVABLE.
+                        */
+                       zone_movable_limit[nid] = max(start_pfn,
+                                       movablemem_map.map[map_pos].start_pfn);
+
+                       break;
+               }
+       }
+}
+
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
@@ -4389,7 +4467,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 
        return zholes_size[zone_type];
 }
-
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4573,7 +4650,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                nr_all_pages += freesize;
 
                zone->spanned_pages = size;
-               zone->present_pages = freesize;
+               zone->present_pages = realsize;
                /*
                 * Set an approximate value for lowmem here, it will be adjusted
                 * when the bootmem allocator frees pages into the buddy system.
@@ -4831,12 +4908,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                required_kernelcore = max(required_kernelcore, corepages);
        }
 
-       /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-       if (!required_kernelcore)
+       /*
+        * If neither kernelcore/movablecore nor movablemem_map is specified,
+        * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
+        * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
+        */
+       if (!required_kernelcore) {
+               if (movablemem_map.nr_map)
+                       memcpy(zone_movable_pfn, zone_movable_limit,
+                               sizeof(zone_movable_pfn));
                goto out;
+       }
 
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-       find_usable_zone_for_movable();
        usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 
 restart:
@@ -4864,10 +4948,24 @@ restart:
                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
                        unsigned long size_pages;
 
+                       /*
+                        * Find more memory for kernelcore in
+                        * [zone_movable_pfn[nid], zone_movable_limit[nid]).
+                        */
                        start_pfn = max(start_pfn, zone_movable_pfn[nid]);
                        if (start_pfn >= end_pfn)
                                continue;
 
+                       if (zone_movable_limit[nid]) {
+                               end_pfn = min(end_pfn, zone_movable_limit[nid]);
+                               /* No range left for kernelcore in this node */
+                               if (start_pfn >= end_pfn) {
+                                       zone_movable_pfn[nid] =
+                                                       zone_movable_limit[nid];
+                                       break;
+                               }
+                       }
+
                        /* Account for what is only usable for kernelcore */
                        if (start_pfn < usable_startpfn) {
                                unsigned long kernel_pages;
@@ -4927,12 +5025,12 @@ restart:
        if (usable_nodes && required_kernelcore > usable_nodes)
                goto restart;
 
+out:
        /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                zone_movable_pfn[nid] =
                        roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 
-out:
        /* restore the node_state */
        node_states[N_MEMORY] = saved_node_state;
 }
@@ -4995,6 +5093,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+       find_usable_zone_for_movable();
+       sanitize_zone_movable_limit();
        find_zone_movable_pfns_for_nodes();
 
        /* Print out the zone ranges */
@@ -5078,6 +5178,181 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
+/**
+ * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
+ * @start_pfn: start pfn of the range to be checked
+ * @end_pfn:   end pfn of the range to be checked (exclusive)
+ *
+ * This function checks if a given memory range [start_pfn, end_pfn) overlaps
+ * the movablemem_map.map[] array.
+ *
+ * Return: index of the first overlapped element in movablemem_map.map[]
+ *         or -1 if they don't overlap each other.
+ */
+int __init movablemem_map_overlap(unsigned long start_pfn,
+                                  unsigned long end_pfn)
+{
+       int overlap;
+
+       if (!movablemem_map.nr_map)
+               return -1;
+
+       for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
+               if (start_pfn < movablemem_map.map[overlap].end_pfn)
+                       break;
+
+       if (overlap == movablemem_map.nr_map ||
+           end_pfn <= movablemem_map.map[overlap].start_pfn)
+               return -1;
+
+       return overlap;
+}
+
+/**
+ * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
+ * @start_pfn: start pfn of the range
+ * @end_pfn:   end pfn of the range
+ *
+ * This function will also merge the overlapped ranges, and sort the array
+ * by start_pfn in monotonic increasing order.
+ */
+void __init insert_movablemem_map(unsigned long start_pfn,
+                                 unsigned long end_pfn)
+{
+       int pos, overlap;
+
+       /*
+        * pos will be at the 1st overlapped range, or the position
+        * where the element should be inserted.
+        */
+       for (pos = 0; pos < movablemem_map.nr_map; pos++)
+               if (start_pfn <= movablemem_map.map[pos].end_pfn)
+                       break;
+
+       /* If there is no overlapped range, just insert the element. */
+       if (pos == movablemem_map.nr_map ||
+           end_pfn < movablemem_map.map[pos].start_pfn) {
+               /*
+                * If pos is not the end of array, we need to move all
+                * the rest elements backward.
+                */
+               if (pos < movablemem_map.nr_map)
+                       memmove(&movablemem_map.map[pos+1],
+                               &movablemem_map.map[pos],
+                               sizeof(struct movablemem_entry) *
+                               (movablemem_map.nr_map - pos));
+               movablemem_map.map[pos].start_pfn = start_pfn;
+               movablemem_map.map[pos].end_pfn = end_pfn;
+               movablemem_map.nr_map++;
+               return;
+       }
+
+       /* overlap will be at the last overlapped range */
+       for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
+               if (end_pfn < movablemem_map.map[overlap].start_pfn)
+                       break;
+
+       /*
+        * If there are more ranges overlapped, we need to merge them,
+        * and move the rest elements forward.
+        */
+       overlap--;
+       movablemem_map.map[pos].start_pfn = min(start_pfn,
+                                       movablemem_map.map[pos].start_pfn);
+       movablemem_map.map[pos].end_pfn = max(end_pfn,
+                                       movablemem_map.map[overlap].end_pfn);
+
+       if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
+               memmove(&movablemem_map.map[pos+1],
+                       &movablemem_map.map[overlap+1],
+                       sizeof(struct movablemem_entry) *
+                       (movablemem_map.nr_map - overlap - 1));
+
+       movablemem_map.nr_map -= overlap - pos;
+}
+
+/**
+ * movablemem_map_add_region - Add a memory range into movablemem_map.
+ * @start:     physical start address of range
+ * @end:       physical end address of range
+ *
+ * This function transform the physical address into pfn, and then add the
+ * range into movablemem_map by calling insert_movablemem_map().
+ */
+static void __init movablemem_map_add_region(u64 start, u64 size)
+{
+       unsigned long start_pfn, end_pfn;
+
+       /* In case size == 0 or start + size overflows */
+       if (start + size <= start)
+               return;
+
+       if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
+               pr_err("movablemem_map: too many entries;"
+                       " ignoring [mem %#010llx-%#010llx]\n",
+                       (unsigned long long) start,
+                       (unsigned long long) (start + size - 1));
+               return;
+       }
+
+       start_pfn = PFN_DOWN(start);
+       end_pfn = PFN_UP(start + size);
+       insert_movablemem_map(start_pfn, end_pfn);
+}
+
+/*
+ * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
+ * @p: The boot option of the following format:
+ *     movablemem_map=nn[KMG]@ss[KMG]
+ *
+ * This option sets the memory range [ss, ss+nn) to be used as movable memory.
+ *
+ * Return: 0 on success or -EINVAL on failure.
+ */
+static int __init cmdline_parse_movablemem_map(char *p)
+{
+       char *oldp;
+       u64 start_at, mem_size;
+
+       if (!p)
+               goto err;
+
+       if (!strcmp(p, "acpi"))
+               movablemem_map.acpi = true;
+
+       /*
+        * If user decide to use info from BIOS, all the other user specified
+        * ranges will be ingored.
+        */
+       if (movablemem_map.acpi) {
+               if (movablemem_map.nr_map) {
+                       memset(movablemem_map.map, 0,
+                               sizeof(struct movablemem_entry)
+                               * movablemem_map.nr_map);
+                       movablemem_map.nr_map = 0;
+               }
+               return 0;
+       }
+
+       oldp = p;
+       mem_size = memparse(p, &p);
+       if (p == oldp)
+               goto err;
+
+       if (*p == '@') {
+               oldp = ++p;
+               start_at = memparse(p, &p);
+               if (p == oldp || *p != '\0')
+                       goto err;
+
+               movablemem_map_add_region(start_at, mem_size);
+               return 0;
+       }
+err:
+       return -EINVAL;
+}
+early_param("movablemem_map", cmdline_parse_movablemem_map);
+
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
@@ -5160,8 +5435,8 @@ static void calculate_totalreserve_pages(void)
                        /* we treat the high watermark as reserved pages. */
                        max += high_wmark_pages(zone);
 
-                       if (max > zone->present_pages)
-                               max = zone->present_pages;
+                       if (max > zone->managed_pages)
+                               max = zone->managed_pages;
                        reserve_pages += max;
                        /*
                         * Lowmem reserves are not available to
@@ -5193,7 +5468,7 @@ static void setup_per_zone_lowmem_reserve(void)
        for_each_online_pgdat(pgdat) {
                for (j = 0; j < MAX_NR_ZONES; j++) {
                        struct zone *zone = pgdat->node_zones + j;
-                       unsigned long present_pages = zone->present_pages;
+                       unsigned long managed_pages = zone->managed_pages;
 
                        zone->lowmem_reserve[j] = 0;
 
@@ -5207,9 +5482,9 @@ static void setup_per_zone_lowmem_reserve(void)
                                        sysctl_lowmem_reserve_ratio[idx] = 1;
 
                                lower_zone = pgdat->node_zones + idx;
-                               lower_zone->lowmem_reserve[j] = present_pages /
+                               lower_zone->lowmem_reserve[j] = managed_pages /
                                        sysctl_lowmem_reserve_ratio[idx];
-                               present_pages += lower_zone->present_pages;
+                               managed_pages += lower_zone->managed_pages;
                        }
                }
        }
@@ -5228,14 +5503,14 @@ static void __setup_per_zone_wmarks(void)
        /* Calculate total number of !ZONE_HIGHMEM pages */
        for_each_zone(zone) {
                if (!is_highmem(zone))
-                       lowmem_pages += zone->present_pages;
+                       lowmem_pages += zone->managed_pages;
        }
 
        for_each_zone(zone) {
                u64 tmp;
 
                spin_lock_irqsave(&zone->lock, flags);
-               tmp = (u64)pages_min * zone->present_pages;
+               tmp = (u64)pages_min * zone->managed_pages;
                do_div(tmp, lowmem_pages);
                if (is_highmem(zone)) {
                        /*
@@ -5247,13 +5522,10 @@ static void __setup_per_zone_wmarks(void)
                         * deltas controls asynch page reclaim, and so should
                         * not be capped for highmem.
                         */
-                       int min_pages;
+                       unsigned long min_pages;
 
-                       min_pages = zone->present_pages / 1024;
-                       if (min_pages < SWAP_CLUSTER_MAX)
-                               min_pages = SWAP_CLUSTER_MAX;
-                       if (min_pages > 128)
-                               min_pages = 128;
+                       min_pages = zone->managed_pages / 1024;
+                       min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
                        zone->watermark[WMARK_MIN] = min_pages;
                } else {
                        /*
@@ -5314,7 +5586,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
        unsigned int gb, ratio;
 
        /* Zone size in gigabytes */
-       gb = zone->present_pages >> (30 - PAGE_SHIFT);
+       gb = zone->managed_pages >> (30 - PAGE_SHIFT);
        if (gb)
                ratio = int_sqrt(10 * gb);
        else
@@ -5400,7 +5672,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
                return rc;
 
        for_each_zone(zone)
-               zone->min_unmapped_pages = (zone->present_pages *
+               zone->min_unmapped_pages = (zone->managed_pages *
                                sysctl_min_unmapped_ratio) / 100;
        return 0;
 }
@@ -5416,7 +5688,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
                return rc;
 
        for_each_zone(zone)
-               zone->min_slab_pages = (zone->present_pages *
+               zone->min_slab_pages = (zone->managed_pages *
                                sysctl_min_slab_ratio) / 100;
        return 0;
 }
@@ -5458,7 +5730,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        for_each_populated_zone(zone) {
                for_each_possible_cpu(cpu) {
                        unsigned long  high;
-                       high = zone->present_pages / percpu_pagelist_fraction;
+                       high = zone->managed_pages / percpu_pagelist_fraction;
                        setup_pagelist_highmark(
                                per_cpu_ptr(zone->pageset, cpu), high);
                }
@@ -5806,9 +6078,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                                    0, false, MIGRATE_SYNC,
                                    MR_CMA);
        }
-
-       putback_movable_pages(&cc->migratepages);
-       return ret > 0 ? 0 : ret;
+       if (ret < 0) {
+               putback_movable_pages(&cc->migratepages);
+               return ret;
+       }
+       return 0;
 }
 
 /**