Merge branch 'linaro-android-3.10-lsk' of git://android.git.linaro.org/kernel/linaro...
[firefly-linux-kernel-4.4.55.git] / mm / mmap.c
index 0db0de1c2fbee21e0e9919af66021b190d789731..25abb881c9e989e0deb7684a216cd1fcd8c87615 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,6 +6,7 @@
  * Address space accounting code       <alan@lxorguk.ukuu.org.uk>
  */
 
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
@@ -33,6 +34,8 @@
 #include <linux/uprobes.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/sched/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -84,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50;        /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 /*
  * Make sure vm_committed_as in one cacheline and not cacheline shared with
  * other variables. It can be updated by several CPUs frequently.
@@ -122,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-       unsigned long free, allowed;
+       unsigned long free, allowed, reserve;
 
        vm_acct_memory(pages);
 
@@ -163,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                        free -= totalreserve_pages;
 
                /*
-                * Leave the last 3% for root
+                * Reserve some for root
                 */
                if (!cap_sys_admin)
-                       free -= free / 32;
+                       free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 
                if (free > pages)
                        return 0;
@@ -177,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        allowed = (totalram_pages - hugetlb_total_pages())
                * sysctl_overcommit_ratio / 100;
        /*
-        * Leave the last 3% for root
+        * Reserve some for root
         */
        if (!cap_sys_admin)
-               allowed -= allowed / 32;
+               allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
        allowed += total_swap_pages;
 
-       /* Don't let a single process grow too big:
-          leave 3% of the size of this process for other processes */
-       if (mm)
-               allowed -= mm->total_vm / 32;
+       /*
+        * Don't let a single process grow so big a user can't recover
+        */
+       if (mm) {
+               reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+               allowed -= min(mm->total_vm / 32, reserve);
+       }
 
        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
@@ -543,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
        return 0;
 }
 
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+               unsigned long addr, unsigned long end)
+{
+       unsigned long nr_pages = 0;
+       struct vm_area_struct *vma;
+
+       /* Find first overlaping mapping */
+       vma = find_vma_intersection(mm, addr, end);
+       if (!vma)
+               return 0;
+
+       nr_pages = (min(end, vma->vm_end) -
+               max(addr, vma->vm_start)) >> PAGE_SHIFT;
+
+       /* Iterate over the rest of the overlaps */
+       for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+               unsigned long overlap_len;
+
+               if (vma->vm_start > end)
+                       break;
+
+               overlap_len = min(end, vma->vm_end) - vma->vm_start;
+               nr_pages += overlap_len >> PAGE_SHIFT;
+       }
+
+       return nr_pages;
+}
+
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
@@ -829,7 +865,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
                mm->map_count--;
-               mpol_put(vma_policy(next));
+               vma_set_policy(vma, vma_policy(next));
                kmem_cache_free(vm_area_cachep, next);
                /*
                 * In mprotect's case 6 (see comments on vma_merge),
@@ -857,7 +893,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-                       struct file *file, unsigned long vm_flags)
+                       struct file *file, unsigned long vm_flags,
+                       const char __user *anon_name)
 {
        if (vma->vm_flags ^ vm_flags)
                return 0;
@@ -865,6 +902,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
                return 0;
        if (vma->vm_ops && vma->vm_ops->close)
                return 0;
+       if (vma_get_anon_name(vma) != anon_name)
+               return 0;
        return 1;
 }
 
@@ -895,9 +934,10 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+       const char __user *anon_name)
 {
-       if (is_mergeable_vma(vma, file, vm_flags) &&
+       if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
@@ -914,9 +954,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+       const char __user *anon_name)
 {
-       if (is_mergeable_vma(vma, file, vm_flags) &&
+       if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
@@ -927,9 +968,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 }
 
 /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
@@ -959,7 +1000,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
                        unsigned long end, unsigned long vm_flags,
                        struct anon_vma *anon_vma, struct file *file,
-                       pgoff_t pgoff, struct mempolicy *policy)
+                       pgoff_t pgoff, struct mempolicy *policy,
+                       const char __user *anon_name)
 {
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
@@ -985,15 +1027,15 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
         */
        if (prev && prev->vm_end == addr &&
                        mpol_equal(vma_policy(prev), policy) &&
-                       can_vma_merge_after(prev, vm_flags,
-                                               anon_vma, file, pgoff)) {
+                       can_vma_merge_after(prev, vm_flags, anon_vma,
+                                               file, pgoff, anon_name)) {
                /*
                 * OK, it can.  Can we now merge in the successor as well?
                 */
                if (next && end == next->vm_start &&
                                mpol_equal(policy, vma_policy(next)) &&
-                               can_vma_merge_before(next, vm_flags,
-                                       anon_vma, file, pgoff+pglen) &&
+                               can_vma_merge_before(next, vm_flags, anon_vma,
+                                               file, pgoff+pglen, anon_name) &&
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
@@ -1013,8 +1055,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
         */
        if (next && end == next->vm_start &&
                        mpol_equal(policy, vma_policy(next)) &&
-                       can_vma_merge_before(next, vm_flags,
-                                       anon_vma, file, pgoff+pglen)) {
+                       can_vma_merge_before(next, vm_flags, anon_vma,
+                                       file, pgoff+pglen, anon_name)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
                        err = vma_adjust(prev, prev->vm_start,
                                addr, prev->vm_pgoff, NULL);
@@ -1327,15 +1369,24 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                file = fget(fd);
                if (!file)
                        goto out;
+               if (is_file_hugepages(file))
+                       len = ALIGN(len, huge_page_size(hstate_file(file)));
        } else if (flags & MAP_HUGETLB) {
                struct user_struct *user = NULL;
+               struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &
+                                                  SHM_HUGE_MASK);
+
+               if (!hs)
+                       return -EINVAL;
+
+               len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 * A dummy user value is used because we are not locking
                 * memory so no accounting is necessary
                 */
-               file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+               file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
                                &user, HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1435,6 +1486,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long charged = 0;
        struct inode *inode =  file ? file_inode(file) : NULL;
 
+       /* Check against address space limit. */
+       if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+               unsigned long nr_pages;
+
+               /*
+                * MAP_FIXED may remove pages of mappings that intersects with
+                * requested mapping. Account for the pages it would unmap.
+                */
+               if (!(vm_flags & MAP_FIXED))
+                       return -ENOMEM;
+
+               nr_pages = count_vma_pages_range(mm, addr, addr + len);
+
+               if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+                       return -ENOMEM;
+       }
+
        /* Clear old maps */
        error = -ENOMEM;
 munmap_back:
@@ -1444,10 +1512,6 @@ munmap_back:
                goto munmap_back;
        }
 
-       /* Check against address space limit. */
-       if (!may_expand_vm(mm, len >> PAGE_SHIFT))
-               return -ENOMEM;
-
        /*
         * Private writable mapping: check memory availability
         */
@@ -1461,7 +1525,8 @@ munmap_back:
        /*
         * Can we just expand an old mapping?
         */
-       vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+       vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+                       NULL, NULL);
        if (vma)
                goto out;
 
@@ -1935,9 +2000,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma = NULL;
 
-       if (WARN_ON_ONCE(!mm))          /* Remove this in linux-3.6 */
-               return NULL;
-
        /* Check the cache first. */
        /* (Cache hit rate is typically around 35%.) */
        vma = ACCESS_ONCE(mm->mmap_cache);
@@ -2305,7 +2367,7 @@ static void unmap_region(struct mm_struct *mm,
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end);
        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-                                next ? next->vm_start : 0);
+                                next ? next->vm_start : USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, start, end);
 }
 
@@ -2608,7 +2670,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
        /* Can we just expand an old private anonymous mapping? */
        vma = vma_merge(mm, prev, addr, addr + len, flags,
-                                       NULL, NULL, pgoff, NULL);
+                                       NULL, NULL, pgoff, NULL, NULL);
        if (vma)
                goto out;
 
@@ -2685,7 +2747,7 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, vma, 0, -1);
 
-       free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+       free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
        tlb_finish_mmu(&tlb, 0, -1);
 
        /*
@@ -2766,7 +2828,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
                return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-                       vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+                       vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+                       vma_get_anon_name(vma));
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
@@ -3097,3 +3160,115 @@ void __init mmap_init(void)
        ret = percpu_counter_init(&vm_committed_as, 0);
        VM_BUG_ON(ret);
 }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+       unsigned long free_kbytes;
+
+       free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+       sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+       return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+       unsigned long free_kbytes;
+
+       free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+       sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+       return 0;
+}
+module_init(init_admin_reserve)
+
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+                            unsigned long action, void *data)
+{
+       unsigned long tmp, free_kbytes;
+
+       switch (action) {
+       case MEM_ONLINE:
+               /* Default max is 128MB. Leave alone if modified by operator. */
+               tmp = sysctl_user_reserve_kbytes;
+               if (0 < tmp && tmp < (1UL << 17))
+                       init_user_reserve();
+
+               /* Default max is 8MB.  Leave alone if modified by operator. */
+               tmp = sysctl_admin_reserve_kbytes;
+               if (0 < tmp && tmp < (1UL << 13))
+                       init_admin_reserve();
+
+               break;
+       case MEM_OFFLINE:
+               free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+               if (sysctl_user_reserve_kbytes > free_kbytes) {
+                       init_user_reserve();
+                       pr_info("vm.user_reserve_kbytes reset to %lu\n",
+                               sysctl_user_reserve_kbytes);
+               }
+
+               if (sysctl_admin_reserve_kbytes > free_kbytes) {
+                       init_admin_reserve();
+                       pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+                               sysctl_admin_reserve_kbytes);
+               }
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block reserve_mem_nb = {
+       .notifier_call = reserve_mem_notifier,
+};
+
+static int __meminit init_reserve_notifier(void)
+{
+       if (register_hotmemory_notifier(&reserve_mem_nb))
+               printk("Failed registering memory add/remove notifier for admin reserve");
+
+       return 0;
+}
+module_init(init_reserve_notifier)