rk_sdmmc: recalculate audib rx_wmark
[firefly-linux-kernel-4.4.55.git] / mm / nommu.c
index 9edc897a3970e3a22753e6bbed4ce0bdde9d0f19..298884dcd6e71e4723a203a82f1613cbfcec19ec 100644 (file)
@@ -13,7 +13,7 @@
  *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
@@ -22,7 +22,6 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -30,6 +29,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
@@ -63,10 +63,27 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 int heap_stack_gap = 0;
 
 atomic_long_t mmap_pages_allocated;
 
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+       return percpu_counter_read_positive(&vm_committed_as);
+}
+
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
 
@@ -125,10 +142,10 @@ unsigned int kobjsize(const void *objp)
        return PAGE_SIZE << compound_order(page);
 }
 
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                    unsigned long start, int nr_pages, unsigned int foll_flags,
-                    struct page **pages, struct vm_area_struct **vmas,
-                    int *retry)
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long start, unsigned long nr_pages,
+                     unsigned int foll_flags, struct page **pages,
+                     struct vm_area_struct **vmas, int *nonblocking)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -175,9 +192,10 @@ finish_or_fault:
  *   slab page or a secondary page from a compound page
  * - don't permit access to VMAs that don't support it, such as I/O mappings
  */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-       unsigned long start, int nr_pages, int write, int force,
-       struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                   unsigned long start, unsigned long nr_pages,
+                   int write, int force, struct page **pages,
+                   struct vm_area_struct **vmas)
 {
        int flags = 0;
 
@@ -212,8 +230,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
 
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+LIST_HEAD(vmap_area_list);
 
 void vfree(const void *addr)
 {
@@ -455,7 +472,7 @@ void  __attribute__((weak)) vmalloc_sync_all(void)
  *     between processes, it syncs the pagetable across all
  *     processes.
  */
-struct vm_struct *alloc_vm_area(size_t size)
+struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
 {
        BUG();
        return NULL;
@@ -697,9 +714,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
 
+               mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-               vma_prio_tree_insert(vma, &mapping->i_mmap);
+               vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
+               mutex_unlock(&mapping->i_mmap_mutex);
        }
 
        /* add the VMA to the tree */
@@ -761,9 +780,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
 
+               mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-               vma_prio_tree_remove(vma, &mapping->i_mmap);
+               vma_interval_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
+               mutex_unlock(&mapping->i_mmap_mutex);
        }
 
        /* remove from the MM's tree and list */
@@ -776,8 +797,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 
        if (vma->vm_next)
                vma->vm_next->vm_prev = vma->vm_prev;
-
-       vma->vm_mm = NULL;
 }
 
 /*
@@ -788,11 +807,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
        kenter("%p", vma);
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-       if (vma->vm_file) {
+       if (vma->vm_file)
                fput(vma->vm_file);
-               if (vma->vm_flags & VM_EXECUTABLE)
-                       removed_exe_file_vma(mm);
-       }
        put_nommu_region(vma->vm_region);
        kmem_cache_free(vm_area_cachep, vma);
 }
@@ -806,7 +822,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
        struct vm_area_struct *vma;
 
        /* check the cache first */
-       vma = mm->mmap_cache;
+       vma = ACCESS_ONCE(mm->mmap_cache);
        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
                return vma;
 
@@ -888,7 +904,6 @@ static int validate_mmap_request(struct file *file,
                                 unsigned long *_capabilities)
 {
        unsigned long capabilities, rlen;
-       unsigned long reqprot = prot;
        int ret;
 
        /* do the simple checks first */
@@ -929,7 +944,7 @@ static int validate_mmap_request(struct file *file,
                 */
                mapping = file->f_mapping;
                if (!mapping)
-                       mapping = file->f_path.dentry->d_inode->i_mapping;
+                       mapping = file_inode(file)->i_mapping;
 
                capabilities = 0;
                if (mapping && mapping->backing_dev_info)
@@ -938,7 +953,7 @@ static int validate_mmap_request(struct file *file,
                if (!capabilities) {
                        /* no explicit capabilities set, so assume some
                         * defaults */
-                       switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
+                       switch (file_inode(file)->i_mode & S_IFMT) {
                        case S_IFREG:
                        case S_IFBLK:
                                capabilities = BDI_CAP_MAP_COPY;
@@ -973,11 +988,11 @@ static int validate_mmap_request(struct file *file,
                            !(file->f_mode & FMODE_WRITE))
                                return -EACCES;
 
-                       if (IS_APPEND(file->f_path.dentry->d_inode) &&
+                       if (IS_APPEND(file_inode(file)) &&
                            (file->f_mode & FMODE_WRITE))
                                return -EACCES;
 
-                       if (locks_verify_locked(file->f_path.dentry->d_inode))
+                       if (locks_verify_locked(file_inode(file)))
                                return -EAGAIN;
 
                        if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -1046,7 +1061,7 @@ static int validate_mmap_request(struct file *file,
        }
 
        /* allow the security API to have its say */
-       ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+       ret = security_mmap_addr(addr);
        if (ret < 0)
                return ret;
 
@@ -1087,7 +1102,7 @@ static unsigned long determine_vm_flags(struct file *file,
         * it's being traced - otherwise breakpoints set in it may interfere
         * with another untraced process
         */
-       if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+       if ((flags & MAP_PRIVATE) && current->ptrace)
                vm_flags &= ~VM_MAYSHARE;
 
        return vm_flags;
@@ -1237,7 +1252,8 @@ unsigned long do_mmap_pgoff(struct file *file,
                            unsigned long len,
                            unsigned long prot,
                            unsigned long flags,
-                           unsigned long pgoff)
+                           unsigned long pgoff,
+                           unsigned long *populate)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
@@ -1247,6 +1263,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
 
+       *populate = 0;
+
        /* decide whether we should attempt the mapping, and if so what sort of
         * mapping */
        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1282,14 +1300,8 @@ unsigned long do_mmap_pgoff(struct file *file,
        vma->vm_pgoff = pgoff;
 
        if (file) {
-               region->vm_file = file;
-               get_file(file);
-               vma->vm_file = file;
-               get_file(file);
-               if (vm_flags & VM_EXECUTABLE) {
-                       added_exe_file_vma(current->mm);
-                       vma->vm_mm = current->mm;
-               }
+               region->vm_file = get_file(file);
+               vma->vm_file = get_file(file);
        }
 
        down_write(&nommu_region_sem);
@@ -1316,8 +1328,8 @@ unsigned long do_mmap_pgoff(struct file *file,
                                continue;
 
                        /* search for overlapping mappings on the same file */
-                       if (pregion->vm_file->f_path.dentry->d_inode !=
-                           file->f_path.dentry->d_inode)
+                       if (file_inode(pregion->vm_file) !=
+                           file_inode(file))
                                continue;
 
                        if (pregion->vm_pgoff >= pgend)
@@ -1442,8 +1454,6 @@ error:
        kmem_cache_free(vm_region_jar, region);
        if (vma->vm_file)
                fput(vma->vm_file);
-       if (vma->vm_flags & VM_EXECUTABLE)
-               removed_exe_file_vma(vma->vm_mm);
        kmem_cache_free(vm_area_cachep, vma);
        kleave(" = %d", ret);
        return ret;
@@ -1469,7 +1479,6 @@ error_getting_region:
        show_free_areas(0);
        return -ENOMEM;
 }
-EXPORT_SYMBOL(do_mmap_pgoff);
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
@@ -1487,9 +1496,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-       down_write(&current->mm->mmap_sem);
-       retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-       up_write(&current->mm->mmap_sem);
+       retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 
        if (file)
                fput(file);
@@ -1708,16 +1715,22 @@ erase_whole_vma:
 }
 EXPORT_SYMBOL(do_munmap);
 
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+int vm_munmap(unsigned long addr, size_t len)
 {
-       int ret;
        struct mm_struct *mm = current->mm;
+       int ret;
 
        down_write(&mm->mmap_sem);
        ret = do_munmap(mm, addr, len);
        up_write(&mm->mmap_sem);
        return ret;
 }
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+       return vm_munmap(addr, len);
+}
 
 /*
  * release all the mappings made in a process's VM space
@@ -1743,7 +1756,7 @@ void exit_mmap(struct mm_struct *mm)
        kleave("");
 }
 
-unsigned long do_brk(unsigned long addr, unsigned long len)
+unsigned long vm_brk(unsigned long addr, unsigned long len)
 {
        return -ENOMEM;
 }
@@ -1758,7 +1771,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
  *
  * MREMAP_FIXED is not supported under NOMMU conditions
  */
-unsigned long do_mremap(unsigned long addr,
+static unsigned long do_mremap(unsigned long addr,
                        unsigned long old_len, unsigned long new_len,
                        unsigned long flags, unsigned long new_addr)
 {
@@ -1793,7 +1806,6 @@ unsigned long do_mremap(unsigned long addr,
        vma->vm_end = vma->vm_start + new_len;
        return vma->vm_start;
 }
-EXPORT_SYMBOL(do_mremap);
 
 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                unsigned long, new_len, unsigned long, flags,
@@ -1807,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        return ret;
 }
 
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
-                       unsigned int foll_flags)
+struct page *follow_page_mask(struct vm_area_struct *vma,
+                             unsigned long address, unsigned int flags,
+                             unsigned int *page_mask)
 {
+       *page_mask = 0;
        return NULL;
 }
 
@@ -1819,11 +1833,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        if (addr != (pfn << PAGE_SHIFT))
                return -EINVAL;
 
-       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+       unsigned long pfn = start >> PAGE_SHIFT;
+       unsigned long vm_len = vma->vm_end - vma->vm_start;
+
+       pfn += vma->vm_pgoff;
+       return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                        unsigned long pgoff)
 {
@@ -1874,7 +1898,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-       unsigned long free, allowed;
+       unsigned long free, allowed, reserve;
 
        vm_acct_memory(pages);
 
@@ -1885,10 +1909,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
 
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-               unsigned long n;
+               free = global_page_state(NR_FREE_PAGES);
+               free += global_page_state(NR_FILE_PAGES);
 
-               free = global_page_state(NR_FILE_PAGES);
-               free += nr_swap_pages;
+               /*
+                * shmem pages shouldn't be counted as free in this
+                * case, they can't be purged, only swapped out, and
+                * that won't affect the overall amount of available
+                * memory in the system.
+                */
+               free -= global_page_state(NR_SHMEM);
+
+               free += get_nr_swap_pages();
 
                /*
                 * Any slabs which are created with the
@@ -1898,35 +1930,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 */
                free += global_page_state(NR_SLAB_RECLAIMABLE);
 
-               /*
-                * Leave the last 3% for root
-                */
-               if (!cap_sys_admin)
-                       free -= free / 32;
-
-               if (free > pages)
-                       return 0;
-
-               /*
-                * nr_free_pages() is very expensive on large systems,
-                * only call if we're about to fail.
-                */
-               n = nr_free_pages();
-
                /*
                 * Leave reserved pages. The pages are not for anonymous pages.
                 */
-               if (n <= totalreserve_pages)
+               if (free <= totalreserve_pages)
                        goto error;
                else
-                       n -= totalreserve_pages;
+                       free -= totalreserve_pages;
 
                /*
-                * Leave the last 3% for root
+                * Reserve some for root
                 */
                if (!cap_sys_admin)
-                       n -= n / 32;
-               free += n;
+                       free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 
                if (free > pages)
                        return 0;
@@ -1936,16 +1952,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
        allowed = totalram_pages * sysctl_overcommit_ratio / 100;
        /*
-        * Leave the last 3% for root
+        * Reserve some 3% for root
         */
        if (!cap_sys_admin)
-               allowed -= allowed / 32;
+               allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
        allowed += total_swap_pages;
 
-       /* Don't let a single process grow too big:
-          leave 3% of the size of this process for other processes */
-       if (mm)
-               allowed -= mm->total_vm / 32;
+       /*
+        * Don't let a single process grow so big a user can't recover
+        */
+       if (mm) {
+               reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+               allowed -= min(mm->total_vm / 32, reserve);
+       }
 
        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
@@ -1968,6 +1987,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+                            unsigned long size, pgoff_t pgoff)
+{
+       BUG();
+       return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, int write)
 {
@@ -2052,7 +2079,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                                size_t newsize)
 {
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
        struct vm_region *region;
        pgoff_t low, high;
        size_t r_size, r_top;
@@ -2061,13 +2087,14 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
        down_write(&nommu_region_sem);
+       mutex_lock(&inode->i_mapping->i_mmap_mutex);
 
        /* search for VMAs that fall within the dead zone */
-       vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-                             low, high) {
+       vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
+                       mutex_unlock(&inode->i_mapping->i_mmap_mutex);
                        up_write(&nommu_region_sem);
                        return -ETXTBSY; /* not quite true, but near enough */
                }
@@ -2079,8 +2106,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
         * we don't check for any regions that start beyond the EOF as there
         * shouldn't be any
         */
-       vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-                             0, ULONG_MAX) {
+       vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+                                 0, ULONG_MAX) {
                if (!(vma->vm_flags & VM_SHARED))
                        continue;
 
@@ -2095,6 +2122,49 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                }
        }
 
+       mutex_unlock(&inode->i_mapping->i_mmap_mutex);
        up_write(&nommu_region_sem);
        return 0;
 }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+       unsigned long free_kbytes;
+
+       free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+       sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+       return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+       unsigned long free_kbytes;
+
+       free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+       sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+       return 0;
+}
+module_init(init_admin_reserve)