* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
#include <linux/sched/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- free -= free / 32;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
allowed = (totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages;
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- if (mm)
- allowed -= mm->total_vm / 32;
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
return 0;
}
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long nr_pages = 0;
+ struct vm_area_struct *vma;
+
+ /* Find first overlaping mapping */
+ vma = find_vma_intersection(mm, addr, end);
+ if (!vma)
+ return 0;
+
+ nr_pages = (min(end, vma->vm_end) -
+ max(addr, vma->vm_start)) >> PAGE_SHIFT;
+
+ /* Iterate over the rest of the overlaps */
+ for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+ unsigned long overlap_len;
+
+ if (vma->vm_start > end)
+ break;
+
+ overlap_len = min(end, vma->vm_end) - vma->vm_start;
+ nr_pages += overlap_len >> PAGE_SHIFT;
+ }
+
+ return nr_pages;
+}
+
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
+ vma_set_policy(vma, vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags)
+ struct file *file, unsigned long vm_flags,
+ const char __user *anon_name)
{
if (vma->vm_flags ^ vm_flags)
return 0;
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
+ if (vma_get_anon_name(vma) != anon_name)
+ return 0;
return 1;
}
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy)
+ pgoff_t pgoff, struct mempolicy *policy,
+ const char __user *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
*/
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
- can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff)) {
+ can_vma_merge_after(prev, vm_flags, anon_vma,
+ file, pgoff, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen) &&
+ can_vma_merge_before(next, vm_flags, anon_vma,
+ file, pgoff+pglen, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen)) {
+ can_vma_merge_before(next, vm_flags, anon_vma,
+ file, pgoff+pglen, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
file = fget(fd);
if (!file)
goto out;
+ if (is_file_hugepages(file))
+ len = ALIGN(len, huge_page_size(hstate_file(file)));
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
+ struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &
+ SHM_HUGE_MASK);
+
+ if (!hs)
+ return -EINVAL;
+
+ len = ALIGN(len, huge_page_size(hs));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
unsigned long charged = 0;
struct inode *inode = file ? file_inode(file) : NULL;
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ if (!(vm_flags & MAP_FIXED))
+ return -ENOMEM;
+
+ nr_pages = count_vma_pages_range(mm, addr, addr + len);
+
+ if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
goto munmap_back;
}
- /* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
/*
* Private writable mapping: check memory availability
*/
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+ NULL, NULL);
if (vma)
goto out;
{
struct vm_area_struct *vma = NULL;
- if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
- return NULL;
-
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
vma = ACCESS_ONCE(mm->mmap_cache);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : 0);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
}
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL);
+ NULL, NULL, pgoff, NULL, NULL);
if (vma)
goto out;
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
/*
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
ret = percpu_counter_init(&vm_committed_as, 0);
VM_BUG_ON(ret);
}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+module_init(init_admin_reserve)
+
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long tmp, free_kbytes;
+
+ switch (action) {
+ case MEM_ONLINE:
+ /* Default max is 128MB. Leave alone if modified by operator. */
+ tmp = sysctl_user_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 17))
+ init_user_reserve();
+
+ /* Default max is 8MB. Leave alone if modified by operator. */
+ tmp = sysctl_admin_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 13))
+ init_admin_reserve();
+
+ break;
+ case MEM_OFFLINE:
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ if (sysctl_user_reserve_kbytes > free_kbytes) {
+ init_user_reserve();
+ pr_info("vm.user_reserve_kbytes reset to %lu\n",
+ sysctl_user_reserve_kbytes);
+ }
+
+ if (sysctl_admin_reserve_kbytes > free_kbytes) {
+ init_admin_reserve();
+ pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+ sysctl_admin_reserve_kbytes);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block reserve_mem_nb = {
+ .notifier_call = reserve_mem_notifier,
+};
+
+static int __meminit init_reserve_notifier(void)
+{
+ if (register_hotmemory_notifier(&reserve_mem_nb))
+ printk("Failed registering memory add/remove notifier for admin reserve");
+
+ return 0;
+}
+module_init(init_reserve_notifier)