mm: defer flush of writable TLB entries
[firefly-linux-kernel-4.4.55.git] / mm / huge_memory.c
index c107094f79bae9ee895bd6bf30976d900f16c141..279a818a39b13d76e574bf8f330c7c925b8e3a67 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
-                                       unsigned long haddr, pmd_t *pmd,
-                                       struct page *page, gfp_t gfp)
+                                       unsigned long address, pmd_t *pmd,
+                                       struct page *page, gfp_t gfp,
+                                       unsigned int flags)
 {
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
        spinlock_t *ptl;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
-               return VM_FAULT_OOM;
+       if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+               put_page(page);
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
 
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable)) {
                mem_cgroup_cancel_charge(page, memcg);
+               put_page(page);
                return VM_FAULT_OOM;
        }
 
@@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pte_free(mm, pgtable);
        } else {
                pmd_t entry;
+
+               /* Deliver the page fault to userland */
+               if (userfaultfd_missing(vma)) {
+                       int ret;
+
+                       spin_unlock(ptl);
+                       mem_cgroup_cancel_charge(page, memcg);
+                       put_page(page);
+                       pte_free(mm, pgtable);
+                       ret = handle_userfault(vma, address, flags,
+                                              VM_UFFD_MISSING);
+                       VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                       return ret;
+               }
+
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                atomic_long_inc(&mm->nr_ptes);
                spin_unlock(ptl);
+               count_vm_event(THP_FAULT_ALLOC);
        }
 
        return 0;
@@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 
 /* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct page *zero_page)
 {
        pmd_t entry;
-       if (!pmd_none(*pmd))
-               return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
-       return true;
 }
 
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pgtable_t pgtable;
                struct page *zero_page;
                bool set;
+               int ret;
                pgtable = pte_alloc_one(mm, haddr);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
@@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        return VM_FAULT_FALLBACK;
                }
                ptl = pmd_lock(mm, pmd);
-               set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
-                               zero_page);
-               spin_unlock(ptl);
+               ret = 0;
+               set = false;
+               if (pmd_none(*pmd)) {
+                       if (userfaultfd_missing(vma)) {
+                               spin_unlock(ptl);
+                               ret = handle_userfault(vma, address, flags,
+                                                      VM_UFFD_MISSING);
+                               VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                       } else {
+                               set_huge_zero_page(pgtable, mm, vma,
+                                                  haddr, pmd,
+                                                  zero_page);
+                               spin_unlock(ptl);
+                               set = true;
+                       }
+               } else
+                       spin_unlock(ptl);
                if (!set) {
                        pte_free(mm, pgtable);
                        put_huge_zero_page();
                }
-               return 0;
+               return ret;
        }
        gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-       if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
-               put_page(page);
-               count_vm_event(THP_FAULT_FALLBACK);
-               return VM_FAULT_FALLBACK;
-       }
-
-       count_vm_event(THP_FAULT_ALLOC);
-       return 0;
+       return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
+                                           flags);
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         */
        if (is_huge_zero_pmd(pmd)) {
                struct page *zero_page;
-               bool set;
                /*
                 * get_huge_zero_page() will never allocate a new page here,
                 * since we already have a zero page to copy. It just takes a
                 * reference.
                 */
                zero_page = get_huge_zero_page();
-               set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+               set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
                                zero_page);
-               BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
                ret = 0;
                goto out_unlock;
        }
@@ -1676,12 +1703,7 @@ static void __split_huge_page_refcount(struct page *page,
                /* after clearing PageTail the gup refcount can be released */
                smp_mb__after_atomic();
 
-               /*
-                * retain hwpoison flag of the poisoned tail page:
-                *   fix for the unsuitable process killed on Guest Machine(KVM)
-                *   by the memory-failure.
-                */
-               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
                page_tail->flags |= (page->flags &
                                     ((1L << PG_referenced) |
                                      (1L << PG_swapbacked) |
@@ -2138,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                       if (++none_or_zero <= khugepaged_max_ptes_none)
+                       if (!userfaultfd_armed(vma) &&
+                           ++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out;
@@ -2591,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                       if (++none_or_zero <= khugepaged_max_ptes_none)
+                       if (!userfaultfd_armed(vma) &&
+                           ++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out_unmap;