mm, THP: don't hold mmap_sem in khugepaged when allocating THP
[firefly-linux-kernel-4.4.55.git] / mm / huge_memory.c
index d9a21d06b8623571cabe5f73532d02412a94ae9c..55ab569c31b4f5b9a96c0d2e43e574d1b6699dfb 100644 (file)
@@ -1795,14 +1795,17 @@ static int __split_huge_page_map(struct page *page,
                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
+                       /*
+                        * Note that pmd_numa is not transferred deliberately
+                        * to avoid any possibility that pte_numa leaks to
+                        * a PROT_NONE VMA by accident.
+                        */
                        entry = mk_pte(page + i, vma->vm_page_prot);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (!pmd_write(*pmd))
                                entry = pte_wrprotect(entry);
                        if (!pmd_young(*pmd))
                                entry = pte_mkold(entry);
-                       if (pmd_numa(*pmd))
-                               entry = pte_mknuma(entry);
                        pte = pte_offset_map(&_pmd, haddr);
                        BUG_ON(!pte_none(*pte));
                        set_pte_at(mm, haddr, pte, entry);
@@ -2319,23 +2322,17 @@ static struct page
                       int node)
 {
        VM_BUG_ON_PAGE(*hpage, *hpage);
+
        /*
-        * Allocate the page while the vma is still valid and under
-        * the mmap_sem read mode so there is no memory allocation
-        * later when we take the mmap_sem in write mode. This is more
-        * friendly behavior (OTOH it may actually hide bugs) to
-        * filesystems in userland with daemons allocating memory in
-        * the userland I/O paths.  Allocating memory with the
-        * mmap_sem in read mode is good idea also to allow greater
-        * scalability.
+        * Before allocating the hugepage, release the mmap_sem read lock.
+        * The allocation can take potentially a long time if it involves
+        * sync compaction, and we do not need to hold the mmap_sem during
+        * that. We will recheck the vma after taking it again in write mode.
         */
+       up_read(&mm->mmap_sem);
+
        *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
                khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
-       /*
-        * After allocating the hugepage, release the mmap_sem read lock in
-        * preparation for taking it in write mode.
-        */
-       up_read(&mm->mmap_sem);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);