[PATCH] core remove PageReserved
authorNick Piggin <nickpiggin@yahoo.com.au>
Sun, 30 Oct 2005 01:16:12 +0000 (18:16 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Sun, 30 Oct 2005 04:40:39 +0000 (21:40 -0700)
Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearing of PageReserved is retained, and it is now flagged
in the page_alloc checks to help ensure we don't introduce any refcount
based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated.  We never completely handled it correctly anyway, and is be
reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and all
arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can
be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not.  This still
needs to be addressed (a generic page_is_ram() should work).

A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and
thus mapcounted and count towards shared rss).  These writes to the struct
page could cause excessive cacheline bouncing on big systems.  There are a
number of ways this could be addressed if it is an issue.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Refcount bug fix for filemap_xip.c

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
22 files changed:
arch/ppc64/kernel/vdso.c
arch/sparc/mm/generic.c
arch/sparc64/mm/generic.c
drivers/scsi/sg.c
drivers/scsi/st.c
fs/direct-io.c
include/linux/mm.h
kernel/power/swsusp.c
mm/bootmem.c
mm/filemap_xip.c
mm/fremap.c
mm/madvise.c
mm/memory.c
mm/mempolicy.c
mm/mmap.c
mm/mprotect.c
mm/msync.c
mm/page_alloc.c
mm/rmap.c
mm/shmem.c
mm/swap.c
sound/core/pcm_native.c

index efa985f05aca2ebfe45be32ff4cd1e6b339ce1d8..4aacf521e3e445466e4133955bfe2b5accbf09b3 100644 (file)
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
                return NOPAGE_SIGBUS;
 
        /*
-        * Last page is systemcfg, special handling here, no get_page() a
-        * this is a reserved page
+        * Last page is systemcfg.
         */
        if ((vma->vm_end - address) <= PAGE_SIZE)
-               return virt_to_page(systemcfg);
+               pg = virt_to_page(systemcfg);
+       else
+               pg = virt_to_page(vbase + offset);
 
-       pg = virt_to_page(vbase + offset);
        get_page(pg);
        DBG(" ->page count: %d\n", page_count(pg));
 
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
         * gettimeofday will be totally dead. It's fine to use that for setting
         * breakpoints in the vDSO code pages though
         */
-       vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+       vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
        vma->vm_flags |= mm->def_flags;
        vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
        vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
                ClearPageReserved(pg);
                get_page(pg);
        }
+
+       get_page(virt_to_page(systemcfg));
 }
 
 int in_gate_area_no_task(unsigned long addr)
index 20ccb957fb7795118b5f06b21d7c5649423e72b4..659c9a71f867fd959b9475df284e4d4e78bbf7e6 100644 (file)
@@ -73,6 +73,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
        int space = GET_IOSPACE(pfn);
        unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+       /* See comment in mm/memory.c remap_pfn_range */
+       vma->vm_flags |= VM_IO | VM_RESERVED;
+
        prot = __pgprot(pg_iobits);
        offset -= from;
        dir = pgd_offset(mm, from);
index c954d91f01d0463aa75f607c393f1c7f4fd7dfcb..afc01cec701f56f7cb539b03a9a9f0e829c35256 100644 (file)
@@ -127,6 +127,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
        int space = GET_IOSPACE(pfn);
        unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+       /* See comment in mm/memory.c remap_pfn_range */
+       vma->vm_flags |= VM_IO | VM_RESERVED;
+
        prot = __pgprot(pg_iobits);
        offset -= from;
        dir = pgd_offset(mm, from);
index 861e51375d70c541609105548e25d507df7f00fc..2d30b46806bf30a3d65f9ba1b05672b8143e5c20 100644 (file)
@@ -1886,13 +1886,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
        int i;
 
        for (i=0; i < nr_pages; i++) {
-               if (dirtied && !PageReserved(sgl[i].page))
-                       SetPageDirty(sgl[i].page);
-               /* unlock_page(sgl[i].page); */
+               struct page *page = sgl[i].page;
+
+               /* XXX: just for debug. Remove when PageReserved is removed */
+               BUG_ON(PageReserved(page));
+               if (dirtied)
+                       SetPageDirty(page);
+               /* unlock_page(page); */
                /* FIXME: cache flush missing for rw==READ
                 * FIXME: call the correct reference counting function
                 */
-               page_cache_release(sgl[i].page);
+               page_cache_release(page);
        }
 
        return 0;
index 5eb54d8019b46c3afb6dd9e264676a3e8a03d376..da9766283bd7d0b9125d6cc1146f74860766c40b 100644 (file)
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
        int i;
 
        for (i=0; i < nr_pages; i++) {
-               if (dirtied && !PageReserved(sgl[i].page))
-                       SetPageDirty(sgl[i].page);
+               struct page *page = sgl[i].page;
+
+               /* XXX: just for debug. Remove when PageReserved is removed */
+               BUG_ON(PageReserved(page));
+               if (dirtied)
+                       SetPageDirty(page);
                /* FIXME: cache flush missing for rw==READ
                 * FIXME: call the correct reference counting function
                 */
-               page_cache_release(sgl[i].page);
+               page_cache_release(page);
        }
 
        return 0;
index 0d06097bc995f93cd0074c424cbf24d221b80490..3931e7f1e6bf34a53caad0ced825942b14c18e1b 100644 (file)
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
        up_read(&current->mm->mmap_sem);
 
        if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+               struct page *page = ZERO_PAGE(dio->curr_user_address);
                /*
                 * A memory fault, but the filesystem has some outstanding
                 * mapped blocks.  We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
                 */
                if (dio->page_errors == 0)
                        dio->page_errors = ret;
-               dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+               page_cache_get(page);
+               dio->pages[0] = page;
                dio->head = 0;
                dio->tail = 1;
                ret = 0;
index 0c64484d8ae0ca61da9a07c7b46d6787c8294d6b..da42093250c3927f5fbeb0eb3391f928b2a0e0d9 100644 (file)
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY    0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x00040000      /* Cannot expand with mremap() */
-#define VM_RESERVED    0x00080000      /* Don't unmap it from swap_out */
+#define VM_RESERVED    0x00080000      /* Pages managed in a special way */
 #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
 #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
 #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
@@ -338,7 +338,7 @@ static inline void get_page(struct page *page)
 
 static inline void put_page(struct page *page)
 {
-       if (!PageReserved(page) && put_page_testzero(page))
+       if (put_page_testzero(page))
                __page_cache_release(page);
 }
 
@@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
                int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
index 10bc5ec496d72a25e70b42785cfaee781312dddb..016504ccfccf4e6e37d97015d0f4026a51d52299 100644 (file)
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
                        continue;
                page = pfn_to_page(pfn);
                /*
-                * This condition results from rvmalloc() sans vmalloc_32()
-                * and architectural memory reservations. This should be
-                * corrected eventually when the cases giving rise to this
-                * are better understood.
+                * PageReserved results from rvmalloc() sans vmalloc_32()
+                * and architectural memory reservations.
+                *
+                * rvmalloc should not cause this, because all implementations
+                * appear to always be using vmalloc_32 on architectures with
+                * highmem. This is a good thing, because we would like to save
+                * rvmalloc pages.
+                *
+                * It appears to be triggered by pages which do not point to
+                * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
+                * which sets PageReserved if the page does not point to valid
+                * RAM.
+                *
+                * XXX: must remove usage of PageReserved!
                 */
-               if (PageReserved(page)) {
-                       printk("highmem reserved page?!\n");
+               if (PageReserved(page))
                        continue;
-               }
                BUG_ON(PageNosave(page));
                if (PageNosaveFree(page))
                        continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
                return 0;
 
        page = pfn_to_page(pfn);
-       BUG_ON(PageReserved(page) && PageNosave(page));
        if (PageNosave(page))
                return 0;
-       if (PageReserved(page) && pfn_is_nosave(pfn)) {
+       if (pfn_is_nosave(pfn)) {
                pr_debug("[nosave pfn 0x%lx]", pfn);
                return 0;
        }
index a58699b6579e1fc6364aaea564f2a26e78a0b700..e8c567177dcf83e0b8d7352c581c0565c400844c 100644 (file)
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                                if (j + 16 < BITS_PER_LONG)
                                        prefetchw(page + j + 16);
                                __ClearPageReserved(page + j);
+                               set_page_count(page + j, 0);
                        }
                        __free_pages(page, order);
                        i += BITS_PER_LONG;
index 8c199f537732088310c0e490cebe92c7414076bb..9354ee279b1345051bf686a336b58447b4810686 100644 (file)
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping,
        unsigned long address;
        pte_t *pte;
        pte_t pteval;
+       struct page *page = ZERO_PAGE(address);
 
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -185,15 +186,17 @@ __xip_unmap (struct address_space * mapping,
                 * We need the page_table_lock to protect us from page faults,
                 * munmap, fork, etc...
                 */
-               pte = page_check_address(ZERO_PAGE(address), mm,
-                                        address);
+               pte = page_check_address(page, mm, address);
                if (!IS_ERR(pte)) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
                        pteval = ptep_clear_flush(vma, address, pte);
+                       page_remove_rmap(page);
+                       dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap(pte);
                        spin_unlock(&mm->page_table_lock);
+                       page_cache_release(page);
                }
        }
        spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +231,7 @@ xip_file_nopage(struct vm_area_struct * area,
 
        page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
        if (!IS_ERR(page)) {
-               return page;
+               goto out;
        }
        if (PTR_ERR(page) != -ENODATA)
                return NULL;
@@ -249,6 +252,8 @@ xip_file_nopage(struct vm_area_struct * area,
                page = ZERO_PAGE(address);
        }
 
+out:
+       page_cache_get(page);
        return page;
 }
 
index fd7f2a17ff3e49bada9aa1d7c18febe56a9cda03..224cc1598b354bd4d12659f31c682ec09ecf9aa9 100644 (file)
@@ -29,19 +29,20 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                return;
        if (pte_present(pte)) {
                unsigned long pfn = pte_pfn(pte);
+               struct page *page;
 
                flush_cache_page(vma, addr, pfn);
                pte = ptep_clear_flush(vma, addr, ptep);
-               if (pfn_valid(pfn)) {
-                       struct page *page = pfn_to_page(pfn);
-                       if (!PageReserved(page)) {
-                               if (pte_dirty(pte))
-                                       set_page_dirty(page);
-                               page_remove_rmap(page);
-                               page_cache_release(page);
-                               dec_mm_counter(mm, file_rss);
-                       }
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, pte, addr);
+                       return;
                }
+               page = pfn_to_page(pfn);
+               if (pte_dirty(pte))
+                       set_page_dirty(page);
+               page_remove_rmap(page);
+               page_cache_release(page);
+               dec_mm_counter(mm, file_rss);
        } else {
                if (!pte_file(pte))
                        free_swap_and_cache(pte_to_swp_entry(pte));
@@ -65,6 +66,8 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
        pgd_t *pgd;
        pte_t pte_val;
 
+       BUG_ON(vma->vm_flags & VM_RESERVED);
+
        pgd = pgd_offset(mm, addr);
        spin_lock(&mm->page_table_lock);
        
@@ -125,6 +128,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
        pgd_t *pgd;
        pte_t pte_val;
 
+       BUG_ON(vma->vm_flags & VM_RESERVED);
+
        pgd = pgd_offset(mm, addr);
        spin_lock(&mm->page_table_lock);
        
index 20e075d1c64c9c64674e5fc418c6643418931661..17aaf3e1644983a5a4fac14bef8f009f71fe866c 100644 (file)
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
                             unsigned long start, unsigned long end)
 {
        *prev = vma;
-       if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+       if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
                return -EINVAL;
 
        if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
index da642b5528fab0ccd8d44611d6b0fcd27cd65474..e83f9440bb66b2b8ed7bffaddfaa97d55265c623 100644 (file)
@@ -342,6 +342,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 #define NO_RSS 2       /* Increment neither file_rss nor anon_rss */
 
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+       printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+                       "vm_flags = %lx, vaddr = %lx\n",
+               (long long)pte_val(pte),
+               (vma->vm_mm == current->mm ? current->comm : "???"),
+               vma->vm_flags, vaddr);
+       dump_stack();
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 static inline int
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-               pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
+               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
                unsigned long addr)
 {
+       unsigned long vm_flags = vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;
        unsigned long pfn;
@@ -375,18 +393,22 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_set_pte;
        }
 
+       /* If the region is VM_RESERVED, the mapping is not
+        * mapped via rmap - duplicate the pte as is.
+        */
+       if (vm_flags & VM_RESERVED)
+               goto out_set_pte;
+
        pfn = pte_pfn(pte);
-       /* the pte points outside of valid memory, the
-        * mapping is assumed to be good, meaningful
-        * and not mapped via rmap - duplicate the
-        * mapping as is.
+       /* If the pte points outside of valid memory but
+        * the region is not VM_RESERVED, we have a problem.
         */
-       page = NULL;
-       if (pfn_valid(pfn))
-               page = pfn_to_page(pfn);
+       if (unlikely(!pfn_valid(pfn))) {
+               print_bad_pte(vma, pte, addr);
+               goto out_set_pte; /* try to do something sane */
+       }
 
-       if (!page || PageReserved(page))
-               goto out_set_pte;
+       page = pfn_to_page(pfn);
 
        /*
         * If it's a COW mapping, write protect it both
@@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                unsigned long addr, unsigned long end)
 {
        pte_t *src_pte, *dst_pte;
-       unsigned long vm_flags = vma->vm_flags;
        int progress = 0;
        int rss[NO_RSS+1], anon;
 
@@ -446,8 +467,7 @@ again:
                        progress++;
                        continue;
                }
-               anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
-                                                       vm_flags, addr);
+               anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
                rss[anon]++;
                progress += 8;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
 {
+       struct mm_struct *mm = tlb->mm;
        pte_t *pte;
        int file_rss = 0;
        int anon_rss = 0;
@@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                        continue;
                if (pte_present(ptent)) {
                        struct page *page = NULL;
-                       unsigned long pfn = pte_pfn(ptent);
-                       if (pfn_valid(pfn)) {
-                               page = pfn_to_page(pfn);
-                               if (PageReserved(page))
-                                       page = NULL;
+                       if (!(vma->vm_flags & VM_RESERVED)) {
+                               unsigned long pfn = pte_pfn(ptent);
+                               if (unlikely(!pfn_valid(pfn)))
+                                       print_bad_pte(vma, ptent, addr);
+                               else
+                                       page = pfn_to_page(pfn);
                        }
                        if (unlikely(details) && page) {
                                /*
@@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                                     page->index > details->last_index))
                                        continue;
                        }
-                       ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+                       ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                        if (unlikely(!page))
@@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                        if (unlikely(details) && details->nonlinear_vma
                            && linear_page_index(details->nonlinear_vma,
                                                addr) != page->index)
-                               set_pte_at(tlb->mm, addr, pte,
+                               set_pte_at(mm, addr, pte,
                                           pgoff_to_pte(page->index));
                        if (PageAnon(page))
                                anon_rss++;
@@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                        continue;
                if (!pte_file(ptent))
                        free_swap_and_cache(pte_to_swp_entry(ptent));
-               pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+               pte_clear_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, addr != end);
 
-       add_mm_rss(tlb->mm, -file_rss, -anon_rss);
+       add_mm_rss(mm, -file_rss, -anon_rss);
        pte_unmap(pte - 1);
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
 {
@@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-               zap_pte_range(tlb, pmd, addr, next, details);
+               zap_pte_range(tlb, vma, pmd, addr, next, details);
        } while (pmd++, addr = next, addr != end);
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
 {
@@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-               zap_pmd_range(tlb, pud, addr, next, details);
+               zap_pmd_range(tlb, vma, pud, addr, next, details);
        } while (pud++, addr = next, addr != end);
 }
 
@@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-               zap_pud_range(tlb, pgd, addr, next, details);
+               zap_pud_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
 }
@@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        continue;
                }
 
-               if (!vma || (vma->vm_flags & VM_IO)
+               if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
                                || !(flags & vma->vm_flags))
                        return i ? : -EFAULT;
 
@@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        if (pages) {
                                pages[i] = page;
                                flush_dcache_page(page);
-                               if (!PageReserved(page))
-                                       page_cache_get(page);
+                               page_cache_get(page);
                        }
                        if (vmas)
                                vmas[i] = vma;
@@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
        if (!pte)
                return -ENOMEM;
        do {
-               pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+               struct page *page = ZERO_PAGE(addr);
+               pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+               page_cache_get(page);
+               page_add_file_rmap(page);
+               inc_mm_counter(mm, file_rss);
                BUG_ON(!pte_none(*pte));
                set_pte_at(mm, addr, pte, zero_pte);
        } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                return -ENOMEM;
        do {
                BUG_ON(!pte_none(*pte));
-               if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
-                       set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+               set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap(pte - 1);
@@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *      (accesses can have side effects).
-        *   VM_RESERVED tells swapout not to try to touch
-        *      this region.
+        *   VM_RESERVED tells the core MM not to "manage" these pages
+         *     (e.g. refcount, mapcount, try to swap them out).
         */
        vma->vm_flags |= VM_IO | VM_RESERVED;
 
@@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t entry;
        int ret = VM_FAULT_MINOR;
 
+       BUG_ON(vma->vm_flags & VM_RESERVED);
+
        if (unlikely(!pfn_valid(pfn))) {
                /*
                 * Page table corrupted: show pte and kill process.
                 */
-               pte_ERROR(orig_pte);
+               print_bad_pte(vma, orig_pte, address);
                ret = VM_FAULT_OOM;
                goto unlock;
        }
@@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /*
         * Ok, we need to copy. Oh, well..
         */
-       if (!PageReserved(old_page))
-               page_cache_get(old_page);
+       page_cache_get(old_page);
        pte_unmap(page_table);
        spin_unlock(&mm->page_table_lock);
 
@@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        page_table = pte_offset_map(pmd, address);
        if (likely(pte_same(*page_table, orig_pte))) {
-               if (PageReserved(old_page))
+               page_remove_rmap(old_page);
+               if (!PageAnon(old_page)) {
                        inc_mm_counter(mm, anon_rss);
-               else {
-                       page_remove_rmap(old_page);
-                       if (!PageAnon(old_page)) {
-                               inc_mm_counter(mm, anon_rss);
-                               dec_mm_counter(mm, file_rss);
-                       }
+                       dec_mm_counter(mm, file_rss);
                }
                flush_cache_page(vma, address, pfn);
                entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                int write_access)
 {
+       struct page *page = ZERO_PAGE(addr);
        pte_t entry;
 
        /* Mapping of ZERO_PAGE - vm_page_prot is readonly */
-       entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
+       entry = mk_pte(page, vma->vm_page_prot);
 
        if (write_access) {
-               struct page *page;
-
                /* Allocate our own private page. */
                pte_unmap(page_table);
                spin_unlock(&mm->page_table_lock);
@@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                lru_cache_add_active(page);
                SetPageReferenced(page);
                page_add_anon_rmap(page, vma, address);
+       } else {
+               inc_mm_counter(mm, file_rss);
+               page_add_file_rmap(page);
+               page_cache_get(page);
        }
 
        set_pte_at(mm, address, page_table, entry);
@@ -1916,7 +1943,7 @@ retry:
                        inc_mm_counter(mm, anon_rss);
                        lru_cache_add_active(new_page);
                        page_add_anon_rmap(new_page, vma, address);
-               } else if (!PageReserved(new_page)) {
+               } else if (!(vma->vm_flags & VM_RESERVED)) {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(new_page);
                }
@@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
                /*
                 * Page table corrupted: show pte and kill process.
                 */
-               pte_ERROR(orig_pte);
+               print_bad_pte(vma, orig_pte, address);
                return VM_FAULT_OOM;
        }
        /* We can then assume vm->vm_ops && vma->vm_ops->populate */
@@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void)
        gate_vma.vm_start = FIXADDR_USER_START;
        gate_vma.vm_end = FIXADDR_USER_END;
        gate_vma.vm_page_prot = PAGE_READONLY;
-       gate_vma.vm_flags = 0;
+       gate_vma.vm_flags = VM_RESERVED;
        return 0;
 }
 __initcall(gate_vma_init);
index 43b1199af591303c74fbb56e82b43b6d8fca2d6e..11d824f282f10fa04743642b75f30270316be250 100644 (file)
@@ -223,13 +223,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 }
 
 /* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
        pte_t *orig_pte;
        pte_t *pte;
 
-       spin_lock(&mm->page_table_lock);
+       spin_lock(&vma->vm_mm->page_table_lock);
        orig_pte = pte = pte_offset_map(pmd, addr);
        do {
                unsigned long pfn;
@@ -238,18 +238,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
                if (!pte_present(*pte))
                        continue;
                pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (!pfn_valid(pfn)) {
+                       print_bad_pte(vma, *pte, addr);
                        continue;
+               }
                nid = pfn_to_nid(pfn);
                if (!node_isset(nid, *nodes))
                        break;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap(orig_pte);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(&vma->vm_mm->page_table_lock);
        return addr != end;
 }
 
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
        pmd_t *pmd;
@@ -260,13 +262,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-               if (check_pte_range(mm, pmd, addr, next, nodes))
+               if (check_pte_range(vma, pmd, addr, next, nodes))
                        return -EIO;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
 
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
        pud_t *pud;
@@ -277,24 +279,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-               if (check_pmd_range(mm, pud, addr, next, nodes))
+               if (check_pmd_range(vma, pud, addr, next, nodes))
                        return -EIO;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
 
-static inline int check_pgd_range(struct mm_struct *mm,
+static inline int check_pgd_range(struct vm_area_struct *vma,
                unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
        pgd_t *pgd;
        unsigned long next;
 
-       pgd = pgd_offset(mm, addr);
+       pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-               if (check_pud_range(mm, pgd, addr, next, nodes))
+               if (check_pud_range(vma, pgd, addr, next, nodes))
                        return -EIO;
        } while (pgd++, addr = next, addr != end);
        return 0;
@@ -311,6 +313,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        first = find_vma(mm, start);
        if (!first)
                return ERR_PTR(-EFAULT);
+       if (first->vm_flags & VM_RESERVED)
+               return ERR_PTR(-EACCES);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
                if (!vma->vm_next && vma->vm_end < end)
@@ -323,8 +327,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                endvma = end;
                        if (vma->vm_start > start)
                                start = vma->vm_start;
-                       err = check_pgd_range(vma->vm_mm,
-                                          start, endvma, nodes);
+                       err = check_pgd_range(vma, start, endvma, nodes);
                        if (err) {
                                first = ERR_PTR(err);
                                break;
index 459b9f068ad7b13f8241f024fe779e2b26cfcff7..8a111792b8db6b22f19d8b1de8b006a48927221b 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1088,6 +1088,17 @@ munmap_back:
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
+               if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+                                               == (VM_WRITE | VM_RESERVED)) {
+                       printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+                               "PROT_WRITE mmap of VM_RESERVED memory, which "
+                               "is deprecated. Please report this to "
+                               "linux-kernel@vger.kernel.org\n",current->comm);
+                       if (vma->vm_ops && vma->vm_ops->close)
+                               vma->vm_ops->close(vma);
+                       error = -EACCES;
+                       goto unmap_and_free_vma;
+               }
        } else if (vm_flags & VM_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
index b426f01c5e9cdb0329657076d8a2e8908e3e015a..672a76fddd5e1bc3a1c9f5f599c0ed49ffea30af 100644 (file)
@@ -125,6 +125,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * a MAP_NORESERVE private mapping to writable will now reserve.
         */
        if (newflags & VM_WRITE) {
+               if (oldflags & VM_RESERVED) {
+                       BUG_ON(oldflags & VM_WRITE);
+                       printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+                               "PROT_WRITE mprotect of VM_RESERVED memory, "
+                               "which is deprecated. Please report this to "
+                               "linux-kernel@vger.kernel.org\n",current->comm);
+                       return -EACCES;
+               }
                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
index 3b5f1c521d4b580896504d31778441c10d25531a..8603954860601b24807c69a5d538a7114ee6932b 100644 (file)
@@ -25,6 +25,7 @@
 static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end)
 {
+       struct mm_struct *mm = vma->vm_mm;
        pte_t *pte;
        int progress = 0;
 
@@ -37,7 +38,7 @@ again:
                if (progress >= 64) {
                        progress = 0;
                        if (need_resched() ||
-                           need_lockbreak(&vma->vm_mm->page_table_lock))
+                           need_lockbreak(&mm->page_table_lock))
                                break;
                }
                progress++;
@@ -46,11 +47,11 @@ again:
                if (!pte_maybe_dirty(*pte))
                        continue;
                pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, *pte, addr);
                        continue;
+               }
                page = pfn_to_page(pfn);
-               if (PageReserved(page))
-                       continue;
 
                if (ptep_clear_flush_dirty(vma, addr, pte) ||
                    page_test_and_clear_dirty(page))
@@ -58,7 +59,7 @@ again:
                progress += 3;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap(pte - 1);
-       cond_resched_lock(&vma->vm_mm->page_table_lock);
+       cond_resched_lock(&mm->page_table_lock);
        if (addr != end)
                goto again;
 }
@@ -102,8 +103,10 @@ static void msync_page_range(struct vm_area_struct *vma,
 
        /* For hugepages we can't go walking the page table normally,
         * but that's ok, hugetlbfs is memory based, so we don't need
-        * to do anything more on an msync() */
-       if (is_vm_hugetlb_page(vma))
+        * to do anything more on an msync().
+        * Can't do anything with VM_RESERVED regions either.
+        */
+       if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
                return;
 
        BUG_ON(addr >= end);
index 60663232fbb23a34874f482f49a52f760f4d3ea6..0541288ebf4b1a898388a82a0b65cbb27dd5bde6 100644 (file)
@@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page)
                        1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
-                       1 << PG_writeback);
+                       1 << PG_writeback |
+                       1 << PG_reserved );
        set_page_count(page, 0);
        reset_page_mapcount(page);
        page->mapping = NULL;
@@ -244,7 +245,6 @@ static inline int page_is_buddy(struct page *page, int order)
 {
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
-           !PageReserved(page)         &&
             page_count(page) == 0)
                return 1;
        return 0;
@@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page)
                        1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                bad_page(function, page);
        if (PageDirty(page))
                __ClearPageDirty(page);
@@ -455,7 +456,8 @@ static void prep_new_page(struct page *page, int order)
                        1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                bad_page(__FUNCTION__, page);
 
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
@@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec)
 
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
-       if (!PageReserved(page) && put_page_testzero(page)) {
+       if (put_page_testzero(page)) {
                if (order == 0)
                        free_hot_page(page);
                else
@@ -1674,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
-               set_page_count(page, 0);
+               set_page_count(page, 1);
                reset_page_mapcount(page);
                SetPageReserved(page);
                INIT_LIST_HEAD(&page->lru);
index 504757624cce720f2cd424c13d6448111c86e6ca..f69d5342ce7ff06f3180a6b2b93af00e455a34e5 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -443,8 +443,6 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-       BUG_ON(PageReserved(page));
-
        if (atomic_inc_and_test(&page->_mapcount)) {
                struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -468,8 +466,7 @@ void page_add_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
        BUG_ON(PageAnon(page));
-       if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
-               return;
+       BUG_ON(!pfn_valid(page_to_pfn(page)));
 
        if (atomic_inc_and_test(&page->_mapcount))
                inc_page_state(nr_mapped);
@@ -483,8 +480,6 @@ void page_add_file_rmap(struct page *page)
  */
 void page_remove_rmap(struct page *page)
 {
-       BUG_ON(PageReserved(page));
-
        if (atomic_add_negative(-1, &page->_mapcount)) {
                BUG_ON(page_mapcount(page) < 0);
                /*
@@ -640,13 +635,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
                        continue;
 
                pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, *pte, address);
                        continue;
+               }
 
                page = pfn_to_page(pfn);
                BUG_ON(PageAnon(page));
-               if (PageReserved(page))
-                       continue;
 
                if (ptep_clear_flush_young(vma, address, pte))
                        continue;
@@ -808,7 +803,6 @@ int try_to_unmap(struct page *page)
 {
        int ret;
 
-       BUG_ON(PageReserved(page));
        BUG_ON(!PageLocked(page));
 
        if (PageAnon(page))
index 6796311a23ef5a20d4241f4ec2d5ca188679d7c9..37777f4c11f86074ce9d5cd81bd9cafb0dd6e233 100644 (file)
@@ -1506,8 +1506,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                         */
                        if (!offset)
                                mark_page_accessed(page);
-               } else
+               } else {
                        page = ZERO_PAGE(0);
+                       page_cache_get(page);
+               }
 
                /*
                 * Ok, we have the page, and it's up-to-date, so
index 7771d2803f62a40402409733cadaebf1a72ead45..21d15f99805c96517c21927817512d749017c244 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -48,7 +48,7 @@ void put_page(struct page *page)
                }
                return;
        }
-       if (!PageReserved(page) && put_page_testzero(page))
+       if (put_page_testzero(page))
                __page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
                struct page *page = pages[i];
                struct zone *pagezone;
 
-               if (PageReserved(page) || !put_page_testzero(page))
+               if (!put_page_testzero(page))
                        continue;
 
                pagezone = page_zone(page);
index 67abebabf83e4afb1d9da5e69215adf7d3eee8ca..e97b2d162cc72f3140792ec9725bf71078011183 100644 (file)
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
                return NOPAGE_OOM;
        runtime = substream->runtime;
        page = virt_to_page(runtime->status);
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
        if (type)
                *type = VM_FAULT_MINOR;
        return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
                return NOPAGE_OOM;
        runtime = substream->runtime;
        page = virt_to_page(runtime->control);
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
        if (type)
                *type = VM_FAULT_MINOR;
        return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
                vaddr = runtime->dma_area + offset;
                page = virt_to_page(vaddr);
        }
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
        if (type)
                *type = VM_FAULT_MINOR;
        return page;