return follow_page_pte(vma, address, pmd, flags);
}
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
- return stack_guard_page_start(vma, addr) ||
- stack_guard_page_end(vma, addr+PAGE_SIZE);
-}
-
static int get_gate_page(struct mm_struct *mm, unsigned long address,
unsigned int gup_flags, struct vm_area_struct **vma,
struct page **page)
return ret;
}
+static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *flags, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned int fault_flags = 0;
+ int ret;
+
+ /* For mlock, just skip the stack guard page. */
+ if ((*flags & FOLL_MLOCK) &&
+ (stack_guard_page_start(vma, address) ||
+ stack_guard_page_end(vma, address + PAGE_SIZE)))
+ return -ENOENT;
+ if (*flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (*flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ return -EBUSY;
+ }
+
+ /*
+ * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
+ * necessary, even if maybe_mkwrite decided not to set pte_write. We
+ * can thus safely do subsequent page lookups as if they were reads.
+ * But only do so when looping for pte_write is futile: in some cases
+ * userspace may also be wanting to write to the gotten user page,
+ * which a read fault here might prevent (a readonly page might get
+ * reCOWed by userspace write).
+ */
+ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
+ *flags &= ~FOLL_WRITE;
+ return 0;
+}
+
+static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
+{
+ vm_flags_t vm_flags = vma->vm_flags;
+
+ if (vm_flags & (VM_IO | VM_PFNMAP))
+ return -EFAULT;
+
+ if (gup_flags & FOLL_WRITE) {
+ if (!(vm_flags & VM_WRITE)) {
+ if (!(gup_flags & FOLL_FORCE))
+ return -EFAULT;
+ /*
+ * We used to let the write,force case do COW in a
+ * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
+ * set a breakpoint in a read-only mapping of an
+ * executable, without corrupting the file (yet only
+ * when that file had been opened for writing!).
+ * Anon pages in shared mappings are surprising: now
+ * just reject it.
+ */
+ if (!is_cow_mapping(vm_flags)) {
+ WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+ return -EFAULT;
+ }
+ }
+ } else if (!(vm_flags & VM_READ)) {
+ if (!(gup_flags & FOLL_FORCE))
+ return -EFAULT;
+ /*
+ * Is there actually any vma we can reach here which does not
+ * have VM_MAYREAD set?
+ */
+ if (!(vm_flags & VM_MAYREAD))
+ return -EFAULT;
+ }
+ return 0;
+}
+
/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
- long i;
- unsigned long vm_flags;
+ long i = 0;
unsigned int page_mask;
+ struct vm_area_struct *vma = NULL;
if (!nr_pages)
return 0;
if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA;
- i = 0;
-
do {
- struct vm_area_struct *vma;
-
- vma = find_extend_vma(mm, start);
- if (!vma && in_gate_area(mm, start)) {
- int ret;
- ret = get_gate_page(mm, start & PAGE_MASK, gup_flags,
- &vma, pages ? &pages[i] : NULL);
- if (ret)
- goto efault;
- page_mask = 0;
- goto next_page;
- }
+ struct page *page;
+ unsigned int foll_flags = gup_flags;
+ unsigned int page_increm;
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+ vma = find_extend_vma(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ int ret;
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+ pages ? &pages[i] : NULL);
+ if (ret)
+ return i ? : ret;
+ page_mask = 0;
+ goto next_page;
+ }
- if (!vma)
- goto efault;
- vm_flags = vma->vm_flags;
- if (vm_flags & (VM_IO | VM_PFNMAP))
- goto efault;
-
- if (gup_flags & FOLL_WRITE) {
- if (!(vm_flags & VM_WRITE)) {
- if (!(gup_flags & FOLL_FORCE))
- goto efault;
- /*
- * We used to let the write,force case do COW
- * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so
- * ptrace could set a breakpoint in a read-only
- * mapping of an executable, without corrupting
- * the file (yet only when that file had been
- * opened for writing!). Anon pages in shared
- * mappings are surprising: now just reject it.
- */
- if (!is_cow_mapping(vm_flags)) {
- WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
- goto efault;
- }
+ if (!vma || check_vma_flags(vma, gup_flags))
+ return i ? : -EFAULT;
+ if (is_vm_hugetlb_page(vma)) {
+ i = follow_hugetlb_page(mm, vma, pages, vmas,
+ &start, &nr_pages, i,
+ gup_flags);
+ continue;
}
- } else {
- if (!(vm_flags & VM_READ)) {
- if (!(gup_flags & FOLL_FORCE))
- goto efault;
- /*
- * Is there actually any vma we can reach here
- * which does not have VM_MAYREAD set?
- */
- if (!(vm_flags & VM_MAYREAD))
- goto efault;
+ }
+retry:
+ /*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current)))
+ return i ? i : -ERESTARTSYS;
+ cond_resched();
+ page = follow_page_mask(vma, start, foll_flags, &page_mask);
+ if (!page) {
+ int ret;
+ ret = faultin_page(tsk, vma, start, &foll_flags,
+ nonblocking);
+ switch (ret) {
+ case 0:
+ goto retry;
+ case -EFAULT:
+ case -ENOMEM:
+ case -EHWPOISON:
+ return i ? i : ret;
+ case -EBUSY:
+ return i;
+ case -ENOENT:
+ goto next_page;
}
+ BUG();
}
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i, gup_flags);
- continue;
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
+ if (pages) {
+ pages[i] = page;
+ flush_anon_page(vma, page, start);
+ flush_dcache_page(page);
+ page_mask = 0;
}
-
- do {
- struct page *page;
- unsigned int foll_flags = gup_flags;
- unsigned int page_increm;
-
- /*
- * If we have a pending SIGKILL, don't keep faulting
- * pages and potentially allocating memory.
- */
- if (unlikely(fatal_signal_pending(current)))
- return i ? i : -ERESTARTSYS;
-
- cond_resched();
- while (!(page = follow_page_mask(vma, start,
- foll_flags, &page_mask))) {
- int ret;
- unsigned int fault_flags = 0;
-
- /* For mlock, just skip the stack guard page. */
- if (foll_flags & FOLL_MLOCK) {
- if (stack_guard_page(vma, start))
- goto next_page;
- }
- if (foll_flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
- if (foll_flags & FOLL_NOWAIT)
- fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
-
- ret = handle_mm_fault(mm, vma, start,
- fault_flags);
-
- if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return i ? i : -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON |
- VM_FAULT_HWPOISON_LARGE)) {
- if (i)
- return i;
- else if (gup_flags & FOLL_HWPOISON)
- return -EHWPOISON;
- else
- return -EFAULT;
- }
- if (ret & VM_FAULT_SIGBUS)
- goto efault;
- BUG();
- }
-
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
- if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
- *nonblocking = 0;
- return i;
- }
-
- /*
- * The VM_FAULT_WRITE bit tells us that
- * do_wp_page has broken COW when necessary,
- * even if maybe_mkwrite decided not to set
- * pte_write. We can thus safely do subsequent
- * page lookups as if they were reads. But only
- * do so when looping for pte_write is futile:
- * in some cases userspace may also be wanting
- * to write to the gotten user page, which a
- * read fault here might prevent (a readonly
- * page might get reCOWed by userspace write).
- */
- if ((ret & VM_FAULT_WRITE) &&
- !(vma->vm_flags & VM_WRITE))
- foll_flags &= ~FOLL_WRITE;
-
- cond_resched();
- }
- if (IS_ERR(page))
- return i ? i : PTR_ERR(page);
- if (pages) {
- pages[i] = page;
-
- flush_anon_page(vma, page, start);
- flush_dcache_page(page);
- page_mask = 0;
- }
next_page:
- if (vmas) {
- vmas[i] = vma;
- page_mask = 0;
- }
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
- if (page_increm > nr_pages)
- page_increm = nr_pages;
- i += page_increm;
- start += page_increm * PAGE_SIZE;
- nr_pages -= page_increm;
- } while (nr_pages && start < vma->vm_end);
+ if (vmas) {
+ vmas[i] = vma;
+ page_mask = 0;
+ }
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ if (page_increm > nr_pages)
+ page_increm = nr_pages;
+ i += page_increm;
+ start += page_increm * PAGE_SIZE;
+ nr_pages -= page_increm;
} while (nr_pages);
return i;
-efault:
- return i ? : -EFAULT;
}
EXPORT_SYMBOL(__get_user_pages);