#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
#include "internal.h"
/*
/*
* Lock ordering:
*
- * ->i_mmap_lock (truncate_pagecache)
+ * ->i_mmap_mutex (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
* ->i_mutex
- * ->i_mmap_lock (truncate->unmap_mapping_range)
+ * ->i_mmap_mutex (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
* ->i_mutex
* ->i_alloc_sem (various)
*
- * ->inode_lock
- * ->sb_lock (fs/fs-writeback.c)
+ * inode_wb_list_lock
+ * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (zap_pte_range->set_page_dirty)
+ * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * inode_wb_list_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->task->proc_lock
- * ->dcache_lock (proc_pid_lookup)
- *
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
- * ->i_mmap_lock
+ * ->i_mmap_mutex
*/
/*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the mapping's tree_lock.
*/
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ /*
+ * if we're uptodate, flush out into the cleancache, otherwise
+ * invalidate any existing cleancache entries. We can't leave
+ * stale data around in the cleancache once our page is gone
+ */
+ if (PageUptodate(page) && PageMappedToDisk(page))
+ cleancache_put_page(page);
+ else
+ cleancache_flush_page(mapping, page);
+
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
}
}
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked. It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
+ freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
+ page_cache_release(page);
}
-EXPORT_SYMBOL(remove_from_page_cache);
+EXPORT_SYMBOL(delete_from_page_cache);
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
{
- struct address_space *mapping;
- struct page *page;
-
- page = container_of((unsigned long *)word, struct page, flags);
-
- /*
- * page_mapping() is being called without PG_locked held.
- * Some knowledge of the state and use of the page is used to
- * reduce the requirements down to a memory barrier.
- * The danger here is of a stale page_mapping() return value
- * indicating a struct address_space different from the one it's
- * associated with when it is associated with one.
- * After smp_mb(), it's either the correct page_mapping() for
- * the page, or an old page_mapping() and the page's own
- * page_mapping() has gone NULL.
- * The ->sync_page() address_space operation must tolerate
- * page_mapping() going NULL. By an amazing coincidence,
- * this comes about because none of the users of the page
- * in the ->sync_page() methods make essential use of the
- * page_mapping(), merely passing the page down to the backing
- * device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page_private(page) is
- * of interest. When page_mapping() does go NULL, the entire
- * call stack gracefully ignores the page and returns.
- * -- wli
- */
- smp_mb();
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
- mapping->a_ops->sync_page(page);
io_schedule();
return 0;
}
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
{
- sync_page(word);
+ sleep_on_page(word);
return fatal_signal_pending(current) ? -EINTR : 0;
}
continue;
wait_on_page_writeback(page);
- if (PageError(page))
+ if (TestClearPageError(page))
ret = -EIO;
}
pagevec_release(&pvec);
}
EXPORT_SYMBOL(filemap_write_and_wait_range);
+/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old: page to be replaced
+ * @new: page to replace with
+ * @gfp_mask: allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page. Both the old and new pages must be
+ * locked. This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic. The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+ int error;
+
+ VM_BUG_ON(!PageLocked(old));
+ VM_BUG_ON(!PageLocked(new));
+ VM_BUG_ON(new->mapping);
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (!error) {
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *);
+
+ pgoff_t offset = old->index;
+ freepage = mapping->a_ops->freepage;
+
+ page_cache_get(new);
+ new->mapping = mapping;
+ new->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(old);
+ error = radix_tree_insert(&mapping->page_tree, offset, new);
+ BUG_ON(error);
+ mapping->nrpages++;
+ __inc_zone_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(new))
+ __inc_zone_page_state(new, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ /* mem_cgroup codes must not be called under tree_lock */
+ mem_cgroup_replace_page_cache(old, new);
+ radix_tree_preload_end();
+ if (freepage)
+ freepage(old);
+ page_cache_release(old);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
struct page *page;
if (cpuset_do_page_mem_spread()) {
- get_mems_allowed();
- n = cpuset_mem_spread_node();
- page = alloc_pages_exact_node(n, gfp, 0);
- put_mems_allowed();
+ unsigned int cpuset_mems_cookie;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
return page;
}
return alloc_pages(gfp, 0);
EXPORT_SYMBOL(__page_cache_alloc);
#endif
-static int __sleep_on_page_lock(void *word)
-{
- io_schedule();
- return 0;
-}
-
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_on_page_bit);
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ if (!test_bit(bit_nr, &page->flags))
+ return 0;
+
+ return __wait_on_bit(page_waitqueue(page), &wait,
+ sleep_on_page_killable, TASK_KILLABLE);
+}
+
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait. However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
*/
void __lock_page(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait,
- sync_page_killable, TASK_KILLABLE);
+ sleep_on_page_killable, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
- TASK_UNINTERRUPTIBLE);
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ /*
+ * CAUTION! In this case, mmap_sem is not released
+ * even though return 0.
+ */
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
+ up_read(&mm->mmap_sem);
+ if (flags & FAULT_FLAG_KILLABLE)
+ wait_on_page_locked_killable(page);
+ else
+ wait_on_page_locked(page);
+ return 0;
+ } else {
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
+
+ ret = __lock_page_killable(page);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
+ }
}
/**
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page)) {
- if (ret)
- start = pages[ret-1]->index;
+ WARN_ON(start | i);
goto restart;
}
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
return ret;
}
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page))
goto restart;
- if (page->mapping == NULL || page->index != index)
- break;
-
if (!page_cache_get_speculative(page))
goto repeat;
goto repeat;
}
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != index) {
+ page_cache_release(page);
+ break;
+ }
+
pages[ret] = page;
ret++;
index++;
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page))
goto restart;
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
if (ret)
retval = filemap_write_and_wait_range(mapping, pos,
pos + iov_length(iov, nr_segs) - 1);
if (!retval) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
+ blk_finish_plug(&plug);
}
if (retval > 0) {
*ppos = pos + retval;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
+ if (!ra->ra_pages)
+ return;
- if (VM_SequentialReadHint(vma) ||
- offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
+ if (VM_SequentialReadHint(vma)) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
}
- if (ra->mmap_miss < INT_MAX)
+ /* Avoid banging the cache line if not needed */
+ if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
ra->mmap_miss++;
/*
* mmap read-around
*/
ra_pages = max_sane_readahead(ra->ra_pages);
- if (ra_pages) {
- ra->start = max_t(long, 0, offset - ra_pages/2);
- ra->size = ra_pages;
- ra->async_size = 0;
- ra_submit(ra, mapping, file);
- }
+ ra->start = max_t(long, 0, offset - ra_pages / 2);
+ ra->size = ra_pages;
+ ra->async_size = ra_pages / 4;
+ ra_submit(ra, mapping, file);
}
/*
* waiting for the lock.
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
- lock_page(page);
-
- /* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto no_cached_page;
- }
} else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
- page = find_lock_page(mapping, offset);
+ page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
}
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return ret | VM_FAULT_RETRY;
+ }
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON(page->index != offset);
+
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
return VM_FAULT_SIGBUS;
}
- ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
page = __page_cache_alloc(gfp | __GFP_COLD);
if (!page)
return ERR_PTR(-ENOMEM);
- err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+ err = add_to_page_cache_lru(page, mapping, index, gfp);
if (unlikely(err)) {
page_cache_release(page);
if (err == -EEXIST)
* @gfp: the page allocator flags to use if allocating
*
* This is the same as "read_mapping_page(mapping, index, NULL)", but with
- * any new page allocations done using the specified allocation flags. Note
- * that the Radix tree operations will still use GFP_KERNEL, so you can't
- * expect to do this atomically or anything like that - but you can pass in
- * other page requirements.
+ * any new page allocations done using the specified allocation flags.
*
* If the page does not get brought uptodate, return -EIO.
*/
int file_remove_suid(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
- int killsuid = should_remove_suid(dentry);
- int killpriv = security_inode_need_killpriv(dentry);
+ struct inode *inode = dentry->d_inode;
+ int killsuid;
+ int killpriv;
int error = 0;
+ /* Fast path for nothing security related */
+ if (IS_NOSEC(inode))
+ return 0;
+
+ killsuid = should_remove_suid(dentry);
+ killpriv = security_inode_need_killpriv(dentry);
+
if (killpriv < 0)
return killpriv;
if (killpriv)
error = security_inode_killpriv(dentry);
if (!error && killsuid)
error = __remove_suid(dentry, killsuid);
+ if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+ inode->i_flags |= S_NOSEC;
return error;
}
}
if (written > 0) {
- loff_t end = pos + written;
- if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
- i_size_write(inode, end);
+ pos += written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- *ppos = end;
+ *ppos = pos;
}
out:
return written;
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
- if (likely(page))
- return page;
+ if (page)
+ goto found;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
if (!page)
goto repeat;
return NULL;
}
+found:
+ wait_on_page_writeback(page);
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
+ blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (err < 0 && ret > 0)
ret = err;
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(generic_file_aio_write);