Revert "netfilter: xt_qtaguid: Allow tracking loopback"

[firefly-linux-kernel-4.4.55.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 9701a501f7696b065b75c645945b386630009bc5..10481ebd96c990b3802392e1d1760a5ef657b0f0 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
  #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
  #include <linux/memcontrol.h>
  #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
  #include "internal.h"
  
  /*
@@ -58,16 +59,16 @@
  /*
   * Lock ordering:
   *
- *  ->i_mmap_lock              (truncate_pagecache)
+ *  ->i_mmap_mutex             (truncate_pagecache)
   *    ->private_lock           (__free_pte->__set_page_dirty_buffers)
   *      ->swap_lock            (exclusive_swap_page, others)
   *        ->mapping->tree_lock
   *
   *  ->i_mutex
- *    ->i_mmap_lock            (truncate->unmap_mapping_range)
+ *    ->i_mmap_mutex           (truncate->unmap_mapping_range)
   *
   *  ->mmap_sem
- *    ->i_mmap_lock
+ *    ->i_mmap_mutex
   *      ->page_table_lock or pte_lock  (various, mainly in memory.c)
   *        ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
   *
@@ -80,11 +81,11 @@
   *  ->i_mutex
   *    ->i_alloc_sem             (various)
   *
- *  ->inode_lock
- *    ->sb_lock                        (fs/fs-writeback.c)
+ *  inode_wb_list_lock
+ *    sb_lock                  (fs/fs-writeback.c)
   *    ->mapping->tree_lock     (__sync_single_inode)
   *
- *  ->i_mmap_lock
+ *  ->i_mmap_mutex
   *    ->anon_vma.lock          (vma_adjust)
   *
   *  ->anon_vma.lock
@@ -98,27 +99,36 @@
   *    ->zone.lru_lock          (check_pte_range->isolate_lru_page)
   *    ->private_lock           (page_remove_rmap->set_page_dirty)
   *    ->tree_lock              (page_remove_rmap->set_page_dirty)
- *    ->inode_lock             (page_remove_rmap->set_page_dirty)
- *    ->inode_lock             (zap_pte_range->set_page_dirty)
+ *    inode_wb_list_lock       (page_remove_rmap->set_page_dirty)
+ *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
+ *    inode_wb_list_lock       (zap_pte_range->set_page_dirty)
+ *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
   *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
   *
- *  ->task->proc_lock
- *    ->dcache_lock            (proc_pid_lookup)
- *
   *  (code doesn't rely on that order, so you could switch it around)
   *  ->tasklist_lock             (memory_failure, collect_procs_ao)
- *    ->i_mmap_lock
+ *    ->i_mmap_mutex
   */
  
  /*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
   * is safe.  The caller must hold the mapping's tree_lock.
   */
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
  {
         struct address_space *mapping = page->mapping;
  
+       /*
+        * if we're uptodate, flush out into the cleancache, otherwise
+        * invalidate any existing cleancache entries.  We can't leave
+        * stale data around in the cleancache once our page is gone
+        */
+       if (PageUptodate(page) && PageMappedToDisk(page))
+               cleancache_put_page(page);
+       else
+               cleancache_flush_page(mapping, page);
+
         radix_tree_delete(&mapping->page_tree, page->index);
         page->mapping = NULL;
         mapping->nrpages--;
@@ -140,58 +150,42 @@ void __remove_from_page_cache(struct page *page)
         }
  }
  
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked.  It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
  {
         struct address_space *mapping = page->mapping;
+       void (*freepage)(struct page *);
  
         BUG_ON(!PageLocked(page));
  
+       freepage = mapping->a_ops->freepage;
         spin_lock_irq(&mapping->tree_lock);
-       __remove_from_page_cache(page);
+       __delete_from_page_cache(page);
         spin_unlock_irq(&mapping->tree_lock);
         mem_cgroup_uncharge_cache_page(page);
+
+       if (freepage)
+               freepage(page);
+       page_cache_release(page);
  }
-EXPORT_SYMBOL(remove_from_page_cache);
+EXPORT_SYMBOL(delete_from_page_cache);
  
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
  {
-       struct address_space *mapping;
-       struct page *page;
-
-       page = container_of((unsigned long *)word, struct page, flags);
-
-       /*
-        * page_mapping() is being called without PG_locked held.
-        * Some knowledge of the state and use of the page is used to
-        * reduce the requirements down to a memory barrier.
-        * The danger here is of a stale page_mapping() return value
-        * indicating a struct address_space different from the one it's
-        * associated with when it is associated with one.
-        * After smp_mb(), it's either the correct page_mapping() for
-        * the page, or an old page_mapping() and the page's own
-        * page_mapping() has gone NULL.
-        * The ->sync_page() address_space operation must tolerate
-        * page_mapping() going NULL. By an amazing coincidence,
-        * this comes about because none of the users of the page
-        * in the ->sync_page() methods make essential use of the
-        * page_mapping(), merely passing the page down to the backing
-        * device's unplug functions when it's non-NULL, which in turn
-        * ignore it for all cases but swap, where only page_private(page) is
-        * of interest. When page_mapping() does go NULL, the entire
-        * call stack gracefully ignores the page and returns.
-        * -- wli
-        */
-       smp_mb();
-       mapping = page_mapping(page);
-       if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-               mapping->a_ops->sync_page(page);
         io_schedule();
         return 0;
  }
  
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
  {
-       sync_page(word);
+       sleep_on_page(word);
         return fatal_signal_pending(current) ? -EINTR : 0;
  }
  
@@ -296,7 +290,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                                 continue;
  
                         wait_on_page_writeback(page);
-                       if (PageError(page))
+                       if (TestClearPageError(page))
                                 ret = -EIO;
                 }
                 pagevec_release(&pvec);
@@ -384,6 +378,62 @@ int filemap_write_and_wait_range(struct address_space *mapping,
  }
  EXPORT_SYMBOL(filemap_write_and_wait_range);
  
+/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old:       page to be replaced
+ * @new:       page to replace with
+ * @gfp_mask:  allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one.  On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page.  Both the old and new pages must be
+ * locked.  This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic.  The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+       int error;
+
+       VM_BUG_ON(!PageLocked(old));
+       VM_BUG_ON(!PageLocked(new));
+       VM_BUG_ON(new->mapping);
+
+       error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+       if (!error) {
+               struct address_space *mapping = old->mapping;
+               void (*freepage)(struct page *);
+
+               pgoff_t offset = old->index;
+               freepage = mapping->a_ops->freepage;
+
+               page_cache_get(new);
+               new->mapping = mapping;
+               new->index = offset;
+
+               spin_lock_irq(&mapping->tree_lock);
+               __delete_from_page_cache(old);
+               error = radix_tree_insert(&mapping->page_tree, offset, new);
+               BUG_ON(error);
+               mapping->nrpages++;
+               __inc_zone_page_state(new, NR_FILE_PAGES);
+               if (PageSwapBacked(new))
+                       __inc_zone_page_state(new, NR_SHMEM);
+               spin_unlock_irq(&mapping->tree_lock);
+               /* mem_cgroup codes must not be called under tree_lock */
+               mem_cgroup_replace_page_cache(old, new);
+               radix_tree_preload_end();
+               if (freepage)
+                       freepage(old);
+               page_cache_release(old);
+       }
+
+       return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
  /**
   * add_to_page_cache_locked - add a locked page to the pagecache
   * @page:      page to add
@@ -466,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
         struct page *page;
  
         if (cpuset_do_page_mem_spread()) {
-               get_mems_allowed();
-               n = cpuset_mem_spread_node();
-               page = alloc_pages_exact_node(n, gfp, 0);
-               put_mems_allowed();
+               unsigned int cpuset_mems_cookie;
+               do {
+                       cpuset_mems_cookie = get_mems_allowed();
+                       n = cpuset_mem_spread_node();
+                       page = alloc_pages_exact_node(n, gfp, 0);
+               } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
                 return page;
         }
         return alloc_pages(gfp, 0);
@@ -477,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
  EXPORT_SYMBOL(__page_cache_alloc);
  #endif
  
-static int __sleep_on_page_lock(void *word)
-{
-       io_schedule();
-       return 0;
-}
-
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
@@ -510,11 +557,22 @@ void wait_on_page_bit(struct page *page, int bit_nr)
         DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
  
         if (test_bit(bit_nr, &page->flags))
-               __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+               __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
                                                         TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_on_page_bit);
  
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+       if (!test_bit(bit_nr, &page->flags))
+               return 0;
+
+       return __wait_on_bit(page_waitqueue(page), &wait,
+                            sleep_on_page_killable, TASK_KILLABLE);
+}
+
  /**
   * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
   * @page: Page defining the wait queue of interest
@@ -574,17 +632,12 @@ EXPORT_SYMBOL(end_page_writeback);
  /**
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
   * @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
   */
  void __lock_page(struct page *page)
  {
         DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
  
-       __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+       __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
                                                         TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(__lock_page);
@@ -594,22 +647,40 @@ int __lock_page_killable(struct page *page)
         DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
  
         return __wait_on_bit_lock(page_waitqueue(page), &wait,
-                                       sync_page_killable, TASK_KILLABLE);
+                                       sleep_on_page_killable, TASK_KILLABLE);
  }
  EXPORT_SYMBOL_GPL(__lock_page_killable);
  
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+                        unsigned int flags)
  {
-       DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-       __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-                                                       TASK_UNINTERRUPTIBLE);
+       if (flags & FAULT_FLAG_ALLOW_RETRY) {
+               /*
+                * CAUTION! In this case, mmap_sem is not released
+                * even though return 0.
+                */
+               if (flags & FAULT_FLAG_RETRY_NOWAIT)
+                       return 0;
+
+               up_read(&mm->mmap_sem);
+               if (flags & FAULT_FLAG_KILLABLE)
+                       wait_on_page_locked_killable(page);
+               else
+                       wait_on_page_locked(page);
+               return 0;
+       } else {
+               if (flags & FAULT_FLAG_KILLABLE) {
+                       int ret;
+
+                       ret = __lock_page_killable(page);
+                       if (ret) {
+                               up_read(&mm->mmap_sem);
+                               return 0;
+                       }
+               } else
+                       __lock_page(page);
+               return 1;
+       }
  }
  
  /**
@@ -767,9 +838,13 @@ repeat:
                 page = radix_tree_deref_slot((void **)pages[i]);
                 if (unlikely(!page))
                         continue;
+
+               /*
+                * This can only trigger when the entry at index 0 moves out
+                * of or back to the root: none yet gotten, safe to restart.
+                */
                 if (radix_tree_deref_retry(page)) {
-                       if (ret)
-                               start = pages[ret-1]->index;
+                       WARN_ON(start | i);
                         goto restart;
                 }
  
@@ -785,6 +860,13 @@ repeat:
                 pages[ret] = page;
                 ret++;
         }
+
+       /*
+        * If all entries were removed before we could secure them,
+        * try again, because callers stop trying once 0 is returned.
+        */
+       if (unlikely(!ret && nr_found))
+               goto restart;
         rcu_read_unlock();
         return ret;
  }
@@ -819,12 +901,14 @@ repeat:
                 page = radix_tree_deref_slot((void **)pages[i]);
                 if (unlikely(!page))
                         continue;
+
+               /*
+                * This can only trigger when the entry at index 0 moves out
+                * of or back to the root: none yet gotten, safe to restart.
+                */
                 if (radix_tree_deref_retry(page))
                         goto restart;
  
-               if (page->mapping == NULL || page->index != index)
-                       break;
-
                 if (!page_cache_get_speculative(page))
                         goto repeat;
  
@@ -834,6 +918,16 @@ repeat:
                         goto repeat;
                 }
  
+               /*
+                * must check mapping and index after taking the ref.
+                * otherwise we can get both false positives and false
+                * negatives, which is just confusing to the caller.
+                */
+               if (page->mapping == NULL || page->index != index) {
+                       page_cache_release(page);
+                       break;
+               }
+
                 pages[ret] = page;
                 ret++;
                 index++;
@@ -872,6 +966,11 @@ repeat:
                 page = radix_tree_deref_slot((void **)pages[i]);
                 if (unlikely(!page))
                         continue;
+
+               /*
+                * This can only trigger when the entry at index 0 moves out
+                * of or back to the root: none yet gotten, safe to restart.
+                */
                 if (radix_tree_deref_retry(page))
                         goto restart;
  
@@ -887,6 +986,13 @@ repeat:
                 pages[ret] = page;
                 ret++;
         }
+
+       /*
+        * If all entries were removed before we could secure them,
+        * try again, because callers stop trying once 0 is returned.
+        */
+       if (unlikely(!ret && nr_found))
+               goto restart;
         rcu_read_unlock();
  
         if (ret)
@@ -1297,8 +1403,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                         retval = filemap_write_and_wait_range(mapping, pos,
                                         pos + iov_length(iov, nr_segs) - 1);
                         if (!retval) {
+                               struct blk_plug plug;
+
+                               blk_start_plug(&plug);
                                 retval = mapping->a_ops->direct_IO(READ, iocb,
                                                         iov, pos, nr_segs);
+                               blk_finish_plug(&plug);
                         }
                         if (retval > 0) {
                                 *ppos = pos + retval;
@@ -1446,15 +1556,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
         /* If we don't want any read-ahead, don't bother */
         if (VM_RandomReadHint(vma))
                 return;
+       if (!ra->ra_pages)
+               return;
  
-       if (VM_SequentialReadHint(vma) ||
-                       offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
+       if (VM_SequentialReadHint(vma)) {
                 page_cache_sync_readahead(mapping, ra, file, offset,
                                           ra->ra_pages);
                 return;
         }
  
-       if (ra->mmap_miss < INT_MAX)
+       /* Avoid banging the cache line if not needed */
+       if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
                 ra->mmap_miss++;
  
         /*
@@ -1468,12 +1580,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
          * mmap read-around
          */
         ra_pages = max_sane_readahead(ra->ra_pages);
-       if (ra_pages) {
-               ra->start = max_t(long, 0, offset - ra_pages/2);
-               ra->size = ra_pages;
-               ra->async_size = 0;
-               ra_submit(ra, mapping, file);
-       }
+       ra->start = max_t(long, 0, offset - ra_pages / 2);
+       ra->size = ra_pages;
+       ra->async_size = ra_pages / 4;
+       ra_submit(ra, mapping, file);
  }
  
  /*
@@ -1536,25 +1646,31 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                  * waiting for the lock.
                  */
                 do_async_mmap_readahead(vma, ra, file, page, offset);
-               lock_page(page);
-
-               /* Did it get truncated? */
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto no_cached_page;
-               }
         } else {
                 /* No page in the page cache at all */
                 do_sync_mmap_readahead(vma, ra, file, offset);
                 count_vm_event(PGMAJFAULT);
+               mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                 ret = VM_FAULT_MAJOR;
  retry_find:
-               page = find_lock_page(mapping, offset);
+               page = find_get_page(mapping, offset);
                 if (!page)
                         goto no_cached_page;
         }
  
+       if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+               page_cache_release(page);
+               return ret | VM_FAULT_RETRY;
+       }
+
+       /* Did it get truncated? */
+       if (unlikely(page->mapping != mapping)) {
+               unlock_page(page);
+               put_page(page);
+               goto retry_find;
+       }
+       VM_BUG_ON(page->index != offset);
+
         /*
          * We have a locked page in the page cache, now we need to check
          * that it's up-to-date. If not, it is going to be due to an error.
@@ -1573,7 +1689,6 @@ retry_find:
                 return VM_FAULT_SIGBUS;
         }
  
-       ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
         vmf->page = page;
         return ret | VM_FAULT_LOCKED;
  
@@ -1681,7 +1796,7 @@ repeat:
                 page = __page_cache_alloc(gfp | __GFP_COLD);
                 if (!page)
                         return ERR_PTR(-ENOMEM);
-               err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+               err = add_to_page_cache_lru(page, mapping, index, gfp);
                 if (unlikely(err)) {
                         page_cache_release(page);
                         if (err == -EEXIST)
@@ -1778,10 +1893,7 @@ static struct page *wait_on_page_read(struct page *page)
   * @gfp:       the page allocator flags to use if allocating
   *
   * This is the same as "read_mapping_page(mapping, index, NULL)", but with
- * any new page allocations done using the specified allocation flags. Note
- * that the Radix tree operations will still use GFP_KERNEL, so you can't
- * expect to do this atomically or anything like that - but you can pass in
- * other page requirements.
+ * any new page allocations done using the specified allocation flags.
   *
   * If the page does not get brought uptodate, return -EIO.
   */
@@ -1856,16 +1968,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
  int file_remove_suid(struct file *file)
  {
         struct dentry *dentry = file->f_path.dentry;
-       int killsuid = should_remove_suid(dentry);
-       int killpriv = security_inode_need_killpriv(dentry);
+       struct inode *inode = dentry->d_inode;
+       int killsuid;
+       int killpriv;
         int error = 0;
  
+       /* Fast path for nothing security related */
+       if (IS_NOSEC(inode))
+               return 0;
+
+       killsuid = should_remove_suid(dentry);
+       killpriv = security_inode_need_killpriv(dentry);
+
         if (killpriv < 0)
                 return killpriv;
         if (killpriv)
                 error = security_inode_killpriv(dentry);
         if (!error && killsuid)
                 error = __remove_suid(dentry, killsuid);
+       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+               inode->i_flags |= S_NOSEC;
  
         return error;
  }
@@ -2174,12 +2296,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         }
  
         if (written > 0) {
-               loff_t end = pos + written;
-               if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
-                       i_size_write(inode,  end);
+               pos += written;
+               if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+                       i_size_write(inode, pos);
                         mark_inode_dirty(inode);
                 }
-               *ppos = end;
+               *ppos = pos;
         }
  out:
         return written;
@@ -2200,8 +2322,8 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                 gfp_notmask = __GFP_FS;
  repeat:
         page = find_lock_page(mapping, index);
-       if (likely(page))
-               return page;
+       if (page)
+               goto found;
  
         page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
         if (!page)
@@ -2214,6 +2336,8 @@ repeat:
                         goto repeat;
                 return NULL;
         }
+found:
+       wait_on_page_writeback(page);
         return page;
  }
  EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2460,11 +2584,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
+       struct blk_plug plug;
         ssize_t ret;
  
         BUG_ON(iocb->ki_pos != pos);
  
         mutex_lock(&inode->i_mutex);
+       blk_start_plug(&plug);
         ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
         mutex_unlock(&inode->i_mutex);
  
@@ -2475,6 +2601,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                 if (err < 0 && ret > 0)
                         ret = err;
         }
+       blk_finish_plug(&plug);
         return ret;
  }
  EXPORT_SYMBOL(generic_file_aio_write);