Merge tag 'xfs-for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)
Pull xfs updates from Dave Chinner:
 "There is nothing really major here - the only significant addition is
  the per-mount operation statistics infrastructure.  Otherwises there's
  various ACL, xattr, DAX, AIO and logging fixes, and a smattering of
  small cleanups and fixes elsewhere.

  Summary:

   - per-mount operational statistics in sysfs
   - fixes for concurrent aio append write submission
   - various logging fixes
   - detection of zeroed logs and invalid log sequence numbers on v5 filesystems
   - memory allocation failure message improvements
   - a bunch of xattr/ACL fixes
   - fdatasync optimisation
   - miscellaneous other fixes and cleanups"

* tag 'xfs-for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (39 commits)
  xfs: give all workqueues rescuer threads
  xfs: fix log recovery op header validation assert
  xfs: Fix error path in xfs_get_acl
  xfs: optimise away log forces on timestamp updates for fdatasync
  xfs: don't leak uuid table on rmmod
  xfs: invalidate cached acl if set via ioctl
  xfs: Plug memory leak in xfs_attrmulti_attr_set
  xfs: Validate the length of on-disk ACLs
  xfs: invalidate cached acl if set directly via xattr
  xfs: xfs_filemap_pmd_fault treats read faults as write faults
  xfs: add ->pfn_mkwrite support for DAX
  xfs: DAX does not use IO completion callbacks
  xfs: Don't use unwritten extents for DAX
  xfs: introduce BMAPI_ZERO for allocating zeroed extents
  xfs: fix inode size update overflow in xfs_map_direct()
  xfs: clear PF_NOFREEZE for xfsaild kthread
  xfs: fix an error code in xfs_fs_fill_super()
  xfs: stats are no longer dependent on CONFIG_PROC_FS
  xfs: simplify /proc teardown & error handling
  xfs: per-filesystem stats counter implementation
  ...

1  2 
fs/dax.c
fs/xfs/xfs_file.c
fs/xfs/xfs_qm.c

diff --combined fs/dax.c
index a86d3cc2b38941b0e39f23be84e4986d8852ae42,74033ad1bc9291e540fe17d4a367818b9b557aa0..131fd35ae39d53f8a9ef7bdd1aaa61677beda5f0
+++ b/fs/dax.c
  #include <linux/uio.h>
  #include <linux/vmstat.h>
  
+ /*
+  * dax_clear_blocks() is called from within transaction context from XFS,
+  * and hence this means the stack from this point must follow GFP_NOFS
+  * semantics for all operations.
+  */
  int dax_clear_blocks(struct inode *inode, sector_t block, long size)
  {
        struct block_device *bdev = inode->i_sb->s_bdev;
@@@ -285,7 -290,6 +290,7 @@@ static int copy_user_bh(struct page *to
  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
  {
 +      struct address_space *mapping = inode->i_mapping;
        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
        void __pmem *addr;
        pgoff_t size;
        int error;
  
 +      i_mmap_lock_read(mapping);
 +
        /*
         * Check truncate didn't happen while we were allocating a block.
         * If it did, this block may or may not be still allocated to the
        error = vm_insert_mixed(vma, vaddr, pfn);
  
   out:
 +      i_mmap_unlock_read(mapping);
 +
        return error;
  }
  
@@@ -387,15 -387,17 +392,15 @@@ int __dax_fault(struct vm_area_struct *
                         * from a read fault and we've raced with a truncate
                         */
                        error = -EIO;
 -                      goto unlock;
 +                      goto unlock_page;
                }
 -      } else {
 -              i_mmap_lock_write(mapping);
        }
  
        error = get_block(inode, block, &bh, 0);
        if (!error && (bh.b_size < PAGE_SIZE))
                error = -EIO;           /* fs corruption? */
        if (error)
 -              goto unlock;
 +              goto unlock_page;
  
        if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
                if (vmf->flags & FAULT_FLAG_WRITE) {
                        if (!error && (bh.b_size < PAGE_SIZE))
                                error = -EIO;
                        if (error)
 -                              goto unlock;
 +                              goto unlock_page;
                } else {
 -                      i_mmap_unlock_write(mapping);
                        return dax_load_hole(mapping, page, vmf);
                }
        }
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
 -                      goto unlock;
 +                      goto unlock_page;
                vmf->page = page;
                if (!page) {
 +                      i_mmap_lock_read(mapping);
                        /* Check we didn't race with truncate */
                        size = (i_size_read(inode) + PAGE_SIZE - 1) >>
                                                                PAGE_SHIFT;
                        if (vmf->pgoff >= size) {
 +                              i_mmap_unlock_read(mapping);
                                error = -EIO;
 -                              goto unlock;
 +                              goto out;
                        }
                }
                return VM_FAULT_LOCKED;
                        WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
        }
  
 -      if (!page)
 -              i_mmap_unlock_write(mapping);
   out:
        if (error == -ENOMEM)
                return VM_FAULT_OOM | major;
                return VM_FAULT_SIGBUS | major;
        return VM_FAULT_NOPAGE | major;
  
 - unlock:
 + unlock_page:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
 -      } else {
 -              i_mmap_unlock_write(mapping);
        }
 -
        goto out;
  }
  EXPORT_SYMBOL(__dax_fault);
@@@ -555,10 -561,10 +560,10 @@@ int __dax_pmd_fault(struct vm_area_stru
        block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
  
        bh.b_size = PMD_SIZE;
 -      i_mmap_lock_write(mapping);
        length = get_block(inode, block, &bh, write);
        if (length)
                return VM_FAULT_SIGBUS;
 +      i_mmap_lock_read(mapping);
  
        /*
         * If the filesystem isn't willing to tell us the length of a hole,
        if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
                goto fallback;
  
 -      if (buffer_unwritten(&bh) || buffer_new(&bh)) {
 -              int i;
 -              for (i = 0; i < PTRS_PER_PMD; i++)
 -                      clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
 -              wmb_pmem();
 -              count_vm_event(PGMAJFAULT);
 -              mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 -              result |= VM_FAULT_MAJOR;
 -      }
 -
        /*
         * If we allocated new storage, make sure no process has any
         * zero pages covering this hole
         */
        if (buffer_new(&bh)) {
 -              i_mmap_unlock_write(mapping);
 +              i_mmap_unlock_read(mapping);
                unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
 -              i_mmap_lock_write(mapping);
 +              i_mmap_lock_read(mapping);
        }
  
        /*
                if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
                        goto fallback;
  
 +              if (buffer_unwritten(&bh) || buffer_new(&bh)) {
 +                      int i;
 +                      for (i = 0; i < PTRS_PER_PMD; i++)
 +                              clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
 +                      wmb_pmem();
 +                      count_vm_event(PGMAJFAULT);
 +                      mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 +                      result |= VM_FAULT_MAJOR;
 +              }
 +
                result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
        }
  
   out:
 +      i_mmap_unlock_read(mapping);
 +
        if (buffer_unwritten(&bh))
                complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
  
 -      i_mmap_unlock_write(mapping);
 -
        return result;
  
   fallback:
diff --combined fs/xfs/xfs_file.c
index f80e90f95ad8d766e34890326bf33f77a03ba125,39743efae79501f3d590b05722ebd0c30deb50f4..f5392ab2def1ab806aa075bcc19643cbac6ca8f9
@@@ -242,19 -242,30 +242,30 @@@ xfs_file_fsync
        }
  
        /*
-        * All metadata updates are logged, which means that we just have
-        * to flush the log up to the latest LSN that touched the inode.
+        * All metadata updates are logged, which means that we just have to
+        * flush the log up to the latest LSN that touched the inode. If we have
+        * concurrent fsync/fdatasync() calls, we need them to all block on the
+        * log force before we clear the ili_fsync_fields field. This ensures
+        * that we don't get a racing sync operation that does not wait for the
+        * metadata to hit the journal before returning. If we race with
+        * clearing the ili_fsync_fields, then all that will happen is the log
+        * force will do nothing as the lsn will already be on disk. We can't
+        * race with setting ili_fsync_fields because that is done under
+        * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+        * until after the ili_fsync_fields is cleared.
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        if (xfs_ipincount(ip)) {
                if (!datasync ||
-                   (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+                   (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
                        lsn = ip->i_itemp->ili_last_lsn;
        }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
-       if (lsn)
+       if (lsn) {
                error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+               ip->i_itemp->ili_fsync_fields = 0;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
        /*
         * If we only have a single device, and the log force about was
@@@ -287,7 -298,7 +298,7 @@@ xfs_file_read_iter
        xfs_fsize_t             n;
        loff_t                  pos = iocb->ki_pos;
  
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(mp, xs_read_calls);
  
        if (unlikely(iocb->ki_flags & IOCB_DIRECT))
                ioflags |= XFS_IO_ISDIRECT;
  
        ret = generic_file_read_iter(iocb, to);
        if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(mp, xs_read_bytes, ret);
  
        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
@@@ -383,7 -394,7 +394,7 @@@ xfs_file_splice_read
        int                     ioflags = 0;
        ssize_t                 ret;
  
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(ip->i_mount, xs_read_calls);
  
        if (infilp->f_mode & FMODE_NOCMTIME)
                ioflags |= XFS_IO_INVIS;
        else
                ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
        if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
  
        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
@@@ -482,6 -493,8 +493,8 @@@ xfs_zero_eof
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(offset > isize);
  
+       trace_xfs_zero_eof(ip, isize, offset - isize);
        /*
         * First handle zeroing the block on which isize resides.
         *
@@@ -574,6 -587,7 +587,7 @@@ xfs_file_aio_write_checks
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 error = 0;
        size_t                  count = iov_iter_count(from);
+       bool                    drained_dio = false;
  
  restart:
        error = generic_write_checks(iocb, from);
                bool    zero = false;
  
                spin_unlock(&ip->i_flags_lock);
-               if (*iolock == XFS_IOLOCK_SHARED) {
-                       xfs_rw_iunlock(ip, *iolock);
-                       *iolock = XFS_IOLOCK_EXCL;
-                       xfs_rw_ilock(ip, *iolock);
-                       iov_iter_reexpand(from, count);
+               if (!drained_dio) {
+                       if (*iolock == XFS_IOLOCK_SHARED) {
+                               xfs_rw_iunlock(ip, *iolock);
+                               *iolock = XFS_IOLOCK_EXCL;
+                               xfs_rw_ilock(ip, *iolock);
+                               iov_iter_reexpand(from, count);
+                       }
                        /*
                         * We now have an IO submission barrier in place, but
                         * AIO can do EOF updates during IO completion and hence
                         * no-op.
                         */
                        inode_dio_wait(inode);
+                       drained_dio = true;
                        goto restart;
                }
                error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@@ -867,7 -883,7 +883,7 @@@ xfs_file_write_iter
        ssize_t                 ret;
        size_t                  ocount = iov_iter_count(from);
  
-       XFS_STATS_INC(xs_write_calls);
+       XFS_STATS_INC(ip->i_mount, xs_write_calls);
  
        if (ocount == 0)
                return 0;
        if (ret > 0) {
                ssize_t err;
  
-               XFS_STATS_ADD(xs_write_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
  
                /* Handle various SYNC-type writes */
                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@@ -1477,7 -1493,7 +1493,7 @@@ xfs_file_llseek
   *
   * mmap_sem (MM)
   *   sb_start_pagefault(vfs, freeze)
-  *     i_mmap_lock (XFS - truncate serialisation)
+  *     i_mmaplock (XFS - truncate serialisation)
   *       page_lock (MM)
   *         i_lock (XFS - extent map serialisation)
   */
@@@ -1503,10 -1519,9 +1519,9 @@@ xfs_filemap_page_mkwrite
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
        if (IS_DAX(inode)) {
-               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
        } else {
 -              ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
 +              ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
                ret = block_page_mkwrite_return(ret);
        }
  
@@@ -1538,7 -1553,7 +1553,7 @@@ xfs_filemap_fault
                 * changes to xfs_get_blocks_direct() to map unwritten extent
                 * ioend for conversion on read-only mappings.
                 */
-               ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
        } else
                ret = filemap_fault(vma, vmf);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
        return ret;
  }
  
+ /*
+  * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+  * both read and write faults. Hence we need to handle both cases. There is no
+  * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+  * handle both cases here. @flags carries the information on the type of fault
+  * occuring.
+  */
  STATIC int
  xfs_filemap_pmd_fault(
        struct vm_area_struct   *vma,
  
        trace_xfs_filemap_pmd_fault(ip);
  
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vma->vm_file);
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+       }
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+                             NULL);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       sb_end_pagefault(inode->i_sb);
  
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(inode->i_sb);
+       return ret;
+ }
+ /*
+  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+  * updates on write faults. In reality, it's need to serialise against
+  * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+  * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+  * barrier in place.
+  */
+ static int
+ xfs_filemap_pfn_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+ {
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret = VM_FAULT_NOPAGE;
+       loff_t                  size;
+       trace_xfs_filemap_pfn_mkwrite(ip);
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       /* check if the faulting page hasn't raced with truncate */
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               ret = VM_FAULT_SIGBUS;
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
        return ret;
  }
  
  static const struct vm_operations_struct xfs_file_vm_ops = {
        .pmd_fault      = xfs_filemap_pmd_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_filemap_page_mkwrite,
+       .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
  };
  
  STATIC int
diff --combined fs/xfs/xfs_qm.c
index 587174fd4f2c216637eda70b9986848a0980fd09,7af7648c06c63bd63ec21b150cc2713914f28c08..532ab79d38fe376c14a5463a97195b59a61d8f84
@@@ -184,7 -184,7 +184,7 @@@ xfs_qm_dqpurge
         */
        ASSERT(!list_empty(&dqp->q_lru));
        list_lru_del(&qi->qi_lru, &dqp->q_lru);
-       XFS_STATS_DEC(xs_qm_dquot_unused);
+       XFS_STATS_DEC(mp, xs_qm_dquot_unused);
  
        xfs_qm_dqdestroy(dqp);
        return 0;
@@@ -448,11 -448,11 +448,11 @@@ xfs_qm_dquot_isolate
         */
        if (dqp->q_nrefs) {
                xfs_dqunlock(dqp);
-               XFS_STATS_INC(xs_qm_dqwants);
+               XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
  
                trace_xfs_dqreclaim_want(dqp);
                list_lru_isolate(lru, &dqp->q_lru);
-               XFS_STATS_DEC(xs_qm_dquot_unused);
+               XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
                return LRU_REMOVED;
        }
  
  
        ASSERT(dqp->q_nrefs == 0);
        list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
-       XFS_STATS_DEC(xs_qm_dquot_unused);
+       XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
        trace_xfs_dqreclaim_done(dqp);
-       XFS_STATS_INC(xs_qm_dqreclaims);
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
        return LRU_REMOVED;
  
  out_miss_busy:
        trace_xfs_dqreclaim_busy(dqp);
-       XFS_STATS_INC(xs_qm_dqreclaim_misses);
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
        return LRU_SKIP;
  
  out_unlock_dirty:
        trace_xfs_dqreclaim_busy(dqp);
-       XFS_STATS_INC(xs_qm_dqreclaim_misses);
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
        xfs_dqunlock(dqp);
        spin_lock(lru_lock);
        return LRU_RETRY;
@@@ -525,7 -525,7 +525,7 @@@ xfs_qm_shrink_scan
        unsigned long           freed;
        int                     error;
  
 -      if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
 +      if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
                return 0;
  
        INIT_LIST_HEAD(&isol.buffers);