Merge tag 'xfs-for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)
diff --combined fs/dax.c

index a86d3cc2b38941b0e39f23be84e4986d8852ae42,74033ad1bc9291e540fe17d4a367818b9b557aa0..131fd35ae39d53f8a9ef7bdd1aaa61677beda5f0
--- 1/fs/dax.c
--- 2/fs/dax.c
+++ b/fs/dax.c
@@@ -29,6 -29,11 +29,11 @@@
   #include <linux/uio.h>
   #include <linux/vmstat.h>
   
+ /*
+  * dax_clear_blocks() is called from within transaction context from XFS,
+  * and hence this means the stack from this point must follow GFP_NOFS
+  * semantics for all operations.
+  */
   int dax_clear_blocks(struct inode *inode, sector_t block, long size)
   {
         struct block_device *bdev = inode->i_sb->s_bdev;
@@@ -285,7 -290,6 +290,7 @@@ static int copy_user_bh(struct page *to
   static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                         struct vm_area_struct *vma, struct vm_fault *vmf)
   {
+ +      struct address_space *mapping = inode->i_mapping;
         sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
         unsigned long vaddr = (unsigned long)vmf->virtual_address;
         void __pmem *addr;
@@@ -293,8 -297,6 +298,8 @@@
         pgoff_t size;
         int error;
   
+ +      i_mmap_lock_read(mapping);
+ +
         /*
          * Check truncate didn't happen while we were allocating a block.
          * If it did, this block may or may not be still allocated to the
@@@ -324,8 -326,6 +329,8 @@@
         error = vm_insert_mixed(vma, vaddr, pfn);
   
    out:
+ +      i_mmap_unlock_read(mapping);
+ +
         return error;
   }
   
@@@ -387,15 -387,17 +392,15 @@@ int __dax_fault(struct vm_area_struct *
                          * from a read fault and we've raced with a truncate
                          */
                         error = -EIO;
- -                      goto unlock;
+ +                      goto unlock_page;
                 }
- -      } else {
- -              i_mmap_lock_write(mapping);
         }
   
         error = get_block(inode, block, &bh, 0);
         if (!error && (bh.b_size < PAGE_SIZE))
                 error = -EIO;           /* fs corruption? */
         if (error)
- -              goto unlock;
+ +              goto unlock_page;
   
         if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
                 if (vmf->flags & FAULT_FLAG_WRITE) {
@@@ -406,8 -408,9 +411,8 @@@
                         if (!error && (bh.b_size < PAGE_SIZE))
                                 error = -EIO;
                         if (error)
- -                              goto unlock;
+ +                              goto unlock_page;
                 } else {
- -                      i_mmap_unlock_write(mapping);
                         return dax_load_hole(mapping, page, vmf);
                 }
         }
@@@ -419,17 -422,15 +424,17 @@@
                 else
                         clear_user_highpage(new_page, vaddr);
                 if (error)
- -                      goto unlock;
+ +                      goto unlock_page;
                 vmf->page = page;
                 if (!page) {
+ +                      i_mmap_lock_read(mapping);
                         /* Check we didn't race with truncate */
                         size = (i_size_read(inode) + PAGE_SIZE - 1) >>
                                                                 PAGE_SHIFT;
                         if (vmf->pgoff >= size) {
+ +                              i_mmap_unlock_read(mapping);
                                 error = -EIO;
- -                              goto unlock;
+ +                              goto out;
                         }
                 }
                 return VM_FAULT_LOCKED;
@@@ -465,6 -466,8 +470,6 @@@
                         WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
         }
   
- -      if (!page)
- -              i_mmap_unlock_write(mapping);
    out:
         if (error == -ENOMEM)
                 return VM_FAULT_OOM | major;
@@@ -473,11 -476,14 +478,11 @@@
                 return VM_FAULT_SIGBUS | major;
         return VM_FAULT_NOPAGE | major;
   
- - unlock:
+ + unlock_page:
         if (page) {
                 unlock_page(page);
                 page_cache_release(page);
- -      } else {
- -              i_mmap_unlock_write(mapping);
         }
- -
         goto out;
   }
   EXPORT_SYMBOL(__dax_fault);
@@@ -555,10 -561,10 +560,10 @@@ int __dax_pmd_fault(struct vm_area_stru
         block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
   
         bh.b_size = PMD_SIZE;
- -      i_mmap_lock_write(mapping);
         length = get_block(inode, block, &bh, write);
         if (length)
                 return VM_FAULT_SIGBUS;
+ +      i_mmap_lock_read(mapping);
   
         /*
          * If the filesystem isn't willing to tell us the length of a hole,
@@@ -568,14 -574,24 +573,14 @@@
         if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
                 goto fallback;
   
- -      if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- -              int i;
- -              for (i = 0; i < PTRS_PER_PMD; i++)
- -                      clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
- -              wmb_pmem();
- -              count_vm_event(PGMAJFAULT);
- -              mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- -              result |= VM_FAULT_MAJOR;
- -      }
- -
         /*
          * If we allocated new storage, make sure no process has any
          * zero pages covering this hole
          */
         if (buffer_new(&bh)) {
- -              i_mmap_unlock_write(mapping);
+ +              i_mmap_unlock_read(mapping);
                 unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
- -              i_mmap_lock_write(mapping);
+ +              i_mmap_lock_read(mapping);
         }
   
         /*
@@@ -622,25 -638,15 +627,25 @@@
                 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
                         goto fallback;
   
+ +              if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ +                      int i;
+ +                      for (i = 0; i < PTRS_PER_PMD; i++)
+ +                              clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+ +                      wmb_pmem();
+ +                      count_vm_event(PGMAJFAULT);
+ +                      mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ +                      result |= VM_FAULT_MAJOR;
+ +              }
+ +
                 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
         }
   
    out:
+ +      i_mmap_unlock_read(mapping);
+ +
         if (buffer_unwritten(&bh))
                 complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
   
- -      i_mmap_unlock_write(mapping);
- -
         return result;
   
    fallback:
diff --combined fs/xfs/xfs_file.c

index f80e90f95ad8d766e34890326bf33f77a03ba125,39743efae79501f3d590b05722ebd0c30deb50f4..f5392ab2def1ab806aa075bcc19643cbac6ca8f9
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -242,19 -242,30 +242,30 @@@ xfs_file_fsync
         }
   
         /*
-        * All metadata updates are logged, which means that we just have
-        * to flush the log up to the latest LSN that touched the inode.
+        * All metadata updates are logged, which means that we just have to
+        * flush the log up to the latest LSN that touched the inode. If we have
+        * concurrent fsync/fdatasync() calls, we need them to all block on the
+        * log force before we clear the ili_fsync_fields field. This ensures
+        * that we don't get a racing sync operation that does not wait for the
+        * metadata to hit the journal before returning. If we race with
+        * clearing the ili_fsync_fields, then all that will happen is the log
+        * force will do nothing as the lsn will already be on disk. We can't
+        * race with setting ili_fsync_fields because that is done under
+        * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+        * until after the ili_fsync_fields is cleared.
          */
         xfs_ilock(ip, XFS_ILOCK_SHARED);
         if (xfs_ipincount(ip)) {
                 if (!datasync ||
-                   (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+                   (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
                         lsn = ip->i_itemp->ili_last_lsn;
         }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
   
-       if (lsn)
+       if (lsn) {
                 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+               ip->i_itemp->ili_fsync_fields = 0;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
   
         /*
          * If we only have a single device, and the log force about was
@@@ -287,7 -298,7 +298,7 @@@ xfs_file_read_iter
         xfs_fsize_t             n;
         loff_t                  pos = iocb->ki_pos;
   
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(mp, xs_read_calls);
   
         if (unlikely(iocb->ki_flags & IOCB_DIRECT))
                 ioflags |= XFS_IO_ISDIRECT;
@@@ -365,7 -376,7 +376,7 @@@
   
         ret = generic_file_read_iter(iocb, to);
         if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(mp, xs_read_bytes, ret);
   
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
@@@ -383,7 -394,7 +394,7 @@@ xfs_file_splice_read
         int                     ioflags = 0;
         ssize_t                 ret;
   
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(ip->i_mount, xs_read_calls);
   
         if (infilp->f_mode & FMODE_NOCMTIME)
                 ioflags |= XFS_IO_INVIS;
@@@ -401,7 -412,7 +412,7 @@@
         else
                 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
         if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
   
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
@@@ -482,6 -493,8 +493,8 @@@ xfs_zero_eof
         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
         ASSERT(offset > isize);
   
+       trace_xfs_zero_eof(ip, isize, offset - isize);
+ 
         /*
          * First handle zeroing the block on which isize resides.
          *
@@@ -574,6 -587,7 +587,7 @@@ xfs_file_aio_write_checks
         struct xfs_inode        *ip = XFS_I(inode);
         ssize_t                 error = 0;
         size_t                  count = iov_iter_count(from);
+       bool                    drained_dio = false;
   
   restart:
         error = generic_write_checks(iocb, from);
@@@ -611,12 -625,13 +625,13 @@@
                 bool    zero = false;
   
                 spin_unlock(&ip->i_flags_lock);
-               if (*iolock == XFS_IOLOCK_SHARED) {
-                       xfs_rw_iunlock(ip, *iolock);
-                       *iolock = XFS_IOLOCK_EXCL;
-                       xfs_rw_ilock(ip, *iolock);
-                       iov_iter_reexpand(from, count);
- 
+               if (!drained_dio) {
+                       if (*iolock == XFS_IOLOCK_SHARED) {
+                               xfs_rw_iunlock(ip, *iolock);
+                               *iolock = XFS_IOLOCK_EXCL;
+                               xfs_rw_ilock(ip, *iolock);
+                               iov_iter_reexpand(from, count);
+                       }
                         /*
                          * We now have an IO submission barrier in place, but
                          * AIO can do EOF updates during IO completion and hence
@@@ -626,6 -641,7 +641,7 @@@
                          * no-op.
                          */
                         inode_dio_wait(inode);
+                       drained_dio = true;
                         goto restart;
                 }
                 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@@ -867,7 -883,7 +883,7 @@@ xfs_file_write_iter
         ssize_t                 ret;
         size_t                  ocount = iov_iter_count(from);
   
-       XFS_STATS_INC(xs_write_calls);
+       XFS_STATS_INC(ip->i_mount, xs_write_calls);
   
         if (ocount == 0)
                 return 0;
@@@ -883,7 -899,7 +899,7 @@@
         if (ret > 0) {
                 ssize_t err;
   
-               XFS_STATS_ADD(xs_write_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
   
                 /* Handle various SYNC-type writes */
                 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@@ -1477,7 -1493,7 +1493,7 @@@ xfs_file_llseek
    *
    * mmap_sem (MM)
    *   sb_start_pagefault(vfs, freeze)
-  *     i_mmap_lock (XFS - truncate serialisation)
+  *     i_mmaplock (XFS - truncate serialisation)
    *       page_lock (MM)
    *         i_lock (XFS - extent map serialisation)
    */
@@@ -1503,10 -1519,9 +1519,9 @@@ xfs_filemap_page_mkwrite
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
   
         if (IS_DAX(inode)) {
-               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
         } else {
- -              ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ +              ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
                 ret = block_page_mkwrite_return(ret);
         }
   
@@@ -1538,7 -1553,7 +1553,7 @@@ xfs_filemap_fault
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
-               ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@@ -1546,6 -1561,13 +1561,13 @@@
         return ret;
   }
   
+ /*
+  * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+  * both read and write faults. Hence we need to handle both cases. There is no
+  * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+  * handle both cases here. @flags carries the information on the type of fault
+  * occuring.
+  */
   STATIC int
   xfs_filemap_pmd_fault(
         struct vm_area_struct   *vma,
@@@ -1562,15 -1584,54 +1584,54 @@@
   
         trace_xfs_filemap_pmd_fault(ip);
   
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vma->vm_file);
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+       }
+ 
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+                             NULL);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       sb_end_pagefault(inode->i_sb);
   
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(inode->i_sb);
+ 
+       return ret;
+ }
+ 
+ /*
+  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+  * updates on write faults. In reality, it's need to serialise against
+  * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+  * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+  * barrier in place.
+  */
+ static int
+ xfs_filemap_pfn_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+ {
+ 
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret = VM_FAULT_NOPAGE;
+       loff_t                  size;
+ 
+       trace_xfs_filemap_pfn_mkwrite(ip);
+ 
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+ 
+       /* check if the faulting page hasn't raced with truncate */
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               ret = VM_FAULT_SIGBUS;
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
         return ret;
+ 
   }
   
   static const struct vm_operations_struct xfs_file_vm_ops = {
@@@ -1578,6 -1639,7 +1639,7 @@@
         .pmd_fault      = xfs_filemap_pmd_fault,
         .map_pages      = filemap_map_pages,
         .page_mkwrite   = xfs_filemap_page_mkwrite,
+       .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
   };
   
   STATIC int
diff --combined fs/xfs/xfs_qm.c

index 587174fd4f2c216637eda70b9986848a0980fd09,7af7648c06c63bd63ec21b150cc2713914f28c08..532ab79d38fe376c14a5463a97195b59a61d8f84
--- 1/fs/xfs/xfs_qm.c
--- 2/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@@ -184,7 -184,7 +184,7 @@@ xfs_qm_dqpurge
          */
         ASSERT(!list_empty(&dqp->q_lru));
         list_lru_del(&qi->qi_lru, &dqp->q_lru);
-       XFS_STATS_DEC(xs_qm_dquot_unused);
+       XFS_STATS_DEC(mp, xs_qm_dquot_unused);
   
         xfs_qm_dqdestroy(dqp);
         return 0;
@@@ -448,11 -448,11 +448,11 @@@ xfs_qm_dquot_isolate
          */
         if (dqp->q_nrefs) {
                 xfs_dqunlock(dqp);
-               XFS_STATS_INC(xs_qm_dqwants);
+               XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
   
                 trace_xfs_dqreclaim_want(dqp);
                 list_lru_isolate(lru, &dqp->q_lru);
-               XFS_STATS_DEC(xs_qm_dquot_unused);
+               XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
                 return LRU_REMOVED;
         }
   
@@@ -496,19 -496,19 +496,19 @@@
   
         ASSERT(dqp->q_nrefs == 0);
         list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
-       XFS_STATS_DEC(xs_qm_dquot_unused);
+       XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
         trace_xfs_dqreclaim_done(dqp);
-       XFS_STATS_INC(xs_qm_dqreclaims);
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
         return LRU_REMOVED;
   
   out_miss_busy:
         trace_xfs_dqreclaim_busy(dqp);
-       XFS_STATS_INC(xs_qm_dqreclaim_misses);
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
         return LRU_SKIP;
   
   out_unlock_dirty:
         trace_xfs_dqreclaim_busy(dqp);
-       XFS_STATS_INC(xs_qm_dqreclaim_misses);
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
         xfs_dqunlock(dqp);
         spin_lock(lru_lock);
         return LRU_RETRY;
@@@ -525,7 -525,7 +525,7 @@@ xfs_qm_shrink_scan
         unsigned long           freed;
         int                     error;
   
- -      if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ +      if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
                 return 0;
   
         INIT_LIST_HEAD(&isol.buffers);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Nov 2015 04:18:48 +0000 (20:18 -0800)
		1	2
fs/dax.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_qm.c	patch \|	diff1 \|	diff2 \|	blob \| history