ext4: punch_hole should wait for DIO writers
authorDmitry Monakhov <dmonakhov@openvz.org>
Mon, 1 Oct 2012 03:03:42 +0000 (23:03 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Mon, 1 Oct 2012 03:03:42 +0000 (23:03 -0400)
punch_hole is the place where we have to wait for all existing writers
(writeback, aio, dio), but currently we simply flush pended end_io request
which is not sufficient. Other issue is that punch_hole performed w/o i_mutex
held which obviously result in dangerous data corruption due to
write-after-free.

This patch performs following changes:
- Guard punch_hole with i_mutex
- Recheck inode flags under i_mutex
- Block all new dio readers in order to prevent information leak caused by
  read-after-free pattern.
- punch_hole now wait for all writers in flight
  NOTE: XXX write-after-free race is still possible because new dirty pages
  may appear due to mmap(), and currently there is no easy way to stop
  writeback while punch_hole is in progress.

[ Fixed error return from ext4_ext_punch_hole() to make sure that we
  release i_mutex before returning EPERM or ETXTBUSY -- Ted ]

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/extents.c

index 232077439aa85499502d4cd9e1cc5cff8c3236e3..5920e75fc05fe07a221672353d95c3d6a035431b 100644 (file)
@@ -4794,9 +4794,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        loff_t first_page_offset, last_page_offset;
        int credits, err = 0;
 
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               err = filemap_write_and_wait_range(mapping,
+                       offset, offset + length - 1);
+
+               if (err)
+                       return err;
+       }
+
+       mutex_lock(&inode->i_mutex);
+       /* It's not possible punch hole on append only file */
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+               err = -EPERM;
+               goto out_mutex;
+       }
+       if (IS_SWAPFILE(inode)) {
+               err = -ETXTBSY;
+               goto out_mutex;
+       }
+
        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
-               return 0;
+               goto out_mutex;
 
        /*
         * If the hole extends beyond i_size, set the hole
@@ -4814,33 +4837,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        first_page_offset = first_page << PAGE_CACHE_SHIFT;
        last_page_offset = last_page << PAGE_CACHE_SHIFT;
 
-       /*
-        * Write out all dirty pages to avoid race conditions
-        * Then release them.
-        */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-               err = filemap_write_and_wait_range(mapping,
-                       offset, offset + length - 1);
-
-               if (err)
-                       return err;
-       }
-
        /* Now release the pages */
        if (last_page_offset > first_page_offset) {
                truncate_pagecache_range(inode, first_page_offset,
                                         last_page_offset - 1);
        }
 
-       /* finish any pending end_io work */
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
        err = ext4_flush_completed_IO(inode);
        if (err)
-               return err;
+               goto out_dio;
 
        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, credits);
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
+       if (IS_ERR(handle)) {
+               err = PTR_ERR(handle);
+               goto out_dio;
+       }
 
 
        /*
@@ -4930,6 +4945,10 @@ out:
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
        return err;
 }
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,