Btrfs: Tree logging fixes
authorChris Mason <chris.mason@oracle.com>
Mon, 8 Sep 2008 15:18:08 +0000 (11:18 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:07 +0000 (11:04 -0400)
* Pin down data blocks to prevent them from being reallocated like so:

trans 1: allocate file extent
trans 2: free file extent
trans 3: free file extent during old snapshot deletion
trans 3: allocate file extent to new file
trans 3: fsync new file

Before the tree logging code, this was legal because the fsync
would commit the transation that did the final data extent free
and the transaction that allocated the extent to the new file
at the same time.

With the tree logging code, the tree log subtransaction can commit
before the transaction that freed the extent.  If we crash,
we're left with two different files using the extent.

* Don't wait in start_transaction if log replay is going on.  This
avoids deadlocks from iput while we're cleaning up link counts in the
replay code.

* Don't deadlock in replay_one_name by trying to read an inode off
the disk while holding paths for the directory

* Hold the buffer lock while we mark a buffer as written.  This
closes a race where someone is changing a buffer while we write it.
They are supposed to mark it dirty again after they change it, but
this violates the cow rules.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/transaction.c
fs/btrfs/tree-log.c

index a4373db5967a7d61cfd18f07bc02146b11764f97..42bf99168056ad70562a1c293fea45215d40f9ef 100644 (file)
@@ -307,9 +307,7 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
                goto err;
        }
        found_level = btrfs_header_level(eb);
-       spin_lock(&root->fs_info->hash_lock);
-       btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-       spin_unlock(&root->fs_info->hash_lock);
+
        csum_tree_block(root, eb, 0);
 err:
        free_extent_buffer(eb);
@@ -1998,7 +1996,36 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
        return ret;
 }
 
+int btree_lock_page_hook(struct page *page)
+{
+       struct inode *inode = page->mapping->host;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct extent_buffer *eb;
+       unsigned long len;
+       u64 bytenr = page_offset(page);
+
+       if (page->private == EXTENT_PAGE_PRIVATE)
+               goto out;
+
+       len = page->private >> 2;
+       eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+       if (!eb)
+               goto out;
+
+       btrfs_tree_lock(eb);
+       spin_lock(&root->fs_info->hash_lock);
+       btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+       spin_unlock(&root->fs_info->hash_lock);
+       btrfs_tree_unlock(eb);
+       free_extent_buffer(eb);
+out:
+       lock_page(page);
+       return 0;
+}
+
 static struct extent_io_ops btree_extent_io_ops = {
+       .write_cache_pages_lock_hook = btree_lock_page_hook,
        .writepage_io_hook = btree_writepage_io_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
        .submit_bio_hook = btree_submit_bio_hook,
index 6b6fdc697f31a9e7d404f4b375e1ee5dd2423ac4..f84f5058dbbb892435802c52ad9efd430d7f7900 100644 (file)
@@ -80,4 +80,5 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
+int btree_lock_page_hook(struct page *page);
 #endif
index 646b9148ca21ea88b66509ef7141e090199c4c03..3181759da1cfab72ed02f1de5253dd922e678d99 100644 (file)
@@ -1590,13 +1590,17 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 }
 
 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
-                         int pending)
+                         int is_data, int pending)
 {
        int err = 0;
 
        WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
        if (!pending) {
                struct extent_buffer *buf;
+
+               if (is_data)
+                       goto pinit;
+
                buf = btrfs_find_tree_block(root, bytenr, num_bytes);
                if (buf) {
                        /* we can reuse a block if it hasn't been written
@@ -1624,6 +1628,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
                        }
                        free_extent_buffer(buf);
                }
+pinit:
                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
        } else {
                set_extent_bits(&root->fs_info->pending_del,
@@ -1744,7 +1749,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 #endif
 
                if (pin) {
-                       ret = pin_down_bytes(root, bytenr, num_bytes, 0);
+                       ret = pin_down_bytes(root, bytenr, num_bytes,
+                            owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 0);
                        if (ret > 0)
                                mark_free = 1;
                        BUG_ON(ret < 0);
@@ -1862,9 +1868,17 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                ref_generation = 0;
 
        if (root == extent_root) {
-               pin_down_bytes(root, bytenr, num_bytes, 1);
+               pin_down_bytes(root, bytenr, num_bytes, 0, 1);
                return 0;
        }
+       /* if metadata always pin */
+       if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+               pin = 1;
+
+       /* if data pin when any transaction has committed this */
+       if (ref_generation != trans->transid)
+               pin = 1;
+
        ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
                            ref_generation, owner_objectid, owner_offset,
                            pin, pin == 0);
index 239e7c908abf884dc65c6ab1aa2c471210691848..319a0c7a4a58db186c8f07cc28ee679380558cb8 100644 (file)
@@ -29,7 +29,10 @@ static struct kmem_cache *extent_buffer_cache;
 
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
+
+#ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
+#endif
 
 #define BUFFER_LRU_MAX 64
 
@@ -106,7 +109,9 @@ EXPORT_SYMBOL(extent_io_tree_init);
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
        struct extent_state *state;
+#ifdef LEAK_DEBUG
        unsigned long flags;
+#endif
 
        state = kmem_cache_alloc(extent_state_cache, mask);
        if (!state)
@@ -114,10 +119,11 @@ struct extent_state *alloc_extent_state(gfp_t mask)
        state->state = 0;
        state->private = 0;
        state->tree = NULL;
+#ifdef LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
        list_add(&state->leak_list, &states);
        spin_unlock_irqrestore(&leak_lock, flags);
-
+#endif
        atomic_set(&state->refs, 1);
        init_waitqueue_head(&state->wq);
        return state;
@@ -129,11 +135,15 @@ void free_extent_state(struct extent_state *state)
        if (!state)
                return;
        if (atomic_dec_and_test(&state->refs)) {
+#ifdef LEAK_DEBUG
                unsigned long flags;
+#endif
                WARN_ON(state->tree);
+#ifdef LEAK_DEBUG
                spin_lock_irqsave(&leak_lock, flags);
                list_del(&state->leak_list);
                spin_unlock_irqrestore(&leak_lock, flags);
+#endif
                kmem_cache_free(extent_state_cache, state);
        }
 }
@@ -2070,13 +2080,13 @@ done:
 }
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-/* Taken directly from 2.6.23 for 2.6.18 back port */
+/* Taken directly from 2.6.23 with a mod for a lockpage hook */
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                 void *data);
+#endif
 
 /**
- * write_cache_pages - walk the list of dirty pages of the given address space
- * and write all of them.
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  * @writepage: function called for each page
@@ -2090,9 +2100,10 @@ typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
-static int write_cache_pages(struct address_space *mapping,
-                     struct writeback_control *wbc, writepage_t writepage,
-                     void *data)
+int extent_write_cache_pages(struct extent_io_tree *tree,
+                            struct address_space *mapping,
+                            struct writeback_control *wbc,
+                            writepage_t writepage, void *data)
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
@@ -2138,7 +2149,10 @@ retry:
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
-                       lock_page(page);
+                       if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+                               tree->ops->write_cache_pages_lock_hook(page);
+                       else
+                               lock_page(page);
 
                        if (unlikely(page->mapping != mapping)) {
                                unlock_page(page);
@@ -2187,9 +2201,12 @@ retry:
        }
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                mapping->writeback_index = index;
+
+       if (wbc->range_cont)
+               wbc->range_start = index << PAGE_CACHE_SHIFT;
        return ret;
 }
-#endif
+EXPORT_SYMBOL(extent_write_cache_pages);
 
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent,
@@ -2214,7 +2231,8 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
        ret = __extent_writepage(page, wbc, &epd);
 
-       write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
+       extent_write_cache_pages(tree, mapping, &wbc_writepages,
+                                __extent_writepage, &epd);
        if (epd.bio) {
                submit_one_bio(WRITE, epd.bio, 0);
        }
@@ -2235,7 +2253,8 @@ int extent_writepages(struct extent_io_tree *tree,
                .get_extent = get_extent,
        };
 
-       ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+       ret = extent_write_cache_pages(tree, mapping, wbc,
+                                      __extent_writepage, &epd);
        if (epd.bio) {
                submit_one_bio(WRITE, epd.bio, 0);
        }
@@ -2567,15 +2586,19 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                                                   gfp_t mask)
 {
        struct extent_buffer *eb = NULL;
+#ifdef LEAK_DEBUG
        unsigned long flags;
+#endif
 
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
        eb->start = start;
        eb->len = len;
        mutex_init(&eb->mutex);
+#ifdef LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
        list_add(&eb->leak_list, &buffers);
        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
        atomic_set(&eb->refs, 1);
 
        return eb;
@@ -2583,10 +2606,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
+#ifdef LEAK_DEBUG
        unsigned long flags;
        spin_lock_irqsave(&leak_lock, flags);
        list_del(&eb->leak_list);
        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
        kmem_cache_free(extent_buffer_cache, eb);
 }
 
index 315cfceae3128f72c59e4562b955763baa6dffde..3cb411a5f4d376beae2cc76de68cffa01b497606 100644 (file)
@@ -50,6 +50,7 @@ struct extent_io_ops {
                            unsigned long old, unsigned long bits);
        int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
                            unsigned long old, unsigned long bits);
+       int (*write_cache_pages_lock_hook)(struct page *page);
 };
 
 struct extent_io_tree {
index 49c4f5b40ed6657505ef29e58c891e02ea633a76..61a377bcb2fbb4ef74fc863b43df034fa554a857 100644 (file)
@@ -161,7 +161,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        int ret;
 
        mutex_lock(&root->fs_info->trans_mutex);
-       if ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)
+       if (!root->fs_info->log_root_recovering &&
+           ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
                wait_current_trans(root);
        ret = join_transaction(root);
        BUG_ON(ret);
@@ -328,9 +329,17 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 
                        index = start >> PAGE_CACHE_SHIFT;
                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-                       page = find_lock_page(btree_inode->i_mapping, index);
+                       page = find_get_page(btree_inode->i_mapping, index);
                        if (!page)
                                continue;
+
+                       btree_lock_page_hook(page);
+                       if (!page->mapping) {
+                               unlock_page(page);
+                               page_cache_release(page);
+                               continue;
+                       }
+
                        if (PageWriteback(page)) {
                                if (PageDirty(page))
                                        wait_on_page_writeback(page);
@@ -360,7 +369,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                        if (!page)
                                continue;
                        if (PageDirty(page)) {
-                               lock_page(page);
+                               btree_lock_page_hook(page);
+                               wait_on_page_writeback(page);
                                err = write_one_page(page, 0);
                                if (err)
                                        werr = err;
index d1ce8314b9485cecf0c7483a7f73b7b052170c51..13d7ee8e0c52a6d41ce7b300598b1919e097102b 100644 (file)
@@ -1176,8 +1176,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        struct btrfs_key found_key;
        struct btrfs_key log_key;
        struct inode *dir;
-       struct inode *inode;
        u8 log_type;
+       int exists;
        int ret;
 
        dir = read_one_inode(root, key->objectid);
@@ -1190,6 +1190,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                   name_len);
 
        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+       exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+       if (exists == 0)
+               exists = 1;
+       else
+               exists = 0;
+       btrfs_release_path(root, path);
+
        if (key->type == BTRFS_DIR_ITEM_KEY) {
                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
                                       name, name_len, 1);
@@ -1224,11 +1231,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
         * don't drop the conflicting directory entry if the inode
         * for the new entry doesn't exist
         */
-       inode = read_one_inode(root, log_key.objectid);
-       if (!inode)
+       if (!exists)
                goto out;
 
-       iput(inode);
        ret = drop_one_dir_item(trans, root, path, dir, dst_di);
        BUG_ON(ret);