Btrfs: release delalloc reservations on extent item insertion
authorJosef Bacik <josef@redhat.com>
Thu, 8 Oct 2009 17:34:05 +0000 (13:34 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 8 Oct 2009 19:21:10 +0000 (15:21 -0400)
This patch fixes an issue with the delalloc metadata space reservation
code.  The problem is we used to free the reservation as soon as we
allocated the delalloc region.  The problem with this is if we are not
inserting an inline extent, we don't actually insert the extent item until
after the ordered extent is written out.  This patch does 3 things,

1) It moves the reservation clearing stuff into the ordered code, so when
we remove the ordered extent we remove the reservation.
2) It adds a EXTENT_DO_ACCOUNTING flag that gets passed when we clear
delalloc bits in the cases where we want to clear the metadata reservation
when we clear the delalloc extent, in the case that we do an inline extent
or we invalidate the page.
3) It adds another waitqueue to the space info so that when we start a fs
wide delalloc flush, anybody else who also hits that area will simply wait
for the flush to finish and then try to make their allocation.

This has been tested thoroughly to make sure we did not regress on
performance.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/btrfs_inode.h
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c

index a54d354cefcb71894a89975fae23ff94a9b939ad..c71abec0ab9021c9e56269e7fce81267e4ab9100 100644 (file)
@@ -128,12 +128,14 @@ struct btrfs_inode {
        u64 last_unlink_trans;
 
        /*
-        * These two counters are for delalloc metadata reservations.  We keep
-        * track of how many extents we've accounted for vs how many extents we
-        * have.
+        * Counters to keep track of the number of extent item's we may use due
+        * to delalloc and such.  outstanding_extents is the number of extent
+        * items we think we'll end up using, and reserved_extents is the number
+        * of extent items we've reserved metadata for.
         */
-       int delalloc_reserved_extents;
-       int delalloc_extents;
+       spinlock_t accounting_lock;
+       int reserved_extents;
+       int outstanding_extents;
 
        /*
         * ordered_data_close is set by truncate when a file that used
index 1b920ffc6a598047df4f9fffa7b59aad235cd836..dbdada569507fefa4b65acb99ed52daa5316fb68 100644 (file)
@@ -699,6 +699,9 @@ struct btrfs_space_info {
 
        int allocating_chunk;
        wait_queue_head_t wait;
+
+       int flushing;
+       wait_queue_head_t flush_wait;
 };
 
 /*
index 2f82fabd70113b4d8eb26074fc621cf80461043e..3d1be0b77f8fd82662ddff1a3baa48a588d62e89 100644 (file)
@@ -2823,14 +2823,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
                                           num_items);
 
        spin_lock(&meta_sinfo->lock);
-       if (BTRFS_I(inode)->delalloc_reserved_extents <=
-           BTRFS_I(inode)->delalloc_extents) {
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       if (BTRFS_I(inode)->reserved_extents <=
+           BTRFS_I(inode)->outstanding_extents) {
+               spin_unlock(&BTRFS_I(inode)->accounting_lock);
                spin_unlock(&meta_sinfo->lock);
                return 0;
        }
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
-       BTRFS_I(inode)->delalloc_reserved_extents--;
-       BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+       BTRFS_I(inode)->reserved_extents--;
+       BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
 
        if (meta_sinfo->bytes_delalloc < num_bytes) {
                bug = true;
@@ -2863,6 +2866,37 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
                meta_sinfo->force_delalloc = 0;
 }
 
+static void flush_delalloc(struct btrfs_root *root,
+                                struct btrfs_space_info *info)
+{
+       bool wait = false;
+
+       spin_lock(&info->lock);
+
+       if (!info->flushing) {
+               info->flushing = 1;
+               init_waitqueue_head(&info->flush_wait);
+       } else {
+               wait = true;
+       }
+
+       spin_unlock(&info->lock);
+
+       if (wait) {
+               wait_event(info->flush_wait,
+                          !info->flushing);
+               return;
+       }
+
+       btrfs_start_delalloc_inodes(root);
+       btrfs_wait_ordered_extents(root, 0);
+
+       spin_lock(&info->lock);
+       info->flushing = 0;
+       spin_unlock(&info->lock);
+       wake_up(&info->flush_wait);
+}
+
 static int maybe_allocate_chunk(struct btrfs_root *root,
                                 struct btrfs_space_info *info)
 {
@@ -2980,21 +3014,20 @@ again:
                        filemap_flush(inode->i_mapping);
                        goto again;
                } else if (flushed == 3) {
-                       btrfs_start_delalloc_inodes(root);
-                       btrfs_wait_ordered_extents(root, 0);
+                       flush_delalloc(root, meta_sinfo);
                        goto again;
                }
                spin_lock(&meta_sinfo->lock);
                meta_sinfo->bytes_delalloc -= num_bytes;
                spin_unlock(&meta_sinfo->lock);
                printk(KERN_ERR "enospc, has %d, reserved %d\n",
-                      BTRFS_I(inode)->delalloc_extents,
-                      BTRFS_I(inode)->delalloc_reserved_extents);
+                      BTRFS_I(inode)->outstanding_extents,
+                      BTRFS_I(inode)->reserved_extents);
                dump_space_info(meta_sinfo, 0, 0);
                return -ENOSPC;
        }
 
-       BTRFS_I(inode)->delalloc_reserved_extents++;
+       BTRFS_I(inode)->reserved_extents++;
        check_force_delalloc(meta_sinfo);
        spin_unlock(&meta_sinfo->lock);
 
@@ -3093,8 +3126,7 @@ again:
                }
 
                if (retries == 2) {
-                       btrfs_start_delalloc_inodes(root);
-                       btrfs_wait_ordered_extents(root, 0);
+                       flush_delalloc(root, meta_sinfo);
                        goto again;
                }
                spin_lock(&meta_sinfo->lock);
index f9708bd01669e1bf4b86d800bca59a22204bf9c4..96577e8bf9fdb62819ab2dbd5f9da91200624596 100644 (file)
@@ -460,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
                            struct extent_state *state, int bits, int wake,
                            int delete)
 {
-       int ret = state->state & bits;
+       int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+       int ret = state->state & bits_to_clear;
 
        if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
@@ -468,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
                tree->dirty_bytes -= range;
        }
        clear_state_cb(tree, state, bits);
-       state->state &= ~bits;
+       state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
        if (delete || state->state == 0) {
@@ -956,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask)
 {
        return clear_extent_bit(tree, start, end,
-                               EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+                               EXTENT_DIRTY | EXTENT_DELALLOC |
+                               EXTENT_DO_ACCOUNTING, 0, 0,
                                NULL, mask);
 }
 
@@ -1419,9 +1421,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
        if (op & EXTENT_CLEAR_DELALLOC)
                clear_bits |= EXTENT_DELALLOC;
 
+       if (op & EXTENT_CLEAR_ACCOUNTING)
+               clear_bits |= EXTENT_DO_ACCOUNTING;
+
        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
-       if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK |
-                   EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2)))
+       if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+                   EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+                   EXTENT_SET_PRIVATE2)))
                return 0;
 
        while (nr_pages > 0) {
@@ -2709,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
        lock_extent(tree, start, end, GFP_NOFS);
        wait_on_page_writeback(page);
        clear_extent_bit(tree, start, end,
-                        EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+                        EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+                        EXTENT_DO_ACCOUNTING,
                         1, 1, NULL, GFP_NOFS);
        return 0;
 }
index 41d2a47ecf3887296b5447a5d9b7a75c7db9cacb..36de250a7b2bce5ef6f36d35ded4f1f88a2dfc58 100644 (file)
@@ -15,6 +15,7 @@
 #define EXTENT_BUFFER_FILLED (1 << 8)
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /* flags for bio submission */
@@ -33,6 +34,7 @@
 #define EXTENT_SET_WRITEBACK    0x10
 #define EXTENT_END_WRITEBACK    0x20
 #define EXTENT_SET_PRIVATE2     0x40
+#define EXTENT_CLEAR_ACCOUNTING  0x80
 
 /*
  * page->private values.  Every page that is controlled by the extent
index f155179877a6ff81cb741a8a47b13a6aebc992a4..53fb1c997f0e96c76913def6c57d0b7c95216b98 100644 (file)
@@ -878,7 +878,8 @@ again:
                        btrfs_put_ordered_extent(ordered);
 
                clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
-                                 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+                                 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_DO_ACCOUNTING,
                                  GFP_NOFS);
                unlock_extent(&BTRFS_I(inode)->io_tree,
                              start_pos, last_pos - 1, GFP_NOFS);
index 401dfb2a94e811a858f21d8d90677a47af506fb3..ccc4f1121210a255182ab73280f1e200b7b50999 100644 (file)
@@ -428,6 +428,7 @@ again:
                             start, end, NULL,
                             EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                             EXTENT_CLEAR_DELALLOC |
+                            EXTENT_CLEAR_ACCOUNTING |
                             EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
                        ret = 0;
                        goto free_pages_out;
@@ -722,6 +723,7 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_CLEAR_UNLOCK_PAGE |
                                     EXTENT_CLEAR_UNLOCK |
                                     EXTENT_CLEAR_DELALLOC |
+                                    EXTENT_CLEAR_ACCOUNTING |
                                     EXTENT_CLEAR_DIRTY |
                                     EXTENT_SET_WRITEBACK |
                                     EXTENT_END_WRITEBACK);
@@ -1195,15 +1197,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
                                        root->fs_info->max_extent);
 
                /*
-                * if we break a large extent up then leave delalloc_extents be,
-                * since we've already accounted for the large extent.
+                * if we break a large extent up then leave oustanding_extents
+                * be, since we've already accounted for the large extent.
                 */
                if (div64_u64(new_size + root->fs_info->max_extent - 1,
                              root->fs_info->max_extent) < num_extents)
                        return 0;
        }
 
-       BTRFS_I(inode)->delalloc_extents++;
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       BTRFS_I(inode)->outstanding_extents++;
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
        return 0;
 }
@@ -1234,7 +1238,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 
        /* we're not bigger than the max, unreserve the space and go */
        if (new_size <= root->fs_info->max_extent) {
-               BTRFS_I(inode)->delalloc_extents--;
+               spin_lock(&BTRFS_I(inode)->accounting_lock);
+               BTRFS_I(inode)->outstanding_extents--;
+               spin_unlock(&BTRFS_I(inode)->accounting_lock);
                return 0;
        }
 
@@ -1248,7 +1254,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
                      root->fs_info->max_extent) > num_extents)
                return 0;
 
-       BTRFS_I(inode)->delalloc_extents--;
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       BTRFS_I(inode)->outstanding_extents--;
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
        return 0;
 }
@@ -1270,7 +1278,9 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
 
-               BTRFS_I(inode)->delalloc_extents++;
+               spin_lock(&BTRFS_I(inode)->accounting_lock);
+               BTRFS_I(inode)->outstanding_extents++;
+               spin_unlock(&BTRFS_I(inode)->accounting_lock);
                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1298,8 +1308,12 @@ static int btrfs_clear_bit_hook(struct inode *inode,
        if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
 
-               BTRFS_I(inode)->delalloc_extents--;
-               btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+               if (bits & EXTENT_DO_ACCOUNTING) {
+                       spin_lock(&BTRFS_I(inode)->accounting_lock);
+                       BTRFS_I(inode)->outstanding_extents--;
+                       spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                       btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+               }
 
                spin_lock(&root->fs_info->delalloc_lock);
                if (state->end - state->start + 1 >
@@ -4825,7 +4839,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                 */
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+                                EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+                                NULL, GFP_NOFS);
                /*
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
@@ -4838,8 +4853,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                lock_extent(tree, page_start, page_end, GFP_NOFS);
        }
        clear_extent_bit(tree, page_start, page_end,
-                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-                1, 1, NULL, GFP_NOFS);
+                EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+                EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
        __btrfs_releasepage(page, GFP_NOFS);
 
        ClearPageChecked(page);
@@ -4934,7 +4949,8 @@ again:
         * prepare_pages in the normal write path.
         */
        clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
-                         EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
+                         EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+                         GFP_NOFS);
 
        ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
        if (ret) {
@@ -5082,8 +5098,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
                return NULL;
        ei->last_trans = 0;
        ei->logged_trans = 0;
-       ei->delalloc_extents = 0;
-       ei->delalloc_reserved_extents = 0;
+       ei->outstanding_extents = 0;
+       ei->reserved_extents = 0;
+       spin_lock_init(&ei->accounting_lock);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->ordered_operations);
index 4a9c8c4cec2552de9c10a25708d320e9c6b930d8..ab21c29f2247ef4319352b1ce36b1e3db529a7f0 100644 (file)
@@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
        tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
+       BTRFS_I(inode)->outstanding_extents--;
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
+       btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
+                                             inode, 1);
+
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);