Btrfs: update global block_rsv when creating a new block group

[firefly-linux-kernel-4.4.55.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index f9711a82fc541143ddc88301e126963cd6987ae0..bf30f670cda96e5e6c081ec69d54a12d7f95e07d 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
  #include <linux/rcupdate.h>
  #include <linux/kthread.h>
  #include <linux/slab.h>
+#include <linux/ratelimit.h>
  #include "compat.h"
  #include "hash.h"
  #include "ctree.h"
@@ -466,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                              struct btrfs_root *root,
                              int load_cache_only)
  {
+       DEFINE_WAIT(wait);
         struct btrfs_fs_info *fs_info = cache->fs_info;
         struct btrfs_caching_control *caching_ctl;
         int ret = 0;
  
-       smp_mb();
-       if (cache->cached != BTRFS_CACHE_NO)
+       caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+       BUG_ON(!caching_ctl);
+
+       INIT_LIST_HEAD(&caching_ctl->list);
+       mutex_init(&caching_ctl->mutex);
+       init_waitqueue_head(&caching_ctl->wait);
+       caching_ctl->block_group = cache;
+       caching_ctl->progress = cache->key.objectid;
+       atomic_set(&caching_ctl->count, 1);
+       caching_ctl->work.func = caching_thread;
+
+       spin_lock(&cache->lock);
+       /*
+        * This should be a rare occasion, but this could happen I think in the
+        * case where one thread starts to load the space cache info, and then
+        * some other thread starts a transaction commit which tries to do an
+        * allocation while the other thread is still loading the space cache
+        * info.  The previous loop should have kept us from choosing this block
+        * group, but if we've moved to the state where we will wait on caching
+        * block groups we need to first check if we're doing a fast load here,
+        * so we can wait for it to finish, otherwise we could end up allocating
+        * from a block group who's cache gets evicted for one reason or
+        * another.
+        */
+       while (cache->cached == BTRFS_CACHE_FAST) {
+               struct btrfs_caching_control *ctl;
+
+               ctl = cache->caching_ctl;
+               atomic_inc(&ctl->count);
+               prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+               spin_unlock(&cache->lock);
+
+               schedule();
+
+               finish_wait(&ctl->wait, &wait);
+               put_caching_control(ctl);
+               spin_lock(&cache->lock);
+       }
+
+       if (cache->cached != BTRFS_CACHE_NO) {
+               spin_unlock(&cache->lock);
+               kfree(caching_ctl);
                 return 0;
+       }
+       WARN_ON(cache->caching_ctl);
+       cache->caching_ctl = caching_ctl;
+       cache->cached = BTRFS_CACHE_FAST;
+       spin_unlock(&cache->lock);
  
         /*
          * We can't do the read from on-disk cache during a commit since we need
@@ -483,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         if (trans && (!trans->transaction->in_commit) &&
             (root && root != root->fs_info->tree_root) &&
             btrfs_test_opt(root, SPACE_CACHE)) {
-               spin_lock(&cache->lock);
-               if (cache->cached != BTRFS_CACHE_NO) {
-                       spin_unlock(&cache->lock);
-                       return 0;
-               }
-               cache->cached = BTRFS_CACHE_STARTED;
-               spin_unlock(&cache->lock);
-
                 ret = load_free_space_cache(fs_info, cache);
  
                 spin_lock(&cache->lock);
                 if (ret == 1) {
+                       cache->caching_ctl = NULL;
                         cache->cached = BTRFS_CACHE_FINISHED;
                         cache->last_byte_to_unpin = (u64)-1;
                 } else {
-                       cache->cached = BTRFS_CACHE_NO;
+                       if (load_cache_only) {
+                               cache->caching_ctl = NULL;
+                               cache->cached = BTRFS_CACHE_NO;
+                       } else {
+                               cache->cached = BTRFS_CACHE_STARTED;
+                       }
                 }
                 spin_unlock(&cache->lock);
+               wake_up(&caching_ctl->wait);
                 if (ret == 1) {
+                       put_caching_control(caching_ctl);
                         free_excluded_extents(fs_info->extent_root, cache);
                         return 0;
                 }
+       } else {
+               /*
+                * We are not going to do the fast caching, set cached to the
+                * appropriate value and wakeup any waiters.
+                */
+               spin_lock(&cache->lock);
+               if (load_cache_only) {
+                       cache->caching_ctl = NULL;
+                       cache->cached = BTRFS_CACHE_NO;
+               } else {
+                       cache->cached = BTRFS_CACHE_STARTED;
+               }
+               spin_unlock(&cache->lock);
+               wake_up(&caching_ctl->wait);
         }
  
-       if (load_cache_only)
-               return 0;
-
-       caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-       BUG_ON(!caching_ctl);
-
-       INIT_LIST_HEAD(&caching_ctl->list);
-       mutex_init(&caching_ctl->mutex);
-       init_waitqueue_head(&caching_ctl->wait);
-       caching_ctl->block_group = cache;
-       caching_ctl->progress = cache->key.objectid;
-       /* one for caching kthread, one for caching block group list */
-       atomic_set(&caching_ctl->count, 2);
-       caching_ctl->work.func = caching_thread;
-
-       spin_lock(&cache->lock);
-       if (cache->cached != BTRFS_CACHE_NO) {
-               spin_unlock(&cache->lock);
-               kfree(caching_ctl);
+       if (load_cache_only) {
+               put_caching_control(caching_ctl);
                 return 0;
         }
-       cache->caching_ctl = caching_ctl;
-       cache->cached = BTRFS_CACHE_STARTED;
-       spin_unlock(&cache->lock);
  
         down_write(&fs_info->extent_commit_sem);
+       atomic_inc(&caching_ctl->count);
         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
         up_write(&fs_info->extent_commit_sem);
  
@@ -1787,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  {
         int ret;
         u64 discarded_bytes = 0;
-       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_bio *bbio = NULL;
  
  
         /* Tell the block device(s) that the sectors can be discarded */
         ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-                             bytenr, &num_bytes, &multi, 0);
+                             bytenr, &num_bytes, &bbio, 0);
         if (!ret) {
-               struct btrfs_bio_stripe *stripe = multi->stripes;
+               struct btrfs_bio_stripe *stripe = bbio->stripes;
                 int i;
  
  
-               for (i = 0; i < multi->num_stripes; i++, stripe++) {
+               for (i = 0; i < bbio->num_stripes; i++, stripe++) {
                         if (!stripe->dev->can_discard)
                                 continue;
  
@@ -1817,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                          */
                         ret = 0;
                 }
-               kfree(multi);
+               kfree(bbio);
         }
  
         if (actual_bytes)
@@ -2717,6 +2759,13 @@ again:
                 goto again;
         }
  
+       /* We've already setup this transaction, go ahead and exit */
+       if (block_group->cache_generation == trans->transid &&
+           i_size_read(inode)) {
+               dcs = BTRFS_DC_SETUP;
+               goto out_put;
+       }
+
         /*
          * We want to set the generation to 0, that way if anything goes wrong
          * from here on out we know not to trust this cache when we load up next
@@ -2756,19 +2805,16 @@ again:
         num_pages *= 16;
         num_pages *= PAGE_CACHE_SIZE;
  
-       ret = btrfs_delalloc_reserve_space(inode, num_pages);
+       ret = btrfs_check_data_free_space(inode, num_pages);
         if (ret)
                 goto out_put;
  
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                               num_pages, num_pages,
                                               &alloc_hint);
-       if (!ret) {
+       if (!ret)
                 dcs = BTRFS_DC_SETUP;
-               btrfs_free_reserved_data_space(inode, num_pages);
-       } else {
-               btrfs_delalloc_release_space(inode, num_pages);
-       }
+       btrfs_free_reserved_data_space(inode, num_pages);
  
  out_put:
         iput(inode);
@@ -2776,6 +2822,8 @@ out_free:
         btrfs_release_path(path);
  out:
         spin_lock(&block_group->lock);
+       if (!ret && dcs == BTRFS_DC_SETUP)
+               block_group->cache_generation = trans->transid;
         block_group->disk_cache_state = dcs;
         spin_unlock(&block_group->lock);
  
@@ -3202,7 +3250,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
          * about 1% of the FS size.
          */
         if (force == CHUNK_ALLOC_LIMITED) {
-               thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+               thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
                 thresh = max_t(u64, 64 * 1024 * 1024,
                                div_factor_fine(thresh, 1));
  
@@ -3224,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
         if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
                 return 0;
  
-       thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+       thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
  
         /* 256MB or 5% of the FS */
         thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3327,19 +3375,21 @@ out:
  /*
   * shrink metadata reservation for delalloc
   */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 to_reclaim, int sync)
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
+                          bool wait_ordered)
  {
         struct btrfs_block_rsv *block_rsv;
         struct btrfs_space_info *space_info;
+       struct btrfs_trans_handle *trans;
         u64 reserved;
         u64 max_reclaim;
         u64 reclaimed = 0;
         long time_left;
-       int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+       unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
         int loops = 0;
         unsigned long progress;
  
+       trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
         space_info = block_rsv->space_info;
  
@@ -3359,7 +3409,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
         }
  
         max_reclaim = min(reserved, to_reclaim);
-
+       nr_pages = max_t(unsigned long, nr_pages,
+                        max_reclaim >> PAGE_CACHE_SHIFT);
         while (loops < 1024) {
                 /* have the flusher threads jump in and do some IO */
                 smp_mb();
@@ -3381,11 +3432,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                 if (trans && trans->transaction->blocked)
                         return -EAGAIN;
  
-               time_left = schedule_timeout_interruptible(1);
+               if (wait_ordered && !trans) {
+                       btrfs_wait_ordered_extents(root, 0, 0);
+               } else {
+                       time_left = schedule_timeout_interruptible(1);
  
-               /* We were interrupted, exit */
-               if (time_left)
-                       break;
+                       /* We were interrupted, exit */
+                       if (time_left)
+                               break;
+               }
  
                 /* we've kicked the IO a few times, if anything has been freed,
                  * exit.  There is no sense in looping here for a long time
@@ -3400,18 +3455,70 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                 }
  
         }
-       if (reclaimed >= to_reclaim && !trans)
-               btrfs_wait_ordered_extents(root, 0, 0);
+
         return reclaimed >= to_reclaim;
  }
  
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+                                 struct btrfs_space_info *space_info,
+                                 u64 bytes, int force)
+{
+       struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+       struct btrfs_trans_handle *trans;
+
+       trans = (struct btrfs_trans_handle *)current->journal_info;
+       if (trans)
+               return -EAGAIN;
+
+       if (force)
+               goto commit;
+
+       /* See if there is enough pinned space to make this reservation */
+       spin_lock(&space_info->lock);
+       if (space_info->bytes_pinned >= bytes) {
+               spin_unlock(&space_info->lock);
+               goto commit;
+       }
+       spin_unlock(&space_info->lock);
+
+       /*
+        * See if there is some space in the delayed insertion reservation for
+        * this reservation.
+        */
+       if (space_info != delayed_rsv->space_info)
+               return -ENOSPC;
+
+       spin_lock(&delayed_rsv->lock);
+       if (delayed_rsv->size < bytes) {
+               spin_unlock(&delayed_rsv->lock);
+               return -ENOSPC;
+       }
+       spin_unlock(&delayed_rsv->lock);
+
+commit:
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans))
+               return -ENOSPC;
+
+       return btrfs_commit_transaction(trans, root);
+}
+
  /**
   * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
   * @root - the root we're allocating for
   * @block_rsv - the block_rsv we're allocating for
   * @orig_bytes - the number of bytes we want
   * @flush - wether or not we can flush to make our reservation
- * @check - wether this is just to check if we have enough space or not
   *
   * This will reserve orgi_bytes number of bytes from the space info associated
   * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3422,18 +3529,17 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
   */
  static int reserve_metadata_bytes(struct btrfs_root *root,
                                   struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush, int check)
+                                 u64 orig_bytes, int flush)
  {
         struct btrfs_space_info *space_info = block_rsv->space_info;
-       struct btrfs_trans_handle *trans;
         u64 used;
         u64 num_bytes = orig_bytes;
         int retries = 0;
         int ret = 0;
         bool committed = false;
         bool flushing = false;
+       bool wait_ordered = false;
  
-       trans = (struct btrfs_trans_handle *)current->journal_info;
  again:
         ret = 0;
         spin_lock(&space_info->lock);
@@ -3449,7 +3555,7 @@ again:
                  * deadlock since we are waiting for the flusher to finish, but
                  * hold the current transaction open.
                  */
-               if (trans)
+               if (current->journal_info)
                         return -EAGAIN;
                 ret = wait_event_interruptible(space_info->wait,
                                                !space_info->flush);
@@ -3490,14 +3596,33 @@ again:
                  * amount plus the amount of bytes that we need for this
                  * reservation.
                  */
+               wait_ordered = true;
                 num_bytes = used - space_info->total_bytes +
                         (orig_bytes * (retries + 1));
         }
  
-       if (ret && !check) {
+       if (ret) {
                 u64 profile = btrfs_get_alloc_profile(root, 0);
                 u64 avail;
  
+               /*
+                * If we have a lot of space that's pinned, don't bother doing
+                * the overcommit dance yet and just commit the transaction.
+                */
+               avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+               do_div(avail, 10);
+               if (space_info->bytes_pinned >= avail && flush && !committed) {
+                       space_info->flush = 1;
+                       flushing = true;
+                       spin_unlock(&space_info->lock);
+                       ret = may_commit_transaction(root, space_info,
+                                                    orig_bytes, 1);
+                       if (ret)
+                               goto out;
+                       committed = true;
+                       goto again;
+               }
+
                 spin_lock(&root->fs_info->free_chunk_lock);
                 avail = root->fs_info->free_chunk_space;
  
@@ -3521,9 +3646,11 @@ again:
                         avail >>= 1;
                  spin_unlock(&root->fs_info->free_chunk_lock);
  
-               if (used + orig_bytes < space_info->total_bytes + avail) {
+               if (used + num_bytes < space_info->total_bytes + avail) {
                         space_info->bytes_may_use += orig_bytes;
                         ret = 0;
+               } else {
+                       wait_ordered = true;
                 }
         }
  
@@ -3546,7 +3673,7 @@ again:
          * We do synchronous shrinking since we don't actually unreserve
          * metadata until after the IO is completed.
          */
-       ret = shrink_delalloc(trans, root, num_bytes, 1);
+       ret = shrink_delalloc(root, num_bytes, wait_ordered);
         if (ret < 0)
                 goto out;
  
@@ -3558,35 +3685,17 @@ again:
          * so go back around and try again.
          */
         if (retries < 2) {
+               wait_ordered = true;
                 retries++;
                 goto again;
         }
  
-       /*
-        * Not enough space to be reclaimed, don't bother committing the
-        * transaction.
-        */
-       spin_lock(&space_info->lock);
-       if (space_info->bytes_pinned < orig_bytes)
-               ret = -ENOSPC;
-       spin_unlock(&space_info->lock);
-       if (ret)
-               goto out;
-
-       ret = -EAGAIN;
-       if (trans)
-               goto out;
-
         ret = -ENOSPC;
         if (committed)
                 goto out;
  
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans))
-               goto out;
-       ret = btrfs_commit_transaction(trans, root);
+       ret = may_commit_transaction(root, space_info, orig_bytes, 0);
         if (!ret) {
-               trans = NULL;
                 committed = true;
                 goto again;
         }
@@ -3728,16 +3837,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
         kfree(rsv);
  }
  
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
+static inline int __block_rsv_add(struct btrfs_root *root,
+                                 struct btrfs_block_rsv *block_rsv,
+                                 u64 num_bytes, int flush)
  {
         int ret;
  
         if (num_bytes == 0)
                 return 0;
  
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
+       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
         if (!ret) {
                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
                 return 0;
@@ -3746,9 +3855,22 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
         return ret;
  }
  
+int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv,
+                       u64 num_bytes)
+{
+       return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+                               struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes)
+{
+       return __block_rsv_add(root, block_rsv, num_bytes, 0);
+}
+
  int btrfs_block_rsv_check(struct btrfs_root *root,
-                         struct btrfs_block_rsv *block_rsv,
-                         u64 min_reserved, int min_factor, int flush)
+                         struct btrfs_block_rsv *block_rsv, int min_factor)
  {
         u64 num_bytes = 0;
         int ret = -ENOSPC;
@@ -3757,11 +3879,26 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
                 return 0;
  
         spin_lock(&block_rsv->lock);
-       if (min_factor > 0)
-               num_bytes = div_factor(block_rsv->size, min_factor);
-       if (min_reserved > num_bytes)
-               num_bytes = min_reserved;
+       num_bytes = div_factor(block_rsv->size, min_factor);
+       if (block_rsv->reserved >= num_bytes)
+               ret = 0;
+       spin_unlock(&block_rsv->lock);
+
+       return ret;
+}
  
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+                                          struct btrfs_block_rsv *block_rsv,
+                                          u64 min_reserved, int flush)
+{
+       u64 num_bytes = 0;
+       int ret = -ENOSPC;
+
+       if (!block_rsv)
+               return 0;
+
+       spin_lock(&block_rsv->lock);
+       num_bytes = min_reserved;
         if (block_rsv->reserved >= num_bytes)
                 ret = 0;
         else
@@ -3771,7 +3908,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
         if (!ret)
                 return 0;
  
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
+       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
         if (!ret) {
                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
                 return 0;
@@ -3780,6 +3917,20 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
         return ret;
  }
  
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved)
+{
+       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+                                  struct btrfs_block_rsv *block_rsv,
+                                  u64 min_reserved)
+{
+       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
+}
+
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                             struct btrfs_block_rsv *dst_rsv,
                             u64 num_bytes)
@@ -3809,7 +3960,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
         u64 num_bytes;
         u64 meta_used;
         u64 data_used;
-       int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+       int csum_size = btrfs_super_csum_size(fs_info->super_copy);
  
         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
         spin_lock(&sinfo->lock);
@@ -3880,6 +4031,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
         fs_info->delalloc_block_rsv.space_info = space_info;
         fs_info->trans_block_rsv.space_info = space_info;
         fs_info->empty_block_rsv.space_info = space_info;
+       fs_info->delayed_block_rsv.space_info = space_info;
  
         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3899,18 +4051,17 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
         WARN_ON(fs_info->chunk_block_rsv.size > 0);
         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+       WARN_ON(fs_info->delayed_block_rsv.size > 0);
+       WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
  }
  
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
  {
-       struct btrfs_block_rsv *block_rsv;
-
         if (!trans->bytes_reserved)
                 return;
  
-       block_rsv = &root->fs_info->trans_block_rsv;
-       btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved);
+       btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
         trans->bytes_reserved = 0;
  }
  
@@ -3963,23 +4114,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
   */
  static unsigned drop_outstanding_extent(struct inode *inode)
  {
+       unsigned drop_inode_space = 0;
         unsigned dropped_extents = 0;
  
         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
         BTRFS_I(inode)->outstanding_extents--;
  
+       if (BTRFS_I(inode)->outstanding_extents == 0 &&
+           BTRFS_I(inode)->delalloc_meta_reserved) {
+               drop_inode_space = 1;
+               BTRFS_I(inode)->delalloc_meta_reserved = 0;
+       }
+
         /*
          * If we have more or the same amount of outsanding extents than we have
          * reserved then we need to leave the reserved extents count alone.
          */
         if (BTRFS_I(inode)->outstanding_extents >=
             BTRFS_I(inode)->reserved_extents)
-               return 0;
+               return drop_inode_space;
  
         dropped_extents = BTRFS_I(inode)->reserved_extents -
                 BTRFS_I(inode)->outstanding_extents;
         BTRFS_I(inode)->reserved_extents -= dropped_extents;
-       return dropped_extents;
+       return dropped_extents + drop_inode_space;
  }
  
  /**
@@ -4045,12 +4203,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
         u64 to_reserve = 0;
+       u64 csum_bytes;
         unsigned nr_extents = 0;
+       int extra_reserve = 0;
         int flush = 1;
         int ret;
  
+       /* Need to be holding the i_mutex here if we aren't free space cache */
         if (btrfs_is_free_space_inode(root, inode))
                 flush = 0;
+       else
+               WARN_ON(!mutex_is_locked(&inode->i_mutex));
  
         if (flush && btrfs_transaction_in_commit(root->fs_info))
                 schedule_timeout(1);
@@ -4061,39 +4224,60 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         BTRFS_I(inode)->outstanding_extents++;
  
         if (BTRFS_I(inode)->outstanding_extents >
-           BTRFS_I(inode)->reserved_extents) {
+           BTRFS_I(inode)->reserved_extents)
                 nr_extents = BTRFS_I(inode)->outstanding_extents -
                         BTRFS_I(inode)->reserved_extents;
-               BTRFS_I(inode)->reserved_extents += nr_extents;
  
-               to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+       /*
+        * Add an item to reserve for updating the inode when we complete the
+        * delalloc io.
+        */
+       if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+               nr_extents++;
+               extra_reserve = 1;
         }
+
+       to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
+       csum_bytes = BTRFS_I(inode)->csum_bytes;
         spin_unlock(&BTRFS_I(inode)->lock);
  
-       ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
+       ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
         if (ret) {
                 u64 to_free = 0;
                 unsigned dropped;
  
                 spin_lock(&BTRFS_I(inode)->lock);
                 dropped = drop_outstanding_extent(inode);
-               to_free = calc_csum_metadata_size(inode, num_bytes, 0);
-               spin_unlock(&BTRFS_I(inode)->lock);
-               to_free += btrfs_calc_trans_metadata_size(root, dropped);
-
                 /*
-                * Somebody could have come in and twiddled with the
-                * reservation, so if we have to free more than we would have
-                * reserved from this reservation go ahead and release those
-                * bytes.
+                * If the inodes csum_bytes is the same as the original
+                * csum_bytes then we know we haven't raced with any free()ers
+                * so we can just reduce our inodes csum bytes and carry on.
+                * Otherwise we have to do the normal free thing to account for
+                * the case that the free side didn't free up its reserve
+                * because of this outstanding reservation.
                  */
-               to_free -= to_reserve;
+               if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+                       calc_csum_metadata_size(inode, num_bytes, 0);
+               else
+                       to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+               spin_unlock(&BTRFS_I(inode)->lock);
+               if (dropped)
+                       to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
                 if (to_free)
                         btrfs_block_rsv_release(root, block_rsv, to_free);
                 return ret;
         }
  
+       spin_lock(&BTRFS_I(inode)->lock);
+       if (extra_reserve) {
+               BTRFS_I(inode)->delalloc_meta_reserved = 1;
+               nr_extents--;
+       }
+       BTRFS_I(inode)->reserved_extents += nr_extents;
+       spin_unlock(&BTRFS_I(inode)->lock);
+
         block_rsv_add_bytes(block_rsv, to_reserve, 1);
  
         return 0;
@@ -4191,12 +4375,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
  
         /* block accounting for super block */
         spin_lock(&info->delalloc_lock);
-       old_val = btrfs_super_bytes_used(&info->super_copy);
+       old_val = btrfs_super_bytes_used(info->super_copy);
         if (alloc)
                 old_val += num_bytes;
         else
                 old_val -= num_bytes;
-       btrfs_set_super_bytes_used(&info->super_copy, old_val);
+       btrfs_set_super_bytes_used(info->super_copy, old_val);
         spin_unlock(&info->delalloc_lock);
  
         while (total) {
@@ -4313,6 +4497,34 @@ int btrfs_pin_extent(struct btrfs_root *root,
         return 0;
  }
  
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   u64 bytenr, u64 num_bytes)
+{
+       struct btrfs_block_group_cache *cache;
+
+       cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+       BUG_ON(!cache);
+
+       /*
+        * pull in the free space cache (if any) so that our pin
+        * removes the free space from the cache.  We have load_only set
+        * to one because the slow code to read in the free extents does check
+        * the pinned extents.
+        */
+       cache_block_group(cache, trans, root, 1);
+
+       pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+       /* remove us from the free space cache (if we're there at all) */
+       btrfs_remove_free_space(cache, bytenr, num_bytes);
+       btrfs_put_block_group(cache);
+       return 0;
+}
+
  /**
   * btrfs_update_reserved_bytes - update the block_group and space info counters
   * @cache:     The cache we are manipulating
@@ -4911,11 +5123,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
         struct btrfs_root *root = orig_root->fs_info->extent_root;
         struct btrfs_free_cluster *last_ptr = NULL;
         struct btrfs_block_group_cache *block_group = NULL;
+       struct btrfs_block_group_cache *used_block_group;
         int empty_cluster = 2 * 1024 * 1024;
         int allowed_chunk_alloc = 0;
         int done_chunk_alloc = 0;
         struct btrfs_space_info *space_info;
-       int last_ptr_loop = 0;
         int loop = 0;
         int index = 0;
         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
@@ -4924,6 +5136,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
         bool failed_cluster_refill = false;
         bool failed_alloc = false;
         bool use_cluster = true;
+       bool have_caching_bg = false;
         u64 ideal_cache_percent = 0;
         u64 ideal_cache_offset = 0;
  
@@ -4976,6 +5189,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
  ideal_cache:
                 block_group = btrfs_lookup_block_group(root->fs_info,
                                                        search_start);
+               used_block_group = block_group;
                 /*
                  * we don't want to use the block group if it doesn't match our
                  * allocation bits, or if its not cached.
@@ -5006,12 +5220,14 @@ ideal_cache:
                 }
         }
  search:
+       have_caching_bg = false;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group, &space_info->block_groups[index],
                             list) {
                 u64 offset;
                 int cached;
  
+               used_block_group = block_group;
                 btrfs_get_block_group(block_group);
                 search_start = block_group->key.objectid;
  
@@ -5035,13 +5251,15 @@ search:
                 }
  
  have_block_group:
-               if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+               cached = block_group_cache_done(block_group);
+               if (unlikely(!cached)) {
                         u64 free_percent;
  
+                       found_uncached_bg = true;
                         ret = cache_block_group(block_group, trans,
                                                 orig_root, 1);
                         if (block_group->cached == BTRFS_CACHE_FINISHED)
-                               goto have_block_group;
+                               goto alloc;
  
                         free_percent = btrfs_block_group_used(&block_group->item);
                         free_percent *= 100;
@@ -5063,7 +5281,6 @@ have_block_group:
                                                         orig_root, 0);
                                 BUG_ON(ret);
                         }
-                       found_uncached_bg = true;
  
                         /*
                          * If loop is set for cached only, try the next block
@@ -5073,94 +5290,80 @@ have_block_group:
                                 goto loop;
                 }
  
-               cached = block_group_cache_done(block_group);
-               if (unlikely(!cached))
-                       found_uncached_bg = true;
-
+alloc:
                 if (unlikely(block_group->ro))
                         goto loop;
  
                 spin_lock(&block_group->free_space_ctl->tree_lock);
                 if (cached &&
                     block_group->free_space_ctl->free_space <
-                   num_bytes + empty_size) {
+                   num_bytes + empty_cluster + empty_size) {
                         spin_unlock(&block_group->free_space_ctl->tree_lock);
                         goto loop;
                 }
                 spin_unlock(&block_group->free_space_ctl->tree_lock);
  
                 /*
-                * Ok we want to try and use the cluster allocator, so lets look
-                * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
-                * have tried the cluster allocator plenty of times at this
-                * point and not have found anything, so we are likely way too
-                * fragmented for the clustering stuff to find anything, so lets
-                * just skip it and let the allocator find whatever block it can
-                * find
+                * Ok we want to try and use the cluster allocator, so
+                * lets look there
                  */
-               if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
+               if (last_ptr) {
                         /*
                          * the refill lock keeps out other
                          * people trying to start a new cluster
                          */
                         spin_lock(&last_ptr->refill_lock);
-                       if (last_ptr->block_group &&
-                           (last_ptr->block_group->ro ||
-                           !block_group_bits(last_ptr->block_group, data))) {
-                               offset = 0;
+                       used_block_group = last_ptr->block_group;
+                       if (used_block_group != block_group &&
+                           (!used_block_group ||
+                            used_block_group->ro ||
+                            !block_group_bits(used_block_group, data))) {
+                               used_block_group = block_group;
                                 goto refill_cluster;
                         }
  
-                       offset = btrfs_alloc_from_cluster(block_group, last_ptr,
-                                                num_bytes, search_start);
+                       if (used_block_group != block_group)
+                               btrfs_get_block_group(used_block_group);
+
+                       offset = btrfs_alloc_from_cluster(used_block_group,
+                         last_ptr, num_bytes, used_block_group->key.objectid);
                         if (offset) {
                                 /* we have a block, we're done */
                                 spin_unlock(&last_ptr->refill_lock);
                                 goto checks;
                         }
  
-                       spin_lock(&last_ptr->lock);
-                       /*
-                        * whoops, this cluster doesn't actually point to
-                        * this block group.  Get a ref on the block
-                        * group is does point to and try again
-                        */
-                       if (!last_ptr_loop && last_ptr->block_group &&
-                           last_ptr->block_group != block_group &&
-                           index <=
-                                get_block_group_index(last_ptr->block_group)) {
-
-                               btrfs_put_block_group(block_group);
-                               block_group = last_ptr->block_group;
-                               btrfs_get_block_group(block_group);
-                               spin_unlock(&last_ptr->lock);
-                               spin_unlock(&last_ptr->refill_lock);
-
-                               last_ptr_loop = 1;
-                               search_start = block_group->key.objectid;
-                               /*
-                                * we know this block group is properly
-                                * in the list because
-                                * btrfs_remove_block_group, drops the
-                                * cluster before it removes the block
-                                * group from the list
-                                */
-                               goto have_block_group;
+                       WARN_ON(last_ptr->block_group != used_block_group);
+                       if (used_block_group != block_group) {
+                               btrfs_put_block_group(used_block_group);
+                               used_block_group = block_group;
                         }
-                       spin_unlock(&last_ptr->lock);
  refill_cluster:
+                       BUG_ON(used_block_group != block_group);
+                       /* If we are on LOOP_NO_EMPTY_SIZE, we can't
+                        * set up a new clusters, so lets just skip it
+                        * and let the allocator find whatever block
+                        * it can find.  If we reach this point, we
+                        * will have tried the cluster allocator
+                        * plenty of times and not have found
+                        * anything, so we are likely way too
+                        * fragmented for the clustering stuff to find
+                        * anything.  */
+                       if (loop >= LOOP_NO_EMPTY_SIZE) {
+                               spin_unlock(&last_ptr->refill_lock);
+                               goto unclustered_alloc;
+                       }
+
                         /*
                          * this cluster didn't work out, free it and
                          * start over
                          */
                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
  
-                       last_ptr_loop = 0;
-
                         /* allocate a cluster in this block group */
                         ret = btrfs_find_space_cluster(trans, root,
                                                block_group, last_ptr,
-                                              offset, num_bytes,
+                                              search_start, num_bytes,
                                                empty_cluster + empty_size);
                         if (ret == 0) {
                                 /*
@@ -5196,6 +5399,7 @@ refill_cluster:
                         goto loop;
                 }
  
+unclustered_alloc:
                 offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                     num_bytes, empty_size);
                 /*
@@ -5214,20 +5418,22 @@ refill_cluster:
                         failed_alloc = true;
                         goto have_block_group;
                 } else if (!offset) {
+                       if (!cached)
+                               have_caching_bg = true;
                         goto loop;
                 }
  checks:
                 search_start = stripe_align(root, offset);
                 /* move on to the next group */
                 if (search_start + num_bytes >= search_end) {
-                       btrfs_add_free_space(block_group, offset, num_bytes);
+                       btrfs_add_free_space(used_block_group, offset, num_bytes);
                         goto loop;
                 }
  
                 /* move on to the next group */
                 if (search_start + num_bytes >
-                   block_group->key.objectid + block_group->key.offset) {
-                       btrfs_add_free_space(block_group, offset, num_bytes);
+                   used_block_group->key.objectid + used_block_group->key.offset) {
+                       btrfs_add_free_space(used_block_group, offset, num_bytes);
                         goto loop;
                 }
  
@@ -5235,14 +5441,14 @@ checks:
                 ins->offset = num_bytes;
  
                 if (offset < search_start)
-                       btrfs_add_free_space(block_group, offset,
+                       btrfs_add_free_space(used_block_group, offset,
                                              search_start - offset);
                 BUG_ON(offset > search_start);
  
-               ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+               ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
                                                   alloc_type);
                 if (ret == -EAGAIN) {
-                       btrfs_add_free_space(block_group, offset, num_bytes);
+                       btrfs_add_free_space(used_block_group, offset, num_bytes);
                         goto loop;
                 }
  
@@ -5251,19 +5457,26 @@ checks:
                 ins->offset = num_bytes;
  
                 if (offset < search_start)
-                       btrfs_add_free_space(block_group, offset,
+                       btrfs_add_free_space(used_block_group, offset,
                                              search_start - offset);
                 BUG_ON(offset > search_start);
+               if (used_block_group != block_group)
+                       btrfs_put_block_group(used_block_group);
                 btrfs_put_block_group(block_group);
                 break;
  loop:
                 failed_cluster_refill = false;
                 failed_alloc = false;
                 BUG_ON(index != get_block_group_index(block_group));
+               if (used_block_group != block_group)
+                       btrfs_put_block_group(used_block_group);
                 btrfs_put_block_group(block_group);
         }
         up_read(&space_info->groups_sem);
  
+       if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+               goto search;
+
         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
                 goto search;
  
@@ -5449,7 +5662,8 @@ again:
         return ret;
  }
  
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+                                       u64 start, u64 len, int pin)
  {
         struct btrfs_block_group_cache *cache;
         int ret = 0;
@@ -5464,8 +5678,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
         if (btrfs_test_opt(root, DISCARD))
                 ret = btrfs_discard_extent(root, start, len, NULL);
  
-       btrfs_add_free_space(cache, start, len);
-       btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+       if (pin)
+               pin_down_extent(root, cache, start, len, 1);
+       else {
+               btrfs_add_free_space(cache, start, len);
+               btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+       }
         btrfs_put_block_group(cache);
  
         trace_btrfs_reserved_extent_free(root, start, len);
@@ -5473,6 +5691,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
         return ret;
  }
  
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+                                       u64 start, u64 len)
+{
+       return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+                                      u64 start, u64 len)
+{
+       return __btrfs_free_reserved_extent(root, start, len, 1);
+}
+
  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       u64 parent, u64 root_objectid,
@@ -5726,7 +5956,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
         block_rsv = get_block_rsv(trans, root);
  
         if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
                 /*
                  * If we couldn't reserve metadata bytes try and use some from
                  * the global reserve.
@@ -5746,8 +5976,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
         if (!ret)
                 return block_rsv;
         if (ret) {
-               WARN_ON(1);
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+               static DEFINE_RATELIMIT_STATE(_rs,
+                               DEFAULT_RATELIMIT_INTERVAL,
+                               /*DEFAULT_RATELIMIT_BURST*/ 2);
+               if (__ratelimit(&_rs)) {
+                       printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+                       WARN_ON(1);
+               }
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
                 if (!ret) {
                         return block_rsv;
                 } else if (ret && block_rsv != global_rsv) {
@@ -6848,7 +7084,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                  * space to fit our block group in.
                  */
                 if (device->total_bytes > device->bytes_used + min_free) {
-                       ret = find_free_dev_extent(NULL, device, min_free,
+                       ret = find_free_dev_extent(device, min_free,
                                                    &dev_offset, NULL);
                         if (!ret)
                                 dev_nr++;
@@ -7038,9 +7274,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 return -ENOMEM;
         path->reada = 1;
  
-       cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+       cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
         if (btrfs_test_opt(root, SPACE_CACHE) &&
-           btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+           btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
                 need_clear = 1;
         if (btrfs_test_opt(root, CLEAR_CACHE))
                 need_clear = 1;
@@ -7210,6 +7446,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                 &cache->space_info);
         BUG_ON(ret);
+       update_global_block_rsv(root->fs_info);
  
         spin_lock(&cache->space_info->lock);
         cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7282,7 +7519,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                 goto out;
         }
  
-       inode = lookup_free_space_inode(root, block_group, path);
+       inode = lookup_free_space_inode(tree_root, block_group, path);
         if (!IS_ERR(inode)) {
                 ret = btrfs_orphan_add(trans, inode);
                 BUG_ON(ret);
@@ -7369,7 +7606,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
         int mixed = 0;
         int ret;
  
-       disk_super = &fs_info->super_copy;
+       disk_super = fs_info->super_copy;
         if (!btrfs_super_root(disk_super))
                 return 1;