arm64: dts: rk3399-sapphire: enable isp0 isp1
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / ctree.c
index 02fae7f7e42cb417a14fe0243805bcad4270b592..0f2b7c622ce3e5fb6a8450b8c8d57255a94a9886 100644 (file)
@@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
                    int level, int slot);
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
                                 struct extent_buffer *eb);
-static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -81,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 {
        int i;
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       /* lockdep really cares that we take all of these spinlocks
-        * in the right order.  If any of the locks in the path are not
-        * currently blocking, it is going to complain.  So, make really
-        * really sure by forcing the path to blocking before we clear
-        * the path blocking.
-        */
        if (held) {
                btrfs_set_lock_blocking_rw(held, held_rw);
                if (held_rw == BTRFS_WRITE_LOCK)
@@ -96,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
                        held_rw = BTRFS_READ_LOCK_BLOCKING;
        }
        btrfs_set_path_blocking(p);
-#endif
 
        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
                if (p->nodes[i] && p->locks[i]) {
@@ -108,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
                }
        }
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (held)
                btrfs_clear_lock_blocking_rw(held, held_rw);
-#endif
 }
 
 /* this also releases the path */
@@ -224,10 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
  */
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
+       if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+           !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+               return;
+
        spin_lock(&root->fs_info->trans_lock);
-       if (root->track_dirty && list_empty(&root->dirty_list)) {
-               list_add(&root->dirty_list,
-                        &root->fs_info->dirty_cowonly_roots);
+       if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+               /* Want the extent tree to be the last on the list */
+               if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+                       list_move_tail(&root->dirty_list,
+                                      &root->fs_info->dirty_cowonly_roots);
+               else
+                       list_move(&root->dirty_list,
+                                 &root->fs_info->dirty_cowonly_roots);
        }
        spin_unlock(&root->fs_info->trans_lock);
 }
@@ -247,9 +245,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        int level;
        struct btrfs_disk_key disk_key;
 
-       WARN_ON(root->ref_cows && trans->transid !=
-               root->fs_info->running_transaction->transid);
-       WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+       WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+               trans->transid != root->fs_info->running_transaction->transid);
+       WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+               trans->transid != root->last_trans);
 
        level = btrfs_header_level(buf);
        if (level == 0)
@@ -257,9 +256,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        else
                btrfs_node_key(buf, &disk_key, 0);
 
-       cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
-                                    new_root_objectid, &disk_key, level,
-                                    buf->start, 0);
+       cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
+                       &disk_key, level, buf->start, 0);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -274,15 +272,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        else
                btrfs_set_header_owner(cow, new_root_objectid);
 
-       write_extent_buffer(cow, root->fs_info->fsid,
-                           (unsigned long)btrfs_header_fsid(cow),
+       write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
 
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+               ret = btrfs_inc_ref(trans, root, cow, 1);
        else
-               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+               ret = btrfs_inc_ref(trans, root, cow, 0);
 
        if (ret)
                return ret;
@@ -356,43 +353,13 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 }
 
 /*
- * Increment the upper half of tree_mod_seq, set lower half zero.
- *
- * Must be called with fs_info->tree_mod_seq_lock held.
+ * Pull a new tree mod seq number for our operation.
  */
-static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
-{
-       u64 seq = atomic64_read(&fs_info->tree_mod_seq);
-       seq &= 0xffffffff00000000ull;
-       seq += 1ull << 32;
-       atomic64_set(&fs_info->tree_mod_seq, seq);
-       return seq;
-}
-
-/*
- * Increment the lower half of tree_mod_seq.
- *
- * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
- * are generated should not technically require a spin lock here. (Rationale:
- * incrementing the minor while incrementing the major seq number is between its
- * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
- * just returns a unique sequence number as usual.) We have decided to leave
- * that requirement in here and rethink it once we notice it really imposes a
- * problem on some workload.
- */
-static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 {
        return atomic64_inc_return(&fs_info->tree_mod_seq);
 }
 
-/*
- * return the last minor in the previous major tree_mod_seq number
- */
-u64 btrfs_tree_mod_seq_prev(u64 seq)
-{
-       return (seq & 0xffffffff00000000ull) - 1ull;
-}
-
 /*
  * This adds a new blocker to the tree mod log's blocker list if the @elem
  * passed does not already have a sequence number set. So when a caller expects
@@ -404,19 +371,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
                           struct seq_list *elem)
 {
-       u64 seq;
-
        tree_mod_log_write_lock(fs_info);
        spin_lock(&fs_info->tree_mod_seq_lock);
        if (!elem->seq) {
-               elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
+               elem->seq = btrfs_inc_tree_mod_seq(fs_info);
                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
        }
-       seq = btrfs_inc_tree_mod_seq_minor(fs_info);
        spin_unlock(&fs_info->tree_mod_seq_lock);
        tree_mod_log_write_unlock(fs_info);
 
-       return seq;
+       return elem->seq;
 }
 
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -476,6 +440,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  * the index is the shifted logical of the *new* root node for root replace
  * operations, or the shifted logical of the affected block for all other
  * operations.
+ *
+ * Note: must be called with write lock (tree_mod_log_write_lock).
  */
 static noinline int
 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -485,7 +451,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
        struct rb_node *parent = NULL;
        struct tree_mod_elem *cur;
 
-       BUG_ON(!tm || !tm->seq);
+       BUG_ON(!tm);
+
+       tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 
        tm_root = &fs_info->tree_mod_log;
        new = &tm_root->rb_node;
@@ -500,10 +468,8 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
                        new = &((*new)->rb_left);
                else if (cur->seq > tm->seq)
                        new = &((*new)->rb_right);
-               else {
-                       kfree(tm);
+               else
                        return -EEXIST;
-               }
        }
 
        rb_link_node(&tm->node, parent, new);
@@ -526,11 +492,7 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
                return 1;
 
        tree_mod_log_write_lock(fs_info);
-       if (list_empty(&fs_info->tree_mod_seq_list)) {
-               /*
-                * someone emptied the list while we were waiting for the lock.
-                * we must not add to the list when no blocker exists.
-                */
+       if (list_empty(&(fs_info)->tree_mod_seq_list)) {
                tree_mod_log_write_unlock(fs_info);
                return 1;
        }
@@ -538,43 +500,28 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
        return 0;
 }
 
-/*
- * This allocates memory and gets a tree modification sequence number.
- *
- * Returns <0 on error.
- * Returns >0 (the added sequence number) on success.
- */
-static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
-                                struct tree_mod_elem **tm_ret)
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+                                   struct extent_buffer *eb)
 {
-       struct tree_mod_elem *tm;
-
-       /*
-        * once we switch from spin locks to something different, we should
-        * honor the flags parameter here.
-        */
-       tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
-       if (!tm)
-               return -ENOMEM;
-
-       spin_lock(&fs_info->tree_mod_seq_lock);
-       tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
-       spin_unlock(&fs_info->tree_mod_seq_lock);
+       smp_mb();
+       if (list_empty(&(fs_info)->tree_mod_seq_list))
+               return 0;
+       if (eb && btrfs_header_level(eb) == 0)
+               return 0;
 
-       return tm->seq;
+       return 1;
 }
 
-static inline int
-__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
-                         struct extent_buffer *eb, int slot,
-                         enum mod_log_op op, gfp_t flags)
+static struct tree_mod_elem *
+alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
+                   enum mod_log_op op, gfp_t flags)
 {
-       int ret;
        struct tree_mod_elem *tm;
 
-       ret = tree_mod_alloc(fs_info, flags, &tm);
-       if (ret < 0)
-               return ret;
+       tm = kzalloc(sizeof(*tm), flags);
+       if (!tm)
+               return NULL;
 
        tm->index = eb->start >> PAGE_CACHE_SHIFT;
        if (op != MOD_LOG_KEY_ADD) {
@@ -584,39 +531,37 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
        tm->op = op;
        tm->slot = slot;
        tm->generation = btrfs_node_ptr_generation(eb, slot);
+       RB_CLEAR_NODE(&tm->node);
 
-       return __tree_mod_log_insert(fs_info, tm);
+       return tm;
 }
 
 static noinline int
-tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
-                            struct extent_buffer *eb, int slot,
-                            enum mod_log_op op, gfp_t flags)
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+                       struct extent_buffer *eb, int slot,
+                       enum mod_log_op op, gfp_t flags)
 {
+       struct tree_mod_elem *tm;
        int ret;
 
-       if (tree_mod_dont_log(fs_info, eb))
+       if (!tree_mod_need_log(fs_info, eb))
                return 0;
 
-       ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+       tm = alloc_tree_mod_elem(eb, slot, op, flags);
+       if (!tm)
+               return -ENOMEM;
 
-       tree_mod_log_write_unlock(fs_info);
-       return ret;
-}
+       if (tree_mod_dont_log(fs_info, eb)) {
+               kfree(tm);
+               return 0;
+       }
 
-static noinline int
-tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
-                       int slot, enum mod_log_op op)
-{
-       return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
-}
+       ret = __tree_mod_log_insert(fs_info, tm);
+       tree_mod_log_write_unlock(fs_info);
+       if (ret)
+               kfree(tm);
 
-static noinline int
-tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
-                            struct extent_buffer *eb, int slot,
-                            enum mod_log_op op)
-{
-       return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
+       return ret;
 }
 
 static noinline int
@@ -624,27 +569,24 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *eb, int dst_slot, int src_slot,
                         int nr_items, gfp_t flags)
 {
-       struct tree_mod_elem *tm;
-       int ret;
+       struct tree_mod_elem *tm = NULL;
+       struct tree_mod_elem **tm_list = NULL;
+       int ret = 0;
        int i;
+       int locked = 0;
 
-       if (tree_mod_dont_log(fs_info, eb))
+       if (!tree_mod_need_log(fs_info, eb))
                return 0;
 
-       /*
-        * When we override something during the move, we log these removals.
-        * This can only happen when we move towards the beginning of the
-        * buffer, i.e. dst_slot < src_slot.
-        */
-       for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-               ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
-                                             MOD_LOG_KEY_REMOVE_WHILE_MOVING);
-               BUG_ON(ret < 0);
-       }
+       tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
+       if (!tm_list)
+               return -ENOMEM;
 
-       ret = tree_mod_alloc(fs_info, flags, &tm);
-       if (ret < 0)
-               goto out;
+       tm = kzalloc(sizeof(*tm), flags);
+       if (!tm) {
+               ret = -ENOMEM;
+               goto free_tms;
+       }
 
        tm->index = eb->start >> PAGE_CACHE_SHIFT;
        tm->slot = src_slot;
@@ -652,28 +594,70 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
        tm->move.nr_items = nr_items;
        tm->op = MOD_LOG_MOVE_KEYS;
 
+       for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+               tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+                   MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+               if (!tm_list[i]) {
+                       ret = -ENOMEM;
+                       goto free_tms;
+               }
+       }
+
+       if (tree_mod_dont_log(fs_info, eb))
+               goto free_tms;
+       locked = 1;
+
+       /*
+        * When we override something during the move, we log these removals.
+        * This can only happen when we move towards the beginning of the
+        * buffer, i.e. dst_slot < src_slot.
+        */
+       for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+               ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+               if (ret)
+                       goto free_tms;
+       }
+
        ret = __tree_mod_log_insert(fs_info, tm);
-out:
+       if (ret)
+               goto free_tms;
        tree_mod_log_write_unlock(fs_info);
+       kfree(tm_list);
+
+       return 0;
+free_tms:
+       for (i = 0; i < nr_items; i++) {
+               if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+                       rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+               kfree(tm_list[i]);
+       }
+       if (locked)
+               tree_mod_log_write_unlock(fs_info);
+       kfree(tm_list);
+       kfree(tm);
+
        return ret;
 }
 
-static inline void
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static inline int
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                      struct tree_mod_elem **tm_list,
+                      int nritems)
 {
-       int i;
-       u32 nritems;
+       int i, j;
        int ret;
 
-       if (btrfs_header_level(eb) == 0)
-               return;
-
-       nritems = btrfs_header_nritems(eb);
        for (i = nritems - 1; i >= 0; i--) {
-               ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
-                                             MOD_LOG_KEY_REMOVE_WHILE_FREEING);
-               BUG_ON(ret < 0);
+               ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+               if (ret) {
+                       for (j = nritems - 1; j > i; j--)
+                               rb_erase(&tm_list[j]->node,
+                                        &fs_info->tree_mod_log);
+                       return ret;
+               }
        }
+
+       return 0;
 }
 
 static noinline int
@@ -682,18 +666,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
                         struct extent_buffer *new_root, gfp_t flags,
                         int log_removal)
 {
-       struct tree_mod_elem *tm;
-       int ret;
+       struct tree_mod_elem *tm = NULL;
+       struct tree_mod_elem **tm_list = NULL;
+       int nritems = 0;
+       int ret = 0;
+       int i;
 
-       if (tree_mod_dont_log(fs_info, NULL))
+       if (!tree_mod_need_log(fs_info, NULL))
                return 0;
 
-       if (log_removal)
-               __tree_mod_log_free_eb(fs_info, old_root);
+       if (log_removal && btrfs_header_level(old_root) > 0) {
+               nritems = btrfs_header_nritems(old_root);
+               tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
+                                 flags);
+               if (!tm_list) {
+                       ret = -ENOMEM;
+                       goto free_tms;
+               }
+               for (i = 0; i < nritems; i++) {
+                       tm_list[i] = alloc_tree_mod_elem(old_root, i,
+                           MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+                       if (!tm_list[i]) {
+                               ret = -ENOMEM;
+                               goto free_tms;
+                       }
+               }
+       }
 
-       ret = tree_mod_alloc(fs_info, flags, &tm);
-       if (ret < 0)
-               goto out;
+       tm = kzalloc(sizeof(*tm), flags);
+       if (!tm) {
+               ret = -ENOMEM;
+               goto free_tms;
+       }
 
        tm->index = new_root->start >> PAGE_CACHE_SHIFT;
        tm->old_root.logical = old_root->start;
@@ -701,9 +705,29 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
        tm->generation = btrfs_header_generation(old_root);
        tm->op = MOD_LOG_ROOT_REPLACE;
 
-       ret = __tree_mod_log_insert(fs_info, tm);
-out:
+       if (tree_mod_dont_log(fs_info, NULL))
+               goto free_tms;
+
+       if (tm_list)
+               ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+       if (!ret)
+               ret = __tree_mod_log_insert(fs_info, tm);
+
        tree_mod_log_write_unlock(fs_info);
+       if (ret)
+               goto free_tms;
+       kfree(tm_list);
+
+       return ret;
+
+free_tms:
+       if (tm_list) {
+               for (i = 0; i < nritems; i++)
+                       kfree(tm_list[i]);
+               kfree(tm_list);
+       }
+       kfree(tm);
+
        return ret;
 }
 
@@ -773,34 +797,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 }
 
-static noinline void
+static noinline int
 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
                     struct extent_buffer *src, unsigned long dst_offset,
                     unsigned long src_offset, int nr_items)
 {
-       int ret;
+       int ret = 0;
+       struct tree_mod_elem **tm_list = NULL;
+       struct tree_mod_elem **tm_list_add, **tm_list_rem;
        int i;
+       int locked = 0;
 
-       if (tree_mod_dont_log(fs_info, NULL))
-               return;
+       if (!tree_mod_need_log(fs_info, NULL))
+               return 0;
 
-       if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
-               tree_mod_log_write_unlock(fs_info);
-               return;
+       if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+               return 0;
+
+       tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
+                         GFP_NOFS);
+       if (!tm_list)
+               return -ENOMEM;
+
+       tm_list_add = tm_list;
+       tm_list_rem = tm_list + nr_items;
+       for (i = 0; i < nr_items; i++) {
+               tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+                   MOD_LOG_KEY_REMOVE, GFP_NOFS);
+               if (!tm_list_rem[i]) {
+                       ret = -ENOMEM;
+                       goto free_tms;
+               }
+
+               tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+                   MOD_LOG_KEY_ADD, GFP_NOFS);
+               if (!tm_list_add[i]) {
+                       ret = -ENOMEM;
+                       goto free_tms;
+               }
        }
 
+       if (tree_mod_dont_log(fs_info, NULL))
+               goto free_tms;
+       locked = 1;
+
        for (i = 0; i < nr_items; i++) {
-               ret = tree_mod_log_insert_key_locked(fs_info, src,
-                                               i + src_offset,
-                                               MOD_LOG_KEY_REMOVE);
-               BUG_ON(ret < 0);
-               ret = tree_mod_log_insert_key_locked(fs_info, dst,
-                                                    i + dst_offset,
-                                                    MOD_LOG_KEY_ADD);
-               BUG_ON(ret < 0);
+               ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
+               if (ret)
+                       goto free_tms;
+               ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
+               if (ret)
+                       goto free_tms;
        }
 
        tree_mod_log_write_unlock(fs_info);
+       kfree(tm_list);
+
+       return 0;
+
+free_tms:
+       for (i = 0; i < nr_items * 2; i++) {
+               if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+                       rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+               kfree(tm_list[i]);
+       }
+       if (locked)
+               tree_mod_log_write_unlock(fs_info);
+       kfree(tm_list);
+
+       return ret;
 }
 
 static inline void
@@ -819,21 +884,57 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 {
        int ret;
 
-       ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
-                                          MOD_LOG_KEY_REPLACE,
-                                          atomic ? GFP_ATOMIC : GFP_NOFS);
+       ret = tree_mod_log_insert_key(fs_info, eb, slot,
+                                       MOD_LOG_KEY_REPLACE,
+                                       atomic ? GFP_ATOMIC : GFP_NOFS);
        BUG_ON(ret < 0);
 }
 
-static noinline void
+static noinline int
 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 {
-       if (tree_mod_dont_log(fs_info, eb))
-               return;
+       struct tree_mod_elem **tm_list = NULL;
+       int nritems = 0;
+       int i;
+       int ret = 0;
+
+       if (btrfs_header_level(eb) == 0)
+               return 0;
+
+       if (!tree_mod_need_log(fs_info, NULL))
+               return 0;
+
+       nritems = btrfs_header_nritems(eb);
+       tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
+       if (!tm_list)
+               return -ENOMEM;
+
+       for (i = 0; i < nritems; i++) {
+               tm_list[i] = alloc_tree_mod_elem(eb, i,
+                   MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+               if (!tm_list[i]) {
+                       ret = -ENOMEM;
+                       goto free_tms;
+               }
+       }
 
-       __tree_mod_log_free_eb(fs_info, eb);
+       if (tree_mod_dont_log(fs_info, eb))
+               goto free_tms;
 
+       ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
        tree_mod_log_write_unlock(fs_info);
+       if (ret)
+               goto free_tms;
+       kfree(tm_list);
+
+       return 0;
+
+free_tms:
+       for (i = 0; i < nritems; i++)
+               kfree(tm_list[i]);
+       kfree(tm_list);
+
+       return ret;
 }
 
 static noinline void
@@ -859,14 +960,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
         * snapshot and the block was not allocated by tree relocation,
         * we know the block is not shared.
         */
-       if (root->ref_cows &&
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
            buf != root->node && buf != root->commit_root &&
            (btrfs_header_generation(buf) <=
             btrfs_root_last_snapshot(&root->root_item) ||
             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
                return 1;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-       if (root->ref_cows &&
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
                return 1;
 #endif
@@ -910,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        return ret;
                if (refs == 0) {
                        ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret);
+                       btrfs_std_error(root->fs_info, ret, NULL);
                        return ret;
                }
        } else {
@@ -930,14 +1031,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if ((owner == root->root_key.objectid ||
                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-                       ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+                       ret = btrfs_inc_ref(trans, root, buf, 1);
                        BUG_ON(ret); /* -ENOMEM */
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID) {
-                               ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+                               ret = btrfs_dec_ref(trans, root, buf, 0);
                                BUG_ON(ret); /* -ENOMEM */
-                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1);
                                BUG_ON(ret); /* -ENOMEM */
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -945,9 +1046,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 0);
                        BUG_ON(ret); /* -ENOMEM */
                }
                if (new_flags != 0) {
@@ -964,14 +1065,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 0);
                        BUG_ON(ret); /* -ENOMEM */
-                       ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+                       ret = btrfs_dec_ref(trans, root, buf, 1);
                        BUG_ON(ret); /* -ENOMEM */
                }
-               clean_tree_block(trans, root, buf);
+               clean_tree_block(trans, root->fs_info, buf);
                *last_ref = 1;
        }
        return 0;
@@ -1008,9 +1109,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
        btrfs_assert_tree_locked(buf);
 
-       WARN_ON(root->ref_cows && trans->transid !=
-               root->fs_info->running_transaction->transid);
-       WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+       WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+               trans->transid != root->fs_info->running_transaction->transid);
+       WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+               trans->transid != root->last_trans);
 
        level = btrfs_header_level(buf);
 
@@ -1027,9 +1129,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        } else
                parent_start = 0;
 
-       cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
-                                    root->root_key.objectid, &disk_key,
-                                    level, search_start, empty_size);
+       cow = btrfs_alloc_tree_block(trans, root, parent_start,
+                       root->root_key.objectid, &disk_key, level,
+                       search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -1046,8 +1148,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        else
                btrfs_set_header_owner(cow, root->root_key.objectid);
 
-       write_extent_buffer(cow, root->fs_info->fsid,
-                           (unsigned long)btrfs_header_fsid(cow),
+       write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
 
        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
@@ -1056,8 +1157,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                return ret;
        }
 
-       if (root->ref_cows)
-               btrfs_reloc_cow_block(trans, root, buf, cow);
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+               ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       return ret;
+               }
+       }
 
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
@@ -1083,13 +1189,19 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
                WARN_ON(trans->transid != btrfs_header_generation(parent));
                tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
-                                       MOD_LOG_KEY_REPLACE);
+                                       MOD_LOG_KEY_REPLACE, GFP_NOFS);
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-               tree_mod_log_free_eb(root->fs_info, buf);
+               if (last_ref) {
+                       ret = tree_mod_log_free_eb(root->fs_info, buf);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, root, ret);
+                               return ret;
+                       }
+               }
                btrfs_free_tree_block(trans, root, buf, parent_start,
                                      last_ref);
        }
@@ -1115,7 +1227,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
        int looped = 0;
 
        if (!time_seq)
-               return 0;
+               return NULL;
 
        /*
         * the very last operation that's logged for a root is the replacement
@@ -1126,7 +1238,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
                tm = tree_mod_log_search_oldest(fs_info, root_logical,
                                                time_seq);
                if (!looped && !tm)
-                       return 0;
+                       return NULL;
                /*
                 * if there are no tree operation for the oldest root, we simply
                 * return it. this should only happen if that (old) root is at
@@ -1161,8 +1273,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
  * time_seq).
  */
 static void
-__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
-                     struct tree_mod_elem *first_tm)
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                     u64 time_seq, struct tree_mod_elem *first_tm)
 {
        u32 n;
        struct rb_node *next;
@@ -1172,6 +1284,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
        unsigned long p_size = sizeof(struct btrfs_key_ptr);
 
        n = btrfs_header_nritems(eb);
+       tree_mod_log_read_lock(fs_info);
        while (tm && tm->seq >= time_seq) {
                /*
                 * all the operations are recorded with the operator used for
@@ -1226,6 +1339,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                if (tm->index != first_tm->index)
                        break;
        }
+       tree_mod_log_read_unlock(fs_info);
        btrfs_set_header_nritems(eb, n);
 }
 
@@ -1237,8 +1351,8 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
  * is freed (its refcount is decremented).
  */
 static struct extent_buffer *
-tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
-                   u64 time_seq)
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+                   struct extent_buffer *eb, u64 time_seq)
 {
        struct extent_buffer *eb_rewin;
        struct tree_mod_elem *tm;
@@ -1253,11 +1367,17 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
        if (!tm)
                return eb;
 
+       btrfs_set_path_blocking(path);
+       btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                BUG_ON(tm->slot != 0);
-               eb_rewin = alloc_dummy_extent_buffer(eb->start,
-                                               fs_info->tree_root->nodesize);
-               BUG_ON(!eb_rewin);
+               eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+               if (!eb_rewin) {
+                       btrfs_tree_read_unlock_blocking(eb);
+                       free_extent_buffer(eb);
+                       return NULL;
+               }
                btrfs_set_header_bytenr(eb_rewin, eb->start);
                btrfs_set_header_backref_rev(eb_rewin,
                                             btrfs_header_backref_rev(eb));
@@ -1265,16 +1385,20 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
        } else {
                eb_rewin = btrfs_clone_extent_buffer(eb);
-               BUG_ON(!eb_rewin);
+               if (!eb_rewin) {
+                       btrfs_tree_read_unlock_blocking(eb);
+                       free_extent_buffer(eb);
+                       return NULL;
+               }
        }
 
-       extent_buffer_get(eb_rewin);
-       btrfs_tree_read_unlock(eb);
+       btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
+       btrfs_tree_read_unlock_blocking(eb);
        free_extent_buffer(eb);
 
        extent_buffer_get(eb_rewin);
        btrfs_tree_read_lock(eb_rewin);
-       __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+       __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
        WARN_ON(btrfs_header_nritems(eb_rewin) >
                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
 
@@ -1298,7 +1422,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        struct tree_mod_root *old_root = NULL;
        u64 old_generation = 0;
        u64 logical;
-       u32 blocksize;
 
        eb_root = btrfs_read_lock_root_node(root);
        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1317,13 +1440,12 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
-               blocksize = btrfs_level_size(root, old_root->level);
-               old = read_tree_block(root, logical, blocksize, 0);
-               if (!old || !extent_buffer_uptodate(old)) {
-                       free_extent_buffer(old);
-                       pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
-                               logical);
-                       WARN_ON(1);
+               old = read_tree_block(root, logical, 0);
+               if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+                       if (!IS_ERR(old))
+                               free_extent_buffer(old);
+                       btrfs_warn(root->fs_info,
+                               "failed to read tree block %llu from get_old_root", logical);
                } else {
                        eb = btrfs_clone_extent_buffer(old);
                        free_extent_buffer(old);
@@ -1331,10 +1453,11 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        } else if (old_root) {
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
-               eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+               eb = alloc_dummy_extent_buffer(root->fs_info, logical);
        } else {
+               btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
                eb = btrfs_clone_extent_buffer(eb_root);
-               btrfs_tree_read_unlock(eb_root);
+               btrfs_tree_read_unlock_blocking(eb_root);
                free_extent_buffer(eb_root);
        }
 
@@ -1350,7 +1473,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                btrfs_set_header_generation(eb, old_generation);
        }
        if (tm)
-               __tree_mod_log_rewind(eb, time_seq, tm);
+               __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
        else
                WARN_ON(btrfs_header_level(eb) != 0);
        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -1379,6 +1502,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
 {
+       if (btrfs_test_is_dummy_root(root))
+               return 0;
+
        /* ensure we can see the force_cow */
        smp_rmb();
 
@@ -1397,7 +1523,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
-           !root->force_cow)
+           !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
                return 0;
        return 1;
 }
@@ -1417,16 +1543,15 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 
        if (trans->transaction != root->fs_info->running_transaction)
                WARN(1, KERN_CRIT "trans %llu running %llu\n",
-                      (unsigned long long)trans->transid,
-                      (unsigned long long)
+                      trans->transid,
                       root->fs_info->running_transaction->transid);
 
        if (trans->transid != root->fs_info->generation)
                WARN(1, KERN_CRIT "trans %llu running %llu\n",
-                      (unsigned long long)trans->transid,
-                      (unsigned long long)root->fs_info->generation);
+                      trans->transid, root->fs_info->generation);
 
        if (!should_cow_block(trans, root, buf)) {
+               trans->dirty = true;
                *cow_ret = buf;
                return 0;
        }
@@ -1522,15 +1647,15 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        WARN_ON(trans->transid != root->fs_info->generation);
 
        parent_nritems = btrfs_header_nritems(parent);
-       blocksize = btrfs_level_size(root, parent_level - 1);
-       end_slot = parent_nritems;
+       blocksize = root->nodesize;
+       end_slot = parent_nritems - 1;
 
-       if (parent_nritems == 1)
+       if (parent_nritems <= 1)
                return 0;
 
        btrfs_set_lock_blocking(parent);
 
-       for (i = start_slot; i < end_slot; i++) {
+       for (i = start_slot; i <= end_slot; i++) {
                int close = 1;
 
                btrfs_node_key(parent, &disk_key, i);
@@ -1547,7 +1672,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        other = btrfs_node_blockptr(parent, i - 1);
                        close = close_blocks(blocknr, other, blocksize);
                }
-               if (!close && i < end_slot - 2) {
+               if (!close && i < end_slot) {
                        other = btrfs_node_blockptr(parent, i + 1);
                        close = close_blocks(blocknr, other, blocksize);
                }
@@ -1556,16 +1681,17 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        continue;
                }
 
-               cur = btrfs_find_tree_block(root, blocknr, blocksize);
+               cur = btrfs_find_tree_block(root->fs_info, blocknr);
                if (cur)
                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
                else
                        uptodate = 0;
                if (!cur || !uptodate) {
                        if (!cur) {
-                               cur = read_tree_block(root, blocknr,
-                                                        blocksize, gen);
-                               if (!cur || !extent_buffer_uptodate(cur)) {
+                               cur = read_tree_block(root, blocknr, gen);
+                               if (IS_ERR(cur)) {
+                                       return PTR_ERR(cur);
+                               } else if (!extent_buffer_uptodate(cur)) {
                                        free_extent_buffer(cur);
                                        return -EIO;
                                }
@@ -1743,10 +1869,10 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
        BUG_ON(level == 0);
 
        eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
-                            btrfs_level_size(root, level - 1),
                             btrfs_node_ptr_generation(parent, slot));
-       if (eb && !extent_buffer_uptodate(eb)) {
-               free_extent_buffer(eb);
+       if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+               if (!IS_ERR(eb))
+                       free_extent_buffer(eb);
                eb = NULL;
        }
 
@@ -1802,7 +1928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                child = read_node_slot(root, mid, 0);
                if (!child) {
                        ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret);
+                       btrfs_std_error(root->fs_info, ret, NULL);
                        goto enospc;
                }
 
@@ -1823,7 +1949,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
                path->locks[level] = 0;
                path->nodes[level] = NULL;
-               clean_tree_block(trans, root, mid);
+               clean_tree_block(trans, root->fs_info, mid);
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);
@@ -1877,7 +2003,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret < 0 && wret != -ENOSPC)
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
-                       clean_tree_block(trans, root, right);
+                       clean_tree_block(trans, root->fs_info, right);
                        btrfs_tree_unlock(right);
                        del_ptr(root, path, level + 1, pslot + 1);
                        root_sub_used(root, right->len);
@@ -1905,7 +2031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 */
                if (!left) {
                        ret = -EROFS;
-                       btrfs_std_error(root->fs_info, ret);
+                       btrfs_std_error(root->fs_info, ret, NULL);
                        goto enospc;
                }
                wret = balance_node_right(trans, root, mid, left);
@@ -1921,7 +2047,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(wret == 1);
        }
        if (btrfs_header_nritems(mid) == 0) {
-               clean_tree_block(trans, root, mid);
+               clean_tree_block(trans, root->fs_info, mid);
                btrfs_tree_unlock(mid);
                del_ptr(root, path, level + 1, pslot);
                root_sub_used(root, mid->len);
@@ -2138,8 +2264,8 @@ static void reada_for_search(struct btrfs_root *root,
        node = path->nodes[level];
 
        search = btrfs_node_blockptr(node, slot);
-       blocksize = btrfs_level_size(root, level - 1);
-       eb = btrfs_find_tree_block(root, search, blocksize);
+       blocksize = root->nodesize;
+       eb = btrfs_find_tree_block(root->fs_info, search);
        if (eb) {
                free_extent_buffer(eb);
                return;
@@ -2169,7 +2295,7 @@ static void reada_for_search(struct btrfs_root *root,
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
                        gen = btrfs_node_ptr_generation(node, nr);
-                       readahead_tree_block(root, search, blocksize, gen);
+                       readahead_tree_block(root, search);
                        nread += blocksize;
                }
                nscan++;
@@ -2178,12 +2304,8 @@ static void reada_for_search(struct btrfs_root *root,
        }
 }
 
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
-                                     struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+                                      struct btrfs_path *path, int level)
 {
        int slot;
        int nritems;
@@ -2192,21 +2314,18 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        u64 gen;
        u64 block1 = 0;
        u64 block2 = 0;
-       int ret = 0;
-       int blocksize;
 
        parent = path->nodes[level + 1];
        if (!parent)
-               return 0;
+               return;
 
        nritems = btrfs_header_nritems(parent);
        slot = path->slots[level + 1];
-       blocksize = btrfs_level_size(root, level);
 
        if (slot > 0) {
                block1 = btrfs_node_blockptr(parent, slot - 1);
                gen = btrfs_node_ptr_generation(parent, slot - 1);
-               eb = btrfs_find_tree_block(root, block1, blocksize);
+               eb = btrfs_find_tree_block(root->fs_info, block1);
                /*
                 * if we get -eagain from btrfs_buffer_uptodate, we
                 * don't want to return eagain here.  That will loop
@@ -2219,33 +2338,16 @@ static noinline int reada_for_balance(struct btrfs_root *root,
        if (slot + 1 < nritems) {
                block2 = btrfs_node_blockptr(parent, slot + 1);
                gen = btrfs_node_ptr_generation(parent, slot + 1);
-               eb = btrfs_find_tree_block(root, block2, blocksize);
+               eb = btrfs_find_tree_block(root->fs_info, block2);
                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
                        block2 = 0;
                free_extent_buffer(eb);
        }
-       if (block1 || block2) {
-               ret = -EAGAIN;
 
-               /* release the whole path */
-               btrfs_release_path(path);
-
-               /* read the blocks */
-               if (block1)
-                       readahead_tree_block(root, block1, blocksize, 0);
-               if (block2)
-                       readahead_tree_block(root, block2, blocksize, 0);
-
-               if (block1) {
-                       eb = read_tree_block(root, block1, blocksize, 0);
-                       free_extent_buffer(eb);
-               }
-               if (block2) {
-                       eb = read_tree_block(root, block2, blocksize, 0);
-                       free_extent_buffer(eb);
-               }
-       }
-       return ret;
+       if (block1)
+               readahead_tree_block(root, block1);
+       if (block2)
+               readahead_tree_block(root, block2);
 }
 
 
@@ -2347,47 +2449,38 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 {
        u64 blocknr;
        u64 gen;
-       u32 blocksize;
        struct extent_buffer *b = *eb_ret;
        struct extent_buffer *tmp;
        int ret;
 
        blocknr = btrfs_node_blockptr(b, slot);
        gen = btrfs_node_ptr_generation(b, slot);
-       blocksize = btrfs_level_size(root, level - 1);
 
-       tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+       tmp = btrfs_find_tree_block(root->fs_info, blocknr);
        if (tmp) {
                /* first we do an atomic uptodate check */
-               if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
-                       if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
-                               /*
-                                * we found an up to date block without
-                                * sleeping, return
-                                * right away
-                                */
-                               *eb_ret = tmp;
-                               return 0;
-                       }
-                       /* the pages were up to date, but we failed
-                        * the generation number check.  Do a full
-                        * read for the generation number that is correct.
-                        * We must do this without dropping locks so
-                        * we can trust our generation number
-                        */
-                       free_extent_buffer(tmp);
-                       btrfs_set_path_blocking(p);
+               if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+                       *eb_ret = tmp;
+                       return 0;
+               }
 
-                       /* now we're allowed to do a blocking uptodate check */
-                       tmp = read_tree_block(root, blocknr, blocksize, gen);
-                       if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
-                               *eb_ret = tmp;
-                               return 0;
-                       }
-                       free_extent_buffer(tmp);
-                       btrfs_release_path(p);
-                       return -EIO;
+               /* the pages were up to date, but we failed
+                * the generation number check.  Do a full
+                * read for the generation number that is correct.
+                * We must do this without dropping locks so
+                * we can trust our generation number
+                */
+               btrfs_set_path_blocking(p);
+
+               /* now we're allowed to do a blocking uptodate check */
+               ret = btrfs_read_buffer(tmp, gen);
+               if (!ret) {
+                       *eb_ret = tmp;
+                       return 0;
                }
+               free_extent_buffer(tmp);
+               btrfs_release_path(p);
+               return -EIO;
        }
 
        /*
@@ -2407,8 +2500,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_release_path(p);
 
        ret = -EAGAIN;
-       tmp = read_tree_block(root, blocknr, blocksize, 0);
-       if (tmp) {
+       tmp = read_tree_block(root, blocknr, 0);
+       if (!IS_ERR(tmp)) {
                /*
                 * If the read above didn't mark this buffer up to date,
                 * it will never end up being up to date.  Set ret to EIO now
@@ -2448,11 +2541,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                        goto again;
                }
 
-               sret = reada_for_balance(root, p, level);
-               if (sret)
-                       goto again;
-
                btrfs_set_path_blocking(p);
+               reada_for_balance(root, p, level);
                sret = split_node(trans, root, p, level);
                btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -2472,11 +2562,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                        goto again;
                }
 
-               sret = reada_for_balance(root, p, level);
-               if (sret)
-                       goto again;
-
                btrfs_set_path_blocking(p);
+               reada_for_balance(root, p, level);
                sret = balance_level(trans, root, p, level);
                btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -2499,6 +2586,75 @@ done:
        return ret;
 }
 
+static void key_search_validate(struct extent_buffer *b,
+                               struct btrfs_key *key,
+                               int level)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+       struct btrfs_disk_key disk_key;
+
+       btrfs_cpu_key_to_disk(&disk_key, key);
+
+       if (level == 0)
+               ASSERT(!memcmp_extent_buffer(b, &disk_key,
+                   offsetof(struct btrfs_leaf, items[0].key),
+                   sizeof(disk_key)));
+       else
+               ASSERT(!memcmp_extent_buffer(b, &disk_key,
+                   offsetof(struct btrfs_node, ptrs[0].key),
+                   sizeof(disk_key)));
+#endif
+}
+
+static int key_search(struct extent_buffer *b, struct btrfs_key *key,
+                     int level, int *prev_cmp, int *slot)
+{
+       if (*prev_cmp != 0) {
+               *prev_cmp = bin_search(b, key, level, slot);
+               return *prev_cmp;
+       }
+
+       key_search_validate(b, key, level);
+       *slot = 0;
+
+       return 0;
+}
+
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+               u64 iobjectid, u64 ioff, u8 key_type,
+               struct btrfs_key *found_key)
+{
+       int ret;
+       struct btrfs_key key;
+       struct extent_buffer *eb;
+
+       ASSERT(path);
+       ASSERT(found_key);
+
+       key.type = key_type;
+       key.objectid = iobjectid;
+       key.offset = ioff;
+
+       ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       eb = path->nodes[0];
+       if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+               ret = btrfs_next_leaf(fs_root, path);
+               if (ret)
+                       return ret;
+               eb = path->nodes[0];
+       }
+
+       btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+       if (found_key->type != key.type ||
+                       found_key->objectid != key.objectid)
+               return 1;
+
+       return 0;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -2527,10 +2683,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        int write_lock_level = 0;
        u8 lowest_level = 0;
        int min_write_lock_level;
+       int prev_cmp;
 
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
        WARN_ON(p->nodes[0] != NULL);
+       BUG_ON(!cow && ins_len);
 
        if (ins_len < 0) {
                lowest_unlock = 2;
@@ -2557,6 +2715,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        min_write_lock_level = write_lock_level;
 
 again:
+       prev_cmp = -1;
        /*
         * we try very hard to do read locks on the root
         */
@@ -2567,9 +2726,13 @@ again:
                 * the commit roots are read only
                 * so we always do read locks
                 */
+               if (p->need_commit_sem)
+                       down_read(&root->fs_info->commit_root_sem);
                b = root->commit_root;
                extent_buffer_get(b);
                level = btrfs_header_level(b);
+               if (p->need_commit_sem)
+                       up_read(&root->fs_info->commit_root_sem);
                if (!p->skip_locking)
                        btrfs_tree_read_lock(b);
        } else {
@@ -2611,10 +2774,10 @@ again:
                         * then we don't want to set the path blocking,
                         * so we test it here
                         */
-                       if (!should_cow_block(trans, root, b))
+                       if (!should_cow_block(trans, root, b)) {
+                               trans->dirty = true;
                                goto cow_done;
-
-                       btrfs_set_path_blocking(p);
+                       }
 
                        /*
                         * must have write locks on this node and the
@@ -2629,6 +2792,7 @@ again:
                                goto again;
                        }
 
+                       btrfs_set_path_blocking(p);
                        err = btrfs_cow_block(trans, root, b,
                                              p->nodes[level + 1],
                                              p->slots[level + 1], &b);
@@ -2638,8 +2802,6 @@ again:
                        }
                }
 cow_done:
-               BUG_ON(!cow && ins_len);
-
                p->nodes[level] = b;
                btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -2649,15 +2811,21 @@ cow_done:
                 * It is safe to drop the lock on our parent before we
                 * go through the expensive btree search on b.
                 *
-                * If cow is true, then we might be changing slot zero,
-                * which may require changing the parent.  So, we can't
-                * drop the lock until after we know which slot we're
-                * operating on.
+                * If we're inserting or deleting (ins_len != 0), then we might
+                * be changing slot zero, which may require changing the parent.
+                * So, we can't drop the lock until after we know which slot
+                * we're operating on.
                 */
-               if (!cow)
-                       btrfs_unlock_up_safe(p, level + 1);
+               if (!ins_len && !p->keep_locks) {
+                       int u = level + 1;
+
+                       if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+                               btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+                               p->locks[u] = 0;
+                       }
+               }
 
-               ret = bin_search(b, key, level, &slot);
+               ret = key_search(b, key, level, &prev_cmp, &slot);
 
                if (level != 0) {
                        int dec = 0;
@@ -2683,7 +2851,7 @@ cow_done:
                         * which means we must have a write lock
                         * on the parent
                         */
-                       if (slot == 0 && cow &&
+                       if (slot == 0 && ins_len &&
                            write_lock_level < level + 1) {
                                write_lock_level = level + 1;
                                btrfs_release_path(p);
@@ -2720,7 +2888,7 @@ cow_done:
                                        }
                                        p->locks[level] = BTRFS_WRITE_LOCK;
                                } else {
-                                       err = btrfs_try_tree_read_lock(b);
+                                       err = btrfs_tree_read_lock_atomic(b);
                                        if (!err) {
                                                btrfs_set_path_blocking(p);
                                                btrfs_tree_read_lock(b);
@@ -2766,7 +2934,7 @@ done:
         */
        if (!p->leave_spinning)
                btrfs_set_path_blocking(p);
-       if (ret < 0)
+       if (ret < 0 && !p->skip_release_on_error)
                btrfs_release_path(p);
        return ret;
 }
@@ -2792,6 +2960,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;
+       int prev_cmp = -1;
 
        lowest_level = p->lowest_level;
        WARN_ON(p->nodes[0] != NULL);
@@ -2819,7 +2988,12 @@ again:
                 */
                btrfs_unlock_up_safe(p, level + 1);
 
-               ret = bin_search(b, key, level, &slot);
+               /*
+                * Since we can unwind eb's we want to do a real search every
+                * time.
+                */
+               prev_cmp = -1;
+               ret = key_search(b, key, level, &prev_cmp, &slot);
 
                if (level != 0) {
                        int dec = 0;
@@ -2846,14 +3020,18 @@ again:
                        }
 
                        level = btrfs_header_level(b);
-                       err = btrfs_try_tree_read_lock(b);
+                       err = btrfs_tree_read_lock_atomic(b);
                        if (!err) {
                                btrfs_set_path_blocking(p);
                                btrfs_tree_read_lock(b);
                                btrfs_clear_path_blocking(p, b,
                                                          BTRFS_READ_LOCK);
                        }
-                       b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+                       b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
+                       if (!b) {
+                               ret = -ENOMEM;
+                               goto done;
+                       }
                        p->locks[level] = BTRFS_READ_LOCK;
                        p->nodes[level] = b;
                } else {
@@ -2926,7 +3104,9 @@ again:
                        if (ret < 0)
                                return ret;
                        if (!ret) {
-                               p->slots[0] = btrfs_header_nritems(leaf) - 1;
+                               leaf = p->nodes[0];
+                               if (p->slots[0] == btrfs_header_nritems(leaf))
+                                       p->slots[0]--;
                                return 0;
                        }
                        if (!return_any)
@@ -2954,7 +3134,8 @@ again:
  * higher levels
  *
  */
-static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_fs_info *fs_info,
+                          struct btrfs_path *path,
                           struct btrfs_disk_key *key, int level)
 {
        int i;
@@ -2965,7 +3146,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
-               tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
+               tree_mod_log_set_node_key(fs_info, t, tslot, 1);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(path->nodes[i]);
                if (tslot != 0)
@@ -2979,7 +3160,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
  * This function isn't completely safe. It's the caller's responsibility
  * that the new key won't break the order
  */
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+                            struct btrfs_path *path,
                             struct btrfs_key *new_key)
 {
        struct btrfs_disk_key disk_key;
@@ -3001,7 +3183,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
        btrfs_set_item_key(eb, &disk_key, slot);
        btrfs_mark_buffer_dirty(eb);
        if (slot == 0)
-               fixup_low_keys(root, path, &disk_key, 1);
+               fixup_low_keys(fs_info, path, &disk_key, 1);
 }
 
 /*
@@ -3047,8 +3229,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
        } else
                push_items = min(src_nritems - 8, push_items);
 
-       tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
-                            push_items);
+       ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+                                  push_items);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               return ret;
+       }
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(dst_nritems),
                           btrfs_node_key_ptr_offset(0),
@@ -3118,8 +3304,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                                      (dst_nritems) *
                                      sizeof(struct btrfs_key_ptr));
 
-       tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
-                            src_nritems - push_items, push_items);
+       ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+                                  src_nritems - push_items, push_items);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               return ret;
+       }
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -3143,7 +3333,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
  */
 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
-                          struct btrfs_path *path, int level, int log_removal)
+                          struct btrfs_path *path, int level)
 {
        u64 lower_gen;
        struct extent_buffer *lower;
@@ -3160,9 +3350,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        else
                btrfs_node_key(lower, &lower_key, 0);
 
-       c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                  root->root_key.objectid, &lower_key,
-                                  level, root->node->start, 0);
+       c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+                                  &lower_key, level, root->node->start, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
 
@@ -3176,13 +3365,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(c, root->root_key.objectid);
 
-       write_extent_buffer(c, root->fs_info->fsid,
-                           (unsigned long)btrfs_header_fsid(c),
+       write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
                            BTRFS_FSID_SIZE);
 
        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
-                           (unsigned long)btrfs_header_chunk_tree_uuid(c),
-                           BTRFS_UUID_SIZE);
+                           btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
 
        btrfs_set_node_key(c, &lower_key, 0);
        btrfs_set_node_blockptr(c, 0, lower->start);
@@ -3194,7 +3381,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
 
        old = root->node;
-       tree_mod_log_set_root_pointer(root, c, log_removal);
+       tree_mod_log_set_root_pointer(root, c, 0);
        rcu_assign_pointer(root->node, c);
 
        /* the super has an extra ref to root->node */
@@ -3203,7 +3390,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        add_root_to_dirty_list(root);
        extent_buffer_get(c);
        path->nodes[level] = c;
-       path->locks[level] = BTRFS_WRITE_LOCK;
+       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
        path->slots[level] = 0;
        return 0;
 }
@@ -3241,7 +3428,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
        }
        if (level) {
                ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
-                                             MOD_LOG_KEY_ADD);
+                                             MOD_LOG_KEY_ADD, GFP_NOFS);
                BUG_ON(ret < 0);
        }
        btrfs_set_node_key(lower, key, slot);
@@ -3278,14 +3465,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                /*
                 * trying to split the root, lets make a new one
                 *
-                * tree mod log: We pass 0 as log_removal parameter to
+                * tree mod log: We don't log_removal old root in
                 * insert_new_root, because that root buffer will be kept as a
                 * normal node. We are going to log removal of half of the
                 * elements below with tree_mod_log_eb_copy. We're holding a
                 * tree lock on the buffer, which is why we cannot race with
                 * other tree_mod_log users.
                 */
-               ret = insert_new_root(trans, root, path, level + 1, 0);
+               ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
        } else {
@@ -3302,9 +3489,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        mid = (c_nritems + 1) / 2;
        btrfs_node_key(c, &disk_key, mid);
 
-       split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                       root->root_key.objectid,
-                                       &disk_key, level, c->start, 0);
+       split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+                       &disk_key, level, c->start, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
 
@@ -3317,13 +3503,17 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(split, root->root_key.objectid);
        write_extent_buffer(split, root->fs_info->fsid,
-                           (unsigned long)btrfs_header_fsid(split),
-                           BTRFS_FSID_SIZE);
+                           btrfs_header_fsid(), BTRFS_FSID_SIZE);
        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
-                           (unsigned long)btrfs_header_chunk_tree_uuid(split),
+                           btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
 
-       tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
+       ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
+                                  mid, c_nritems - mid);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               return ret;
+       }
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(mid),
@@ -3368,8 +3558,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
        if (!nr)
                return 0;
        btrfs_init_map_token(&token);
-       start_item = btrfs_item_nr(l, start);
-       end_item = btrfs_item_nr(l, end);
+       start_item = btrfs_item_nr(start);
+       end_item = btrfs_item_nr(end);
        data_len = btrfs_token_item_offset(l, start_item, &token) +
                btrfs_token_item_size(l, start_item, &token);
        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
@@ -3390,8 +3580,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        int ret;
        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
        if (ret < 0) {
-               printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
-                      "used %d nritems %d\n",
+               btrfs_crit(root->fs_info,
+                       "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
                       leaf_space_used(leaf, 0, nritems), nritems);
        }
@@ -3437,7 +3627,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        slot = path->slots[1];
        i = left_nritems - 1;
        while (i >= nr) {
-               item = btrfs_item_nr(left, i);
+               item = btrfs_item_nr(i);
 
                if (!empty && push_items > 0) {
                        if (path->slots[0] > i)
@@ -3501,7 +3691,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(right, right_nritems);
        push_space = BTRFS_LEAF_DATA_SIZE(root);
        for (i = 0; i < right_nritems; i++) {
-               item = btrfs_item_nr(right, i);
+               item = btrfs_item_nr(i);
                push_space -= btrfs_token_item_size(right, item, &token);
                btrfs_set_token_item_offset(right, item, push_space, &token);
        }
@@ -3512,7 +3702,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (left_nritems)
                btrfs_mark_buffer_dirty(left);
        else
-               clean_tree_block(trans, root, left);
+               clean_tree_block(trans, root->fs_info, left);
 
        btrfs_mark_buffer_dirty(right);
 
@@ -3524,7 +3714,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (path->slots[0] >= left_nritems) {
                path->slots[0] -= left_nritems;
                if (btrfs_header_nritems(path->nodes[0]) == 0)
-                       clean_tree_block(trans, root, path->nodes[0]);
+                       clean_tree_block(trans, root->fs_info, path->nodes[0]);
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = right;
@@ -3599,6 +3789,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (left_nritems == 0)
                goto out_unlock;
 
+       if (path->slots[0] == left_nritems && !empty) {
+               /* Key greater than all keys in the leaf, right neighbor has
+                * enough room for it and we're not emptying our leaf to delete
+                * it, therefore use right neighbor to insert the new item and
+                * no need to touch/dirty our left leaft. */
+               btrfs_tree_unlock(left);
+               free_extent_buffer(left);
+               path->nodes[0] = right;
+               path->slots[0] = 0;
+               path->slots[1]++;
+               return 0;
+       }
+
        return __push_leaf_right(trans, root, path, min_data_size, empty,
                                right, free_space, left_nritems, min_slot);
 out_unlock:
@@ -3643,7 +3846,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                nr = min(right_nritems - 1, max_slot);
 
        for (i = 0; i < nr; i++) {
-               item = btrfs_item_nr(right, i);
+               item = btrfs_item_nr(i);
 
                if (!empty && push_items > 0) {
                        if (path->slots[0] < i)
@@ -3670,8 +3873,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                ret = 1;
                goto out;
        }
-       if (!empty && push_items == btrfs_header_nritems(right))
-               WARN_ON(1);
+       WARN_ON(!empty && push_items == btrfs_header_nritems(right));
 
        /* push data from right to left */
        copy_extent_buffer(left, right,
@@ -3694,7 +3896,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
                u32 ioff;
 
-               item = btrfs_item_nr(left, i);
+               item = btrfs_item_nr(i);
 
                ioff = btrfs_token_item_offset(left, item, &token);
                btrfs_set_token_item_offset(left, item,
@@ -3725,7 +3927,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(right, right_nritems);
        push_space = BTRFS_LEAF_DATA_SIZE(root);
        for (i = 0; i < right_nritems; i++) {
-               item = btrfs_item_nr(right, i);
+               item = btrfs_item_nr(i);
 
                push_space = push_space - btrfs_token_item_size(right,
                                                                item, &token);
@@ -3736,10 +3938,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
        else
-               clean_tree_block(trans, root, right);
+               clean_tree_block(trans, root->fs_info, right);
 
        btrfs_item_key(right, &disk_key, 0);
-       fixup_low_keys(root, path, &disk_key, 1);
+       fixup_low_keys(root->fs_info, path, &disk_key, 1);
 
        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
@@ -3866,7 +4068,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
                      btrfs_item_end_nr(l, mid);
 
        for (i = 0; i < nritems; i++) {
-               struct btrfs_item *item = btrfs_item_nr(right, i);
+               struct btrfs_item *item = btrfs_item_nr(i);
                u32 ioff;
 
                ioff = btrfs_token_item_offset(right, item, &token);
@@ -3916,14 +4118,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
        int progress = 0;
        int slot;
        u32 nritems;
+       int space_needed = data_size;
 
        slot = path->slots[0];
+       if (slot < btrfs_header_nritems(path->nodes[0]))
+               space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
 
        /*
         * try to push all the items after our slot into the
         * right leaf
         */
-       ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+       ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
        if (ret < 0)
                return ret;
 
@@ -3943,7 +4148,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
 
        /* try to push all the items before our slot into the next leaf */
        slot = path->slots[0];
-       ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+       ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
        if (ret < 0)
                return ret;
 
@@ -3973,6 +4178,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int mid;
        int slot;
        struct extent_buffer *right;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        int wret;
        int split;
@@ -3986,14 +4192,19 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                return -EOVERFLOW;
 
        /* first try to make some room by pushing left and right */
-       if (data_size) {
-               wret = push_leaf_right(trans, root, path, data_size,
-                                      data_size, 0, 0);
+       if (data_size && path->nodes[1]) {
+               int space_needed = data_size;
+
+               if (slot < btrfs_header_nritems(l))
+                       space_needed -= btrfs_leaf_free_space(root, l);
+
+               wret = push_leaf_right(trans, root, path, space_needed,
+                                      space_needed, 0, 0);
                if (wret < 0)
                        return wret;
                if (wret) {
-                       wret = push_leaf_left(trans, root, path, data_size,
-                                             data_size, 0, (u32)-1);
+                       wret = push_leaf_left(trans, root, path, space_needed,
+                                             space_needed, 0, (u32)-1);
                        if (wret < 0)
                                return wret;
                }
@@ -4005,7 +4216,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        }
 
        if (!path->nodes[1]) {
-               ret = insert_new_root(trans, root, path, 1, 1);
+               ret = insert_new_root(trans, root, path, 1);
                if (ret)
                        return ret;
        }
@@ -4047,7 +4258,7 @@ again:
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
                                        if (data_size && !tried_avoid_double)
                                                goto push_for_double;
-                                       split = 2 ;
+                                       split = 2;
                                }
                        }
                }
@@ -4058,13 +4269,12 @@ again:
        else
                btrfs_item_key(l, &disk_key, mid);
 
-       right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                       root->root_key.objectid,
-                                       &disk_key, 0, l->start, 0);
+       right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+                       &disk_key, 0, l->start, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
 
-       root_add_used(root, root->leafsize);
+       root_add_used(root, root->nodesize);
 
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
@@ -4072,12 +4282,11 @@ again:
        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(right, root->root_key.objectid);
        btrfs_set_header_level(right, 0);
-       write_extent_buffer(right, root->fs_info->fsid,
-                           (unsigned long)btrfs_header_fsid(right),
-                           BTRFS_FSID_SIZE);
+       write_extent_buffer(right, fs_info->fsid,
+                           btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
-       write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
-                           (unsigned long)btrfs_header_chunk_tree_uuid(right),
+       write_extent_buffer(right, fs_info->chunk_tree_uuid,
+                           btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
 
        if (split == 0) {
@@ -4099,7 +4308,7 @@ again:
                        path->nodes[0] = right;
                        path->slots[0] = 0;
                        if (path->slots[1] == 0)
-                               fixup_low_keys(root, path, &disk_key, 1);
+                               fixup_low_keys(fs_info, path, &disk_key, 1);
                }
                btrfs_mark_buffer_dirty(right);
                return ret;
@@ -4155,13 +4364,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        path->search_for_split = 1;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        path->search_for_split = 0;
+       if (ret > 0)
+               ret = -EAGAIN;
        if (ret < 0)
                goto err;
 
        ret = -EAGAIN;
        leaf = path->nodes[0];
-       /* if our item isn't there or got smaller, return now */
-       if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+       /* if our item isn't there, return now */
+       if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
                goto err;
 
        /* the leaf has  changed, it now has room.  return now */
@@ -4209,7 +4420,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 
        btrfs_set_path_blocking(path);
 
-       item = btrfs_item_nr(leaf, path->slots[0]);
+       item = btrfs_item_nr(path->slots[0]);
        orig_offset = btrfs_item_offset(leaf, item);
        item_size = btrfs_item_size(leaf, item);
 
@@ -4232,7 +4443,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
        btrfs_cpu_key_to_disk(&disk_key, new_key);
        btrfs_set_item_key(leaf, &disk_key, slot);
 
-       new_item = btrfs_item_nr(leaf, slot);
+       new_item = btrfs_item_nr(slot);
 
        btrfs_set_item_offset(leaf, new_item, orig_offset);
        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
@@ -4371,7 +4582,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
        /* first correct the data pointers */
        for (i = slot; i < nritems; i++) {
                u32 ioff;
-               item = btrfs_item_nr(leaf, i);
+               item = btrfs_item_nr(i);
 
                ioff = btrfs_token_item_offset(leaf, item, &token);
                btrfs_set_token_item_offset(leaf, item,
@@ -4403,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
                                ptr = btrfs_item_ptr_offset(leaf, slot);
                                memmove_extent_buffer(leaf, ptr,
                                      (unsigned long)fi,
-                                     offsetof(struct btrfs_file_extent_item,
-                                                disk_bytenr));
+                                     BTRFS_FILE_EXTENT_INLINE_DATA_START);
                        }
                }
 
@@ -4416,10 +4626,10 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
                btrfs_set_item_key(leaf, &disk_key, slot);
                if (slot == 0)
-                       fixup_low_keys(root, path, &disk_key, 1);
+                       fixup_low_keys(root->fs_info, path, &disk_key, 1);
        }
 
-       item = btrfs_item_nr(leaf, slot);
+       item = btrfs_item_nr(slot);
        btrfs_set_item_size(leaf, item, new_size);
        btrfs_mark_buffer_dirty(leaf);
 
@@ -4430,7 +4640,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 }
 
 /*
- * make the item pointed to by the path bigger, data_size is the new size.
+ * make the item pointed to by the path bigger, data_size is the added size.
  */
 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
                       u32 data_size)
@@ -4462,7 +4672,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
        BUG_ON(slot < 0);
        if (slot >= nritems) {
                btrfs_print_leaf(root, leaf);
-               printk(KERN_CRIT "slot %d too large, nritems %d\n",
+               btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
                       slot, nritems);
                BUG_ON(1);
        }
@@ -4473,7 +4683,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
        /* first correct the data pointers */
        for (i = slot; i < nritems; i++) {
                u32 ioff;
-               item = btrfs_item_nr(leaf, i);
+               item = btrfs_item_nr(i);
 
                ioff = btrfs_token_item_offset(leaf, item, &token);
                btrfs_set_token_item_offset(leaf, item,
@@ -4487,7 +4697,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 
        data_end = old_data;
        old_size = btrfs_item_size_nr(leaf, slot);
-       item = btrfs_item_nr(leaf, slot);
+       item = btrfs_item_nr(slot);
        btrfs_set_item_size(leaf, item, old_size + data_size);
        btrfs_mark_buffer_dirty(leaf);
 
@@ -4515,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        int slot;
        struct btrfs_map_token token;
 
+       if (path->slots[0] == 0) {
+               btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+               fixup_low_keys(root->fs_info, path, &disk_key, 1);
+       }
+       btrfs_unlock_up_safe(path, 1);
+
        btrfs_init_map_token(&token);
 
        leaf = path->nodes[0];
@@ -4525,7 +4741,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 
        if (btrfs_leaf_free_space(root, leaf) < total_size) {
                btrfs_print_leaf(root, leaf);
-               printk(KERN_CRIT "not enough freespace need %u have %d\n",
+               btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
                       total_size, btrfs_leaf_free_space(root, leaf));
                BUG();
        }
@@ -4535,7 +4751,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 
                if (old_data < data_end) {
                        btrfs_print_leaf(root, leaf);
-                       printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+                       btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
                               slot, old_data, data_end);
                        BUG_ON(1);
                }
@@ -4546,7 +4762,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
                for (i = slot; i < nritems; i++) {
                        u32 ioff;
 
-                       item = btrfs_item_nr(leaf, i);
+                       item = btrfs_item_nr( i);
                        ioff = btrfs_token_item_offset(leaf, item, &token);
                        btrfs_set_token_item_offset(leaf, item,
                                                    ioff - total_data, &token);
@@ -4567,7 +4783,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        for (i = 0; i < nr; i++) {
                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
                btrfs_set_item_key(leaf, &disk_key, slot + i);
-               item = btrfs_item_nr(leaf, slot + i);
+               item = btrfs_item_nr(slot + i);
                btrfs_set_token_item_offset(leaf, item,
                                            data_end - data_size[i], &token);
                data_end -= data_size[i];
@@ -4575,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        }
 
        btrfs_set_header_nritems(leaf, nritems + nr);
-
-       if (slot == 0) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-               fixup_low_keys(root, path, &disk_key, 1);
-       }
-       btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(leaf);
 
        if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -4675,7 +4885,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
                              (nritems - slot - 1));
        } else if (level) {
                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
-                                             MOD_LOG_KEY_REMOVE);
+                                             MOD_LOG_KEY_REMOVE, GFP_NOFS);
                BUG_ON(ret < 0);
        }
 
@@ -4689,7 +4899,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
                struct btrfs_disk_key disk_key;
 
                btrfs_node_key(parent, &disk_key, 0);
-               fixup_low_keys(root, path, &disk_key, level + 1);
+               fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
        }
        btrfs_mark_buffer_dirty(parent);
 }
@@ -4733,8 +4943,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
        struct extent_buffer *leaf;
        struct btrfs_item *item;
-       int last_off;
-       int dsize = 0;
+       u32 last_off;
+       u32 dsize = 0;
        int ret = 0;
        int wret;
        int i;
@@ -4762,7 +4972,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                for (i = slot + nr; i < nritems; i++) {
                        u32 ioff;
 
-                       item = btrfs_item_nr(leaf, i);
+                       item = btrfs_item_nr(i);
                        ioff = btrfs_token_item_offset(leaf, item, &token);
                        btrfs_set_token_item_offset(leaf, item,
                                                    ioff + dsize, &token);
@@ -4782,7 +4992,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        btrfs_set_header_level(leaf, 0);
                } else {
                        btrfs_set_path_blocking(path);
-                       clean_tree_block(trans, root, leaf);
+                       clean_tree_block(trans, root->fs_info, leaf);
                        btrfs_del_leaf(trans, root, path, leaf);
                }
        } else {
@@ -4791,7 +5001,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        struct btrfs_disk_key disk_key;
 
                        btrfs_item_key(leaf, &disk_key, 0);
-                       fixup_low_keys(root, path, &disk_key, 1);
+                       fixup_low_keys(root->fs_info, path, &disk_key, 1);
                }
 
                /* delete the leaf if it is mostly empty */
@@ -4855,14 +5065,18 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
 
-       if (key.offset > 0)
+       if (key.offset > 0) {
                key.offset--;
-       else if (key.type > 0)
+       } else if (key.type > 0) {
                key.type--;
-       else if (key.objectid > 0)
+               key.offset = (u64)-1;
+       } else if (key.objectid > 0) {
                key.objectid--;
-       else
+               key.type = (u8)-1;
+               key.offset = (u64)-1;
+       } else {
                return 1;
+       }
 
        btrfs_release_path(path);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -4870,7 +5084,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
                return ret;
        btrfs_item_key(path->nodes[0], &found_key, 0);
        ret = comp_keys(&found_key, &key);
-       if (ret < 0)
+       /*
+        * We might have had an item with the previous key in the tree right
+        * before we released our path. And after we released our path, that
+        * item might have been pushed to the first slot (0) of the leaf we
+        * were holding due to a tree balance. Alternatively, an item with the
+        * previous key can exist as the only element of a leaf (big fat item).
+        * Therefore account for these 2 cases, so that our callers (like
+        * btrfs_previous_item) don't miss an existing item with a key matching
+        * the previous key we computed above.
+        */
+       if (ret <= 0)
                return 0;
        return 1;
 }
@@ -4898,7 +5122,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * was nothing in the tree that matched the search criteria.
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-                        struct btrfs_key *max_key,
                         struct btrfs_path *path,
                         u64 min_trans)
 {
@@ -4909,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
        u32 nritems;
        int level;
        int ret = 1;
+       int keep_locks = path->keep_locks;
 
-       WARN_ON(!path->keep_locks);
+       path->keep_locks = 1;
 again:
        cur = btrfs_read_lock_root_node(root);
        level = btrfs_header_level(cur);
@@ -4943,10 +5167,8 @@ again:
                 * If it is too old, old, skip to the next one.
                 */
                while (slot < nritems) {
-                       u64 blockptr;
                        u64 gen;
 
-                       blockptr = btrfs_node_blockptr(cur, slot);
                        gen = btrfs_node_ptr_generation(cur, slot);
                        if (gen < min_trans) {
                                slot++;
@@ -4976,7 +5198,6 @@ find_next_key:
                path->slots[level] = slot;
                if (level == path->lowest_level) {
                        ret = 0;
-                       unlock_up(path, level, 1, 0, NULL);
                        goto out;
                }
                btrfs_set_path_blocking(path);
@@ -4991,9 +5212,12 @@ find_next_key:
                btrfs_clear_path_blocking(path, NULL, 0);
        }
 out:
-       if (ret == 0)
+       path->keep_locks = keep_locks;
+       if (ret == 0) {
+               btrfs_unlock_up_safe(path, path->lowest_level + 1);
+               btrfs_set_path_blocking(path);
                memcpy(min_key, &found_key, sizeof(found_key));
-       btrfs_set_path_blocking(path);
+       }
        return ret;
 }
 
@@ -5112,7 +5336,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 {
        int ret;
        int cmp;
-       struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *left_path = NULL;
        struct btrfs_path *right_path = NULL;
        struct btrfs_key left_key;
@@ -5128,9 +5351,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        int advance_right;
        u64 left_blockptr;
        u64 right_blockptr;
-       u64 left_start_ctransid;
-       u64 right_start_ctransid;
-       u64 ctransid;
+       u64 left_gen;
+       u64 right_gen;
 
        left_path = btrfs_alloc_path();
        if (!left_path) {
@@ -5143,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                goto out;
        }
 
-       tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+       tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
        if (!tmp_buf) {
                ret = -ENOMEM;
                goto out;
@@ -5154,21 +5376,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        right_path->search_commit_root = 1;
        right_path->skip_locking = 1;
 
-       spin_lock(&left_root->root_item_lock);
-       left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
-       spin_unlock(&left_root->root_item_lock);
-
-       spin_lock(&right_root->root_item_lock);
-       right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
-       spin_unlock(&right_root->root_item_lock);
-
-       trans = btrfs_join_transaction(left_root);
-       if (IS_ERR(trans)) {
-               ret = PTR_ERR(trans);
-               trans = NULL;
-               goto out;
-       }
-
        /*
         * Strategy: Go to the first items of both trees. Then do
         *
@@ -5205,6 +5412,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
         *   the right if possible or go up and right.
         */
 
+       down_read(&left_root->fs_info->commit_root_sem);
        left_level = btrfs_header_level(left_root->commit_root);
        left_root_level = left_level;
        left_path->nodes[left_level] = left_root->commit_root;
@@ -5214,6 +5422,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        right_root_level = right_level;
        right_path->nodes[right_level] = right_root->commit_root;
        extent_buffer_get(right_path->nodes[right_level]);
+       up_read(&left_root->fs_info->commit_root_sem);
 
        if (left_level == 0)
                btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5232,67 +5441,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        advance_left = advance_right = 0;
 
        while (1) {
-               /*
-                * We need to make sure the transaction does not get committed
-                * while we do anything on commit roots. This means, we need to
-                * join and leave transactions for every item that we process.
-                */
-               if (trans && btrfs_should_end_transaction(trans, left_root)) {
-                       btrfs_release_path(left_path);
-                       btrfs_release_path(right_path);
-
-                       ret = btrfs_end_transaction(trans, left_root);
-                       trans = NULL;
-                       if (ret < 0)
-                               goto out;
-               }
-               /* now rejoin the transaction */
-               if (!trans) {
-                       trans = btrfs_join_transaction(left_root);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                               trans = NULL;
-                               goto out;
-                       }
-
-                       spin_lock(&left_root->root_item_lock);
-                       ctransid = btrfs_root_ctransid(&left_root->root_item);
-                       spin_unlock(&left_root->root_item_lock);
-                       if (ctransid != left_start_ctransid)
-                               left_start_ctransid = 0;
-
-                       spin_lock(&right_root->root_item_lock);
-                       ctransid = btrfs_root_ctransid(&right_root->root_item);
-                       spin_unlock(&right_root->root_item_lock);
-                       if (ctransid != right_start_ctransid)
-                               right_start_ctransid = 0;
-
-                       if (!left_start_ctransid || !right_start_ctransid) {
-                               WARN(1, KERN_WARNING
-                                       "btrfs: btrfs_compare_tree detected "
-                                       "a change in one of the trees while "
-                                       "iterating. This is probably a "
-                                       "bug.\n");
-                               ret = -EIO;
-                               goto out;
-                       }
-
-                       /*
-                        * the commit root may have changed, so start again
-                        * where we stopped
-                        */
-                       left_path->lowest_level = left_level;
-                       right_path->lowest_level = right_level;
-                       ret = btrfs_search_slot(NULL, left_root,
-                                       &left_key, left_path, 0, 0);
-                       if (ret < 0)
-                               goto out;
-                       ret = btrfs_search_slot(NULL, right_root,
-                                       &right_key, right_path, 0, 0);
-                       if (ret < 0)
-                               goto out;
-               }
-
                if (advance_left && !left_end_reached) {
                        ret = tree_advance(left_root, left_path, &left_level,
                                        left_root_level,
@@ -5362,19 +5510,20 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                        goto out;
                                advance_right = ADVANCE;
                        } else {
+                               enum btrfs_compare_tree_result result;
+
                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
                                ret = tree_compare_item(left_root, left_path,
                                                right_path, tmp_buf);
-                               if (ret) {
-                                       WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
-                                       ret = changed_cb(left_root, right_root,
-                                               left_path, right_path,
-                                               &left_key,
-                                               BTRFS_COMPARE_TREE_CHANGED,
-                                               ctx);
-                                       if (ret < 0)
-                                               goto out;
-                               }
+                               if (ret)
+                                       result = BTRFS_COMPARE_TREE_CHANGED;
+                               else
+                                       result = BTRFS_COMPARE_TREE_SAME;
+                               ret = changed_cb(left_root, right_root,
+                                                left_path, right_path,
+                                                &left_key, result, ctx);
+                               if (ret < 0)
+                                       goto out;
                                advance_left = ADVANCE;
                                advance_right = ADVANCE;
                        }
@@ -5391,7 +5540,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                right_blockptr = btrfs_node_blockptr(
                                                right_path->nodes[right_level],
                                                right_path->slots[right_level]);
-                               if (left_blockptr == right_blockptr) {
+                               left_gen = btrfs_node_ptr_generation(
+                                               left_path->nodes[left_level],
+                                               left_path->slots[left_level]);
+                               right_gen = btrfs_node_ptr_generation(
+                                               right_path->nodes[right_level],
+                                               right_path->slots[right_level]);
+                               if (left_blockptr == right_blockptr &&
+                                   left_gen == right_gen) {
                                        /*
                                         * As we're on a shared block, don't
                                         * allow to go deeper.
@@ -5414,14 +5570,6 @@ out:
        btrfs_free_path(left_path);
        btrfs_free_path(right_path);
        kfree(tmp_buf);
-
-       if (trans) {
-               if (!ret)
-                       ret = btrfs_end_transaction(trans, left_root);
-               else
-                       btrfs_end_transaction(trans, left_root);
-       }
-
        return ret;
 }
 
@@ -5560,6 +5708,24 @@ again:
                ret = 0;
                goto done;
        }
+       /*
+        * So the above check misses one case:
+        * - after releasing the path above, someone has removed the item that
+        *   used to be at the very end of the block, and balance between leafs
+        *   gets another one with bigger key.offset to replace it.
+        *
+        * This one should be returned as well, or we can get leaf corruption
+        * later(esp. in __btrfs_drop_extents()).
+        *
+        * And a bit more explanation about this check,
+        * with ret > 0, the key isn't found, the path points to the slot
+        * where it should be inserted, so the path->slots[0] item must be the
+        * bigger one.
+        */
+       if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+               ret = 0;
+               goto done;
+       }
 
        while (level < BTRFS_MAX_LEVEL) {
                if (!path->nodes[level]) {
@@ -5708,3 +5874,46 @@ int btrfs_previous_item(struct btrfs_root *root,
        }
        return 1;
 }
+
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+                       struct btrfs_path *path, u64 min_objectid)
+{
+       struct btrfs_key found_key;
+       struct extent_buffer *leaf;
+       u32 nritems;
+       int ret;
+
+       while (1) {
+               if (path->slots[0] == 0) {
+                       btrfs_set_path_blocking(path);
+                       ret = btrfs_prev_leaf(root, path);
+                       if (ret != 0)
+                               return ret;
+               } else {
+                       path->slots[0]--;
+               }
+               leaf = path->nodes[0];
+               nritems = btrfs_header_nritems(leaf);
+               if (nritems == 0)
+                       return 1;
+               if (path->slots[0] == nritems)
+                       path->slots[0]--;
+
+               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+               if (found_key.objectid < min_objectid)
+                       break;
+               if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+                   found_key.type == BTRFS_METADATA_ITEM_KEY)
+                       return 0;
+               if (found_key.objectid == min_objectid &&
+                   found_key.type < BTRFS_EXTENT_ITEM_KEY)
+                       break;
+       }
+       return 1;
+}