btrfs: fix leak of path in btrfs_find_item
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43                                  struct extent_buffer *eb);
44
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47         struct btrfs_path *path;
48         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49         return path;
50 }
51
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58         int i;
59         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60                 if (!p->nodes[i] || !p->locks[i])
61                         continue;
62                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63                 if (p->locks[i] == BTRFS_READ_LOCK)
64                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
66                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67         }
68 }
69
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79                                         struct extent_buffer *held, int held_rw)
80 {
81         int i;
82
83         if (held) {
84                 btrfs_set_lock_blocking_rw(held, held_rw);
85                 if (held_rw == BTRFS_WRITE_LOCK)
86                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87                 else if (held_rw == BTRFS_READ_LOCK)
88                         held_rw = BTRFS_READ_LOCK_BLOCKING;
89         }
90         btrfs_set_path_blocking(p);
91
92         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93                 if (p->nodes[i] && p->locks[i]) {
94                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96                                 p->locks[i] = BTRFS_WRITE_LOCK;
97                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98                                 p->locks[i] = BTRFS_READ_LOCK;
99                 }
100         }
101
102         if (held)
103                 btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109         if (!p)
110                 return;
111         btrfs_release_path(p);
112         kmem_cache_free(btrfs_path_cachep, p);
113 }
114
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123         int i;
124
125         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126                 p->slots[i] = 0;
127                 if (!p->nodes[i])
128                         continue;
129                 if (p->locks[i]) {
130                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131                         p->locks[i] = 0;
132                 }
133                 free_extent_buffer(p->nodes[i]);
134                 p->nodes[i] = NULL;
135         }
136 }
137
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150         struct extent_buffer *eb;
151
152         while (1) {
153                 rcu_read_lock();
154                 eb = rcu_dereference(root->node);
155
156                 /*
157                  * RCU really hurts here, we could free up the root node because
158                  * it was cow'ed but we may not get the new root node yet so do
159                  * the inc_not_zero dance and if it doesn't work then
160                  * synchronize_rcu and try again.
161                  */
162                 if (atomic_inc_not_zero(&eb->refs)) {
163                         rcu_read_unlock();
164                         break;
165                 }
166                 rcu_read_unlock();
167                 synchronize_rcu();
168         }
169         return eb;
170 }
171
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178         struct extent_buffer *eb;
179
180         while (1) {
181                 eb = btrfs_root_node(root);
182                 btrfs_tree_lock(eb);
183                 if (eb == root->node)
184                         break;
185                 btrfs_tree_unlock(eb);
186                 free_extent_buffer(eb);
187         }
188         return eb;
189 }
190
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197         struct extent_buffer *eb;
198
199         while (1) {
200                 eb = btrfs_root_node(root);
201                 btrfs_tree_read_lock(eb);
202                 if (eb == root->node)
203                         break;
204                 btrfs_tree_read_unlock(eb);
205                 free_extent_buffer(eb);
206         }
207         return eb;
208 }
209
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216         spin_lock(&root->fs_info->trans_lock);
217         if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
218             list_empty(&root->dirty_list)) {
219                 list_add(&root->dirty_list,
220                          &root->fs_info->dirty_cowonly_roots);
221         }
222         spin_unlock(&root->fs_info->trans_lock);
223 }
224
225 /*
226  * used by snapshot creation to make a copy of a root for a tree with
227  * a given objectid.  The buffer with the new root node is returned in
228  * cow_ret, and this func returns zero on success or a negative error code.
229  */
230 int btrfs_copy_root(struct btrfs_trans_handle *trans,
231                       struct btrfs_root *root,
232                       struct extent_buffer *buf,
233                       struct extent_buffer **cow_ret, u64 new_root_objectid)
234 {
235         struct extent_buffer *cow;
236         int ret = 0;
237         int level;
238         struct btrfs_disk_key disk_key;
239
240         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
241                 trans->transid != root->fs_info->running_transaction->transid);
242         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
243                 trans->transid != root->last_trans);
244
245         level = btrfs_header_level(buf);
246         if (level == 0)
247                 btrfs_item_key(buf, &disk_key, 0);
248         else
249                 btrfs_node_key(buf, &disk_key, 0);
250
251         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
252                         &disk_key, level, buf->start, 0);
253         if (IS_ERR(cow))
254                 return PTR_ERR(cow);
255
256         copy_extent_buffer(cow, buf, 0, 0, cow->len);
257         btrfs_set_header_bytenr(cow, cow->start);
258         btrfs_set_header_generation(cow, trans->transid);
259         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
260         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
261                                      BTRFS_HEADER_FLAG_RELOC);
262         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
263                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
264         else
265                 btrfs_set_header_owner(cow, new_root_objectid);
266
267         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
268                             BTRFS_FSID_SIZE);
269
270         WARN_ON(btrfs_header_generation(buf) > trans->transid);
271         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
272                 ret = btrfs_inc_ref(trans, root, cow, 1);
273         else
274                 ret = btrfs_inc_ref(trans, root, cow, 0);
275
276         if (ret)
277                 return ret;
278
279         btrfs_mark_buffer_dirty(cow);
280         *cow_ret = cow;
281         return 0;
282 }
283
284 enum mod_log_op {
285         MOD_LOG_KEY_REPLACE,
286         MOD_LOG_KEY_ADD,
287         MOD_LOG_KEY_REMOVE,
288         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
289         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
290         MOD_LOG_MOVE_KEYS,
291         MOD_LOG_ROOT_REPLACE,
292 };
293
294 struct tree_mod_move {
295         int dst_slot;
296         int nr_items;
297 };
298
299 struct tree_mod_root {
300         u64 logical;
301         u8 level;
302 };
303
304 struct tree_mod_elem {
305         struct rb_node node;
306         u64 index;              /* shifted logical */
307         u64 seq;
308         enum mod_log_op op;
309
310         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
311         int slot;
312
313         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
314         u64 generation;
315
316         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
317         struct btrfs_disk_key key;
318         u64 blockptr;
319
320         /* this is used for op == MOD_LOG_MOVE_KEYS */
321         struct tree_mod_move move;
322
323         /* this is used for op == MOD_LOG_ROOT_REPLACE */
324         struct tree_mod_root old_root;
325 };
326
327 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
328 {
329         read_lock(&fs_info->tree_mod_log_lock);
330 }
331
332 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
333 {
334         read_unlock(&fs_info->tree_mod_log_lock);
335 }
336
337 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
338 {
339         write_lock(&fs_info->tree_mod_log_lock);
340 }
341
342 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
343 {
344         write_unlock(&fs_info->tree_mod_log_lock);
345 }
346
347 /*
348  * Pull a new tree mod seq number for our operation.
349  */
350 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
351 {
352         return atomic64_inc_return(&fs_info->tree_mod_seq);
353 }
354
355 /*
356  * This adds a new blocker to the tree mod log's blocker list if the @elem
357  * passed does not already have a sequence number set. So when a caller expects
358  * to record tree modifications, it should ensure to set elem->seq to zero
359  * before calling btrfs_get_tree_mod_seq.
360  * Returns a fresh, unused tree log modification sequence number, even if no new
361  * blocker was added.
362  */
363 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
364                            struct seq_list *elem)
365 {
366         tree_mod_log_write_lock(fs_info);
367         spin_lock(&fs_info->tree_mod_seq_lock);
368         if (!elem->seq) {
369                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
370                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
371         }
372         spin_unlock(&fs_info->tree_mod_seq_lock);
373         tree_mod_log_write_unlock(fs_info);
374
375         return elem->seq;
376 }
377
378 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
379                             struct seq_list *elem)
380 {
381         struct rb_root *tm_root;
382         struct rb_node *node;
383         struct rb_node *next;
384         struct seq_list *cur_elem;
385         struct tree_mod_elem *tm;
386         u64 min_seq = (u64)-1;
387         u64 seq_putting = elem->seq;
388
389         if (!seq_putting)
390                 return;
391
392         spin_lock(&fs_info->tree_mod_seq_lock);
393         list_del(&elem->list);
394         elem->seq = 0;
395
396         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
397                 if (cur_elem->seq < min_seq) {
398                         if (seq_putting > cur_elem->seq) {
399                                 /*
400                                  * blocker with lower sequence number exists, we
401                                  * cannot remove anything from the log
402                                  */
403                                 spin_unlock(&fs_info->tree_mod_seq_lock);
404                                 return;
405                         }
406                         min_seq = cur_elem->seq;
407                 }
408         }
409         spin_unlock(&fs_info->tree_mod_seq_lock);
410
411         /*
412          * anything that's lower than the lowest existing (read: blocked)
413          * sequence number can be removed from the tree.
414          */
415         tree_mod_log_write_lock(fs_info);
416         tm_root = &fs_info->tree_mod_log;
417         for (node = rb_first(tm_root); node; node = next) {
418                 next = rb_next(node);
419                 tm = container_of(node, struct tree_mod_elem, node);
420                 if (tm->seq > min_seq)
421                         continue;
422                 rb_erase(node, tm_root);
423                 kfree(tm);
424         }
425         tree_mod_log_write_unlock(fs_info);
426 }
427
428 /*
429  * key order of the log:
430  *       index -> sequence
431  *
432  * the index is the shifted logical of the *new* root node for root replace
433  * operations, or the shifted logical of the affected block for all other
434  * operations.
435  *
436  * Note: must be called with write lock (tree_mod_log_write_lock).
437  */
438 static noinline int
439 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
440 {
441         struct rb_root *tm_root;
442         struct rb_node **new;
443         struct rb_node *parent = NULL;
444         struct tree_mod_elem *cur;
445
446         BUG_ON(!tm);
447
448         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
449
450         tm_root = &fs_info->tree_mod_log;
451         new = &tm_root->rb_node;
452         while (*new) {
453                 cur = container_of(*new, struct tree_mod_elem, node);
454                 parent = *new;
455                 if (cur->index < tm->index)
456                         new = &((*new)->rb_left);
457                 else if (cur->index > tm->index)
458                         new = &((*new)->rb_right);
459                 else if (cur->seq < tm->seq)
460                         new = &((*new)->rb_left);
461                 else if (cur->seq > tm->seq)
462                         new = &((*new)->rb_right);
463                 else
464                         return -EEXIST;
465         }
466
467         rb_link_node(&tm->node, parent, new);
468         rb_insert_color(&tm->node, tm_root);
469         return 0;
470 }
471
472 /*
473  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
474  * returns zero with the tree_mod_log_lock acquired. The caller must hold
475  * this until all tree mod log insertions are recorded in the rb tree and then
476  * call tree_mod_log_write_unlock() to release.
477  */
478 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
479                                     struct extent_buffer *eb) {
480         smp_mb();
481         if (list_empty(&(fs_info)->tree_mod_seq_list))
482                 return 1;
483         if (eb && btrfs_header_level(eb) == 0)
484                 return 1;
485
486         tree_mod_log_write_lock(fs_info);
487         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
488                 tree_mod_log_write_unlock(fs_info);
489                 return 1;
490         }
491
492         return 0;
493 }
494
495 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
496 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
497                                     struct extent_buffer *eb)
498 {
499         smp_mb();
500         if (list_empty(&(fs_info)->tree_mod_seq_list))
501                 return 0;
502         if (eb && btrfs_header_level(eb) == 0)
503                 return 0;
504
505         return 1;
506 }
507
508 static struct tree_mod_elem *
509 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
510                     enum mod_log_op op, gfp_t flags)
511 {
512         struct tree_mod_elem *tm;
513
514         tm = kzalloc(sizeof(*tm), flags);
515         if (!tm)
516                 return NULL;
517
518         tm->index = eb->start >> PAGE_CACHE_SHIFT;
519         if (op != MOD_LOG_KEY_ADD) {
520                 btrfs_node_key(eb, &tm->key, slot);
521                 tm->blockptr = btrfs_node_blockptr(eb, slot);
522         }
523         tm->op = op;
524         tm->slot = slot;
525         tm->generation = btrfs_node_ptr_generation(eb, slot);
526         RB_CLEAR_NODE(&tm->node);
527
528         return tm;
529 }
530
531 static noinline int
532 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
533                         struct extent_buffer *eb, int slot,
534                         enum mod_log_op op, gfp_t flags)
535 {
536         struct tree_mod_elem *tm;
537         int ret;
538
539         if (!tree_mod_need_log(fs_info, eb))
540                 return 0;
541
542         tm = alloc_tree_mod_elem(eb, slot, op, flags);
543         if (!tm)
544                 return -ENOMEM;
545
546         if (tree_mod_dont_log(fs_info, eb)) {
547                 kfree(tm);
548                 return 0;
549         }
550
551         ret = __tree_mod_log_insert(fs_info, tm);
552         tree_mod_log_write_unlock(fs_info);
553         if (ret)
554                 kfree(tm);
555
556         return ret;
557 }
558
559 static noinline int
560 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
561                          struct extent_buffer *eb, int dst_slot, int src_slot,
562                          int nr_items, gfp_t flags)
563 {
564         struct tree_mod_elem *tm = NULL;
565         struct tree_mod_elem **tm_list = NULL;
566         int ret = 0;
567         int i;
568         int locked = 0;
569
570         if (!tree_mod_need_log(fs_info, eb))
571                 return 0;
572
573         tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
574         if (!tm_list)
575                 return -ENOMEM;
576
577         tm = kzalloc(sizeof(*tm), flags);
578         if (!tm) {
579                 ret = -ENOMEM;
580                 goto free_tms;
581         }
582
583         tm->index = eb->start >> PAGE_CACHE_SHIFT;
584         tm->slot = src_slot;
585         tm->move.dst_slot = dst_slot;
586         tm->move.nr_items = nr_items;
587         tm->op = MOD_LOG_MOVE_KEYS;
588
589         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
590                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
591                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
592                 if (!tm_list[i]) {
593                         ret = -ENOMEM;
594                         goto free_tms;
595                 }
596         }
597
598         if (tree_mod_dont_log(fs_info, eb))
599                 goto free_tms;
600         locked = 1;
601
602         /*
603          * When we override something during the move, we log these removals.
604          * This can only happen when we move towards the beginning of the
605          * buffer, i.e. dst_slot < src_slot.
606          */
607         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
608                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
609                 if (ret)
610                         goto free_tms;
611         }
612
613         ret = __tree_mod_log_insert(fs_info, tm);
614         if (ret)
615                 goto free_tms;
616         tree_mod_log_write_unlock(fs_info);
617         kfree(tm_list);
618
619         return 0;
620 free_tms:
621         for (i = 0; i < nr_items; i++) {
622                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
623                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
624                 kfree(tm_list[i]);
625         }
626         if (locked)
627                 tree_mod_log_write_unlock(fs_info);
628         kfree(tm_list);
629         kfree(tm);
630
631         return ret;
632 }
633
634 static inline int
635 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
636                        struct tree_mod_elem **tm_list,
637                        int nritems)
638 {
639         int i, j;
640         int ret;
641
642         for (i = nritems - 1; i >= 0; i--) {
643                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
644                 if (ret) {
645                         for (j = nritems - 1; j > i; j--)
646                                 rb_erase(&tm_list[j]->node,
647                                          &fs_info->tree_mod_log);
648                         return ret;
649                 }
650         }
651
652         return 0;
653 }
654
655 static noinline int
656 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
657                          struct extent_buffer *old_root,
658                          struct extent_buffer *new_root, gfp_t flags,
659                          int log_removal)
660 {
661         struct tree_mod_elem *tm = NULL;
662         struct tree_mod_elem **tm_list = NULL;
663         int nritems = 0;
664         int ret = 0;
665         int i;
666
667         if (!tree_mod_need_log(fs_info, NULL))
668                 return 0;
669
670         if (log_removal && btrfs_header_level(old_root) > 0) {
671                 nritems = btrfs_header_nritems(old_root);
672                 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
673                                   flags);
674                 if (!tm_list) {
675                         ret = -ENOMEM;
676                         goto free_tms;
677                 }
678                 for (i = 0; i < nritems; i++) {
679                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
680                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
681                         if (!tm_list[i]) {
682                                 ret = -ENOMEM;
683                                 goto free_tms;
684                         }
685                 }
686         }
687
688         tm = kzalloc(sizeof(*tm), flags);
689         if (!tm) {
690                 ret = -ENOMEM;
691                 goto free_tms;
692         }
693
694         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
695         tm->old_root.logical = old_root->start;
696         tm->old_root.level = btrfs_header_level(old_root);
697         tm->generation = btrfs_header_generation(old_root);
698         tm->op = MOD_LOG_ROOT_REPLACE;
699
700         if (tree_mod_dont_log(fs_info, NULL))
701                 goto free_tms;
702
703         if (tm_list)
704                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
705         if (!ret)
706                 ret = __tree_mod_log_insert(fs_info, tm);
707
708         tree_mod_log_write_unlock(fs_info);
709         if (ret)
710                 goto free_tms;
711         kfree(tm_list);
712
713         return ret;
714
715 free_tms:
716         if (tm_list) {
717                 for (i = 0; i < nritems; i++)
718                         kfree(tm_list[i]);
719                 kfree(tm_list);
720         }
721         kfree(tm);
722
723         return ret;
724 }
725
726 static struct tree_mod_elem *
727 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
728                       int smallest)
729 {
730         struct rb_root *tm_root;
731         struct rb_node *node;
732         struct tree_mod_elem *cur = NULL;
733         struct tree_mod_elem *found = NULL;
734         u64 index = start >> PAGE_CACHE_SHIFT;
735
736         tree_mod_log_read_lock(fs_info);
737         tm_root = &fs_info->tree_mod_log;
738         node = tm_root->rb_node;
739         while (node) {
740                 cur = container_of(node, struct tree_mod_elem, node);
741                 if (cur->index < index) {
742                         node = node->rb_left;
743                 } else if (cur->index > index) {
744                         node = node->rb_right;
745                 } else if (cur->seq < min_seq) {
746                         node = node->rb_left;
747                 } else if (!smallest) {
748                         /* we want the node with the highest seq */
749                         if (found)
750                                 BUG_ON(found->seq > cur->seq);
751                         found = cur;
752                         node = node->rb_left;
753                 } else if (cur->seq > min_seq) {
754                         /* we want the node with the smallest seq */
755                         if (found)
756                                 BUG_ON(found->seq < cur->seq);
757                         found = cur;
758                         node = node->rb_right;
759                 } else {
760                         found = cur;
761                         break;
762                 }
763         }
764         tree_mod_log_read_unlock(fs_info);
765
766         return found;
767 }
768
769 /*
770  * this returns the element from the log with the smallest time sequence
771  * value that's in the log (the oldest log item). any element with a time
772  * sequence lower than min_seq will be ignored.
773  */
774 static struct tree_mod_elem *
775 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
776                            u64 min_seq)
777 {
778         return __tree_mod_log_search(fs_info, start, min_seq, 1);
779 }
780
781 /*
782  * this returns the element from the log with the largest time sequence
783  * value that's in the log (the most recent log item). any element with
784  * a time sequence lower than min_seq will be ignored.
785  */
786 static struct tree_mod_elem *
787 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
788 {
789         return __tree_mod_log_search(fs_info, start, min_seq, 0);
790 }
791
792 static noinline int
793 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
794                      struct extent_buffer *src, unsigned long dst_offset,
795                      unsigned long src_offset, int nr_items)
796 {
797         int ret = 0;
798         struct tree_mod_elem **tm_list = NULL;
799         struct tree_mod_elem **tm_list_add, **tm_list_rem;
800         int i;
801         int locked = 0;
802
803         if (!tree_mod_need_log(fs_info, NULL))
804                 return 0;
805
806         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
807                 return 0;
808
809         tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
810                           GFP_NOFS);
811         if (!tm_list)
812                 return -ENOMEM;
813
814         tm_list_add = tm_list;
815         tm_list_rem = tm_list + nr_items;
816         for (i = 0; i < nr_items; i++) {
817                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
818                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
819                 if (!tm_list_rem[i]) {
820                         ret = -ENOMEM;
821                         goto free_tms;
822                 }
823
824                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
825                     MOD_LOG_KEY_ADD, GFP_NOFS);
826                 if (!tm_list_add[i]) {
827                         ret = -ENOMEM;
828                         goto free_tms;
829                 }
830         }
831
832         if (tree_mod_dont_log(fs_info, NULL))
833                 goto free_tms;
834         locked = 1;
835
836         for (i = 0; i < nr_items; i++) {
837                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
838                 if (ret)
839                         goto free_tms;
840                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
841                 if (ret)
842                         goto free_tms;
843         }
844
845         tree_mod_log_write_unlock(fs_info);
846         kfree(tm_list);
847
848         return 0;
849
850 free_tms:
851         for (i = 0; i < nr_items * 2; i++) {
852                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
853                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
854                 kfree(tm_list[i]);
855         }
856         if (locked)
857                 tree_mod_log_write_unlock(fs_info);
858         kfree(tm_list);
859
860         return ret;
861 }
862
863 static inline void
864 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
865                      int dst_offset, int src_offset, int nr_items)
866 {
867         int ret;
868         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
869                                        nr_items, GFP_NOFS);
870         BUG_ON(ret < 0);
871 }
872
873 static noinline void
874 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
875                           struct extent_buffer *eb, int slot, int atomic)
876 {
877         int ret;
878
879         ret = tree_mod_log_insert_key(fs_info, eb, slot,
880                                         MOD_LOG_KEY_REPLACE,
881                                         atomic ? GFP_ATOMIC : GFP_NOFS);
882         BUG_ON(ret < 0);
883 }
884
885 static noinline int
886 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
887 {
888         struct tree_mod_elem **tm_list = NULL;
889         int nritems = 0;
890         int i;
891         int ret = 0;
892
893         if (btrfs_header_level(eb) == 0)
894                 return 0;
895
896         if (!tree_mod_need_log(fs_info, NULL))
897                 return 0;
898
899         nritems = btrfs_header_nritems(eb);
900         tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
901                           GFP_NOFS);
902         if (!tm_list)
903                 return -ENOMEM;
904
905         for (i = 0; i < nritems; i++) {
906                 tm_list[i] = alloc_tree_mod_elem(eb, i,
907                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
908                 if (!tm_list[i]) {
909                         ret = -ENOMEM;
910                         goto free_tms;
911                 }
912         }
913
914         if (tree_mod_dont_log(fs_info, eb))
915                 goto free_tms;
916
917         ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
918         tree_mod_log_write_unlock(fs_info);
919         if (ret)
920                 goto free_tms;
921         kfree(tm_list);
922
923         return 0;
924
925 free_tms:
926         for (i = 0; i < nritems; i++)
927                 kfree(tm_list[i]);
928         kfree(tm_list);
929
930         return ret;
931 }
932
933 static noinline void
934 tree_mod_log_set_root_pointer(struct btrfs_root *root,
935                               struct extent_buffer *new_root_node,
936                               int log_removal)
937 {
938         int ret;
939         ret = tree_mod_log_insert_root(root->fs_info, root->node,
940                                        new_root_node, GFP_NOFS, log_removal);
941         BUG_ON(ret < 0);
942 }
943
944 /*
945  * check if the tree block can be shared by multiple trees
946  */
947 int btrfs_block_can_be_shared(struct btrfs_root *root,
948                               struct extent_buffer *buf)
949 {
950         /*
951          * Tree blocks not in refernece counted trees and tree roots
952          * are never shared. If a block was allocated after the last
953          * snapshot and the block was not allocated by tree relocation,
954          * we know the block is not shared.
955          */
956         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
957             buf != root->node && buf != root->commit_root &&
958             (btrfs_header_generation(buf) <=
959              btrfs_root_last_snapshot(&root->root_item) ||
960              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
961                 return 1;
962 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
963         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
964             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
965                 return 1;
966 #endif
967         return 0;
968 }
969
970 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
971                                        struct btrfs_root *root,
972                                        struct extent_buffer *buf,
973                                        struct extent_buffer *cow,
974                                        int *last_ref)
975 {
976         u64 refs;
977         u64 owner;
978         u64 flags;
979         u64 new_flags = 0;
980         int ret;
981
982         /*
983          * Backrefs update rules:
984          *
985          * Always use full backrefs for extent pointers in tree block
986          * allocated by tree relocation.
987          *
988          * If a shared tree block is no longer referenced by its owner
989          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
990          * use full backrefs for extent pointers in tree block.
991          *
992          * If a tree block is been relocating
993          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
994          * use full backrefs for extent pointers in tree block.
995          * The reason for this is some operations (such as drop tree)
996          * are only allowed for blocks use full backrefs.
997          */
998
999         if (btrfs_block_can_be_shared(root, buf)) {
1000                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
1001                                                btrfs_header_level(buf), 1,
1002                                                &refs, &flags);
1003                 if (ret)
1004                         return ret;
1005                 if (refs == 0) {
1006                         ret = -EROFS;
1007                         btrfs_std_error(root->fs_info, ret);
1008                         return ret;
1009                 }
1010         } else {
1011                 refs = 1;
1012                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1013                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1014                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1015                 else
1016                         flags = 0;
1017         }
1018
1019         owner = btrfs_header_owner(buf);
1020         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1021                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1022
1023         if (refs > 1) {
1024                 if ((owner == root->root_key.objectid ||
1025                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1026                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1027                         ret = btrfs_inc_ref(trans, root, buf, 1);
1028                         BUG_ON(ret); /* -ENOMEM */
1029
1030                         if (root->root_key.objectid ==
1031                             BTRFS_TREE_RELOC_OBJECTID) {
1032                                 ret = btrfs_dec_ref(trans, root, buf, 0);
1033                                 BUG_ON(ret); /* -ENOMEM */
1034                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1035                                 BUG_ON(ret); /* -ENOMEM */
1036                         }
1037                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1038                 } else {
1039
1040                         if (root->root_key.objectid ==
1041                             BTRFS_TREE_RELOC_OBJECTID)
1042                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1043                         else
1044                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1045                         BUG_ON(ret); /* -ENOMEM */
1046                 }
1047                 if (new_flags != 0) {
1048                         int level = btrfs_header_level(buf);
1049
1050                         ret = btrfs_set_disk_extent_flags(trans, root,
1051                                                           buf->start,
1052                                                           buf->len,
1053                                                           new_flags, level, 0);
1054                         if (ret)
1055                                 return ret;
1056                 }
1057         } else {
1058                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1059                         if (root->root_key.objectid ==
1060                             BTRFS_TREE_RELOC_OBJECTID)
1061                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1062                         else
1063                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1064                         BUG_ON(ret); /* -ENOMEM */
1065                         ret = btrfs_dec_ref(trans, root, buf, 1);
1066                         BUG_ON(ret); /* -ENOMEM */
1067                 }
1068                 clean_tree_block(trans, root, buf);
1069                 *last_ref = 1;
1070         }
1071         return 0;
1072 }
1073
1074 /*
1075  * does the dirty work in cow of a single block.  The parent block (if
1076  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1077  * dirty and returned locked.  If you modify the block it needs to be marked
1078  * dirty again.
1079  *
1080  * search_start -- an allocation hint for the new block
1081  *
1082  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1083  * bytes the allocator should try to find free next to the block it returns.
1084  * This is just a hint and may be ignored by the allocator.
1085  */
1086 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1087                              struct btrfs_root *root,
1088                              struct extent_buffer *buf,
1089                              struct extent_buffer *parent, int parent_slot,
1090                              struct extent_buffer **cow_ret,
1091                              u64 search_start, u64 empty_size)
1092 {
1093         struct btrfs_disk_key disk_key;
1094         struct extent_buffer *cow;
1095         int level, ret;
1096         int last_ref = 0;
1097         int unlock_orig = 0;
1098         u64 parent_start;
1099
1100         if (*cow_ret == buf)
1101                 unlock_orig = 1;
1102
1103         btrfs_assert_tree_locked(buf);
1104
1105         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1106                 trans->transid != root->fs_info->running_transaction->transid);
1107         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1108                 trans->transid != root->last_trans);
1109
1110         level = btrfs_header_level(buf);
1111
1112         if (level == 0)
1113                 btrfs_item_key(buf, &disk_key, 0);
1114         else
1115                 btrfs_node_key(buf, &disk_key, 0);
1116
1117         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1118                 if (parent)
1119                         parent_start = parent->start;
1120                 else
1121                         parent_start = 0;
1122         } else
1123                 parent_start = 0;
1124
1125         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1126                         root->root_key.objectid, &disk_key, level,
1127                         search_start, empty_size);
1128         if (IS_ERR(cow))
1129                 return PTR_ERR(cow);
1130
1131         /* cow is set to blocking by btrfs_init_new_buffer */
1132
1133         copy_extent_buffer(cow, buf, 0, 0, cow->len);
1134         btrfs_set_header_bytenr(cow, cow->start);
1135         btrfs_set_header_generation(cow, trans->transid);
1136         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1137         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1138                                      BTRFS_HEADER_FLAG_RELOC);
1139         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1140                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1141         else
1142                 btrfs_set_header_owner(cow, root->root_key.objectid);
1143
1144         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1145                             BTRFS_FSID_SIZE);
1146
1147         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1148         if (ret) {
1149                 btrfs_abort_transaction(trans, root, ret);
1150                 return ret;
1151         }
1152
1153         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1154                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1155                 if (ret)
1156                         return ret;
1157         }
1158
1159         if (buf == root->node) {
1160                 WARN_ON(parent && parent != buf);
1161                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1162                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1163                         parent_start = buf->start;
1164                 else
1165                         parent_start = 0;
1166
1167                 extent_buffer_get(cow);
1168                 tree_mod_log_set_root_pointer(root, cow, 1);
1169                 rcu_assign_pointer(root->node, cow);
1170
1171                 btrfs_free_tree_block(trans, root, buf, parent_start,
1172                                       last_ref);
1173                 free_extent_buffer(buf);
1174                 add_root_to_dirty_list(root);
1175         } else {
1176                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1177                         parent_start = parent->start;
1178                 else
1179                         parent_start = 0;
1180
1181                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1182                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1183                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1184                 btrfs_set_node_blockptr(parent, parent_slot,
1185                                         cow->start);
1186                 btrfs_set_node_ptr_generation(parent, parent_slot,
1187                                               trans->transid);
1188                 btrfs_mark_buffer_dirty(parent);
1189                 if (last_ref) {
1190                         ret = tree_mod_log_free_eb(root->fs_info, buf);
1191                         if (ret) {
1192                                 btrfs_abort_transaction(trans, root, ret);
1193                                 return ret;
1194                         }
1195                 }
1196                 btrfs_free_tree_block(trans, root, buf, parent_start,
1197                                       last_ref);
1198         }
1199         if (unlock_orig)
1200                 btrfs_tree_unlock(buf);
1201         free_extent_buffer_stale(buf);
1202         btrfs_mark_buffer_dirty(cow);
1203         *cow_ret = cow;
1204         return 0;
1205 }
1206
1207 /*
1208  * returns the logical address of the oldest predecessor of the given root.
1209  * entries older than time_seq are ignored.
1210  */
1211 static struct tree_mod_elem *
1212 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1213                            struct extent_buffer *eb_root, u64 time_seq)
1214 {
1215         struct tree_mod_elem *tm;
1216         struct tree_mod_elem *found = NULL;
1217         u64 root_logical = eb_root->start;
1218         int looped = 0;
1219
1220         if (!time_seq)
1221                 return NULL;
1222
1223         /*
1224          * the very last operation that's logged for a root is the replacement
1225          * operation (if it is replaced at all). this has the index of the *new*
1226          * root, making it the very first operation that's logged for this root.
1227          */
1228         while (1) {
1229                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1230                                                 time_seq);
1231                 if (!looped && !tm)
1232                         return NULL;
1233                 /*
1234                  * if there are no tree operation for the oldest root, we simply
1235                  * return it. this should only happen if that (old) root is at
1236                  * level 0.
1237                  */
1238                 if (!tm)
1239                         break;
1240
1241                 /*
1242                  * if there's an operation that's not a root replacement, we
1243                  * found the oldest version of our root. normally, we'll find a
1244                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1245                  */
1246                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1247                         break;
1248
1249                 found = tm;
1250                 root_logical = tm->old_root.logical;
1251                 looped = 1;
1252         }
1253
1254         /* if there's no old root to return, return what we found instead */
1255         if (!found)
1256                 found = tm;
1257
1258         return found;
1259 }
1260
1261 /*
1262  * tm is a pointer to the first operation to rewind within eb. then, all
1263  * previous operations will be rewinded (until we reach something older than
1264  * time_seq).
1265  */
1266 static void
1267 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1268                       u64 time_seq, struct tree_mod_elem *first_tm)
1269 {
1270         u32 n;
1271         struct rb_node *next;
1272         struct tree_mod_elem *tm = first_tm;
1273         unsigned long o_dst;
1274         unsigned long o_src;
1275         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1276
1277         n = btrfs_header_nritems(eb);
1278         tree_mod_log_read_lock(fs_info);
1279         while (tm && tm->seq >= time_seq) {
1280                 /*
1281                  * all the operations are recorded with the operator used for
1282                  * the modification. as we're going backwards, we do the
1283                  * opposite of each operation here.
1284                  */
1285                 switch (tm->op) {
1286                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1287                         BUG_ON(tm->slot < n);
1288                         /* Fallthrough */
1289                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1290                 case MOD_LOG_KEY_REMOVE:
1291                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1292                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1293                         btrfs_set_node_ptr_generation(eb, tm->slot,
1294                                                       tm->generation);
1295                         n++;
1296                         break;
1297                 case MOD_LOG_KEY_REPLACE:
1298                         BUG_ON(tm->slot >= n);
1299                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1300                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1301                         btrfs_set_node_ptr_generation(eb, tm->slot,
1302                                                       tm->generation);
1303                         break;
1304                 case MOD_LOG_KEY_ADD:
1305                         /* if a move operation is needed it's in the log */
1306                         n--;
1307                         break;
1308                 case MOD_LOG_MOVE_KEYS:
1309                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1310                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1311                         memmove_extent_buffer(eb, o_dst, o_src,
1312                                               tm->move.nr_items * p_size);
1313                         break;
1314                 case MOD_LOG_ROOT_REPLACE:
1315                         /*
1316                          * this operation is special. for roots, this must be
1317                          * handled explicitly before rewinding.
1318                          * for non-roots, this operation may exist if the node
1319                          * was a root: root A -> child B; then A gets empty and
1320                          * B is promoted to the new root. in the mod log, we'll
1321                          * have a root-replace operation for B, a tree block
1322                          * that is no root. we simply ignore that operation.
1323                          */
1324                         break;
1325                 }
1326                 next = rb_next(&tm->node);
1327                 if (!next)
1328                         break;
1329                 tm = container_of(next, struct tree_mod_elem, node);
1330                 if (tm->index != first_tm->index)
1331                         break;
1332         }
1333         tree_mod_log_read_unlock(fs_info);
1334         btrfs_set_header_nritems(eb, n);
1335 }
1336
1337 /*
1338  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1339  * is returned. If rewind operations happen, a fresh buffer is returned. The
1340  * returned buffer is always read-locked. If the returned buffer is not the
1341  * input buffer, the lock on the input buffer is released and the input buffer
1342  * is freed (its refcount is decremented).
1343  */
1344 static struct extent_buffer *
1345 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1346                     struct extent_buffer *eb, u64 time_seq)
1347 {
1348         struct extent_buffer *eb_rewin;
1349         struct tree_mod_elem *tm;
1350
1351         if (!time_seq)
1352                 return eb;
1353
1354         if (btrfs_header_level(eb) == 0)
1355                 return eb;
1356
1357         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1358         if (!tm)
1359                 return eb;
1360
1361         btrfs_set_path_blocking(path);
1362         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1363
1364         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1365                 BUG_ON(tm->slot != 0);
1366                 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1367                                                 fs_info->tree_root->nodesize);
1368                 if (!eb_rewin) {
1369                         btrfs_tree_read_unlock_blocking(eb);
1370                         free_extent_buffer(eb);
1371                         return NULL;
1372                 }
1373                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1374                 btrfs_set_header_backref_rev(eb_rewin,
1375                                              btrfs_header_backref_rev(eb));
1376                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1377                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1378         } else {
1379                 eb_rewin = btrfs_clone_extent_buffer(eb);
1380                 if (!eb_rewin) {
1381                         btrfs_tree_read_unlock_blocking(eb);
1382                         free_extent_buffer(eb);
1383                         return NULL;
1384                 }
1385         }
1386
1387         btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1388         btrfs_tree_read_unlock_blocking(eb);
1389         free_extent_buffer(eb);
1390
1391         extent_buffer_get(eb_rewin);
1392         btrfs_tree_read_lock(eb_rewin);
1393         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1394         WARN_ON(btrfs_header_nritems(eb_rewin) >
1395                 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1396
1397         return eb_rewin;
1398 }
1399
1400 /*
1401  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1402  * value. If there are no changes, the current root->root_node is returned. If
1403  * anything changed in between, there's a fresh buffer allocated on which the
1404  * rewind operations are done. In any case, the returned buffer is read locked.
1405  * Returns NULL on error (with no locks held).
1406  */
1407 static inline struct extent_buffer *
1408 get_old_root(struct btrfs_root *root, u64 time_seq)
1409 {
1410         struct tree_mod_elem *tm;
1411         struct extent_buffer *eb = NULL;
1412         struct extent_buffer *eb_root;
1413         struct extent_buffer *old;
1414         struct tree_mod_root *old_root = NULL;
1415         u64 old_generation = 0;
1416         u64 logical;
1417
1418         eb_root = btrfs_read_lock_root_node(root);
1419         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1420         if (!tm)
1421                 return eb_root;
1422
1423         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1424                 old_root = &tm->old_root;
1425                 old_generation = tm->generation;
1426                 logical = old_root->logical;
1427         } else {
1428                 logical = eb_root->start;
1429         }
1430
1431         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1432         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1433                 btrfs_tree_read_unlock(eb_root);
1434                 free_extent_buffer(eb_root);
1435                 old = read_tree_block(root, logical, 0);
1436                 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1437                         free_extent_buffer(old);
1438                         btrfs_warn(root->fs_info,
1439                                 "failed to read tree block %llu from get_old_root", logical);
1440                 } else {
1441                         eb = btrfs_clone_extent_buffer(old);
1442                         free_extent_buffer(old);
1443                 }
1444         } else if (old_root) {
1445                 btrfs_tree_read_unlock(eb_root);
1446                 free_extent_buffer(eb_root);
1447                 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1448         } else {
1449                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1450                 eb = btrfs_clone_extent_buffer(eb_root);
1451                 btrfs_tree_read_unlock_blocking(eb_root);
1452                 free_extent_buffer(eb_root);
1453         }
1454
1455         if (!eb)
1456                 return NULL;
1457         extent_buffer_get(eb);
1458         btrfs_tree_read_lock(eb);
1459         if (old_root) {
1460                 btrfs_set_header_bytenr(eb, eb->start);
1461                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1462                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1463                 btrfs_set_header_level(eb, old_root->level);
1464                 btrfs_set_header_generation(eb, old_generation);
1465         }
1466         if (tm)
1467                 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1468         else
1469                 WARN_ON(btrfs_header_level(eb) != 0);
1470         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1471
1472         return eb;
1473 }
1474
1475 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1476 {
1477         struct tree_mod_elem *tm;
1478         int level;
1479         struct extent_buffer *eb_root = btrfs_root_node(root);
1480
1481         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1482         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1483                 level = tm->old_root.level;
1484         } else {
1485                 level = btrfs_header_level(eb_root);
1486         }
1487         free_extent_buffer(eb_root);
1488
1489         return level;
1490 }
1491
1492 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1493                                    struct btrfs_root *root,
1494                                    struct extent_buffer *buf)
1495 {
1496         if (btrfs_test_is_dummy_root(root))
1497                 return 0;
1498
1499         /* ensure we can see the force_cow */
1500         smp_rmb();
1501
1502         /*
1503          * We do not need to cow a block if
1504          * 1) this block is not created or changed in this transaction;
1505          * 2) this block does not belong to TREE_RELOC tree;
1506          * 3) the root is not forced COW.
1507          *
1508          * What is forced COW:
1509          *    when we create snapshot during commiting the transaction,
1510          *    after we've finished coping src root, we must COW the shared
1511          *    block to ensure the metadata consistency.
1512          */
1513         if (btrfs_header_generation(buf) == trans->transid &&
1514             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1515             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1516               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1517             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1518                 return 0;
1519         return 1;
1520 }
1521
1522 /*
1523  * cows a single block, see __btrfs_cow_block for the real work.
1524  * This version of it has extra checks so that a block isn't cow'd more than
1525  * once per transaction, as long as it hasn't been written yet
1526  */
1527 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1528                     struct btrfs_root *root, struct extent_buffer *buf,
1529                     struct extent_buffer *parent, int parent_slot,
1530                     struct extent_buffer **cow_ret)
1531 {
1532         u64 search_start;
1533         int ret;
1534
1535         if (trans->transaction != root->fs_info->running_transaction)
1536                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1537                        trans->transid,
1538                        root->fs_info->running_transaction->transid);
1539
1540         if (trans->transid != root->fs_info->generation)
1541                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1542                        trans->transid, root->fs_info->generation);
1543
1544         if (!should_cow_block(trans, root, buf)) {
1545                 *cow_ret = buf;
1546                 return 0;
1547         }
1548
1549         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1550
1551         if (parent)
1552                 btrfs_set_lock_blocking(parent);
1553         btrfs_set_lock_blocking(buf);
1554
1555         ret = __btrfs_cow_block(trans, root, buf, parent,
1556                                  parent_slot, cow_ret, search_start, 0);
1557
1558         trace_btrfs_cow_block(root, buf, *cow_ret);
1559
1560         return ret;
1561 }
1562
1563 /*
1564  * helper function for defrag to decide if two blocks pointed to by a
1565  * node are actually close by
1566  */
1567 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1568 {
1569         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1570                 return 1;
1571         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1572                 return 1;
1573         return 0;
1574 }
1575
1576 /*
1577  * compare two keys in a memcmp fashion
1578  */
1579 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1580 {
1581         struct btrfs_key k1;
1582
1583         btrfs_disk_key_to_cpu(&k1, disk);
1584
1585         return btrfs_comp_cpu_keys(&k1, k2);
1586 }
1587
1588 /*
1589  * same as comp_keys only with two btrfs_key's
1590  */
1591 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1592 {
1593         if (k1->objectid > k2->objectid)
1594                 return 1;
1595         if (k1->objectid < k2->objectid)
1596                 return -1;
1597         if (k1->type > k2->type)
1598                 return 1;
1599         if (k1->type < k2->type)
1600                 return -1;
1601         if (k1->offset > k2->offset)
1602                 return 1;
1603         if (k1->offset < k2->offset)
1604                 return -1;
1605         return 0;
1606 }
1607
1608 /*
1609  * this is used by the defrag code to go through all the
1610  * leaves pointed to by a node and reallocate them so that
1611  * disk order is close to key order
1612  */
1613 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1614                        struct btrfs_root *root, struct extent_buffer *parent,
1615                        int start_slot, u64 *last_ret,
1616                        struct btrfs_key *progress)
1617 {
1618         struct extent_buffer *cur;
1619         u64 blocknr;
1620         u64 gen;
1621         u64 search_start = *last_ret;
1622         u64 last_block = 0;
1623         u64 other;
1624         u32 parent_nritems;
1625         int end_slot;
1626         int i;
1627         int err = 0;
1628         int parent_level;
1629         int uptodate;
1630         u32 blocksize;
1631         int progress_passed = 0;
1632         struct btrfs_disk_key disk_key;
1633
1634         parent_level = btrfs_header_level(parent);
1635
1636         WARN_ON(trans->transaction != root->fs_info->running_transaction);
1637         WARN_ON(trans->transid != root->fs_info->generation);
1638
1639         parent_nritems = btrfs_header_nritems(parent);
1640         blocksize = root->nodesize;
1641         end_slot = parent_nritems;
1642
1643         if (parent_nritems == 1)
1644                 return 0;
1645
1646         btrfs_set_lock_blocking(parent);
1647
1648         for (i = start_slot; i < end_slot; i++) {
1649                 int close = 1;
1650
1651                 btrfs_node_key(parent, &disk_key, i);
1652                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1653                         continue;
1654
1655                 progress_passed = 1;
1656                 blocknr = btrfs_node_blockptr(parent, i);
1657                 gen = btrfs_node_ptr_generation(parent, i);
1658                 if (last_block == 0)
1659                         last_block = blocknr;
1660
1661                 if (i > 0) {
1662                         other = btrfs_node_blockptr(parent, i - 1);
1663                         close = close_blocks(blocknr, other, blocksize);
1664                 }
1665                 if (!close && i < end_slot - 2) {
1666                         other = btrfs_node_blockptr(parent, i + 1);
1667                         close = close_blocks(blocknr, other, blocksize);
1668                 }
1669                 if (close) {
1670                         last_block = blocknr;
1671                         continue;
1672                 }
1673
1674                 cur = btrfs_find_tree_block(root, blocknr);
1675                 if (cur)
1676                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1677                 else
1678                         uptodate = 0;
1679                 if (!cur || !uptodate) {
1680                         if (!cur) {
1681                                 cur = read_tree_block(root, blocknr, gen);
1682                                 if (!cur || !extent_buffer_uptodate(cur)) {
1683                                         free_extent_buffer(cur);
1684                                         return -EIO;
1685                                 }
1686                         } else if (!uptodate) {
1687                                 err = btrfs_read_buffer(cur, gen);
1688                                 if (err) {
1689                                         free_extent_buffer(cur);
1690                                         return err;
1691                                 }
1692                         }
1693                 }
1694                 if (search_start == 0)
1695                         search_start = last_block;
1696
1697                 btrfs_tree_lock(cur);
1698                 btrfs_set_lock_blocking(cur);
1699                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1700                                         &cur, search_start,
1701                                         min(16 * blocksize,
1702                                             (end_slot - i) * blocksize));
1703                 if (err) {
1704                         btrfs_tree_unlock(cur);
1705                         free_extent_buffer(cur);
1706                         break;
1707                 }
1708                 search_start = cur->start;
1709                 last_block = cur->start;
1710                 *last_ret = search_start;
1711                 btrfs_tree_unlock(cur);
1712                 free_extent_buffer(cur);
1713         }
1714         return err;
1715 }
1716
1717 /*
1718  * The leaf data grows from end-to-front in the node.
1719  * this returns the address of the start of the last item,
1720  * which is the stop of the leaf data stack
1721  */
1722 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1723                                          struct extent_buffer *leaf)
1724 {
1725         u32 nr = btrfs_header_nritems(leaf);
1726         if (nr == 0)
1727                 return BTRFS_LEAF_DATA_SIZE(root);
1728         return btrfs_item_offset_nr(leaf, nr - 1);
1729 }
1730
1731
1732 /*
1733  * search for key in the extent_buffer.  The items start at offset p,
1734  * and they are item_size apart.  There are 'max' items in p.
1735  *
1736  * the slot in the array is returned via slot, and it points to
1737  * the place where you would insert key if it is not found in
1738  * the array.
1739  *
1740  * slot may point to max if the key is bigger than all of the keys
1741  */
1742 static noinline int generic_bin_search(struct extent_buffer *eb,
1743                                        unsigned long p,
1744                                        int item_size, struct btrfs_key *key,
1745                                        int max, int *slot)
1746 {
1747         int low = 0;
1748         int high = max;
1749         int mid;
1750         int ret;
1751         struct btrfs_disk_key *tmp = NULL;
1752         struct btrfs_disk_key unaligned;
1753         unsigned long offset;
1754         char *kaddr = NULL;
1755         unsigned long map_start = 0;
1756         unsigned long map_len = 0;
1757         int err;
1758
1759         while (low < high) {
1760                 mid = (low + high) / 2;
1761                 offset = p + mid * item_size;
1762
1763                 if (!kaddr || offset < map_start ||
1764                     (offset + sizeof(struct btrfs_disk_key)) >
1765                     map_start + map_len) {
1766
1767                         err = map_private_extent_buffer(eb, offset,
1768                                                 sizeof(struct btrfs_disk_key),
1769                                                 &kaddr, &map_start, &map_len);
1770
1771                         if (!err) {
1772                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1773                                                         map_start);
1774                         } else {
1775                                 read_extent_buffer(eb, &unaligned,
1776                                                    offset, sizeof(unaligned));
1777                                 tmp = &unaligned;
1778                         }
1779
1780                 } else {
1781                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1782                                                         map_start);
1783                 }
1784                 ret = comp_keys(tmp, key);
1785
1786                 if (ret < 0)
1787                         low = mid + 1;
1788                 else if (ret > 0)
1789                         high = mid;
1790                 else {
1791                         *slot = mid;
1792                         return 0;
1793                 }
1794         }
1795         *slot = low;
1796         return 1;
1797 }
1798
1799 /*
1800  * simple bin_search frontend that does the right thing for
1801  * leaves vs nodes
1802  */
1803 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1804                       int level, int *slot)
1805 {
1806         if (level == 0)
1807                 return generic_bin_search(eb,
1808                                           offsetof(struct btrfs_leaf, items),
1809                                           sizeof(struct btrfs_item),
1810                                           key, btrfs_header_nritems(eb),
1811                                           slot);
1812         else
1813                 return generic_bin_search(eb,
1814                                           offsetof(struct btrfs_node, ptrs),
1815                                           sizeof(struct btrfs_key_ptr),
1816                                           key, btrfs_header_nritems(eb),
1817                                           slot);
1818 }
1819
1820 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1821                      int level, int *slot)
1822 {
1823         return bin_search(eb, key, level, slot);
1824 }
1825
1826 static void root_add_used(struct btrfs_root *root, u32 size)
1827 {
1828         spin_lock(&root->accounting_lock);
1829         btrfs_set_root_used(&root->root_item,
1830                             btrfs_root_used(&root->root_item) + size);
1831         spin_unlock(&root->accounting_lock);
1832 }
1833
1834 static void root_sub_used(struct btrfs_root *root, u32 size)
1835 {
1836         spin_lock(&root->accounting_lock);
1837         btrfs_set_root_used(&root->root_item,
1838                             btrfs_root_used(&root->root_item) - size);
1839         spin_unlock(&root->accounting_lock);
1840 }
1841
1842 /* given a node and slot number, this reads the blocks it points to.  The
1843  * extent buffer is returned with a reference taken (but unlocked).
1844  * NULL is returned on error.
1845  */
1846 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1847                                    struct extent_buffer *parent, int slot)
1848 {
1849         int level = btrfs_header_level(parent);
1850         struct extent_buffer *eb;
1851
1852         if (slot < 0)
1853                 return NULL;
1854         if (slot >= btrfs_header_nritems(parent))
1855                 return NULL;
1856
1857         BUG_ON(level == 0);
1858
1859         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1860                              btrfs_node_ptr_generation(parent, slot));
1861         if (eb && !extent_buffer_uptodate(eb)) {
1862                 free_extent_buffer(eb);
1863                 eb = NULL;
1864         }
1865
1866         return eb;
1867 }
1868
1869 /*
1870  * node level balancing, used to make sure nodes are in proper order for
1871  * item deletion.  We balance from the top down, so we have to make sure
1872  * that a deletion won't leave an node completely empty later on.
1873  */
1874 static noinline int balance_level(struct btrfs_trans_handle *trans,
1875                          struct btrfs_root *root,
1876                          struct btrfs_path *path, int level)
1877 {
1878         struct extent_buffer *right = NULL;
1879         struct extent_buffer *mid;
1880         struct extent_buffer *left = NULL;
1881         struct extent_buffer *parent = NULL;
1882         int ret = 0;
1883         int wret;
1884         int pslot;
1885         int orig_slot = path->slots[level];
1886         u64 orig_ptr;
1887
1888         if (level == 0)
1889                 return 0;
1890
1891         mid = path->nodes[level];
1892
1893         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1894                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1895         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1896
1897         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1898
1899         if (level < BTRFS_MAX_LEVEL - 1) {
1900                 parent = path->nodes[level + 1];
1901                 pslot = path->slots[level + 1];
1902         }
1903
1904         /*
1905          * deal with the case where there is only one pointer in the root
1906          * by promoting the node below to a root
1907          */
1908         if (!parent) {
1909                 struct extent_buffer *child;
1910
1911                 if (btrfs_header_nritems(mid) != 1)
1912                         return 0;
1913
1914                 /* promote the child to a root */
1915                 child = read_node_slot(root, mid, 0);
1916                 if (!child) {
1917                         ret = -EROFS;
1918                         btrfs_std_error(root->fs_info, ret);
1919                         goto enospc;
1920                 }
1921
1922                 btrfs_tree_lock(child);
1923                 btrfs_set_lock_blocking(child);
1924                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1925                 if (ret) {
1926                         btrfs_tree_unlock(child);
1927                         free_extent_buffer(child);
1928                         goto enospc;
1929                 }
1930
1931                 tree_mod_log_set_root_pointer(root, child, 1);
1932                 rcu_assign_pointer(root->node, child);
1933
1934                 add_root_to_dirty_list(root);
1935                 btrfs_tree_unlock(child);
1936
1937                 path->locks[level] = 0;
1938                 path->nodes[level] = NULL;
1939                 clean_tree_block(trans, root, mid);
1940                 btrfs_tree_unlock(mid);
1941                 /* once for the path */
1942                 free_extent_buffer(mid);
1943
1944                 root_sub_used(root, mid->len);
1945                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1946                 /* once for the root ptr */
1947                 free_extent_buffer_stale(mid);
1948                 return 0;
1949         }
1950         if (btrfs_header_nritems(mid) >
1951             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1952                 return 0;
1953
1954         left = read_node_slot(root, parent, pslot - 1);
1955         if (left) {
1956                 btrfs_tree_lock(left);
1957                 btrfs_set_lock_blocking(left);
1958                 wret = btrfs_cow_block(trans, root, left,
1959                                        parent, pslot - 1, &left);
1960                 if (wret) {
1961                         ret = wret;
1962                         goto enospc;
1963                 }
1964         }
1965         right = read_node_slot(root, parent, pslot + 1);
1966         if (right) {
1967                 btrfs_tree_lock(right);
1968                 btrfs_set_lock_blocking(right);
1969                 wret = btrfs_cow_block(trans, root, right,
1970                                        parent, pslot + 1, &right);
1971                 if (wret) {
1972                         ret = wret;
1973                         goto enospc;
1974                 }
1975         }
1976
1977         /* first, try to make some room in the middle buffer */
1978         if (left) {
1979                 orig_slot += btrfs_header_nritems(left);
1980                 wret = push_node_left(trans, root, left, mid, 1);
1981                 if (wret < 0)
1982                         ret = wret;
1983         }
1984
1985         /*
1986          * then try to empty the right most buffer into the middle
1987          */
1988         if (right) {
1989                 wret = push_node_left(trans, root, mid, right, 1);
1990                 if (wret < 0 && wret != -ENOSPC)
1991                         ret = wret;
1992                 if (btrfs_header_nritems(right) == 0) {
1993                         clean_tree_block(trans, root, right);
1994                         btrfs_tree_unlock(right);
1995                         del_ptr(root, path, level + 1, pslot + 1);
1996                         root_sub_used(root, right->len);
1997                         btrfs_free_tree_block(trans, root, right, 0, 1);
1998                         free_extent_buffer_stale(right);
1999                         right = NULL;
2000                 } else {
2001                         struct btrfs_disk_key right_key;
2002                         btrfs_node_key(right, &right_key, 0);
2003                         tree_mod_log_set_node_key(root->fs_info, parent,
2004                                                   pslot + 1, 0);
2005                         btrfs_set_node_key(parent, &right_key, pslot + 1);
2006                         btrfs_mark_buffer_dirty(parent);
2007                 }
2008         }
2009         if (btrfs_header_nritems(mid) == 1) {
2010                 /*
2011                  * we're not allowed to leave a node with one item in the
2012                  * tree during a delete.  A deletion from lower in the tree
2013                  * could try to delete the only pointer in this node.
2014                  * So, pull some keys from the left.
2015                  * There has to be a left pointer at this point because
2016                  * otherwise we would have pulled some pointers from the
2017                  * right
2018                  */
2019                 if (!left) {
2020                         ret = -EROFS;
2021                         btrfs_std_error(root->fs_info, ret);
2022                         goto enospc;
2023                 }
2024                 wret = balance_node_right(trans, root, mid, left);
2025                 if (wret < 0) {
2026                         ret = wret;
2027                         goto enospc;
2028                 }
2029                 if (wret == 1) {
2030                         wret = push_node_left(trans, root, left, mid, 1);
2031                         if (wret < 0)
2032                                 ret = wret;
2033                 }
2034                 BUG_ON(wret == 1);
2035         }
2036         if (btrfs_header_nritems(mid) == 0) {
2037                 clean_tree_block(trans, root, mid);
2038                 btrfs_tree_unlock(mid);
2039                 del_ptr(root, path, level + 1, pslot);
2040                 root_sub_used(root, mid->len);
2041                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2042                 free_extent_buffer_stale(mid);
2043                 mid = NULL;
2044         } else {
2045                 /* update the parent key to reflect our changes */
2046                 struct btrfs_disk_key mid_key;
2047                 btrfs_node_key(mid, &mid_key, 0);
2048                 tree_mod_log_set_node_key(root->fs_info, parent,
2049                                           pslot, 0);
2050                 btrfs_set_node_key(parent, &mid_key, pslot);
2051                 btrfs_mark_buffer_dirty(parent);
2052         }
2053
2054         /* update the path */
2055         if (left) {
2056                 if (btrfs_header_nritems(left) > orig_slot) {
2057                         extent_buffer_get(left);
2058                         /* left was locked after cow */
2059                         path->nodes[level] = left;
2060                         path->slots[level + 1] -= 1;
2061                         path->slots[level] = orig_slot;
2062                         if (mid) {
2063                                 btrfs_tree_unlock(mid);
2064                                 free_extent_buffer(mid);
2065                         }
2066                 } else {
2067                         orig_slot -= btrfs_header_nritems(left);
2068                         path->slots[level] = orig_slot;
2069                 }
2070         }
2071         /* double check we haven't messed things up */
2072         if (orig_ptr !=
2073             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2074                 BUG();
2075 enospc:
2076         if (right) {
2077                 btrfs_tree_unlock(right);
2078                 free_extent_buffer(right);
2079         }
2080         if (left) {
2081                 if (path->nodes[level] != left)
2082                         btrfs_tree_unlock(left);
2083                 free_extent_buffer(left);
2084         }
2085         return ret;
2086 }
2087
2088 /* Node balancing for insertion.  Here we only split or push nodes around
2089  * when they are completely full.  This is also done top down, so we
2090  * have to be pessimistic.
2091  */
2092 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2093                                           struct btrfs_root *root,
2094                                           struct btrfs_path *path, int level)
2095 {
2096         struct extent_buffer *right = NULL;
2097         struct extent_buffer *mid;
2098         struct extent_buffer *left = NULL;
2099         struct extent_buffer *parent = NULL;
2100         int ret = 0;
2101         int wret;
2102         int pslot;
2103         int orig_slot = path->slots[level];
2104
2105         if (level == 0)
2106                 return 1;
2107
2108         mid = path->nodes[level];
2109         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2110
2111         if (level < BTRFS_MAX_LEVEL - 1) {
2112                 parent = path->nodes[level + 1];
2113                 pslot = path->slots[level + 1];
2114         }
2115
2116         if (!parent)
2117                 return 1;
2118
2119         left = read_node_slot(root, parent, pslot - 1);
2120
2121         /* first, try to make some room in the middle buffer */
2122         if (left) {
2123                 u32 left_nr;
2124
2125                 btrfs_tree_lock(left);
2126                 btrfs_set_lock_blocking(left);
2127
2128                 left_nr = btrfs_header_nritems(left);
2129                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2130                         wret = 1;
2131                 } else {
2132                         ret = btrfs_cow_block(trans, root, left, parent,
2133                                               pslot - 1, &left);
2134                         if (ret)
2135                                 wret = 1;
2136                         else {
2137                                 wret = push_node_left(trans, root,
2138                                                       left, mid, 0);
2139                         }
2140                 }
2141                 if (wret < 0)
2142                         ret = wret;
2143                 if (wret == 0) {
2144                         struct btrfs_disk_key disk_key;
2145                         orig_slot += left_nr;
2146                         btrfs_node_key(mid, &disk_key, 0);
2147                         tree_mod_log_set_node_key(root->fs_info, parent,
2148                                                   pslot, 0);
2149                         btrfs_set_node_key(parent, &disk_key, pslot);
2150                         btrfs_mark_buffer_dirty(parent);
2151                         if (btrfs_header_nritems(left) > orig_slot) {
2152                                 path->nodes[level] = left;
2153                                 path->slots[level + 1] -= 1;
2154                                 path->slots[level] = orig_slot;
2155                                 btrfs_tree_unlock(mid);
2156                                 free_extent_buffer(mid);
2157                         } else {
2158                                 orig_slot -=
2159                                         btrfs_header_nritems(left);
2160                                 path->slots[level] = orig_slot;
2161                                 btrfs_tree_unlock(left);
2162                                 free_extent_buffer(left);
2163                         }
2164                         return 0;
2165                 }
2166                 btrfs_tree_unlock(left);
2167                 free_extent_buffer(left);
2168         }
2169         right = read_node_slot(root, parent, pslot + 1);
2170
2171         /*
2172          * then try to empty the right most buffer into the middle
2173          */
2174         if (right) {
2175                 u32 right_nr;
2176
2177                 btrfs_tree_lock(right);
2178                 btrfs_set_lock_blocking(right);
2179
2180                 right_nr = btrfs_header_nritems(right);
2181                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2182                         wret = 1;
2183                 } else {
2184                         ret = btrfs_cow_block(trans, root, right,
2185                                               parent, pslot + 1,
2186                                               &right);
2187                         if (ret)
2188                                 wret = 1;
2189                         else {
2190                                 wret = balance_node_right(trans, root,
2191                                                           right, mid);
2192                         }
2193                 }
2194                 if (wret < 0)
2195                         ret = wret;
2196                 if (wret == 0) {
2197                         struct btrfs_disk_key disk_key;
2198
2199                         btrfs_node_key(right, &disk_key, 0);
2200                         tree_mod_log_set_node_key(root->fs_info, parent,
2201                                                   pslot + 1, 0);
2202                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2203                         btrfs_mark_buffer_dirty(parent);
2204
2205                         if (btrfs_header_nritems(mid) <= orig_slot) {
2206                                 path->nodes[level] = right;
2207                                 path->slots[level + 1] += 1;
2208                                 path->slots[level] = orig_slot -
2209                                         btrfs_header_nritems(mid);
2210                                 btrfs_tree_unlock(mid);
2211                                 free_extent_buffer(mid);
2212                         } else {
2213                                 btrfs_tree_unlock(right);
2214                                 free_extent_buffer(right);
2215                         }
2216                         return 0;
2217                 }
2218                 btrfs_tree_unlock(right);
2219                 free_extent_buffer(right);
2220         }
2221         return 1;
2222 }
2223
2224 /*
2225  * readahead one full node of leaves, finding things that are close
2226  * to the block in 'slot', and triggering ra on them.
2227  */
2228 static void reada_for_search(struct btrfs_root *root,
2229                              struct btrfs_path *path,
2230                              int level, int slot, u64 objectid)
2231 {
2232         struct extent_buffer *node;
2233         struct btrfs_disk_key disk_key;
2234         u32 nritems;
2235         u64 search;
2236         u64 target;
2237         u64 nread = 0;
2238         u64 gen;
2239         int direction = path->reada;
2240         struct extent_buffer *eb;
2241         u32 nr;
2242         u32 blocksize;
2243         u32 nscan = 0;
2244
2245         if (level != 1)
2246                 return;
2247
2248         if (!path->nodes[level])
2249                 return;
2250
2251         node = path->nodes[level];
2252
2253         search = btrfs_node_blockptr(node, slot);
2254         blocksize = root->nodesize;
2255         eb = btrfs_find_tree_block(root, search);
2256         if (eb) {
2257                 free_extent_buffer(eb);
2258                 return;
2259         }
2260
2261         target = search;
2262
2263         nritems = btrfs_header_nritems(node);
2264         nr = slot;
2265
2266         while (1) {
2267                 if (direction < 0) {
2268                         if (nr == 0)
2269                                 break;
2270                         nr--;
2271                 } else if (direction > 0) {
2272                         nr++;
2273                         if (nr >= nritems)
2274                                 break;
2275                 }
2276                 if (path->reada < 0 && objectid) {
2277                         btrfs_node_key(node, &disk_key, nr);
2278                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2279                                 break;
2280                 }
2281                 search = btrfs_node_blockptr(node, nr);
2282                 if ((search <= target && target - search <= 65536) ||
2283                     (search > target && search - target <= 65536)) {
2284                         gen = btrfs_node_ptr_generation(node, nr);
2285                         readahead_tree_block(root, search, blocksize);
2286                         nread += blocksize;
2287                 }
2288                 nscan++;
2289                 if ((nread > 65536 || nscan > 32))
2290                         break;
2291         }
2292 }
2293
2294 static noinline void reada_for_balance(struct btrfs_root *root,
2295                                        struct btrfs_path *path, int level)
2296 {
2297         int slot;
2298         int nritems;
2299         struct extent_buffer *parent;
2300         struct extent_buffer *eb;
2301         u64 gen;
2302         u64 block1 = 0;
2303         u64 block2 = 0;
2304         int blocksize;
2305
2306         parent = path->nodes[level + 1];
2307         if (!parent)
2308                 return;
2309
2310         nritems = btrfs_header_nritems(parent);
2311         slot = path->slots[level + 1];
2312         blocksize = root->nodesize;
2313
2314         if (slot > 0) {
2315                 block1 = btrfs_node_blockptr(parent, slot - 1);
2316                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2317                 eb = btrfs_find_tree_block(root, block1);
2318                 /*
2319                  * if we get -eagain from btrfs_buffer_uptodate, we
2320                  * don't want to return eagain here.  That will loop
2321                  * forever
2322                  */
2323                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2324                         block1 = 0;
2325                 free_extent_buffer(eb);
2326         }
2327         if (slot + 1 < nritems) {
2328                 block2 = btrfs_node_blockptr(parent, slot + 1);
2329                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2330                 eb = btrfs_find_tree_block(root, block2);
2331                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2332                         block2 = 0;
2333                 free_extent_buffer(eb);
2334         }
2335
2336         if (block1)
2337                 readahead_tree_block(root, block1, blocksize);
2338         if (block2)
2339                 readahead_tree_block(root, block2, blocksize);
2340 }
2341
2342
2343 /*
2344  * when we walk down the tree, it is usually safe to unlock the higher layers
2345  * in the tree.  The exceptions are when our path goes through slot 0, because
2346  * operations on the tree might require changing key pointers higher up in the
2347  * tree.
2348  *
2349  * callers might also have set path->keep_locks, which tells this code to keep
2350  * the lock if the path points to the last slot in the block.  This is part of
2351  * walking through the tree, and selecting the next slot in the higher block.
2352  *
2353  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2354  * if lowest_unlock is 1, level 0 won't be unlocked
2355  */
2356 static noinline void unlock_up(struct btrfs_path *path, int level,
2357                                int lowest_unlock, int min_write_lock_level,
2358                                int *write_lock_level)
2359 {
2360         int i;
2361         int skip_level = level;
2362         int no_skips = 0;
2363         struct extent_buffer *t;
2364
2365         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2366                 if (!path->nodes[i])
2367                         break;
2368                 if (!path->locks[i])
2369                         break;
2370                 if (!no_skips && path->slots[i] == 0) {
2371                         skip_level = i + 1;
2372                         continue;
2373                 }
2374                 if (!no_skips && path->keep_locks) {
2375                         u32 nritems;
2376                         t = path->nodes[i];
2377                         nritems = btrfs_header_nritems(t);
2378                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2379                                 skip_level = i + 1;
2380                                 continue;
2381                         }
2382                 }
2383                 if (skip_level < i && i >= lowest_unlock)
2384                         no_skips = 1;
2385
2386                 t = path->nodes[i];
2387                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2388                         btrfs_tree_unlock_rw(t, path->locks[i]);
2389                         path->locks[i] = 0;
2390                         if (write_lock_level &&
2391                             i > min_write_lock_level &&
2392                             i <= *write_lock_level) {
2393                                 *write_lock_level = i - 1;
2394                         }
2395                 }
2396         }
2397 }
2398
2399 /*
2400  * This releases any locks held in the path starting at level and
2401  * going all the way up to the root.
2402  *
2403  * btrfs_search_slot will keep the lock held on higher nodes in a few
2404  * corner cases, such as COW of the block at slot zero in the node.  This
2405  * ignores those rules, and it should only be called when there are no
2406  * more updates to be done higher up in the tree.
2407  */
2408 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2409 {
2410         int i;
2411
2412         if (path->keep_locks)
2413                 return;
2414
2415         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2416                 if (!path->nodes[i])
2417                         continue;
2418                 if (!path->locks[i])
2419                         continue;
2420                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2421                 path->locks[i] = 0;
2422         }
2423 }
2424
2425 /*
2426  * helper function for btrfs_search_slot.  The goal is to find a block
2427  * in cache without setting the path to blocking.  If we find the block
2428  * we return zero and the path is unchanged.
2429  *
2430  * If we can't find the block, we set the path blocking and do some
2431  * reada.  -EAGAIN is returned and the search must be repeated.
2432  */
2433 static int
2434 read_block_for_search(struct btrfs_trans_handle *trans,
2435                        struct btrfs_root *root, struct btrfs_path *p,
2436                        struct extent_buffer **eb_ret, int level, int slot,
2437                        struct btrfs_key *key, u64 time_seq)
2438 {
2439         u64 blocknr;
2440         u64 gen;
2441         struct extent_buffer *b = *eb_ret;
2442         struct extent_buffer *tmp;
2443         int ret;
2444
2445         blocknr = btrfs_node_blockptr(b, slot);
2446         gen = btrfs_node_ptr_generation(b, slot);
2447
2448         tmp = btrfs_find_tree_block(root, blocknr);
2449         if (tmp) {
2450                 /* first we do an atomic uptodate check */
2451                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2452                         *eb_ret = tmp;
2453                         return 0;
2454                 }
2455
2456                 /* the pages were up to date, but we failed
2457                  * the generation number check.  Do a full
2458                  * read for the generation number that is correct.
2459                  * We must do this without dropping locks so
2460                  * we can trust our generation number
2461                  */
2462                 btrfs_set_path_blocking(p);
2463
2464                 /* now we're allowed to do a blocking uptodate check */
2465                 ret = btrfs_read_buffer(tmp, gen);
2466                 if (!ret) {
2467                         *eb_ret = tmp;
2468                         return 0;
2469                 }
2470                 free_extent_buffer(tmp);
2471                 btrfs_release_path(p);
2472                 return -EIO;
2473         }
2474
2475         /*
2476          * reduce lock contention at high levels
2477          * of the btree by dropping locks before
2478          * we read.  Don't release the lock on the current
2479          * level because we need to walk this node to figure
2480          * out which blocks to read.
2481          */
2482         btrfs_unlock_up_safe(p, level + 1);
2483         btrfs_set_path_blocking(p);
2484
2485         free_extent_buffer(tmp);
2486         if (p->reada)
2487                 reada_for_search(root, p, level, slot, key->objectid);
2488
2489         btrfs_release_path(p);
2490
2491         ret = -EAGAIN;
2492         tmp = read_tree_block(root, blocknr, 0);
2493         if (tmp) {
2494                 /*
2495                  * If the read above didn't mark this buffer up to date,
2496                  * it will never end up being up to date.  Set ret to EIO now
2497                  * and give up so that our caller doesn't loop forever
2498                  * on our EAGAINs.
2499                  */
2500                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2501                         ret = -EIO;
2502                 free_extent_buffer(tmp);
2503         }
2504         return ret;
2505 }
2506
2507 /*
2508  * helper function for btrfs_search_slot.  This does all of the checks
2509  * for node-level blocks and does any balancing required based on
2510  * the ins_len.
2511  *
2512  * If no extra work was required, zero is returned.  If we had to
2513  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2514  * start over
2515  */
2516 static int
2517 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2518                        struct btrfs_root *root, struct btrfs_path *p,
2519                        struct extent_buffer *b, int level, int ins_len,
2520                        int *write_lock_level)
2521 {
2522         int ret;
2523         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2524             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2525                 int sret;
2526
2527                 if (*write_lock_level < level + 1) {
2528                         *write_lock_level = level + 1;
2529                         btrfs_release_path(p);
2530                         goto again;
2531                 }
2532
2533                 btrfs_set_path_blocking(p);
2534                 reada_for_balance(root, p, level);
2535                 sret = split_node(trans, root, p, level);
2536                 btrfs_clear_path_blocking(p, NULL, 0);
2537
2538                 BUG_ON(sret > 0);
2539                 if (sret) {
2540                         ret = sret;
2541                         goto done;
2542                 }
2543                 b = p->nodes[level];
2544         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2545                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2546                 int sret;
2547
2548                 if (*write_lock_level < level + 1) {
2549                         *write_lock_level = level + 1;
2550                         btrfs_release_path(p);
2551                         goto again;
2552                 }
2553
2554                 btrfs_set_path_blocking(p);
2555                 reada_for_balance(root, p, level);
2556                 sret = balance_level(trans, root, p, level);
2557                 btrfs_clear_path_blocking(p, NULL, 0);
2558
2559                 if (sret) {
2560                         ret = sret;
2561                         goto done;
2562                 }
2563                 b = p->nodes[level];
2564                 if (!b) {
2565                         btrfs_release_path(p);
2566                         goto again;
2567                 }
2568                 BUG_ON(btrfs_header_nritems(b) == 1);
2569         }
2570         return 0;
2571
2572 again:
2573         ret = -EAGAIN;
2574 done:
2575         return ret;
2576 }
2577
2578 static void key_search_validate(struct extent_buffer *b,
2579                                 struct btrfs_key *key,
2580                                 int level)
2581 {
2582 #ifdef CONFIG_BTRFS_ASSERT
2583         struct btrfs_disk_key disk_key;
2584
2585         btrfs_cpu_key_to_disk(&disk_key, key);
2586
2587         if (level == 0)
2588                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2589                     offsetof(struct btrfs_leaf, items[0].key),
2590                     sizeof(disk_key)));
2591         else
2592                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2593                     offsetof(struct btrfs_node, ptrs[0].key),
2594                     sizeof(disk_key)));
2595 #endif
2596 }
2597
2598 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2599                       int level, int *prev_cmp, int *slot)
2600 {
2601         if (*prev_cmp != 0) {
2602                 *prev_cmp = bin_search(b, key, level, slot);
2603                 return *prev_cmp;
2604         }
2605
2606         key_search_validate(b, key, level);
2607         *slot = 0;
2608
2609         return 0;
2610 }
2611
2612 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2613                 u64 iobjectid, u64 ioff, u8 key_type,
2614                 struct btrfs_key *found_key)
2615 {
2616         int ret;
2617         struct btrfs_key key;
2618         struct extent_buffer *eb;
2619
2620         ASSERT(path);
2621
2622         key.type = key_type;
2623         key.objectid = iobjectid;
2624         key.offset = ioff;
2625
2626         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2627         if ((ret < 0) || (found_key == NULL))
2628                 return ret;
2629
2630         eb = path->nodes[0];
2631         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2632                 ret = btrfs_next_leaf(fs_root, path);
2633                 if (ret)
2634                         return ret;
2635                 eb = path->nodes[0];
2636         }
2637
2638         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2639         if (found_key->type != key.type ||
2640                         found_key->objectid != key.objectid)
2641                 return 1;
2642
2643         return 0;
2644 }
2645
2646 /*
2647  * look for key in the tree.  path is filled in with nodes along the way
2648  * if key is found, we return zero and you can find the item in the leaf
2649  * level of the path (level 0)
2650  *
2651  * If the key isn't found, the path points to the slot where it should
2652  * be inserted, and 1 is returned.  If there are other errors during the
2653  * search a negative error number is returned.
2654  *
2655  * if ins_len > 0, nodes and leaves will be split as we walk down the
2656  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2657  * possible)
2658  */
2659 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2660                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2661                       ins_len, int cow)
2662 {
2663         struct extent_buffer *b;
2664         int slot;
2665         int ret;
2666         int err;
2667         int level;
2668         int lowest_unlock = 1;
2669         int root_lock;
2670         /* everything at write_lock_level or lower must be write locked */
2671         int write_lock_level = 0;
2672         u8 lowest_level = 0;
2673         int min_write_lock_level;
2674         int prev_cmp;
2675
2676         lowest_level = p->lowest_level;
2677         WARN_ON(lowest_level && ins_len > 0);
2678         WARN_ON(p->nodes[0] != NULL);
2679         BUG_ON(!cow && ins_len);
2680
2681         if (ins_len < 0) {
2682                 lowest_unlock = 2;
2683
2684                 /* when we are removing items, we might have to go up to level
2685                  * two as we update tree pointers  Make sure we keep write
2686                  * for those levels as well
2687                  */
2688                 write_lock_level = 2;
2689         } else if (ins_len > 0) {
2690                 /*
2691                  * for inserting items, make sure we have a write lock on
2692                  * level 1 so we can update keys
2693                  */
2694                 write_lock_level = 1;
2695         }
2696
2697         if (!cow)
2698                 write_lock_level = -1;
2699
2700         if (cow && (p->keep_locks || p->lowest_level))
2701                 write_lock_level = BTRFS_MAX_LEVEL;
2702
2703         min_write_lock_level = write_lock_level;
2704
2705 again:
2706         prev_cmp = -1;
2707         /*
2708          * we try very hard to do read locks on the root
2709          */
2710         root_lock = BTRFS_READ_LOCK;
2711         level = 0;
2712         if (p->search_commit_root) {
2713                 /*
2714                  * the commit roots are read only
2715                  * so we always do read locks
2716                  */
2717                 if (p->need_commit_sem)
2718                         down_read(&root->fs_info->commit_root_sem);
2719                 b = root->commit_root;
2720                 extent_buffer_get(b);
2721                 level = btrfs_header_level(b);
2722                 if (p->need_commit_sem)
2723                         up_read(&root->fs_info->commit_root_sem);
2724                 if (!p->skip_locking)
2725                         btrfs_tree_read_lock(b);
2726         } else {
2727                 if (p->skip_locking) {
2728                         b = btrfs_root_node(root);
2729                         level = btrfs_header_level(b);
2730                 } else {
2731                         /* we don't know the level of the root node
2732                          * until we actually have it read locked
2733                          */
2734                         b = btrfs_read_lock_root_node(root);
2735                         level = btrfs_header_level(b);
2736                         if (level <= write_lock_level) {
2737                                 /* whoops, must trade for write lock */
2738                                 btrfs_tree_read_unlock(b);
2739                                 free_extent_buffer(b);
2740                                 b = btrfs_lock_root_node(root);
2741                                 root_lock = BTRFS_WRITE_LOCK;
2742
2743                                 /* the level might have changed, check again */
2744                                 level = btrfs_header_level(b);
2745                         }
2746                 }
2747         }
2748         p->nodes[level] = b;
2749         if (!p->skip_locking)
2750                 p->locks[level] = root_lock;
2751
2752         while (b) {
2753                 level = btrfs_header_level(b);
2754
2755                 /*
2756                  * setup the path here so we can release it under lock
2757                  * contention with the cow code
2758                  */
2759                 if (cow) {
2760                         /*
2761                          * if we don't really need to cow this block
2762                          * then we don't want to set the path blocking,
2763                          * so we test it here
2764                          */
2765                         if (!should_cow_block(trans, root, b))
2766                                 goto cow_done;
2767
2768                         /*
2769                          * must have write locks on this node and the
2770                          * parent
2771                          */
2772                         if (level > write_lock_level ||
2773                             (level + 1 > write_lock_level &&
2774                             level + 1 < BTRFS_MAX_LEVEL &&
2775                             p->nodes[level + 1])) {
2776                                 write_lock_level = level + 1;
2777                                 btrfs_release_path(p);
2778                                 goto again;
2779                         }
2780
2781                         btrfs_set_path_blocking(p);
2782                         err = btrfs_cow_block(trans, root, b,
2783                                               p->nodes[level + 1],
2784                                               p->slots[level + 1], &b);
2785                         if (err) {
2786                                 ret = err;
2787                                 goto done;
2788                         }
2789                 }
2790 cow_done:
2791                 p->nodes[level] = b;
2792                 btrfs_clear_path_blocking(p, NULL, 0);
2793
2794                 /*
2795                  * we have a lock on b and as long as we aren't changing
2796                  * the tree, there is no way to for the items in b to change.
2797                  * It is safe to drop the lock on our parent before we
2798                  * go through the expensive btree search on b.
2799                  *
2800                  * If we're inserting or deleting (ins_len != 0), then we might
2801                  * be changing slot zero, which may require changing the parent.
2802                  * So, we can't drop the lock until after we know which slot
2803                  * we're operating on.
2804                  */
2805                 if (!ins_len && !p->keep_locks) {
2806                         int u = level + 1;
2807
2808                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2809                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2810                                 p->locks[u] = 0;
2811                         }
2812                 }
2813
2814                 ret = key_search(b, key, level, &prev_cmp, &slot);
2815
2816                 if (level != 0) {
2817                         int dec = 0;
2818                         if (ret && slot > 0) {
2819                                 dec = 1;
2820                                 slot -= 1;
2821                         }
2822                         p->slots[level] = slot;
2823                         err = setup_nodes_for_search(trans, root, p, b, level,
2824                                              ins_len, &write_lock_level);
2825                         if (err == -EAGAIN)
2826                                 goto again;
2827                         if (err) {
2828                                 ret = err;
2829                                 goto done;
2830                         }
2831                         b = p->nodes[level];
2832                         slot = p->slots[level];
2833
2834                         /*
2835                          * slot 0 is special, if we change the key
2836                          * we have to update the parent pointer
2837                          * which means we must have a write lock
2838                          * on the parent
2839                          */
2840                         if (slot == 0 && ins_len &&
2841                             write_lock_level < level + 1) {
2842                                 write_lock_level = level + 1;
2843                                 btrfs_release_path(p);
2844                                 goto again;
2845                         }
2846
2847                         unlock_up(p, level, lowest_unlock,
2848                                   min_write_lock_level, &write_lock_level);
2849
2850                         if (level == lowest_level) {
2851                                 if (dec)
2852                                         p->slots[level]++;
2853                                 goto done;
2854                         }
2855
2856                         err = read_block_for_search(trans, root, p,
2857                                                     &b, level, slot, key, 0);
2858                         if (err == -EAGAIN)
2859                                 goto again;
2860                         if (err) {
2861                                 ret = err;
2862                                 goto done;
2863                         }
2864
2865                         if (!p->skip_locking) {
2866                                 level = btrfs_header_level(b);
2867                                 if (level <= write_lock_level) {
2868                                         err = btrfs_try_tree_write_lock(b);
2869                                         if (!err) {
2870                                                 btrfs_set_path_blocking(p);
2871                                                 btrfs_tree_lock(b);
2872                                                 btrfs_clear_path_blocking(p, b,
2873                                                                   BTRFS_WRITE_LOCK);
2874                                         }
2875                                         p->locks[level] = BTRFS_WRITE_LOCK;
2876                                 } else {
2877                                         err = btrfs_tree_read_lock_atomic(b);
2878                                         if (!err) {
2879                                                 btrfs_set_path_blocking(p);
2880                                                 btrfs_tree_read_lock(b);
2881                                                 btrfs_clear_path_blocking(p, b,
2882                                                                   BTRFS_READ_LOCK);
2883                                         }
2884                                         p->locks[level] = BTRFS_READ_LOCK;
2885                                 }
2886                                 p->nodes[level] = b;
2887                         }
2888                 } else {
2889                         p->slots[level] = slot;
2890                         if (ins_len > 0 &&
2891                             btrfs_leaf_free_space(root, b) < ins_len) {
2892                                 if (write_lock_level < 1) {
2893                                         write_lock_level = 1;
2894                                         btrfs_release_path(p);
2895                                         goto again;
2896                                 }
2897
2898                                 btrfs_set_path_blocking(p);
2899                                 err = split_leaf(trans, root, key,
2900                                                  p, ins_len, ret == 0);
2901                                 btrfs_clear_path_blocking(p, NULL, 0);
2902
2903                                 BUG_ON(err > 0);
2904                                 if (err) {
2905                                         ret = err;
2906                                         goto done;
2907                                 }
2908                         }
2909                         if (!p->search_for_split)
2910                                 unlock_up(p, level, lowest_unlock,
2911                                           min_write_lock_level, &write_lock_level);
2912                         goto done;
2913                 }
2914         }
2915         ret = 1;
2916 done:
2917         /*
2918          * we don't really know what they plan on doing with the path
2919          * from here on, so for now just mark it as blocking
2920          */
2921         if (!p->leave_spinning)
2922                 btrfs_set_path_blocking(p);
2923         if (ret < 0 && !p->skip_release_on_error)
2924                 btrfs_release_path(p);
2925         return ret;
2926 }
2927
2928 /*
2929  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2930  * current state of the tree together with the operations recorded in the tree
2931  * modification log to search for the key in a previous version of this tree, as
2932  * denoted by the time_seq parameter.
2933  *
2934  * Naturally, there is no support for insert, delete or cow operations.
2935  *
2936  * The resulting path and return value will be set up as if we called
2937  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2938  */
2939 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2940                           struct btrfs_path *p, u64 time_seq)
2941 {
2942         struct extent_buffer *b;
2943         int slot;
2944         int ret;
2945         int err;
2946         int level;
2947         int lowest_unlock = 1;
2948         u8 lowest_level = 0;
2949         int prev_cmp = -1;
2950
2951         lowest_level = p->lowest_level;
2952         WARN_ON(p->nodes[0] != NULL);
2953
2954         if (p->search_commit_root) {
2955                 BUG_ON(time_seq);
2956                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2957         }
2958
2959 again:
2960         b = get_old_root(root, time_seq);
2961         level = btrfs_header_level(b);
2962         p->locks[level] = BTRFS_READ_LOCK;
2963
2964         while (b) {
2965                 level = btrfs_header_level(b);
2966                 p->nodes[level] = b;
2967                 btrfs_clear_path_blocking(p, NULL, 0);
2968
2969                 /*
2970                  * we have a lock on b and as long as we aren't changing
2971                  * the tree, there is no way to for the items in b to change.
2972                  * It is safe to drop the lock on our parent before we
2973                  * go through the expensive btree search on b.
2974                  */
2975                 btrfs_unlock_up_safe(p, level + 1);
2976
2977                 /*
2978                  * Since we can unwind eb's we want to do a real search every
2979                  * time.
2980                  */
2981                 prev_cmp = -1;
2982                 ret = key_search(b, key, level, &prev_cmp, &slot);
2983
2984                 if (level != 0) {
2985                         int dec = 0;
2986                         if (ret && slot > 0) {
2987                                 dec = 1;
2988                                 slot -= 1;
2989                         }
2990                         p->slots[level] = slot;
2991                         unlock_up(p, level, lowest_unlock, 0, NULL);
2992
2993                         if (level == lowest_level) {
2994                                 if (dec)
2995                                         p->slots[level]++;
2996                                 goto done;
2997                         }
2998
2999                         err = read_block_for_search(NULL, root, p, &b, level,
3000                                                     slot, key, time_seq);
3001                         if (err == -EAGAIN)
3002                                 goto again;
3003                         if (err) {
3004                                 ret = err;
3005                                 goto done;
3006                         }
3007
3008                         level = btrfs_header_level(b);
3009                         err = btrfs_tree_read_lock_atomic(b);
3010                         if (!err) {
3011                                 btrfs_set_path_blocking(p);
3012                                 btrfs_tree_read_lock(b);
3013                                 btrfs_clear_path_blocking(p, b,
3014                                                           BTRFS_READ_LOCK);
3015                         }
3016                         b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3017                         if (!b) {
3018                                 ret = -ENOMEM;
3019                                 goto done;
3020                         }
3021                         p->locks[level] = BTRFS_READ_LOCK;
3022                         p->nodes[level] = b;
3023                 } else {
3024                         p->slots[level] = slot;
3025                         unlock_up(p, level, lowest_unlock, 0, NULL);
3026                         goto done;
3027                 }
3028         }
3029         ret = 1;
3030 done:
3031         if (!p->leave_spinning)
3032                 btrfs_set_path_blocking(p);
3033         if (ret < 0)
3034                 btrfs_release_path(p);
3035
3036         return ret;
3037 }
3038
3039 /*
3040  * helper to use instead of search slot if no exact match is needed but
3041  * instead the next or previous item should be returned.
3042  * When find_higher is true, the next higher item is returned, the next lower
3043  * otherwise.
3044  * When return_any and find_higher are both true, and no higher item is found,
3045  * return the next lower instead.
3046  * When return_any is true and find_higher is false, and no lower item is found,
3047  * return the next higher instead.
3048  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3049  * < 0 on error
3050  */
3051 int btrfs_search_slot_for_read(struct btrfs_root *root,
3052                                struct btrfs_key *key, struct btrfs_path *p,
3053                                int find_higher, int return_any)
3054 {
3055         int ret;
3056         struct extent_buffer *leaf;
3057
3058 again:
3059         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3060         if (ret <= 0)
3061                 return ret;
3062         /*
3063          * a return value of 1 means the path is at the position where the
3064          * item should be inserted. Normally this is the next bigger item,
3065          * but in case the previous item is the last in a leaf, path points
3066          * to the first free slot in the previous leaf, i.e. at an invalid
3067          * item.
3068          */
3069         leaf = p->nodes[0];
3070
3071         if (find_higher) {
3072                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3073                         ret = btrfs_next_leaf(root, p);
3074                         if (ret <= 0)
3075                                 return ret;
3076                         if (!return_any)
3077                                 return 1;
3078                         /*
3079                          * no higher item found, return the next
3080                          * lower instead
3081                          */
3082                         return_any = 0;
3083                         find_higher = 0;
3084                         btrfs_release_path(p);
3085                         goto again;
3086                 }
3087         } else {
3088                 if (p->slots[0] == 0) {
3089                         ret = btrfs_prev_leaf(root, p);
3090                         if (ret < 0)
3091                                 return ret;
3092                         if (!ret) {
3093                                 leaf = p->nodes[0];
3094                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3095                                         p->slots[0]--;
3096                                 return 0;
3097                         }
3098                         if (!return_any)
3099                                 return 1;
3100                         /*
3101                          * no lower item found, return the next
3102                          * higher instead
3103                          */
3104                         return_any = 0;
3105                         find_higher = 1;
3106                         btrfs_release_path(p);
3107                         goto again;
3108                 } else {
3109                         --p->slots[0];
3110                 }
3111         }
3112         return 0;
3113 }
3114
3115 /*
3116  * adjust the pointers going up the tree, starting at level
3117  * making sure the right key of each node is points to 'key'.
3118  * This is used after shifting pointers to the left, so it stops
3119  * fixing up pointers when a given leaf/node is not in slot 0 of the
3120  * higher levels
3121  *
3122  */
3123 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3124                            struct btrfs_disk_key *key, int level)
3125 {
3126         int i;
3127         struct extent_buffer *t;
3128
3129         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3130                 int tslot = path->slots[i];
3131                 if (!path->nodes[i])
3132                         break;
3133                 t = path->nodes[i];
3134                 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
3135                 btrfs_set_node_key(t, key, tslot);
3136                 btrfs_mark_buffer_dirty(path->nodes[i]);
3137                 if (tslot != 0)
3138                         break;
3139         }
3140 }
3141
3142 /*
3143  * update item key.
3144  *
3145  * This function isn't completely safe. It's the caller's responsibility
3146  * that the new key won't break the order
3147  */
3148 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3149                              struct btrfs_key *new_key)
3150 {
3151         struct btrfs_disk_key disk_key;
3152         struct extent_buffer *eb;
3153         int slot;
3154
3155         eb = path->nodes[0];
3156         slot = path->slots[0];
3157         if (slot > 0) {
3158                 btrfs_item_key(eb, &disk_key, slot - 1);
3159                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3160         }
3161         if (slot < btrfs_header_nritems(eb) - 1) {
3162                 btrfs_item_key(eb, &disk_key, slot + 1);
3163                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3164         }
3165
3166         btrfs_cpu_key_to_disk(&disk_key, new_key);
3167         btrfs_set_item_key(eb, &disk_key, slot);
3168         btrfs_mark_buffer_dirty(eb);
3169         if (slot == 0)
3170                 fixup_low_keys(root, path, &disk_key, 1);
3171 }
3172
3173 /*
3174  * try to push data from one node into the next node left in the
3175  * tree.
3176  *
3177  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3178  * error, and > 0 if there was no room in the left hand block.
3179  */
3180 static int push_node_left(struct btrfs_trans_handle *trans,
3181                           struct btrfs_root *root, struct extent_buffer *dst,
3182                           struct extent_buffer *src, int empty)
3183 {
3184         int push_items = 0;
3185         int src_nritems;
3186         int dst_nritems;
3187         int ret = 0;
3188
3189         src_nritems = btrfs_header_nritems(src);
3190         dst_nritems = btrfs_header_nritems(dst);
3191         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3192         WARN_ON(btrfs_header_generation(src) != trans->transid);
3193         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3194
3195         if (!empty && src_nritems <= 8)
3196                 return 1;
3197
3198         if (push_items <= 0)
3199                 return 1;
3200
3201         if (empty) {
3202                 push_items = min(src_nritems, push_items);
3203                 if (push_items < src_nritems) {
3204                         /* leave at least 8 pointers in the node if
3205                          * we aren't going to empty it
3206                          */
3207                         if (src_nritems - push_items < 8) {
3208                                 if (push_items <= 8)
3209                                         return 1;
3210                                 push_items -= 8;
3211                         }
3212                 }
3213         } else
3214                 push_items = min(src_nritems - 8, push_items);
3215
3216         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3217                                    push_items);
3218         if (ret) {
3219                 btrfs_abort_transaction(trans, root, ret);
3220                 return ret;
3221         }
3222         copy_extent_buffer(dst, src,
3223                            btrfs_node_key_ptr_offset(dst_nritems),
3224                            btrfs_node_key_ptr_offset(0),
3225                            push_items * sizeof(struct btrfs_key_ptr));
3226
3227         if (push_items < src_nritems) {
3228                 /*
3229                  * don't call tree_mod_log_eb_move here, key removal was already
3230                  * fully logged by tree_mod_log_eb_copy above.
3231                  */
3232                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3233                                       btrfs_node_key_ptr_offset(push_items),
3234                                       (src_nritems - push_items) *
3235                                       sizeof(struct btrfs_key_ptr));
3236         }
3237         btrfs_set_header_nritems(src, src_nritems - push_items);
3238         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3239         btrfs_mark_buffer_dirty(src);
3240         btrfs_mark_buffer_dirty(dst);
3241
3242         return ret;
3243 }
3244
3245 /*
3246  * try to push data from one node into the next node right in the
3247  * tree.
3248  *
3249  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3250  * error, and > 0 if there was no room in the right hand block.
3251  *
3252  * this will  only push up to 1/2 the contents of the left node over
3253  */
3254 static int balance_node_right(struct btrfs_trans_handle *trans,
3255                               struct btrfs_root *root,
3256                               struct extent_buffer *dst,
3257                               struct extent_buffer *src)
3258 {
3259         int push_items = 0;
3260         int max_push;
3261         int src_nritems;
3262         int dst_nritems;
3263         int ret = 0;
3264
3265         WARN_ON(btrfs_header_generation(src) != trans->transid);
3266         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3267
3268         src_nritems = btrfs_header_nritems(src);
3269         dst_nritems = btrfs_header_nritems(dst);
3270         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3271         if (push_items <= 0)
3272                 return 1;
3273
3274         if (src_nritems < 4)
3275                 return 1;
3276
3277         max_push = src_nritems / 2 + 1;
3278         /* don't try to empty the node */
3279         if (max_push >= src_nritems)
3280                 return 1;
3281
3282         if (max_push < push_items)
3283                 push_items = max_push;
3284
3285         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3286         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3287                                       btrfs_node_key_ptr_offset(0),
3288                                       (dst_nritems) *
3289                                       sizeof(struct btrfs_key_ptr));
3290
3291         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3292                                    src_nritems - push_items, push_items);
3293         if (ret) {
3294                 btrfs_abort_transaction(trans, root, ret);
3295                 return ret;
3296         }
3297         copy_extent_buffer(dst, src,
3298                            btrfs_node_key_ptr_offset(0),
3299                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3300                            push_items * sizeof(struct btrfs_key_ptr));
3301
3302         btrfs_set_header_nritems(src, src_nritems - push_items);
3303         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3304
3305         btrfs_mark_buffer_dirty(src);
3306         btrfs_mark_buffer_dirty(dst);
3307
3308         return ret;
3309 }
3310
3311 /*
3312  * helper function to insert a new root level in the tree.
3313  * A new node is allocated, and a single item is inserted to
3314  * point to the existing root
3315  *
3316  * returns zero on success or < 0 on failure.
3317  */
3318 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3319                            struct btrfs_root *root,
3320                            struct btrfs_path *path, int level)
3321 {
3322         u64 lower_gen;
3323         struct extent_buffer *lower;
3324         struct extent_buffer *c;
3325         struct extent_buffer *old;
3326         struct btrfs_disk_key lower_key;
3327
3328         BUG_ON(path->nodes[level]);
3329         BUG_ON(path->nodes[level-1] != root->node);
3330
3331         lower = path->nodes[level-1];
3332         if (level == 1)
3333                 btrfs_item_key(lower, &lower_key, 0);
3334         else
3335                 btrfs_node_key(lower, &lower_key, 0);
3336
3337         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3338                                    &lower_key, level, root->node->start, 0);
3339         if (IS_ERR(c))
3340                 return PTR_ERR(c);
3341
3342         root_add_used(root, root->nodesize);
3343
3344         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3345         btrfs_set_header_nritems(c, 1);
3346         btrfs_set_header_level(c, level);
3347         btrfs_set_header_bytenr(c, c->start);
3348         btrfs_set_header_generation(c, trans->transid);
3349         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3350         btrfs_set_header_owner(c, root->root_key.objectid);
3351
3352         write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3353                             BTRFS_FSID_SIZE);
3354
3355         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3356                             btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3357
3358         btrfs_set_node_key(c, &lower_key, 0);
3359         btrfs_set_node_blockptr(c, 0, lower->start);
3360         lower_gen = btrfs_header_generation(lower);
3361         WARN_ON(lower_gen != trans->transid);
3362
3363         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3364
3365         btrfs_mark_buffer_dirty(c);
3366
3367         old = root->node;
3368         tree_mod_log_set_root_pointer(root, c, 0);
3369         rcu_assign_pointer(root->node, c);
3370
3371         /* the super has an extra ref to root->node */
3372         free_extent_buffer(old);
3373
3374         add_root_to_dirty_list(root);
3375         extent_buffer_get(c);
3376         path->nodes[level] = c;
3377         path->locks[level] = BTRFS_WRITE_LOCK;
3378         path->slots[level] = 0;
3379         return 0;
3380 }
3381
3382 /*
3383  * worker function to insert a single pointer in a node.
3384  * the node should have enough room for the pointer already
3385  *
3386  * slot and level indicate where you want the key to go, and
3387  * blocknr is the block the key points to.
3388  */
3389 static void insert_ptr(struct btrfs_trans_handle *trans,
3390                        struct btrfs_root *root, struct btrfs_path *path,
3391                        struct btrfs_disk_key *key, u64 bytenr,
3392                        int slot, int level)
3393 {
3394         struct extent_buffer *lower;
3395         int nritems;
3396         int ret;
3397
3398         BUG_ON(!path->nodes[level]);
3399         btrfs_assert_tree_locked(path->nodes[level]);
3400         lower = path->nodes[level];
3401         nritems = btrfs_header_nritems(lower);
3402         BUG_ON(slot > nritems);
3403         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3404         if (slot != nritems) {
3405                 if (level)
3406                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3407                                              slot, nritems - slot);
3408                 memmove_extent_buffer(lower,
3409                               btrfs_node_key_ptr_offset(slot + 1),
3410                               btrfs_node_key_ptr_offset(slot),
3411                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3412         }
3413         if (level) {
3414                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3415                                               MOD_LOG_KEY_ADD, GFP_NOFS);
3416                 BUG_ON(ret < 0);
3417         }
3418         btrfs_set_node_key(lower, key, slot);
3419         btrfs_set_node_blockptr(lower, slot, bytenr);
3420         WARN_ON(trans->transid == 0);
3421         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3422         btrfs_set_header_nritems(lower, nritems + 1);
3423         btrfs_mark_buffer_dirty(lower);
3424 }
3425
3426 /*
3427  * split the node at the specified level in path in two.
3428  * The path is corrected to point to the appropriate node after the split
3429  *
3430  * Before splitting this tries to make some room in the node by pushing
3431  * left and right, if either one works, it returns right away.
3432  *
3433  * returns 0 on success and < 0 on failure
3434  */
3435 static noinline int split_node(struct btrfs_trans_handle *trans,
3436                                struct btrfs_root *root,
3437                                struct btrfs_path *path, int level)
3438 {
3439         struct extent_buffer *c;
3440         struct extent_buffer *split;
3441         struct btrfs_disk_key disk_key;
3442         int mid;
3443         int ret;
3444         u32 c_nritems;
3445
3446         c = path->nodes[level];
3447         WARN_ON(btrfs_header_generation(c) != trans->transid);
3448         if (c == root->node) {
3449                 /*
3450                  * trying to split the root, lets make a new one
3451                  *
3452                  * tree mod log: We don't log_removal old root in
3453                  * insert_new_root, because that root buffer will be kept as a
3454                  * normal node. We are going to log removal of half of the
3455                  * elements below with tree_mod_log_eb_copy. We're holding a
3456                  * tree lock on the buffer, which is why we cannot race with
3457                  * other tree_mod_log users.
3458                  */
3459                 ret = insert_new_root(trans, root, path, level + 1);
3460                 if (ret)
3461                         return ret;
3462         } else {
3463                 ret = push_nodes_for_insert(trans, root, path, level);
3464                 c = path->nodes[level];
3465                 if (!ret && btrfs_header_nritems(c) <
3466                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3467                         return 0;
3468                 if (ret < 0)
3469                         return ret;
3470         }
3471
3472         c_nritems = btrfs_header_nritems(c);
3473         mid = (c_nritems + 1) / 2;
3474         btrfs_node_key(c, &disk_key, mid);
3475
3476         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3477                         &disk_key, level, c->start, 0);
3478         if (IS_ERR(split))
3479                 return PTR_ERR(split);
3480
3481         root_add_used(root, root->nodesize);
3482
3483         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3484         btrfs_set_header_level(split, btrfs_header_level(c));
3485         btrfs_set_header_bytenr(split, split->start);
3486         btrfs_set_header_generation(split, trans->transid);
3487         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3488         btrfs_set_header_owner(split, root->root_key.objectid);
3489         write_extent_buffer(split, root->fs_info->fsid,
3490                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
3491         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3492                             btrfs_header_chunk_tree_uuid(split),
3493                             BTRFS_UUID_SIZE);
3494
3495         ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3496                                    mid, c_nritems - mid);
3497         if (ret) {
3498                 btrfs_abort_transaction(trans, root, ret);
3499                 return ret;
3500         }
3501         copy_extent_buffer(split, c,
3502                            btrfs_node_key_ptr_offset(0),
3503                            btrfs_node_key_ptr_offset(mid),
3504                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3505         btrfs_set_header_nritems(split, c_nritems - mid);
3506         btrfs_set_header_nritems(c, mid);
3507         ret = 0;
3508
3509         btrfs_mark_buffer_dirty(c);
3510         btrfs_mark_buffer_dirty(split);
3511
3512         insert_ptr(trans, root, path, &disk_key, split->start,
3513                    path->slots[level + 1] + 1, level + 1);
3514
3515         if (path->slots[level] >= mid) {
3516                 path->slots[level] -= mid;
3517                 btrfs_tree_unlock(c);
3518                 free_extent_buffer(c);
3519                 path->nodes[level] = split;
3520                 path->slots[level + 1] += 1;
3521         } else {
3522                 btrfs_tree_unlock(split);
3523                 free_extent_buffer(split);
3524         }
3525         return ret;
3526 }
3527
3528 /*
3529  * how many bytes are required to store the items in a leaf.  start
3530  * and nr indicate which items in the leaf to check.  This totals up the
3531  * space used both by the item structs and the item data
3532  */
3533 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3534 {
3535         struct btrfs_item *start_item;
3536         struct btrfs_item *end_item;
3537         struct btrfs_map_token token;
3538         int data_len;
3539         int nritems = btrfs_header_nritems(l);
3540         int end = min(nritems, start + nr) - 1;
3541
3542         if (!nr)
3543                 return 0;
3544         btrfs_init_map_token(&token);
3545         start_item = btrfs_item_nr(start);
3546         end_item = btrfs_item_nr(end);
3547         data_len = btrfs_token_item_offset(l, start_item, &token) +
3548                 btrfs_token_item_size(l, start_item, &token);
3549         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3550         data_len += sizeof(struct btrfs_item) * nr;
3551         WARN_ON(data_len < 0);
3552         return data_len;
3553 }
3554
3555 /*
3556  * The space between the end of the leaf items and
3557  * the start of the leaf data.  IOW, how much room
3558  * the leaf has left for both items and data
3559  */
3560 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3561                                    struct extent_buffer *leaf)
3562 {
3563         int nritems = btrfs_header_nritems(leaf);
3564         int ret;
3565         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3566         if (ret < 0) {
3567                 btrfs_crit(root->fs_info,
3568                         "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3569                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3570                        leaf_space_used(leaf, 0, nritems), nritems);
3571         }
3572         return ret;
3573 }
3574
3575 /*
3576  * min slot controls the lowest index we're willing to push to the
3577  * right.  We'll push up to and including min_slot, but no lower
3578  */
3579 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3580                                       struct btrfs_root *root,
3581                                       struct btrfs_path *path,
3582                                       int data_size, int empty,
3583                                       struct extent_buffer *right,
3584                                       int free_space, u32 left_nritems,
3585                                       u32 min_slot)
3586 {
3587         struct extent_buffer *left = path->nodes[0];
3588         struct extent_buffer *upper = path->nodes[1];
3589         struct btrfs_map_token token;
3590         struct btrfs_disk_key disk_key;
3591         int slot;
3592         u32 i;
3593         int push_space = 0;
3594         int push_items = 0;
3595         struct btrfs_item *item;
3596         u32 nr;
3597         u32 right_nritems;
3598         u32 data_end;
3599         u32 this_item_size;
3600
3601         btrfs_init_map_token(&token);
3602
3603         if (empty)
3604                 nr = 0;
3605         else
3606                 nr = max_t(u32, 1, min_slot);
3607
3608         if (path->slots[0] >= left_nritems)
3609                 push_space += data_size;
3610
3611         slot = path->slots[1];
3612         i = left_nritems - 1;
3613         while (i >= nr) {
3614                 item = btrfs_item_nr(i);
3615
3616                 if (!empty && push_items > 0) {
3617                         if (path->slots[0] > i)
3618                                 break;
3619                         if (path->slots[0] == i) {
3620                                 int space = btrfs_leaf_free_space(root, left);
3621                                 if (space + push_space * 2 > free_space)
3622                                         break;
3623                         }
3624                 }
3625
3626                 if (path->slots[0] == i)
3627                         push_space += data_size;
3628
3629                 this_item_size = btrfs_item_size(left, item);
3630                 if (this_item_size + sizeof(*item) + push_space > free_space)
3631                         break;
3632
3633                 push_items++;
3634                 push_space += this_item_size + sizeof(*item);
3635                 if (i == 0)
3636                         break;
3637                 i--;
3638         }
3639
3640         if (push_items == 0)
3641                 goto out_unlock;
3642
3643         WARN_ON(!empty && push_items == left_nritems);
3644
3645         /* push left to right */
3646         right_nritems = btrfs_header_nritems(right);
3647
3648         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3649         push_space -= leaf_data_end(root, left);
3650
3651         /* make room in the right data area */
3652         data_end = leaf_data_end(root, right);
3653         memmove_extent_buffer(right,
3654                               btrfs_leaf_data(right) + data_end - push_space,
3655                               btrfs_leaf_data(right) + data_end,
3656                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3657
3658         /* copy from the left data area */
3659         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3660                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3661                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3662                      push_space);
3663
3664         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3665                               btrfs_item_nr_offset(0),
3666                               right_nritems * sizeof(struct btrfs_item));
3667
3668         /* copy the items from left to right */
3669         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3670                    btrfs_item_nr_offset(left_nritems - push_items),
3671                    push_items * sizeof(struct btrfs_item));
3672
3673         /* update the item pointers */
3674         right_nritems += push_items;
3675         btrfs_set_header_nritems(right, right_nritems);
3676         push_space = BTRFS_LEAF_DATA_SIZE(root);
3677         for (i = 0; i < right_nritems; i++) {
3678                 item = btrfs_item_nr(i);
3679                 push_space -= btrfs_token_item_size(right, item, &token);
3680                 btrfs_set_token_item_offset(right, item, push_space, &token);
3681         }
3682
3683         left_nritems -= push_items;
3684         btrfs_set_header_nritems(left, left_nritems);
3685
3686         if (left_nritems)
3687                 btrfs_mark_buffer_dirty(left);
3688         else
3689                 clean_tree_block(trans, root, left);
3690
3691         btrfs_mark_buffer_dirty(right);
3692
3693         btrfs_item_key(right, &disk_key, 0);
3694         btrfs_set_node_key(upper, &disk_key, slot + 1);
3695         btrfs_mark_buffer_dirty(upper);
3696
3697         /* then fixup the leaf pointer in the path */
3698         if (path->slots[0] >= left_nritems) {
3699                 path->slots[0] -= left_nritems;
3700                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3701                         clean_tree_block(trans, root, path->nodes[0]);
3702                 btrfs_tree_unlock(path->nodes[0]);
3703                 free_extent_buffer(path->nodes[0]);
3704                 path->nodes[0] = right;
3705                 path->slots[1] += 1;
3706         } else {
3707                 btrfs_tree_unlock(right);
3708                 free_extent_buffer(right);
3709         }
3710         return 0;
3711
3712 out_unlock:
3713         btrfs_tree_unlock(right);
3714         free_extent_buffer(right);
3715         return 1;
3716 }
3717
3718 /*
3719  * push some data in the path leaf to the right, trying to free up at
3720  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3721  *
3722  * returns 1 if the push failed because the other node didn't have enough
3723  * room, 0 if everything worked out and < 0 if there were major errors.
3724  *
3725  * this will push starting from min_slot to the end of the leaf.  It won't
3726  * push any slot lower than min_slot
3727  */
3728 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3729                            *root, struct btrfs_path *path,
3730                            int min_data_size, int data_size,
3731                            int empty, u32 min_slot)
3732 {
3733         struct extent_buffer *left = path->nodes[0];
3734         struct extent_buffer *right;
3735         struct extent_buffer *upper;
3736         int slot;
3737         int free_space;
3738         u32 left_nritems;
3739         int ret;
3740
3741         if (!path->nodes[1])
3742                 return 1;
3743
3744         slot = path->slots[1];
3745         upper = path->nodes[1];
3746         if (slot >= btrfs_header_nritems(upper) - 1)
3747                 return 1;
3748
3749         btrfs_assert_tree_locked(path->nodes[1]);
3750
3751         right = read_node_slot(root, upper, slot + 1);
3752         if (right == NULL)
3753                 return 1;
3754
3755         btrfs_tree_lock(right);
3756         btrfs_set_lock_blocking(right);
3757
3758         free_space = btrfs_leaf_free_space(root, right);
3759         if (free_space < data_size)
3760                 goto out_unlock;
3761
3762         /* cow and double check */
3763         ret = btrfs_cow_block(trans, root, right, upper,
3764                               slot + 1, &right);
3765         if (ret)
3766                 goto out_unlock;
3767
3768         free_space = btrfs_leaf_free_space(root, right);
3769         if (free_space < data_size)
3770                 goto out_unlock;
3771
3772         left_nritems = btrfs_header_nritems(left);
3773         if (left_nritems == 0)
3774                 goto out_unlock;
3775
3776         if (path->slots[0] == left_nritems && !empty) {
3777                 /* Key greater than all keys in the leaf, right neighbor has
3778                  * enough room for it and we're not emptying our leaf to delete
3779                  * it, therefore use right neighbor to insert the new item and
3780                  * no need to touch/dirty our left leaft. */
3781                 btrfs_tree_unlock(left);
3782                 free_extent_buffer(left);
3783                 path->nodes[0] = right;
3784                 path->slots[0] = 0;
3785                 path->slots[1]++;
3786                 return 0;
3787         }
3788
3789         return __push_leaf_right(trans, root, path, min_data_size, empty,
3790                                 right, free_space, left_nritems, min_slot);
3791 out_unlock:
3792         btrfs_tree_unlock(right);
3793         free_extent_buffer(right);
3794         return 1;
3795 }
3796
3797 /*
3798  * push some data in the path leaf to the left, trying to free up at
3799  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3800  *
3801  * max_slot can put a limit on how far into the leaf we'll push items.  The
3802  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3803  * items
3804  */
3805 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3806                                      struct btrfs_root *root,
3807                                      struct btrfs_path *path, int data_size,
3808                                      int empty, struct extent_buffer *left,
3809                                      int free_space, u32 right_nritems,
3810                                      u32 max_slot)
3811 {
3812         struct btrfs_disk_key disk_key;
3813         struct extent_buffer *right = path->nodes[0];
3814         int i;
3815         int push_space = 0;
3816         int push_items = 0;
3817         struct btrfs_item *item;
3818         u32 old_left_nritems;
3819         u32 nr;
3820         int ret = 0;
3821         u32 this_item_size;
3822         u32 old_left_item_size;
3823         struct btrfs_map_token token;
3824
3825         btrfs_init_map_token(&token);
3826
3827         if (empty)
3828                 nr = min(right_nritems, max_slot);
3829         else
3830                 nr = min(right_nritems - 1, max_slot);
3831
3832         for (i = 0; i < nr; i++) {
3833                 item = btrfs_item_nr(i);
3834
3835                 if (!empty && push_items > 0) {
3836                         if (path->slots[0] < i)
3837                                 break;
3838                         if (path->slots[0] == i) {
3839                                 int space = btrfs_leaf_free_space(root, right);
3840                                 if (space + push_space * 2 > free_space)
3841                                         break;
3842                         }
3843                 }
3844
3845                 if (path->slots[0] == i)
3846                         push_space += data_size;
3847
3848                 this_item_size = btrfs_item_size(right, item);
3849                 if (this_item_size + sizeof(*item) + push_space > free_space)
3850                         break;
3851
3852                 push_items++;
3853                 push_space += this_item_size + sizeof(*item);
3854         }
3855
3856         if (push_items == 0) {
3857                 ret = 1;
3858                 goto out;
3859         }
3860         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3861
3862         /* push data from right to left */
3863         copy_extent_buffer(left, right,
3864                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3865                            btrfs_item_nr_offset(0),
3866                            push_items * sizeof(struct btrfs_item));
3867
3868         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3869                      btrfs_item_offset_nr(right, push_items - 1);
3870
3871         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3872                      leaf_data_end(root, left) - push_space,
3873                      btrfs_leaf_data(right) +
3874                      btrfs_item_offset_nr(right, push_items - 1),
3875                      push_space);
3876         old_left_nritems = btrfs_header_nritems(left);
3877         BUG_ON(old_left_nritems <= 0);
3878
3879         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3880         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3881                 u32 ioff;
3882
3883                 item = btrfs_item_nr(i);
3884
3885                 ioff = btrfs_token_item_offset(left, item, &token);
3886                 btrfs_set_token_item_offset(left, item,
3887                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3888                       &token);
3889         }
3890         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3891
3892         /* fixup right node */
3893         if (push_items > right_nritems)
3894                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3895                        right_nritems);
3896
3897         if (push_items < right_nritems) {
3898                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3899                                                   leaf_data_end(root, right);
3900                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3901                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3902                                       btrfs_leaf_data(right) +
3903                                       leaf_data_end(root, right), push_space);
3904
3905                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3906                               btrfs_item_nr_offset(push_items),
3907                              (btrfs_header_nritems(right) - push_items) *
3908                              sizeof(struct btrfs_item));
3909         }
3910         right_nritems -= push_items;
3911         btrfs_set_header_nritems(right, right_nritems);
3912         push_space = BTRFS_LEAF_DATA_SIZE(root);
3913         for (i = 0; i < right_nritems; i++) {
3914                 item = btrfs_item_nr(i);
3915
3916                 push_space = push_space - btrfs_token_item_size(right,
3917                                                                 item, &token);
3918                 btrfs_set_token_item_offset(right, item, push_space, &token);
3919         }
3920
3921         btrfs_mark_buffer_dirty(left);
3922         if (right_nritems)
3923                 btrfs_mark_buffer_dirty(right);
3924         else
3925                 clean_tree_block(trans, root, right);
3926
3927         btrfs_item_key(right, &disk_key, 0);
3928         fixup_low_keys(root, path, &disk_key, 1);
3929
3930         /* then fixup the leaf pointer in the path */
3931         if (path->slots[0] < push_items) {
3932                 path->slots[0] += old_left_nritems;
3933                 btrfs_tree_unlock(path->nodes[0]);
3934                 free_extent_buffer(path->nodes[0]);
3935                 path->nodes[0] = left;
3936                 path->slots[1] -= 1;
3937         } else {
3938                 btrfs_tree_unlock(left);
3939                 free_extent_buffer(left);
3940                 path->slots[0] -= push_items;
3941         }
3942         BUG_ON(path->slots[0] < 0);
3943         return ret;
3944 out:
3945         btrfs_tree_unlock(left);
3946         free_extent_buffer(left);
3947         return ret;
3948 }
3949
3950 /*
3951  * push some data in the path leaf to the left, trying to free up at
3952  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3953  *
3954  * max_slot can put a limit on how far into the leaf we'll push items.  The
3955  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3956  * items
3957  */
3958 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3959                           *root, struct btrfs_path *path, int min_data_size,
3960                           int data_size, int empty, u32 max_slot)
3961 {
3962         struct extent_buffer *right = path->nodes[0];
3963         struct extent_buffer *left;
3964         int slot;
3965         int free_space;
3966         u32 right_nritems;
3967         int ret = 0;
3968
3969         slot = path->slots[1];
3970         if (slot == 0)
3971                 return 1;
3972         if (!path->nodes[1])
3973                 return 1;
3974
3975         right_nritems = btrfs_header_nritems(right);
3976         if (right_nritems == 0)
3977                 return 1;
3978
3979         btrfs_assert_tree_locked(path->nodes[1]);
3980
3981         left = read_node_slot(root, path->nodes[1], slot - 1);
3982         if (left == NULL)
3983                 return 1;
3984
3985         btrfs_tree_lock(left);
3986         btrfs_set_lock_blocking(left);
3987
3988         free_space = btrfs_leaf_free_space(root, left);
3989         if (free_space < data_size) {
3990                 ret = 1;
3991                 goto out;
3992         }
3993
3994         /* cow and double check */
3995         ret = btrfs_cow_block(trans, root, left,
3996                               path->nodes[1], slot - 1, &left);
3997         if (ret) {
3998                 /* we hit -ENOSPC, but it isn't fatal here */
3999                 if (ret == -ENOSPC)
4000                         ret = 1;
4001                 goto out;
4002         }
4003
4004         free_space = btrfs_leaf_free_space(root, left);
4005         if (free_space < data_size) {
4006                 ret = 1;
4007                 goto out;
4008         }
4009
4010         return __push_leaf_left(trans, root, path, min_data_size,
4011                                empty, left, free_space, right_nritems,
4012                                max_slot);
4013 out:
4014         btrfs_tree_unlock(left);
4015         free_extent_buffer(left);
4016         return ret;
4017 }
4018
4019 /*
4020  * split the path's leaf in two, making sure there is at least data_size
4021  * available for the resulting leaf level of the path.
4022  */
4023 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4024                                     struct btrfs_root *root,
4025                                     struct btrfs_path *path,
4026                                     struct extent_buffer *l,
4027                                     struct extent_buffer *right,
4028                                     int slot, int mid, int nritems)
4029 {
4030         int data_copy_size;
4031         int rt_data_off;
4032         int i;
4033         struct btrfs_disk_key disk_key;
4034         struct btrfs_map_token token;
4035
4036         btrfs_init_map_token(&token);
4037
4038         nritems = nritems - mid;
4039         btrfs_set_header_nritems(right, nritems);
4040         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4041
4042         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4043                            btrfs_item_nr_offset(mid),
4044                            nritems * sizeof(struct btrfs_item));
4045
4046         copy_extent_buffer(right, l,
4047                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4048                      data_copy_size, btrfs_leaf_data(l) +
4049                      leaf_data_end(root, l), data_copy_size);
4050
4051         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4052                       btrfs_item_end_nr(l, mid);
4053
4054         for (i = 0; i < nritems; i++) {
4055                 struct btrfs_item *item = btrfs_item_nr(i);
4056                 u32 ioff;
4057
4058                 ioff = btrfs_token_item_offset(right, item, &token);
4059                 btrfs_set_token_item_offset(right, item,
4060                                             ioff + rt_data_off, &token);
4061         }
4062
4063         btrfs_set_header_nritems(l, mid);
4064         btrfs_item_key(right, &disk_key, 0);
4065         insert_ptr(trans, root, path, &disk_key, right->start,
4066                    path->slots[1] + 1, 1);
4067
4068         btrfs_mark_buffer_dirty(right);
4069         btrfs_mark_buffer_dirty(l);
4070         BUG_ON(path->slots[0] != slot);
4071
4072         if (mid <= slot) {
4073                 btrfs_tree_unlock(path->nodes[0]);
4074                 free_extent_buffer(path->nodes[0]);
4075                 path->nodes[0] = right;
4076                 path->slots[0] -= mid;
4077                 path->slots[1] += 1;
4078         } else {
4079                 btrfs_tree_unlock(right);
4080                 free_extent_buffer(right);
4081         }
4082
4083         BUG_ON(path->slots[0] < 0);
4084 }
4085
4086 /*
4087  * double splits happen when we need to insert a big item in the middle
4088  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4089  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4090  *          A                 B                 C
4091  *
4092  * We avoid this by trying to push the items on either side of our target
4093  * into the adjacent leaves.  If all goes well we can avoid the double split
4094  * completely.
4095  */
4096 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4097                                           struct btrfs_root *root,
4098                                           struct btrfs_path *path,
4099                                           int data_size)
4100 {
4101         int ret;
4102         int progress = 0;
4103         int slot;
4104         u32 nritems;
4105         int space_needed = data_size;
4106
4107         slot = path->slots[0];
4108         if (slot < btrfs_header_nritems(path->nodes[0]))
4109                 space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4110
4111         /*
4112          * try to push all the items after our slot into the
4113          * right leaf
4114          */
4115         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4116         if (ret < 0)
4117                 return ret;
4118
4119         if (ret == 0)
4120                 progress++;
4121
4122         nritems = btrfs_header_nritems(path->nodes[0]);
4123         /*
4124          * our goal is to get our slot at the start or end of a leaf.  If
4125          * we've done so we're done
4126          */
4127         if (path->slots[0] == 0 || path->slots[0] == nritems)
4128                 return 0;
4129
4130         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4131                 return 0;
4132
4133         /* try to push all the items before our slot into the next leaf */
4134         slot = path->slots[0];
4135         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4136         if (ret < 0)
4137                 return ret;
4138
4139         if (ret == 0)
4140                 progress++;
4141
4142         if (progress)
4143                 return 0;
4144         return 1;
4145 }
4146
4147 /*
4148  * split the path's leaf in two, making sure there is at least data_size
4149  * available for the resulting leaf level of the path.
4150  *
4151  * returns 0 if all went well and < 0 on failure.
4152  */
4153 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4154                                struct btrfs_root *root,
4155                                struct btrfs_key *ins_key,
4156                                struct btrfs_path *path, int data_size,
4157                                int extend)
4158 {
4159         struct btrfs_disk_key disk_key;
4160         struct extent_buffer *l;
4161         u32 nritems;
4162         int mid;
4163         int slot;
4164         struct extent_buffer *right;
4165         int ret = 0;
4166         int wret;
4167         int split;
4168         int num_doubles = 0;
4169         int tried_avoid_double = 0;
4170
4171         l = path->nodes[0];
4172         slot = path->slots[0];
4173         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4174             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4175                 return -EOVERFLOW;
4176
4177         /* first try to make some room by pushing left and right */
4178         if (data_size && path->nodes[1]) {
4179                 int space_needed = data_size;
4180
4181                 if (slot < btrfs_header_nritems(l))
4182                         space_needed -= btrfs_leaf_free_space(root, l);
4183
4184                 wret = push_leaf_right(trans, root, path, space_needed,
4185                                        space_needed, 0, 0);
4186                 if (wret < 0)
4187                         return wret;
4188                 if (wret) {
4189                         wret = push_leaf_left(trans, root, path, space_needed,
4190                                               space_needed, 0, (u32)-1);
4191                         if (wret < 0)
4192                                 return wret;
4193                 }
4194                 l = path->nodes[0];
4195
4196                 /* did the pushes work? */
4197                 if (btrfs_leaf_free_space(root, l) >= data_size)
4198                         return 0;
4199         }
4200
4201         if (!path->nodes[1]) {
4202                 ret = insert_new_root(trans, root, path, 1);
4203                 if (ret)
4204                         return ret;
4205         }
4206 again:
4207         split = 1;
4208         l = path->nodes[0];
4209         slot = path->slots[0];
4210         nritems = btrfs_header_nritems(l);
4211         mid = (nritems + 1) / 2;
4212
4213         if (mid <= slot) {
4214                 if (nritems == 1 ||
4215                     leaf_space_used(l, mid, nritems - mid) + data_size >
4216                         BTRFS_LEAF_DATA_SIZE(root)) {
4217                         if (slot >= nritems) {
4218                                 split = 0;
4219                         } else {
4220                                 mid = slot;
4221                                 if (mid != nritems &&
4222                                     leaf_space_used(l, mid, nritems - mid) +
4223                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4224                                         if (data_size && !tried_avoid_double)
4225                                                 goto push_for_double;
4226                                         split = 2;
4227                                 }
4228                         }
4229                 }
4230         } else {
4231                 if (leaf_space_used(l, 0, mid) + data_size >
4232                         BTRFS_LEAF_DATA_SIZE(root)) {
4233                         if (!extend && data_size && slot == 0) {
4234                                 split = 0;
4235                         } else if ((extend || !data_size) && slot == 0) {
4236                                 mid = 1;
4237                         } else {
4238                                 mid = slot;
4239                                 if (mid != nritems &&
4240                                     leaf_space_used(l, mid, nritems - mid) +
4241                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4242                                         if (data_size && !tried_avoid_double)
4243                                                 goto push_for_double;
4244                                         split = 2;
4245                                 }
4246                         }
4247                 }
4248         }
4249
4250         if (split == 0)
4251                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4252         else
4253                 btrfs_item_key(l, &disk_key, mid);
4254
4255         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4256                         &disk_key, 0, l->start, 0);
4257         if (IS_ERR(right))
4258                 return PTR_ERR(right);
4259
4260         root_add_used(root, root->nodesize);
4261
4262         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4263         btrfs_set_header_bytenr(right, right->start);
4264         btrfs_set_header_generation(right, trans->transid);
4265         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4266         btrfs_set_header_owner(right, root->root_key.objectid);
4267         btrfs_set_header_level(right, 0);
4268         write_extent_buffer(right, root->fs_info->fsid,
4269                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
4270
4271         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4272                             btrfs_header_chunk_tree_uuid(right),
4273                             BTRFS_UUID_SIZE);
4274
4275         if (split == 0) {
4276                 if (mid <= slot) {
4277                         btrfs_set_header_nritems(right, 0);
4278                         insert_ptr(trans, root, path, &disk_key, right->start,
4279                                    path->slots[1] + 1, 1);
4280                         btrfs_tree_unlock(path->nodes[0]);
4281                         free_extent_buffer(path->nodes[0]);
4282                         path->nodes[0] = right;
4283                         path->slots[0] = 0;
4284                         path->slots[1] += 1;
4285                 } else {
4286                         btrfs_set_header_nritems(right, 0);
4287                         insert_ptr(trans, root, path, &disk_key, right->start,
4288                                           path->slots[1], 1);
4289                         btrfs_tree_unlock(path->nodes[0]);
4290                         free_extent_buffer(path->nodes[0]);
4291                         path->nodes[0] = right;
4292                         path->slots[0] = 0;
4293                         if (path->slots[1] == 0)
4294                                 fixup_low_keys(root, path, &disk_key, 1);
4295                 }
4296                 btrfs_mark_buffer_dirty(right);
4297                 return ret;
4298         }
4299
4300         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4301
4302         if (split == 2) {
4303                 BUG_ON(num_doubles != 0);
4304                 num_doubles++;
4305                 goto again;
4306         }
4307
4308         return 0;
4309
4310 push_for_double:
4311         push_for_double_split(trans, root, path, data_size);
4312         tried_avoid_double = 1;
4313         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4314                 return 0;
4315         goto again;
4316 }
4317
4318 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4319                                          struct btrfs_root *root,
4320                                          struct btrfs_path *path, int ins_len)
4321 {
4322         struct btrfs_key key;
4323         struct extent_buffer *leaf;
4324         struct btrfs_file_extent_item *fi;
4325         u64 extent_len = 0;
4326         u32 item_size;
4327         int ret;
4328
4329         leaf = path->nodes[0];
4330         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4331
4332         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4333                key.type != BTRFS_EXTENT_CSUM_KEY);
4334
4335         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4336                 return 0;
4337
4338         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4339         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4340                 fi = btrfs_item_ptr(leaf, path->slots[0],
4341                                     struct btrfs_file_extent_item);
4342                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4343         }
4344         btrfs_release_path(path);
4345
4346         path->keep_locks = 1;
4347         path->search_for_split = 1;
4348         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4349         path->search_for_split = 0;
4350         if (ret < 0)
4351                 goto err;
4352
4353         ret = -EAGAIN;
4354         leaf = path->nodes[0];
4355         /* if our item isn't there or got smaller, return now */
4356         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4357                 goto err;
4358
4359         /* the leaf has  changed, it now has room.  return now */
4360         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4361                 goto err;
4362
4363         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4364                 fi = btrfs_item_ptr(leaf, path->slots[0],
4365                                     struct btrfs_file_extent_item);
4366                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4367                         goto err;
4368         }
4369
4370         btrfs_set_path_blocking(path);
4371         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4372         if (ret)
4373                 goto err;
4374
4375         path->keep_locks = 0;
4376         btrfs_unlock_up_safe(path, 1);
4377         return 0;
4378 err:
4379         path->keep_locks = 0;
4380         return ret;
4381 }
4382
4383 static noinline int split_item(struct btrfs_trans_handle *trans,
4384                                struct btrfs_root *root,
4385                                struct btrfs_path *path,
4386                                struct btrfs_key *new_key,
4387                                unsigned long split_offset)
4388 {
4389         struct extent_buffer *leaf;
4390         struct btrfs_item *item;
4391         struct btrfs_item *new_item;
4392         int slot;
4393         char *buf;
4394         u32 nritems;
4395         u32 item_size;
4396         u32 orig_offset;
4397         struct btrfs_disk_key disk_key;
4398
4399         leaf = path->nodes[0];
4400         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4401
4402         btrfs_set_path_blocking(path);
4403
4404         item = btrfs_item_nr(path->slots[0]);
4405         orig_offset = btrfs_item_offset(leaf, item);
4406         item_size = btrfs_item_size(leaf, item);
4407
4408         buf = kmalloc(item_size, GFP_NOFS);
4409         if (!buf)
4410                 return -ENOMEM;
4411
4412         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4413                             path->slots[0]), item_size);
4414
4415         slot = path->slots[0] + 1;
4416         nritems = btrfs_header_nritems(leaf);
4417         if (slot != nritems) {
4418                 /* shift the items */
4419                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4420                                 btrfs_item_nr_offset(slot),
4421                                 (nritems - slot) * sizeof(struct btrfs_item));
4422         }
4423
4424         btrfs_cpu_key_to_disk(&disk_key, new_key);
4425         btrfs_set_item_key(leaf, &disk_key, slot);
4426
4427         new_item = btrfs_item_nr(slot);
4428
4429         btrfs_set_item_offset(leaf, new_item, orig_offset);
4430         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4431
4432         btrfs_set_item_offset(leaf, item,
4433                               orig_offset + item_size - split_offset);
4434         btrfs_set_item_size(leaf, item, split_offset);
4435
4436         btrfs_set_header_nritems(leaf, nritems + 1);
4437
4438         /* write the data for the start of the original item */
4439         write_extent_buffer(leaf, buf,
4440                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4441                             split_offset);
4442
4443         /* write the data for the new item */
4444         write_extent_buffer(leaf, buf + split_offset,
4445                             btrfs_item_ptr_offset(leaf, slot),
4446                             item_size - split_offset);
4447         btrfs_mark_buffer_dirty(leaf);
4448
4449         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4450         kfree(buf);
4451         return 0;
4452 }
4453
4454 /*
4455  * This function splits a single item into two items,
4456  * giving 'new_key' to the new item and splitting the
4457  * old one at split_offset (from the start of the item).
4458  *
4459  * The path may be released by this operation.  After
4460  * the split, the path is pointing to the old item.  The
4461  * new item is going to be in the same node as the old one.
4462  *
4463  * Note, the item being split must be smaller enough to live alone on
4464  * a tree block with room for one extra struct btrfs_item
4465  *
4466  * This allows us to split the item in place, keeping a lock on the
4467  * leaf the entire time.
4468  */
4469 int btrfs_split_item(struct btrfs_trans_handle *trans,
4470                      struct btrfs_root *root,
4471                      struct btrfs_path *path,
4472                      struct btrfs_key *new_key,
4473                      unsigned long split_offset)
4474 {
4475         int ret;
4476         ret = setup_leaf_for_split(trans, root, path,
4477                                    sizeof(struct btrfs_item));
4478         if (ret)
4479                 return ret;
4480
4481         ret = split_item(trans, root, path, new_key, split_offset);
4482         return ret;
4483 }
4484
4485 /*
4486  * This function duplicate a item, giving 'new_key' to the new item.
4487  * It guarantees both items live in the same tree leaf and the new item
4488  * is contiguous with the original item.
4489  *
4490  * This allows us to split file extent in place, keeping a lock on the
4491  * leaf the entire time.
4492  */
4493 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4494                          struct btrfs_root *root,
4495                          struct btrfs_path *path,
4496                          struct btrfs_key *new_key)
4497 {
4498         struct extent_buffer *leaf;
4499         int ret;
4500         u32 item_size;
4501
4502         leaf = path->nodes[0];
4503         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4504         ret = setup_leaf_for_split(trans, root, path,
4505                                    item_size + sizeof(struct btrfs_item));
4506         if (ret)
4507                 return ret;
4508
4509         path->slots[0]++;
4510         setup_items_for_insert(root, path, new_key, &item_size,
4511                                item_size, item_size +
4512                                sizeof(struct btrfs_item), 1);
4513         leaf = path->nodes[0];
4514         memcpy_extent_buffer(leaf,
4515                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4516                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4517                              item_size);
4518         return 0;
4519 }
4520
4521 /*
4522  * make the item pointed to by the path smaller.  new_size indicates
4523  * how small to make it, and from_end tells us if we just chop bytes
4524  * off the end of the item or if we shift the item to chop bytes off
4525  * the front.
4526  */
4527 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4528                          u32 new_size, int from_end)
4529 {
4530         int slot;
4531         struct extent_buffer *leaf;
4532         struct btrfs_item *item;
4533         u32 nritems;
4534         unsigned int data_end;
4535         unsigned int old_data_start;
4536         unsigned int old_size;
4537         unsigned int size_diff;
4538         int i;
4539         struct btrfs_map_token token;
4540
4541         btrfs_init_map_token(&token);
4542
4543         leaf = path->nodes[0];
4544         slot = path->slots[0];
4545
4546         old_size = btrfs_item_size_nr(leaf, slot);
4547         if (old_size == new_size)
4548                 return;
4549
4550         nritems = btrfs_header_nritems(leaf);
4551         data_end = leaf_data_end(root, leaf);
4552
4553         old_data_start = btrfs_item_offset_nr(leaf, slot);
4554
4555         size_diff = old_size - new_size;
4556
4557         BUG_ON(slot < 0);
4558         BUG_ON(slot >= nritems);
4559
4560         /*
4561          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4562          */
4563         /* first correct the data pointers */
4564         for (i = slot; i < nritems; i++) {
4565                 u32 ioff;
4566                 item = btrfs_item_nr(i);
4567
4568                 ioff = btrfs_token_item_offset(leaf, item, &token);
4569                 btrfs_set_token_item_offset(leaf, item,
4570                                             ioff + size_diff, &token);
4571         }
4572
4573         /* shift the data */
4574         if (from_end) {
4575                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4576                               data_end + size_diff, btrfs_leaf_data(leaf) +
4577                               data_end, old_data_start + new_size - data_end);
4578         } else {
4579                 struct btrfs_disk_key disk_key;
4580                 u64 offset;
4581
4582                 btrfs_item_key(leaf, &disk_key, slot);
4583
4584                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4585                         unsigned long ptr;
4586                         struct btrfs_file_extent_item *fi;
4587
4588                         fi = btrfs_item_ptr(leaf, slot,
4589                                             struct btrfs_file_extent_item);
4590                         fi = (struct btrfs_file_extent_item *)(
4591                              (unsigned long)fi - size_diff);
4592
4593                         if (btrfs_file_extent_type(leaf, fi) ==
4594                             BTRFS_FILE_EXTENT_INLINE) {
4595                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4596                                 memmove_extent_buffer(leaf, ptr,
4597                                       (unsigned long)fi,
4598                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4599                         }
4600                 }
4601
4602                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4603                               data_end + size_diff, btrfs_leaf_data(leaf) +
4604                               data_end, old_data_start - data_end);
4605
4606                 offset = btrfs_disk_key_offset(&disk_key);
4607                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4608                 btrfs_set_item_key(leaf, &disk_key, slot);
4609                 if (slot == 0)
4610                         fixup_low_keys(root, path, &disk_key, 1);
4611         }
4612
4613         item = btrfs_item_nr(slot);
4614         btrfs_set_item_size(leaf, item, new_size);
4615         btrfs_mark_buffer_dirty(leaf);
4616
4617         if (btrfs_leaf_free_space(root, leaf) < 0) {
4618                 btrfs_print_leaf(root, leaf);
4619                 BUG();
4620         }
4621 }
4622
4623 /*
4624  * make the item pointed to by the path bigger, data_size is the added size.
4625  */
4626 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4627                        u32 data_size)
4628 {
4629         int slot;
4630         struct extent_buffer *leaf;
4631         struct btrfs_item *item;
4632         u32 nritems;
4633         unsigned int data_end;
4634         unsigned int old_data;
4635         unsigned int old_size;
4636         int i;
4637         struct btrfs_map_token token;
4638
4639         btrfs_init_map_token(&token);
4640
4641         leaf = path->nodes[0];
4642
4643         nritems = btrfs_header_nritems(leaf);
4644         data_end = leaf_data_end(root, leaf);
4645
4646         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4647                 btrfs_print_leaf(root, leaf);
4648                 BUG();
4649         }
4650         slot = path->slots[0];
4651         old_data = btrfs_item_end_nr(leaf, slot);
4652
4653         BUG_ON(slot < 0);
4654         if (slot >= nritems) {
4655                 btrfs_print_leaf(root, leaf);
4656                 btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4657                        slot, nritems);
4658                 BUG_ON(1);
4659         }
4660
4661         /*
4662          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4663          */
4664         /* first correct the data pointers */
4665         for (i = slot; i < nritems; i++) {
4666                 u32 ioff;
4667                 item = btrfs_item_nr(i);
4668
4669                 ioff = btrfs_token_item_offset(leaf, item, &token);
4670                 btrfs_set_token_item_offset(leaf, item,
4671                                             ioff - data_size, &token);
4672         }
4673
4674         /* shift the data */
4675         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4676                       data_end - data_size, btrfs_leaf_data(leaf) +
4677                       data_end, old_data - data_end);
4678
4679         data_end = old_data;
4680         old_size = btrfs_item_size_nr(leaf, slot);
4681         item = btrfs_item_nr(slot);
4682         btrfs_set_item_size(leaf, item, old_size + data_size);
4683         btrfs_mark_buffer_dirty(leaf);
4684
4685         if (btrfs_leaf_free_space(root, leaf) < 0) {
4686                 btrfs_print_leaf(root, leaf);
4687                 BUG();
4688         }
4689 }
4690
4691 /*
4692  * this is a helper for btrfs_insert_empty_items, the main goal here is
4693  * to save stack depth by doing the bulk of the work in a function
4694  * that doesn't call btrfs_search_slot
4695  */
4696 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4697                             struct btrfs_key *cpu_key, u32 *data_size,
4698                             u32 total_data, u32 total_size, int nr)
4699 {
4700         struct btrfs_item *item;
4701         int i;
4702         u32 nritems;
4703         unsigned int data_end;
4704         struct btrfs_disk_key disk_key;
4705         struct extent_buffer *leaf;
4706         int slot;
4707         struct btrfs_map_token token;
4708
4709         if (path->slots[0] == 0) {
4710                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4711                 fixup_low_keys(root, path, &disk_key, 1);
4712         }
4713         btrfs_unlock_up_safe(path, 1);
4714
4715         btrfs_init_map_token(&token);
4716
4717         leaf = path->nodes[0];
4718         slot = path->slots[0];
4719
4720         nritems = btrfs_header_nritems(leaf);
4721         data_end = leaf_data_end(root, leaf);
4722
4723         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4724                 btrfs_print_leaf(root, leaf);
4725                 btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4726                        total_size, btrfs_leaf_free_space(root, leaf));
4727                 BUG();
4728         }
4729
4730         if (slot != nritems) {
4731                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4732
4733                 if (old_data < data_end) {
4734                         btrfs_print_leaf(root, leaf);
4735                         btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4736                                slot, old_data, data_end);
4737                         BUG_ON(1);
4738                 }
4739                 /*
4740                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4741                  */
4742                 /* first correct the data pointers */
4743                 for (i = slot; i < nritems; i++) {
4744                         u32 ioff;
4745
4746                         item = btrfs_item_nr( i);
4747                         ioff = btrfs_token_item_offset(leaf, item, &token);
4748                         btrfs_set_token_item_offset(leaf, item,
4749                                                     ioff - total_data, &token);
4750                 }
4751                 /* shift the items */
4752                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4753                               btrfs_item_nr_offset(slot),
4754                               (nritems - slot) * sizeof(struct btrfs_item));
4755
4756                 /* shift the data */
4757                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4758                               data_end - total_data, btrfs_leaf_data(leaf) +
4759                               data_end, old_data - data_end);
4760                 data_end = old_data;
4761         }
4762
4763         /* setup the item for the new data */
4764         for (i = 0; i < nr; i++) {
4765                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4766                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4767                 item = btrfs_item_nr(slot + i);
4768                 btrfs_set_token_item_offset(leaf, item,
4769                                             data_end - data_size[i], &token);
4770                 data_end -= data_size[i];
4771                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4772         }
4773
4774         btrfs_set_header_nritems(leaf, nritems + nr);
4775         btrfs_mark_buffer_dirty(leaf);
4776
4777         if (btrfs_leaf_free_space(root, leaf) < 0) {
4778                 btrfs_print_leaf(root, leaf);
4779                 BUG();
4780         }
4781 }
4782
4783 /*
4784  * Given a key and some data, insert items into the tree.
4785  * This does all the path init required, making room in the tree if needed.
4786  */
4787 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4788                             struct btrfs_root *root,
4789                             struct btrfs_path *path,
4790                             struct btrfs_key *cpu_key, u32 *data_size,
4791                             int nr)
4792 {
4793         int ret = 0;
4794         int slot;
4795         int i;
4796         u32 total_size = 0;
4797         u32 total_data = 0;
4798
4799         for (i = 0; i < nr; i++)
4800                 total_data += data_size[i];
4801
4802         total_size = total_data + (nr * sizeof(struct btrfs_item));
4803         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4804         if (ret == 0)
4805                 return -EEXIST;
4806         if (ret < 0)
4807                 return ret;
4808
4809         slot = path->slots[0];
4810         BUG_ON(slot < 0);
4811
4812         setup_items_for_insert(root, path, cpu_key, data_size,
4813                                total_data, total_size, nr);
4814         return 0;
4815 }
4816
4817 /*
4818  * Given a key and some data, insert an item into the tree.
4819  * This does all the path init required, making room in the tree if needed.
4820  */
4821 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4822                       *root, struct btrfs_key *cpu_key, void *data, u32
4823                       data_size)
4824 {
4825         int ret = 0;
4826         struct btrfs_path *path;
4827         struct extent_buffer *leaf;
4828         unsigned long ptr;
4829
4830         path = btrfs_alloc_path();
4831         if (!path)
4832                 return -ENOMEM;
4833         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4834         if (!ret) {
4835                 leaf = path->nodes[0];
4836                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4837                 write_extent_buffer(leaf, data, ptr, data_size);
4838                 btrfs_mark_buffer_dirty(leaf);
4839         }
4840         btrfs_free_path(path);
4841         return ret;
4842 }
4843
4844 /*
4845  * delete the pointer from a given node.
4846  *
4847  * the tree should have been previously balanced so the deletion does not
4848  * empty a node.
4849  */
4850 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4851                     int level, int slot)
4852 {
4853         struct extent_buffer *parent = path->nodes[level];
4854         u32 nritems;
4855         int ret;
4856
4857         nritems = btrfs_header_nritems(parent);
4858         if (slot != nritems - 1) {
4859                 if (level)
4860                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4861                                              slot + 1, nritems - slot - 1);
4862                 memmove_extent_buffer(parent,
4863                               btrfs_node_key_ptr_offset(slot),
4864                               btrfs_node_key_ptr_offset(slot + 1),
4865                               sizeof(struct btrfs_key_ptr) *
4866                               (nritems - slot - 1));
4867         } else if (level) {
4868                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4869                                               MOD_LOG_KEY_REMOVE, GFP_NOFS);
4870                 BUG_ON(ret < 0);
4871         }
4872
4873         nritems--;
4874         btrfs_set_header_nritems(parent, nritems);
4875         if (nritems == 0 && parent == root->node) {
4876                 BUG_ON(btrfs_header_level(root->node) != 1);
4877                 /* just turn the root into a leaf and break */
4878                 btrfs_set_header_level(root->node, 0);
4879         } else if (slot == 0) {
4880                 struct btrfs_disk_key disk_key;
4881
4882                 btrfs_node_key(parent, &disk_key, 0);
4883                 fixup_low_keys(root, path, &disk_key, level + 1);
4884         }
4885         btrfs_mark_buffer_dirty(parent);
4886 }
4887
4888 /*
4889  * a helper function to delete the leaf pointed to by path->slots[1] and
4890  * path->nodes[1].
4891  *
4892  * This deletes the pointer in path->nodes[1] and frees the leaf
4893  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4894  *
4895  * The path must have already been setup for deleting the leaf, including
4896  * all the proper balancing.  path->nodes[1] must be locked.
4897  */
4898 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4899                                     struct btrfs_root *root,
4900                                     struct btrfs_path *path,
4901                                     struct extent_buffer *leaf)
4902 {
4903         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4904         del_ptr(root, path, 1, path->slots[1]);
4905
4906         /*
4907          * btrfs_free_extent is expensive, we want to make sure we
4908          * aren't holding any locks when we call it
4909          */
4910         btrfs_unlock_up_safe(path, 0);
4911
4912         root_sub_used(root, leaf->len);
4913
4914         extent_buffer_get(leaf);
4915         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4916         free_extent_buffer_stale(leaf);
4917 }
4918 /*
4919  * delete the item at the leaf level in path.  If that empties
4920  * the leaf, remove it from the tree
4921  */
4922 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4923                     struct btrfs_path *path, int slot, int nr)
4924 {
4925         struct extent_buffer *leaf;
4926         struct btrfs_item *item;
4927         int last_off;
4928         int dsize = 0;
4929         int ret = 0;
4930         int wret;
4931         int i;
4932         u32 nritems;
4933         struct btrfs_map_token token;
4934
4935         btrfs_init_map_token(&token);
4936
4937         leaf = path->nodes[0];
4938         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4939
4940         for (i = 0; i < nr; i++)
4941                 dsize += btrfs_item_size_nr(leaf, slot + i);
4942
4943         nritems = btrfs_header_nritems(leaf);
4944
4945         if (slot + nr != nritems) {
4946                 int data_end = leaf_data_end(root, leaf);
4947
4948                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4949                               data_end + dsize,
4950                               btrfs_leaf_data(leaf) + data_end,
4951                               last_off - data_end);
4952
4953                 for (i = slot + nr; i < nritems; i++) {
4954                         u32 ioff;
4955
4956                         item = btrfs_item_nr(i);
4957                         ioff = btrfs_token_item_offset(leaf, item, &token);
4958                         btrfs_set_token_item_offset(leaf, item,
4959                                                     ioff + dsize, &token);
4960                 }
4961
4962                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4963                               btrfs_item_nr_offset(slot + nr),
4964                               sizeof(struct btrfs_item) *
4965                               (nritems - slot - nr));
4966         }
4967         btrfs_set_header_nritems(leaf, nritems - nr);
4968         nritems -= nr;
4969
4970         /* delete the leaf if we've emptied it */
4971         if (nritems == 0) {
4972                 if (leaf == root->node) {
4973                         btrfs_set_header_level(leaf, 0);
4974                 } else {
4975                         btrfs_set_path_blocking(path);
4976                         clean_tree_block(trans, root, leaf);
4977                         btrfs_del_leaf(trans, root, path, leaf);
4978                 }
4979         } else {
4980                 int used = leaf_space_used(leaf, 0, nritems);
4981                 if (slot == 0) {
4982                         struct btrfs_disk_key disk_key;
4983
4984                         btrfs_item_key(leaf, &disk_key, 0);
4985                         fixup_low_keys(root, path, &disk_key, 1);
4986                 }
4987
4988                 /* delete the leaf if it is mostly empty */
4989                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4990                         /* push_leaf_left fixes the path.
4991                          * make sure the path still points to our leaf
4992                          * for possible call to del_ptr below
4993                          */
4994                         slot = path->slots[1];
4995                         extent_buffer_get(leaf);
4996
4997                         btrfs_set_path_blocking(path);
4998                         wret = push_leaf_left(trans, root, path, 1, 1,
4999                                               1, (u32)-1);
5000                         if (wret < 0 && wret != -ENOSPC)
5001                                 ret = wret;
5002
5003                         if (path->nodes[0] == leaf &&
5004                             btrfs_header_nritems(leaf)) {
5005                                 wret = push_leaf_right(trans, root, path, 1,
5006                                                        1, 1, 0);
5007                                 if (wret < 0 && wret != -ENOSPC)
5008                                         ret = wret;
5009                         }
5010
5011                         if (btrfs_header_nritems(leaf) == 0) {
5012                                 path->slots[1] = slot;
5013                                 btrfs_del_leaf(trans, root, path, leaf);
5014                                 free_extent_buffer(leaf);
5015                                 ret = 0;
5016                         } else {
5017                                 /* if we're still in the path, make sure
5018                                  * we're dirty.  Otherwise, one of the
5019                                  * push_leaf functions must have already
5020                                  * dirtied this buffer
5021                                  */
5022                                 if (path->nodes[0] == leaf)
5023                                         btrfs_mark_buffer_dirty(leaf);
5024                                 free_extent_buffer(leaf);
5025                         }
5026                 } else {
5027                         btrfs_mark_buffer_dirty(leaf);
5028                 }
5029         }
5030         return ret;
5031 }
5032
5033 /*
5034  * search the tree again to find a leaf with lesser keys
5035  * returns 0 if it found something or 1 if there are no lesser leaves.
5036  * returns < 0 on io errors.
5037  *
5038  * This may release the path, and so you may lose any locks held at the
5039  * time you call it.
5040  */
5041 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5042 {
5043         struct btrfs_key key;
5044         struct btrfs_disk_key found_key;
5045         int ret;
5046
5047         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5048
5049         if (key.offset > 0) {
5050                 key.offset--;
5051         } else if (key.type > 0) {
5052                 key.type--;
5053                 key.offset = (u64)-1;
5054         } else if (key.objectid > 0) {
5055                 key.objectid--;
5056                 key.type = (u8)-1;
5057                 key.offset = (u64)-1;
5058         } else {
5059                 return 1;
5060         }
5061
5062         btrfs_release_path(path);
5063         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5064         if (ret < 0)
5065                 return ret;
5066         btrfs_item_key(path->nodes[0], &found_key, 0);
5067         ret = comp_keys(&found_key, &key);
5068         /*
5069          * We might have had an item with the previous key in the tree right
5070          * before we released our path. And after we released our path, that
5071          * item might have been pushed to the first slot (0) of the leaf we
5072          * were holding due to a tree balance. Alternatively, an item with the
5073          * previous key can exist as the only element of a leaf (big fat item).
5074          * Therefore account for these 2 cases, so that our callers (like
5075          * btrfs_previous_item) don't miss an existing item with a key matching
5076          * the previous key we computed above.
5077          */
5078         if (ret <= 0)
5079                 return 0;
5080         return 1;
5081 }
5082
5083 /*
5084  * A helper function to walk down the tree starting at min_key, and looking
5085  * for nodes or leaves that are have a minimum transaction id.
5086  * This is used by the btree defrag code, and tree logging
5087  *
5088  * This does not cow, but it does stuff the starting key it finds back
5089  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5090  * key and get a writable path.
5091  *
5092  * This does lock as it descends, and path->keep_locks should be set
5093  * to 1 by the caller.
5094  *
5095  * This honors path->lowest_level to prevent descent past a given level
5096  * of the tree.
5097  *
5098  * min_trans indicates the oldest transaction that you are interested
5099  * in walking through.  Any nodes or leaves older than min_trans are
5100  * skipped over (without reading them).
5101  *
5102  * returns zero if something useful was found, < 0 on error and 1 if there
5103  * was nothing in the tree that matched the search criteria.
5104  */
5105 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5106                          struct btrfs_path *path,
5107                          u64 min_trans)
5108 {
5109         struct extent_buffer *cur;
5110         struct btrfs_key found_key;
5111         int slot;
5112         int sret;
5113         u32 nritems;
5114         int level;
5115         int ret = 1;
5116         int keep_locks = path->keep_locks;
5117
5118         path->keep_locks = 1;
5119 again:
5120         cur = btrfs_read_lock_root_node(root);
5121         level = btrfs_header_level(cur);
5122         WARN_ON(path->nodes[level]);
5123         path->nodes[level] = cur;
5124         path->locks[level] = BTRFS_READ_LOCK;
5125
5126         if (btrfs_header_generation(cur) < min_trans) {
5127                 ret = 1;
5128                 goto out;
5129         }
5130         while (1) {
5131                 nritems = btrfs_header_nritems(cur);
5132                 level = btrfs_header_level(cur);
5133                 sret = bin_search(cur, min_key, level, &slot);
5134
5135                 /* at the lowest level, we're done, setup the path and exit */
5136                 if (level == path->lowest_level) {
5137                         if (slot >= nritems)
5138                                 goto find_next_key;
5139                         ret = 0;
5140                         path->slots[level] = slot;
5141                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5142                         goto out;
5143                 }
5144                 if (sret && slot > 0)
5145                         slot--;
5146                 /*
5147                  * check this node pointer against the min_trans parameters.
5148                  * If it is too old, old, skip to the next one.
5149                  */
5150                 while (slot < nritems) {
5151                         u64 gen;
5152
5153                         gen = btrfs_node_ptr_generation(cur, slot);
5154                         if (gen < min_trans) {
5155                                 slot++;
5156                                 continue;
5157                         }
5158                         break;
5159                 }
5160 find_next_key:
5161                 /*
5162                  * we didn't find a candidate key in this node, walk forward
5163                  * and find another one
5164                  */
5165                 if (slot >= nritems) {
5166                         path->slots[level] = slot;
5167                         btrfs_set_path_blocking(path);
5168                         sret = btrfs_find_next_key(root, path, min_key, level,
5169                                                   min_trans);
5170                         if (sret == 0) {
5171                                 btrfs_release_path(path);
5172                                 goto again;
5173                         } else {
5174                                 goto out;
5175                         }
5176                 }
5177                 /* save our key for returning back */
5178                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5179                 path->slots[level] = slot;
5180                 if (level == path->lowest_level) {
5181                         ret = 0;
5182                         goto out;
5183                 }
5184                 btrfs_set_path_blocking(path);
5185                 cur = read_node_slot(root, cur, slot);
5186                 BUG_ON(!cur); /* -ENOMEM */
5187
5188                 btrfs_tree_read_lock(cur);
5189
5190                 path->locks[level - 1] = BTRFS_READ_LOCK;
5191                 path->nodes[level - 1] = cur;
5192                 unlock_up(path, level, 1, 0, NULL);
5193                 btrfs_clear_path_blocking(path, NULL, 0);
5194         }
5195 out:
5196         path->keep_locks = keep_locks;
5197         if (ret == 0) {
5198                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5199                 btrfs_set_path_blocking(path);
5200                 memcpy(min_key, &found_key, sizeof(found_key));
5201         }
5202         return ret;
5203 }
5204
5205 static void tree_move_down(struct btrfs_root *root,
5206                            struct btrfs_path *path,
5207                            int *level, int root_level)
5208 {
5209         BUG_ON(*level == 0);
5210         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5211                                         path->slots[*level]);
5212         path->slots[*level - 1] = 0;
5213         (*level)--;
5214 }
5215
5216 static int tree_move_next_or_upnext(struct btrfs_root *root,
5217                                     struct btrfs_path *path,
5218                                     int *level, int root_level)
5219 {
5220         int ret = 0;
5221         int nritems;
5222         nritems = btrfs_header_nritems(path->nodes[*level]);
5223
5224         path->slots[*level]++;
5225
5226         while (path->slots[*level] >= nritems) {
5227                 if (*level == root_level)
5228                         return -1;
5229
5230                 /* move upnext */
5231                 path->slots[*level] = 0;
5232                 free_extent_buffer(path->nodes[*level]);
5233                 path->nodes[*level] = NULL;
5234                 (*level)++;
5235                 path->slots[*level]++;
5236
5237                 nritems = btrfs_header_nritems(path->nodes[*level]);
5238                 ret = 1;
5239         }
5240         return ret;
5241 }
5242
5243 /*
5244  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5245  * or down.
5246  */
5247 static int tree_advance(struct btrfs_root *root,
5248                         struct btrfs_path *path,
5249                         int *level, int root_level,
5250                         int allow_down,
5251                         struct btrfs_key *key)
5252 {
5253         int ret;
5254
5255         if (*level == 0 || !allow_down) {
5256                 ret = tree_move_next_or_upnext(root, path, level, root_level);
5257         } else {
5258                 tree_move_down(root, path, level, root_level);
5259                 ret = 0;
5260         }
5261         if (ret >= 0) {
5262                 if (*level == 0)
5263                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5264                                         path->slots[*level]);
5265                 else
5266                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5267                                         path->slots[*level]);
5268         }
5269         return ret;
5270 }
5271
5272 static int tree_compare_item(struct btrfs_root *left_root,
5273                              struct btrfs_path *left_path,
5274                              struct btrfs_path *right_path,
5275                              char *tmp_buf)
5276 {
5277         int cmp;
5278         int len1, len2;
5279         unsigned long off1, off2;
5280
5281         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5282         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5283         if (len1 != len2)
5284                 return 1;
5285
5286         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5287         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5288                                 right_path->slots[0]);
5289
5290         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5291
5292         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5293         if (cmp)
5294                 return 1;
5295         return 0;
5296 }
5297
5298 #define ADVANCE 1
5299 #define ADVANCE_ONLY_NEXT -1
5300
5301 /*
5302  * This function compares two trees and calls the provided callback for
5303  * every changed/new/deleted item it finds.
5304  * If shared tree blocks are encountered, whole subtrees are skipped, making
5305  * the compare pretty fast on snapshotted subvolumes.
5306  *
5307  * This currently works on commit roots only. As commit roots are read only,
5308  * we don't do any locking. The commit roots are protected with transactions.
5309  * Transactions are ended and rejoined when a commit is tried in between.
5310  *
5311  * This function checks for modifications done to the trees while comparing.
5312  * If it detects a change, it aborts immediately.
5313  */
5314 int btrfs_compare_trees(struct btrfs_root *left_root,
5315                         struct btrfs_root *right_root,
5316                         btrfs_changed_cb_t changed_cb, void *ctx)
5317 {
5318         int ret;
5319         int cmp;
5320         struct btrfs_path *left_path = NULL;
5321         struct btrfs_path *right_path = NULL;
5322         struct btrfs_key left_key;
5323         struct btrfs_key right_key;
5324         char *tmp_buf = NULL;
5325         int left_root_level;
5326         int right_root_level;
5327         int left_level;
5328         int right_level;
5329         int left_end_reached;
5330         int right_end_reached;
5331         int advance_left;
5332         int advance_right;
5333         u64 left_blockptr;
5334         u64 right_blockptr;
5335         u64 left_gen;
5336         u64 right_gen;
5337
5338         left_path = btrfs_alloc_path();
5339         if (!left_path) {
5340                 ret = -ENOMEM;
5341                 goto out;
5342         }
5343         right_path = btrfs_alloc_path();
5344         if (!right_path) {
5345                 ret = -ENOMEM;
5346                 goto out;
5347         }
5348
5349         tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5350         if (!tmp_buf) {
5351                 ret = -ENOMEM;
5352                 goto out;
5353         }
5354
5355         left_path->search_commit_root = 1;
5356         left_path->skip_locking = 1;
5357         right_path->search_commit_root = 1;
5358         right_path->skip_locking = 1;
5359
5360         /*
5361          * Strategy: Go to the first items of both trees. Then do
5362          *
5363          * If both trees are at level 0
5364          *   Compare keys of current items
5365          *     If left < right treat left item as new, advance left tree
5366          *       and repeat
5367          *     If left > right treat right item as deleted, advance right tree
5368          *       and repeat
5369          *     If left == right do deep compare of items, treat as changed if
5370          *       needed, advance both trees and repeat
5371          * If both trees are at the same level but not at level 0
5372          *   Compare keys of current nodes/leafs
5373          *     If left < right advance left tree and repeat
5374          *     If left > right advance right tree and repeat
5375          *     If left == right compare blockptrs of the next nodes/leafs
5376          *       If they match advance both trees but stay at the same level
5377          *         and repeat
5378          *       If they don't match advance both trees while allowing to go
5379          *         deeper and repeat
5380          * If tree levels are different
5381          *   Advance the tree that needs it and repeat
5382          *
5383          * Advancing a tree means:
5384          *   If we are at level 0, try to go to the next slot. If that's not
5385          *   possible, go one level up and repeat. Stop when we found a level
5386          *   where we could go to the next slot. We may at this point be on a
5387          *   node or a leaf.
5388          *
5389          *   If we are not at level 0 and not on shared tree blocks, go one
5390          *   level deeper.
5391          *
5392          *   If we are not at level 0 and on shared tree blocks, go one slot to
5393          *   the right if possible or go up and right.
5394          */
5395
5396         down_read(&left_root->fs_info->commit_root_sem);
5397         left_level = btrfs_header_level(left_root->commit_root);
5398         left_root_level = left_level;
5399         left_path->nodes[left_level] = left_root->commit_root;
5400         extent_buffer_get(left_path->nodes[left_level]);
5401
5402         right_level = btrfs_header_level(right_root->commit_root);
5403         right_root_level = right_level;
5404         right_path->nodes[right_level] = right_root->commit_root;
5405         extent_buffer_get(right_path->nodes[right_level]);
5406         up_read(&left_root->fs_info->commit_root_sem);
5407
5408         if (left_level == 0)
5409                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5410                                 &left_key, left_path->slots[left_level]);
5411         else
5412                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5413                                 &left_key, left_path->slots[left_level]);
5414         if (right_level == 0)
5415                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5416                                 &right_key, right_path->slots[right_level]);
5417         else
5418                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5419                                 &right_key, right_path->slots[right_level]);
5420
5421         left_end_reached = right_end_reached = 0;
5422         advance_left = advance_right = 0;
5423
5424         while (1) {
5425                 if (advance_left && !left_end_reached) {
5426                         ret = tree_advance(left_root, left_path, &left_level,
5427                                         left_root_level,
5428                                         advance_left != ADVANCE_ONLY_NEXT,
5429                                         &left_key);
5430                         if (ret < 0)
5431                                 left_end_reached = ADVANCE;
5432                         advance_left = 0;
5433                 }
5434                 if (advance_right && !right_end_reached) {
5435                         ret = tree_advance(right_root, right_path, &right_level,
5436                                         right_root_level,
5437                                         advance_right != ADVANCE_ONLY_NEXT,
5438                                         &right_key);
5439                         if (ret < 0)
5440                                 right_end_reached = ADVANCE;
5441                         advance_right = 0;
5442                 }
5443
5444                 if (left_end_reached && right_end_reached) {
5445                         ret = 0;
5446                         goto out;
5447                 } else if (left_end_reached) {
5448                         if (right_level == 0) {
5449                                 ret = changed_cb(left_root, right_root,
5450                                                 left_path, right_path,
5451                                                 &right_key,
5452                                                 BTRFS_COMPARE_TREE_DELETED,
5453                                                 ctx);
5454                                 if (ret < 0)
5455                                         goto out;
5456                         }
5457                         advance_right = ADVANCE;
5458                         continue;
5459                 } else if (right_end_reached) {
5460                         if (left_level == 0) {
5461                                 ret = changed_cb(left_root, right_root,
5462                                                 left_path, right_path,
5463                                                 &left_key,
5464                                                 BTRFS_COMPARE_TREE_NEW,
5465                                                 ctx);
5466                                 if (ret < 0)
5467                                         goto out;
5468                         }
5469                         advance_left = ADVANCE;
5470                         continue;
5471                 }
5472
5473                 if (left_level == 0 && right_level == 0) {
5474                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5475                         if (cmp < 0) {
5476                                 ret = changed_cb(left_root, right_root,
5477                                                 left_path, right_path,
5478                                                 &left_key,
5479                                                 BTRFS_COMPARE_TREE_NEW,
5480                                                 ctx);
5481                                 if (ret < 0)
5482                                         goto out;
5483                                 advance_left = ADVANCE;
5484                         } else if (cmp > 0) {
5485                                 ret = changed_cb(left_root, right_root,
5486                                                 left_path, right_path,
5487                                                 &right_key,
5488                                                 BTRFS_COMPARE_TREE_DELETED,
5489                                                 ctx);
5490                                 if (ret < 0)
5491                                         goto out;
5492                                 advance_right = ADVANCE;
5493                         } else {
5494                                 enum btrfs_compare_tree_result result;
5495
5496                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5497                                 ret = tree_compare_item(left_root, left_path,
5498                                                 right_path, tmp_buf);
5499                                 if (ret)
5500                                         result = BTRFS_COMPARE_TREE_CHANGED;
5501                                 else
5502                                         result = BTRFS_COMPARE_TREE_SAME;
5503                                 ret = changed_cb(left_root, right_root,
5504                                                  left_path, right_path,
5505                                                  &left_key, result, ctx);
5506                                 if (ret < 0)
5507                                         goto out;
5508                                 advance_left = ADVANCE;
5509                                 advance_right = ADVANCE;
5510                         }
5511                 } else if (left_level == right_level) {
5512                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5513                         if (cmp < 0) {
5514                                 advance_left = ADVANCE;
5515                         } else if (cmp > 0) {
5516                                 advance_right = ADVANCE;
5517                         } else {
5518                                 left_blockptr = btrfs_node_blockptr(
5519                                                 left_path->nodes[left_level],
5520                                                 left_path->slots[left_level]);
5521                                 right_blockptr = btrfs_node_blockptr(
5522                                                 right_path->nodes[right_level],
5523                                                 right_path->slots[right_level]);
5524                                 left_gen = btrfs_node_ptr_generation(
5525                                                 left_path->nodes[left_level],
5526                                                 left_path->slots[left_level]);
5527                                 right_gen = btrfs_node_ptr_generation(
5528                                                 right_path->nodes[right_level],
5529                                                 right_path->slots[right_level]);
5530                                 if (left_blockptr == right_blockptr &&
5531                                     left_gen == right_gen) {
5532                                         /*
5533                                          * As we're on a shared block, don't
5534                                          * allow to go deeper.
5535                                          */
5536                                         advance_left = ADVANCE_ONLY_NEXT;
5537                                         advance_right = ADVANCE_ONLY_NEXT;
5538                                 } else {
5539                                         advance_left = ADVANCE;
5540                                         advance_right = ADVANCE;
5541                                 }
5542                         }
5543                 } else if (left_level < right_level) {
5544                         advance_right = ADVANCE;
5545                 } else {
5546                         advance_left = ADVANCE;
5547                 }
5548         }
5549
5550 out:
5551         btrfs_free_path(left_path);
5552         btrfs_free_path(right_path);
5553         kfree(tmp_buf);
5554         return ret;
5555 }
5556
5557 /*
5558  * this is similar to btrfs_next_leaf, but does not try to preserve
5559  * and fixup the path.  It looks for and returns the next key in the
5560  * tree based on the current path and the min_trans parameters.
5561  *
5562  * 0 is returned if another key is found, < 0 if there are any errors
5563  * and 1 is returned if there are no higher keys in the tree
5564  *
5565  * path->keep_locks should be set to 1 on the search made before
5566  * calling this function.
5567  */
5568 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5569                         struct btrfs_key *key, int level, u64 min_trans)
5570 {
5571         int slot;
5572         struct extent_buffer *c;
5573
5574         WARN_ON(!path->keep_locks);
5575         while (level < BTRFS_MAX_LEVEL) {
5576                 if (!path->nodes[level])
5577                         return 1;
5578
5579                 slot = path->slots[level] + 1;
5580                 c = path->nodes[level];
5581 next:
5582                 if (slot >= btrfs_header_nritems(c)) {
5583                         int ret;
5584                         int orig_lowest;
5585                         struct btrfs_key cur_key;
5586                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5587                             !path->nodes[level + 1])
5588                                 return 1;
5589
5590                         if (path->locks[level + 1]) {
5591                                 level++;
5592                                 continue;
5593                         }
5594
5595                         slot = btrfs_header_nritems(c) - 1;
5596                         if (level == 0)
5597                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5598                         else
5599                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5600
5601                         orig_lowest = path->lowest_level;
5602                         btrfs_release_path(path);
5603                         path->lowest_level = level;
5604                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5605                                                 0, 0);
5606                         path->lowest_level = orig_lowest;
5607                         if (ret < 0)
5608                                 return ret;
5609
5610                         c = path->nodes[level];
5611                         slot = path->slots[level];
5612                         if (ret == 0)
5613                                 slot++;
5614                         goto next;
5615                 }
5616
5617                 if (level == 0)
5618                         btrfs_item_key_to_cpu(c, key, slot);
5619                 else {
5620                         u64 gen = btrfs_node_ptr_generation(c, slot);
5621
5622                         if (gen < min_trans) {
5623                                 slot++;
5624                                 goto next;
5625                         }
5626                         btrfs_node_key_to_cpu(c, key, slot);
5627                 }
5628                 return 0;
5629         }
5630         return 1;
5631 }
5632
5633 /*
5634  * search the tree again to find a leaf with greater keys
5635  * returns 0 if it found something or 1 if there are no greater leaves.
5636  * returns < 0 on io errors.
5637  */
5638 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5639 {
5640         return btrfs_next_old_leaf(root, path, 0);
5641 }
5642
5643 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5644                         u64 time_seq)
5645 {
5646         int slot;
5647         int level;
5648         struct extent_buffer *c;
5649         struct extent_buffer *next;
5650         struct btrfs_key key;
5651         u32 nritems;
5652         int ret;
5653         int old_spinning = path->leave_spinning;
5654         int next_rw_lock = 0;
5655
5656         nritems = btrfs_header_nritems(path->nodes[0]);
5657         if (nritems == 0)
5658                 return 1;
5659
5660         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5661 again:
5662         level = 1;
5663         next = NULL;
5664         next_rw_lock = 0;
5665         btrfs_release_path(path);
5666
5667         path->keep_locks = 1;
5668         path->leave_spinning = 1;
5669
5670         if (time_seq)
5671                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5672         else
5673                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5674         path->keep_locks = 0;
5675
5676         if (ret < 0)
5677                 return ret;
5678
5679         nritems = btrfs_header_nritems(path->nodes[0]);
5680         /*
5681          * by releasing the path above we dropped all our locks.  A balance
5682          * could have added more items next to the key that used to be
5683          * at the very end of the block.  So, check again here and
5684          * advance the path if there are now more items available.
5685          */
5686         if (nritems > 0 && path->slots[0] < nritems - 1) {
5687                 if (ret == 0)
5688                         path->slots[0]++;
5689                 ret = 0;
5690                 goto done;
5691         }
5692         /*
5693          * So the above check misses one case:
5694          * - after releasing the path above, someone has removed the item that
5695          *   used to be at the very end of the block, and balance between leafs
5696          *   gets another one with bigger key.offset to replace it.
5697          *
5698          * This one should be returned as well, or we can get leaf corruption
5699          * later(esp. in __btrfs_drop_extents()).
5700          *
5701          * And a bit more explanation about this check,
5702          * with ret > 0, the key isn't found, the path points to the slot
5703          * where it should be inserted, so the path->slots[0] item must be the
5704          * bigger one.
5705          */
5706         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5707                 ret = 0;
5708                 goto done;
5709         }
5710
5711         while (level < BTRFS_MAX_LEVEL) {
5712                 if (!path->nodes[level]) {
5713                         ret = 1;
5714                         goto done;
5715                 }
5716
5717                 slot = path->slots[level] + 1;
5718                 c = path->nodes[level];
5719                 if (slot >= btrfs_header_nritems(c)) {
5720                         level++;
5721                         if (level == BTRFS_MAX_LEVEL) {
5722                                 ret = 1;
5723                                 goto done;
5724                         }
5725                         continue;
5726                 }
5727
5728                 if (next) {
5729                         btrfs_tree_unlock_rw(next, next_rw_lock);
5730                         free_extent_buffer(next);
5731                 }
5732
5733                 next = c;
5734                 next_rw_lock = path->locks[level];
5735                 ret = read_block_for_search(NULL, root, path, &next, level,
5736                                             slot, &key, 0);
5737                 if (ret == -EAGAIN)
5738                         goto again;
5739
5740                 if (ret < 0) {
5741                         btrfs_release_path(path);
5742                         goto done;
5743                 }
5744
5745                 if (!path->skip_locking) {
5746                         ret = btrfs_try_tree_read_lock(next);
5747                         if (!ret && time_seq) {
5748                                 /*
5749                                  * If we don't get the lock, we may be racing
5750                                  * with push_leaf_left, holding that lock while
5751                                  * itself waiting for the leaf we've currently
5752                                  * locked. To solve this situation, we give up
5753                                  * on our lock and cycle.
5754                                  */
5755                                 free_extent_buffer(next);
5756                                 btrfs_release_path(path);
5757                                 cond_resched();
5758                                 goto again;
5759                         }
5760                         if (!ret) {
5761                                 btrfs_set_path_blocking(path);
5762                                 btrfs_tree_read_lock(next);
5763                                 btrfs_clear_path_blocking(path, next,
5764                                                           BTRFS_READ_LOCK);
5765                         }
5766                         next_rw_lock = BTRFS_READ_LOCK;
5767                 }
5768                 break;
5769         }
5770         path->slots[level] = slot;
5771         while (1) {
5772                 level--;
5773                 c = path->nodes[level];
5774                 if (path->locks[level])
5775                         btrfs_tree_unlock_rw(c, path->locks[level]);
5776
5777                 free_extent_buffer(c);
5778                 path->nodes[level] = next;
5779                 path->slots[level] = 0;
5780                 if (!path->skip_locking)
5781                         path->locks[level] = next_rw_lock;
5782                 if (!level)
5783                         break;
5784
5785                 ret = read_block_for_search(NULL, root, path, &next, level,
5786                                             0, &key, 0);
5787                 if (ret == -EAGAIN)
5788                         goto again;
5789
5790                 if (ret < 0) {
5791                         btrfs_release_path(path);
5792                         goto done;
5793                 }
5794
5795                 if (!path->skip_locking) {
5796                         ret = btrfs_try_tree_read_lock(next);
5797                         if (!ret) {
5798                                 btrfs_set_path_blocking(path);
5799                                 btrfs_tree_read_lock(next);
5800                                 btrfs_clear_path_blocking(path, next,
5801                                                           BTRFS_READ_LOCK);
5802                         }
5803                         next_rw_lock = BTRFS_READ_LOCK;
5804                 }
5805         }
5806         ret = 0;
5807 done:
5808         unlock_up(path, 0, 1, 0, NULL);
5809         path->leave_spinning = old_spinning;
5810         if (!old_spinning)
5811                 btrfs_set_path_blocking(path);
5812
5813         return ret;
5814 }
5815
5816 /*
5817  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5818  * searching until it gets past min_objectid or finds an item of 'type'
5819  *
5820  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5821  */
5822 int btrfs_previous_item(struct btrfs_root *root,
5823                         struct btrfs_path *path, u64 min_objectid,
5824                         int type)
5825 {
5826         struct btrfs_key found_key;
5827         struct extent_buffer *leaf;
5828         u32 nritems;
5829         int ret;
5830
5831         while (1) {
5832                 if (path->slots[0] == 0) {
5833                         btrfs_set_path_blocking(path);
5834                         ret = btrfs_prev_leaf(root, path);
5835                         if (ret != 0)
5836                                 return ret;
5837                 } else {
5838                         path->slots[0]--;
5839                 }
5840                 leaf = path->nodes[0];
5841                 nritems = btrfs_header_nritems(leaf);
5842                 if (nritems == 0)
5843                         return 1;
5844                 if (path->slots[0] == nritems)
5845                         path->slots[0]--;
5846
5847                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5848                 if (found_key.objectid < min_objectid)
5849                         break;
5850                 if (found_key.type == type)
5851                         return 0;
5852                 if (found_key.objectid == min_objectid &&
5853                     found_key.type < type)
5854                         break;
5855         }
5856         return 1;
5857 }
5858
5859 /*
5860  * search in extent tree to find a previous Metadata/Data extent item with
5861  * min objecitd.
5862  *
5863  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5864  */
5865 int btrfs_previous_extent_item(struct btrfs_root *root,
5866                         struct btrfs_path *path, u64 min_objectid)
5867 {
5868         struct btrfs_key found_key;
5869         struct extent_buffer *leaf;
5870         u32 nritems;
5871         int ret;
5872
5873         while (1) {
5874                 if (path->slots[0] == 0) {
5875                         btrfs_set_path_blocking(path);
5876                         ret = btrfs_prev_leaf(root, path);
5877                         if (ret != 0)
5878                                 return ret;
5879                 } else {
5880                         path->slots[0]--;
5881                 }
5882                 leaf = path->nodes[0];
5883                 nritems = btrfs_header_nritems(leaf);
5884                 if (nritems == 0)
5885                         return 1;
5886                 if (path->slots[0] == nritems)
5887                         path->slots[0]--;
5888
5889                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5890                 if (found_key.objectid < min_objectid)
5891                         break;
5892                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5893                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5894                         return 0;
5895                 if (found_key.objectid == min_objectid &&
5896                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5897                         break;
5898         }
5899         return 1;
5900 }