8680110f0a5a18712d8d98c4165ffbb4c418458b
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include "ctree.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "locking.h"
26
27 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
28                       *root, struct btrfs_path *path, int level);
29 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_key *ins_key,
31                       struct btrfs_path *path, int data_size, int extend);
32 static int push_node_left(struct btrfs_trans_handle *trans,
33                           struct btrfs_root *root, struct extent_buffer *dst,
34                           struct extent_buffer *src, int empty);
35 static int balance_node_right(struct btrfs_trans_handle *trans,
36                               struct btrfs_root *root,
37                               struct extent_buffer *dst_buf,
38                               struct extent_buffer *src_buf);
39 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40                    struct btrfs_path *path, int level, int slot);
41 static int setup_items_for_insert(struct btrfs_trans_handle *trans,
42                         struct btrfs_root *root, struct btrfs_path *path,
43                         struct btrfs_key *cpu_key, u32 *data_size,
44                         u32 total_data, u32 total_size, int nr);
45
46
47 struct btrfs_path *btrfs_alloc_path(void)
48 {
49         struct btrfs_path *path;
50         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
51         if (path)
52                 path->reada = 1;
53         return path;
54 }
55
56 /*
57  * set all locked nodes in the path to blocking locks.  This should
58  * be done before scheduling
59  */
60 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
61 {
62         int i;
63         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
64                 if (p->nodes[i] && p->locks[i])
65                         btrfs_set_lock_blocking(p->nodes[i]);
66         }
67 }
68
69 /*
70  * reset all the locked nodes in the patch to spinning locks.
71  *
72  * held is used to keep lockdep happy, when lockdep is enabled
73  * we set held to a blocking lock before we go around and
74  * retake all the spinlocks in the path.  You can safely use NULL
75  * for held
76  */
77 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
78                                         struct extent_buffer *held)
79 {
80         int i;
81
82 #ifdef CONFIG_DEBUG_LOCK_ALLOC
83         /* lockdep really cares that we take all of these spinlocks
84          * in the right order.  If any of the locks in the path are not
85          * currently blocking, it is going to complain.  So, make really
86          * really sure by forcing the path to blocking before we clear
87          * the path blocking.
88          */
89         if (held)
90                 btrfs_set_lock_blocking(held);
91         btrfs_set_path_blocking(p);
92 #endif
93
94         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
95                 if (p->nodes[i] && p->locks[i])
96                         btrfs_clear_lock_blocking(p->nodes[i]);
97         }
98
99 #ifdef CONFIG_DEBUG_LOCK_ALLOC
100         if (held)
101                 btrfs_clear_lock_blocking(held);
102 #endif
103 }
104
105 /* this also releases the path */
106 void btrfs_free_path(struct btrfs_path *p)
107 {
108         if (!p)
109                 return;
110         btrfs_release_path(NULL, p);
111         kmem_cache_free(btrfs_path_cachep, p);
112 }
113
114 /*
115  * path release drops references on the extent buffers in the path
116  * and it drops any locks held by this path
117  *
118  * It is safe to call this on paths that no locks or extent buffers held.
119  */
120 noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
121 {
122         int i;
123
124         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
125                 p->slots[i] = 0;
126                 if (!p->nodes[i])
127                         continue;
128                 if (p->locks[i]) {
129                         btrfs_tree_unlock(p->nodes[i]);
130                         p->locks[i] = 0;
131                 }
132                 free_extent_buffer(p->nodes[i]);
133                 p->nodes[i] = NULL;
134         }
135 }
136
137 /*
138  * safely gets a reference on the root node of a tree.  A lock
139  * is not taken, so a concurrent writer may put a different node
140  * at the root of the tree.  See btrfs_lock_root_node for the
141  * looping required.
142  *
143  * The extent buffer returned by this has a reference taken, so
144  * it won't disappear.  It may stop being the root of the tree
145  * at any time because there are no locks held.
146  */
147 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
148 {
149         struct extent_buffer *eb;
150
151         rcu_read_lock();
152         eb = rcu_dereference(root->node);
153         extent_buffer_get(eb);
154         rcu_read_unlock();
155         return eb;
156 }
157
158 /* loop around taking references on and locking the root node of the
159  * tree until you end up with a lock on the root.  A locked buffer
160  * is returned, with a reference held.
161  */
162 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
163 {
164         struct extent_buffer *eb;
165
166         while (1) {
167                 eb = btrfs_root_node(root);
168                 btrfs_tree_lock(eb);
169                 if (eb == root->node)
170                         break;
171                 btrfs_tree_unlock(eb);
172                 free_extent_buffer(eb);
173         }
174         return eb;
175 }
176
177 /* cowonly root (everything not a reference counted cow subvolume), just get
178  * put onto a simple dirty list.  transaction.c walks this to make sure they
179  * get properly updated on disk.
180  */
181 static void add_root_to_dirty_list(struct btrfs_root *root)
182 {
183         if (root->track_dirty && list_empty(&root->dirty_list)) {
184                 list_add(&root->dirty_list,
185                          &root->fs_info->dirty_cowonly_roots);
186         }
187 }
188
189 /*
190  * used by snapshot creation to make a copy of a root for a tree with
191  * a given objectid.  The buffer with the new root node is returned in
192  * cow_ret, and this func returns zero on success or a negative error code.
193  */
194 int btrfs_copy_root(struct btrfs_trans_handle *trans,
195                       struct btrfs_root *root,
196                       struct extent_buffer *buf,
197                       struct extent_buffer **cow_ret, u64 new_root_objectid)
198 {
199         struct extent_buffer *cow;
200         int ret = 0;
201         int level;
202         struct btrfs_disk_key disk_key;
203
204         WARN_ON(root->ref_cows && trans->transid !=
205                 root->fs_info->running_transaction->transid);
206         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
207
208         level = btrfs_header_level(buf);
209         if (level == 0)
210                 btrfs_item_key(buf, &disk_key, 0);
211         else
212                 btrfs_node_key(buf, &disk_key, 0);
213
214         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
215                                      new_root_objectid, &disk_key, level,
216                                      buf->start, 0);
217         if (IS_ERR(cow))
218                 return PTR_ERR(cow);
219
220         copy_extent_buffer(cow, buf, 0, 0, cow->len);
221         btrfs_set_header_bytenr(cow, cow->start);
222         btrfs_set_header_generation(cow, trans->transid);
223         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
224         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
225                                      BTRFS_HEADER_FLAG_RELOC);
226         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
227                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
228         else
229                 btrfs_set_header_owner(cow, new_root_objectid);
230
231         write_extent_buffer(cow, root->fs_info->fsid,
232                             (unsigned long)btrfs_header_fsid(cow),
233                             BTRFS_FSID_SIZE);
234
235         WARN_ON(btrfs_header_generation(buf) > trans->transid);
236         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
237                 ret = btrfs_inc_ref(trans, root, cow, 1);
238         else
239                 ret = btrfs_inc_ref(trans, root, cow, 0);
240
241         if (ret)
242                 return ret;
243
244         btrfs_mark_buffer_dirty(cow);
245         *cow_ret = cow;
246         return 0;
247 }
248
249 /*
250  * check if the tree block can be shared by multiple trees
251  */
252 int btrfs_block_can_be_shared(struct btrfs_root *root,
253                               struct extent_buffer *buf)
254 {
255         /*
256          * Tree blocks not in refernece counted trees and tree roots
257          * are never shared. If a block was allocated after the last
258          * snapshot and the block was not allocated by tree relocation,
259          * we know the block is not shared.
260          */
261         if (root->ref_cows &&
262             buf != root->node && buf != root->commit_root &&
263             (btrfs_header_generation(buf) <=
264              btrfs_root_last_snapshot(&root->root_item) ||
265              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
266                 return 1;
267 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
268         if (root->ref_cows &&
269             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
270                 return 1;
271 #endif
272         return 0;
273 }
274
275 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
276                                        struct btrfs_root *root,
277                                        struct extent_buffer *buf,
278                                        struct extent_buffer *cow,
279                                        int *last_ref)
280 {
281         u64 refs;
282         u64 owner;
283         u64 flags;
284         u64 new_flags = 0;
285         int ret;
286
287         /*
288          * Backrefs update rules:
289          *
290          * Always use full backrefs for extent pointers in tree block
291          * allocated by tree relocation.
292          *
293          * If a shared tree block is no longer referenced by its owner
294          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
295          * use full backrefs for extent pointers in tree block.
296          *
297          * If a tree block is been relocating
298          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
299          * use full backrefs for extent pointers in tree block.
300          * The reason for this is some operations (such as drop tree)
301          * are only allowed for blocks use full backrefs.
302          */
303
304         if (btrfs_block_can_be_shared(root, buf)) {
305                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
306                                                buf->len, &refs, &flags);
307                 BUG_ON(ret);
308                 BUG_ON(refs == 0);
309         } else {
310                 refs = 1;
311                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
312                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
313                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
314                 else
315                         flags = 0;
316         }
317
318         owner = btrfs_header_owner(buf);
319         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
320                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
321
322         if (refs > 1) {
323                 if ((owner == root->root_key.objectid ||
324                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
325                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
326                         ret = btrfs_inc_ref(trans, root, buf, 1);
327                         BUG_ON(ret);
328
329                         if (root->root_key.objectid ==
330                             BTRFS_TREE_RELOC_OBJECTID) {
331                                 ret = btrfs_dec_ref(trans, root, buf, 0);
332                                 BUG_ON(ret);
333                                 ret = btrfs_inc_ref(trans, root, cow, 1);
334                                 BUG_ON(ret);
335                         }
336                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
337                 } else {
338
339                         if (root->root_key.objectid ==
340                             BTRFS_TREE_RELOC_OBJECTID)
341                                 ret = btrfs_inc_ref(trans, root, cow, 1);
342                         else
343                                 ret = btrfs_inc_ref(trans, root, cow, 0);
344                         BUG_ON(ret);
345                 }
346                 if (new_flags != 0) {
347                         ret = btrfs_set_disk_extent_flags(trans, root,
348                                                           buf->start,
349                                                           buf->len,
350                                                           new_flags, 0);
351                         BUG_ON(ret);
352                 }
353         } else {
354                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
355                         if (root->root_key.objectid ==
356                             BTRFS_TREE_RELOC_OBJECTID)
357                                 ret = btrfs_inc_ref(trans, root, cow, 1);
358                         else
359                                 ret = btrfs_inc_ref(trans, root, cow, 0);
360                         BUG_ON(ret);
361                         ret = btrfs_dec_ref(trans, root, buf, 1);
362                         BUG_ON(ret);
363                 }
364                 clean_tree_block(trans, root, buf);
365                 *last_ref = 1;
366         }
367         return 0;
368 }
369
370 /*
371  * does the dirty work in cow of a single block.  The parent block (if
372  * supplied) is updated to point to the new cow copy.  The new buffer is marked
373  * dirty and returned locked.  If you modify the block it needs to be marked
374  * dirty again.
375  *
376  * search_start -- an allocation hint for the new block
377  *
378  * empty_size -- a hint that you plan on doing more cow.  This is the size in
379  * bytes the allocator should try to find free next to the block it returns.
380  * This is just a hint and may be ignored by the allocator.
381  */
382 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
383                              struct btrfs_root *root,
384                              struct extent_buffer *buf,
385                              struct extent_buffer *parent, int parent_slot,
386                              struct extent_buffer **cow_ret,
387                              u64 search_start, u64 empty_size)
388 {
389         struct btrfs_disk_key disk_key;
390         struct extent_buffer *cow;
391         int level;
392         int last_ref = 0;
393         int unlock_orig = 0;
394         u64 parent_start;
395
396         if (*cow_ret == buf)
397                 unlock_orig = 1;
398
399         btrfs_assert_tree_locked(buf);
400
401         WARN_ON(root->ref_cows && trans->transid !=
402                 root->fs_info->running_transaction->transid);
403         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
404
405         level = btrfs_header_level(buf);
406
407         if (level == 0)
408                 btrfs_item_key(buf, &disk_key, 0);
409         else
410                 btrfs_node_key(buf, &disk_key, 0);
411
412         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
413                 if (parent)
414                         parent_start = parent->start;
415                 else
416                         parent_start = 0;
417         } else
418                 parent_start = 0;
419
420         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
421                                      root->root_key.objectid, &disk_key,
422                                      level, search_start, empty_size);
423         if (IS_ERR(cow))
424                 return PTR_ERR(cow);
425
426         /* cow is set to blocking by btrfs_init_new_buffer */
427
428         copy_extent_buffer(cow, buf, 0, 0, cow->len);
429         btrfs_set_header_bytenr(cow, cow->start);
430         btrfs_set_header_generation(cow, trans->transid);
431         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
432         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
433                                      BTRFS_HEADER_FLAG_RELOC);
434         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
435                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
436         else
437                 btrfs_set_header_owner(cow, root->root_key.objectid);
438
439         write_extent_buffer(cow, root->fs_info->fsid,
440                             (unsigned long)btrfs_header_fsid(cow),
441                             BTRFS_FSID_SIZE);
442
443         update_ref_for_cow(trans, root, buf, cow, &last_ref);
444
445         if (root->ref_cows)
446                 btrfs_reloc_cow_block(trans, root, buf, cow);
447
448         if (buf == root->node) {
449                 WARN_ON(parent && parent != buf);
450                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
451                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
452                         parent_start = buf->start;
453                 else
454                         parent_start = 0;
455
456                 extent_buffer_get(cow);
457                 rcu_assign_pointer(root->node, cow);
458
459                 btrfs_free_tree_block(trans, root, buf, parent_start,
460                                       last_ref);
461                 free_extent_buffer(buf);
462                 add_root_to_dirty_list(root);
463         } else {
464                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
465                         parent_start = parent->start;
466                 else
467                         parent_start = 0;
468
469                 WARN_ON(trans->transid != btrfs_header_generation(parent));
470                 btrfs_set_node_blockptr(parent, parent_slot,
471                                         cow->start);
472                 btrfs_set_node_ptr_generation(parent, parent_slot,
473                                               trans->transid);
474                 btrfs_mark_buffer_dirty(parent);
475                 btrfs_free_tree_block(trans, root, buf, parent_start,
476                                       last_ref);
477         }
478         if (unlock_orig)
479                 btrfs_tree_unlock(buf);
480         free_extent_buffer(buf);
481         btrfs_mark_buffer_dirty(cow);
482         *cow_ret = cow;
483         return 0;
484 }
485
486 static inline int should_cow_block(struct btrfs_trans_handle *trans,
487                                    struct btrfs_root *root,
488                                    struct extent_buffer *buf)
489 {
490         if (btrfs_header_generation(buf) == trans->transid &&
491             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
492             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
493               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
494                 return 0;
495         return 1;
496 }
497
498 /*
499  * cows a single block, see __btrfs_cow_block for the real work.
500  * This version of it has extra checks so that a block isn't cow'd more than
501  * once per transaction, as long as it hasn't been written yet
502  */
503 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
504                     struct btrfs_root *root, struct extent_buffer *buf,
505                     struct extent_buffer *parent, int parent_slot,
506                     struct extent_buffer **cow_ret)
507 {
508         u64 search_start;
509         int ret;
510
511         if (trans->transaction != root->fs_info->running_transaction) {
512                 printk(KERN_CRIT "trans %llu running %llu\n",
513                        (unsigned long long)trans->transid,
514                        (unsigned long long)
515                        root->fs_info->running_transaction->transid);
516                 WARN_ON(1);
517         }
518         if (trans->transid != root->fs_info->generation) {
519                 printk(KERN_CRIT "trans %llu running %llu\n",
520                        (unsigned long long)trans->transid,
521                        (unsigned long long)root->fs_info->generation);
522                 WARN_ON(1);
523         }
524
525         if (!should_cow_block(trans, root, buf)) {
526                 *cow_ret = buf;
527                 return 0;
528         }
529
530         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
531
532         if (parent)
533                 btrfs_set_lock_blocking(parent);
534         btrfs_set_lock_blocking(buf);
535
536         ret = __btrfs_cow_block(trans, root, buf, parent,
537                                  parent_slot, cow_ret, search_start, 0);
538         return ret;
539 }
540
541 /*
542  * helper function for defrag to decide if two blocks pointed to by a
543  * node are actually close by
544  */
545 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
546 {
547         if (blocknr < other && other - (blocknr + blocksize) < 32768)
548                 return 1;
549         if (blocknr > other && blocknr - (other + blocksize) < 32768)
550                 return 1;
551         return 0;
552 }
553
554 /*
555  * compare two keys in a memcmp fashion
556  */
557 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
558 {
559         struct btrfs_key k1;
560
561         btrfs_disk_key_to_cpu(&k1, disk);
562
563         return btrfs_comp_cpu_keys(&k1, k2);
564 }
565
566 /*
567  * same as comp_keys only with two btrfs_key's
568  */
569 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
570 {
571         if (k1->objectid > k2->objectid)
572                 return 1;
573         if (k1->objectid < k2->objectid)
574                 return -1;
575         if (k1->type > k2->type)
576                 return 1;
577         if (k1->type < k2->type)
578                 return -1;
579         if (k1->offset > k2->offset)
580                 return 1;
581         if (k1->offset < k2->offset)
582                 return -1;
583         return 0;
584 }
585
586 /*
587  * this is used by the defrag code to go through all the
588  * leaves pointed to by a node and reallocate them so that
589  * disk order is close to key order
590  */
591 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
592                        struct btrfs_root *root, struct extent_buffer *parent,
593                        int start_slot, int cache_only, u64 *last_ret,
594                        struct btrfs_key *progress)
595 {
596         struct extent_buffer *cur;
597         u64 blocknr;
598         u64 gen;
599         u64 search_start = *last_ret;
600         u64 last_block = 0;
601         u64 other;
602         u32 parent_nritems;
603         int end_slot;
604         int i;
605         int err = 0;
606         int parent_level;
607         int uptodate;
608         u32 blocksize;
609         int progress_passed = 0;
610         struct btrfs_disk_key disk_key;
611
612         parent_level = btrfs_header_level(parent);
613         if (cache_only && parent_level != 1)
614                 return 0;
615
616         if (trans->transaction != root->fs_info->running_transaction)
617                 WARN_ON(1);
618         if (trans->transid != root->fs_info->generation)
619                 WARN_ON(1);
620
621         parent_nritems = btrfs_header_nritems(parent);
622         blocksize = btrfs_level_size(root, parent_level - 1);
623         end_slot = parent_nritems;
624
625         if (parent_nritems == 1)
626                 return 0;
627
628         btrfs_set_lock_blocking(parent);
629
630         for (i = start_slot; i < end_slot; i++) {
631                 int close = 1;
632
633                 if (!parent->map_token) {
634                         map_extent_buffer(parent,
635                                         btrfs_node_key_ptr_offset(i),
636                                         sizeof(struct btrfs_key_ptr),
637                                         &parent->map_token, &parent->kaddr,
638                                         &parent->map_start, &parent->map_len,
639                                         KM_USER1);
640                 }
641                 btrfs_node_key(parent, &disk_key, i);
642                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
643                         continue;
644
645                 progress_passed = 1;
646                 blocknr = btrfs_node_blockptr(parent, i);
647                 gen = btrfs_node_ptr_generation(parent, i);
648                 if (last_block == 0)
649                         last_block = blocknr;
650
651                 if (i > 0) {
652                         other = btrfs_node_blockptr(parent, i - 1);
653                         close = close_blocks(blocknr, other, blocksize);
654                 }
655                 if (!close && i < end_slot - 2) {
656                         other = btrfs_node_blockptr(parent, i + 1);
657                         close = close_blocks(blocknr, other, blocksize);
658                 }
659                 if (close) {
660                         last_block = blocknr;
661                         continue;
662                 }
663                 if (parent->map_token) {
664                         unmap_extent_buffer(parent, parent->map_token,
665                                             KM_USER1);
666                         parent->map_token = NULL;
667                 }
668
669                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
670                 if (cur)
671                         uptodate = btrfs_buffer_uptodate(cur, gen);
672                 else
673                         uptodate = 0;
674                 if (!cur || !uptodate) {
675                         if (cache_only) {
676                                 free_extent_buffer(cur);
677                                 continue;
678                         }
679                         if (!cur) {
680                                 cur = read_tree_block(root, blocknr,
681                                                          blocksize, gen);
682                         } else if (!uptodate) {
683                                 btrfs_read_buffer(cur, gen);
684                         }
685                 }
686                 if (search_start == 0)
687                         search_start = last_block;
688
689                 btrfs_tree_lock(cur);
690                 btrfs_set_lock_blocking(cur);
691                 err = __btrfs_cow_block(trans, root, cur, parent, i,
692                                         &cur, search_start,
693                                         min(16 * blocksize,
694                                             (end_slot - i) * blocksize));
695                 if (err) {
696                         btrfs_tree_unlock(cur);
697                         free_extent_buffer(cur);
698                         break;
699                 }
700                 search_start = cur->start;
701                 last_block = cur->start;
702                 *last_ret = search_start;
703                 btrfs_tree_unlock(cur);
704                 free_extent_buffer(cur);
705         }
706         if (parent->map_token) {
707                 unmap_extent_buffer(parent, parent->map_token,
708                                     KM_USER1);
709                 parent->map_token = NULL;
710         }
711         return err;
712 }
713
714 /*
715  * The leaf data grows from end-to-front in the node.
716  * this returns the address of the start of the last item,
717  * which is the stop of the leaf data stack
718  */
719 static inline unsigned int leaf_data_end(struct btrfs_root *root,
720                                          struct extent_buffer *leaf)
721 {
722         u32 nr = btrfs_header_nritems(leaf);
723         if (nr == 0)
724                 return BTRFS_LEAF_DATA_SIZE(root);
725         return btrfs_item_offset_nr(leaf, nr - 1);
726 }
727
728
729 /*
730  * search for key in the extent_buffer.  The items start at offset p,
731  * and they are item_size apart.  There are 'max' items in p.
732  *
733  * the slot in the array is returned via slot, and it points to
734  * the place where you would insert key if it is not found in
735  * the array.
736  *
737  * slot may point to max if the key is bigger than all of the keys
738  */
739 static noinline int generic_bin_search(struct extent_buffer *eb,
740                                        unsigned long p,
741                                        int item_size, struct btrfs_key *key,
742                                        int max, int *slot)
743 {
744         int low = 0;
745         int high = max;
746         int mid;
747         int ret;
748         struct btrfs_disk_key *tmp = NULL;
749         struct btrfs_disk_key unaligned;
750         unsigned long offset;
751         char *map_token = NULL;
752         char *kaddr = NULL;
753         unsigned long map_start = 0;
754         unsigned long map_len = 0;
755         int err;
756
757         while (low < high) {
758                 mid = (low + high) / 2;
759                 offset = p + mid * item_size;
760
761                 if (!map_token || offset < map_start ||
762                     (offset + sizeof(struct btrfs_disk_key)) >
763                     map_start + map_len) {
764                         if (map_token) {
765                                 unmap_extent_buffer(eb, map_token, KM_USER0);
766                                 map_token = NULL;
767                         }
768
769                         err = map_private_extent_buffer(eb, offset,
770                                                 sizeof(struct btrfs_disk_key),
771                                                 &map_token, &kaddr,
772                                                 &map_start, &map_len, KM_USER0);
773
774                         if (!err) {
775                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
776                                                         map_start);
777                         } else {
778                                 read_extent_buffer(eb, &unaligned,
779                                                    offset, sizeof(unaligned));
780                                 tmp = &unaligned;
781                         }
782
783                 } else {
784                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
785                                                         map_start);
786                 }
787                 ret = comp_keys(tmp, key);
788
789                 if (ret < 0)
790                         low = mid + 1;
791                 else if (ret > 0)
792                         high = mid;
793                 else {
794                         *slot = mid;
795                         if (map_token)
796                                 unmap_extent_buffer(eb, map_token, KM_USER0);
797                         return 0;
798                 }
799         }
800         *slot = low;
801         if (map_token)
802                 unmap_extent_buffer(eb, map_token, KM_USER0);
803         return 1;
804 }
805
806 /*
807  * simple bin_search frontend that does the right thing for
808  * leaves vs nodes
809  */
810 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
811                       int level, int *slot)
812 {
813         if (level == 0) {
814                 return generic_bin_search(eb,
815                                           offsetof(struct btrfs_leaf, items),
816                                           sizeof(struct btrfs_item),
817                                           key, btrfs_header_nritems(eb),
818                                           slot);
819         } else {
820                 return generic_bin_search(eb,
821                                           offsetof(struct btrfs_node, ptrs),
822                                           sizeof(struct btrfs_key_ptr),
823                                           key, btrfs_header_nritems(eb),
824                                           slot);
825         }
826         return -1;
827 }
828
829 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
830                      int level, int *slot)
831 {
832         return bin_search(eb, key, level, slot);
833 }
834
835 static void root_add_used(struct btrfs_root *root, u32 size)
836 {
837         spin_lock(&root->accounting_lock);
838         btrfs_set_root_used(&root->root_item,
839                             btrfs_root_used(&root->root_item) + size);
840         spin_unlock(&root->accounting_lock);
841 }
842
843 static void root_sub_used(struct btrfs_root *root, u32 size)
844 {
845         spin_lock(&root->accounting_lock);
846         btrfs_set_root_used(&root->root_item,
847                             btrfs_root_used(&root->root_item) - size);
848         spin_unlock(&root->accounting_lock);
849 }
850
851 /* given a node and slot number, this reads the blocks it points to.  The
852  * extent buffer is returned with a reference taken (but unlocked).
853  * NULL is returned on error.
854  */
855 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
856                                    struct extent_buffer *parent, int slot)
857 {
858         int level = btrfs_header_level(parent);
859         if (slot < 0)
860                 return NULL;
861         if (slot >= btrfs_header_nritems(parent))
862                 return NULL;
863
864         BUG_ON(level == 0);
865
866         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
867                        btrfs_level_size(root, level - 1),
868                        btrfs_node_ptr_generation(parent, slot));
869 }
870
871 /*
872  * node level balancing, used to make sure nodes are in proper order for
873  * item deletion.  We balance from the top down, so we have to make sure
874  * that a deletion won't leave an node completely empty later on.
875  */
876 static noinline int balance_level(struct btrfs_trans_handle *trans,
877                          struct btrfs_root *root,
878                          struct btrfs_path *path, int level)
879 {
880         struct extent_buffer *right = NULL;
881         struct extent_buffer *mid;
882         struct extent_buffer *left = NULL;
883         struct extent_buffer *parent = NULL;
884         int ret = 0;
885         int wret;
886         int pslot;
887         int orig_slot = path->slots[level];
888         u64 orig_ptr;
889
890         if (level == 0)
891                 return 0;
892
893         mid = path->nodes[level];
894
895         WARN_ON(!path->locks[level]);
896         WARN_ON(btrfs_header_generation(mid) != trans->transid);
897
898         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
899
900         if (level < BTRFS_MAX_LEVEL - 1)
901                 parent = path->nodes[level + 1];
902         pslot = path->slots[level + 1];
903
904         /*
905          * deal with the case where there is only one pointer in the root
906          * by promoting the node below to a root
907          */
908         if (!parent) {
909                 struct extent_buffer *child;
910
911                 if (btrfs_header_nritems(mid) != 1)
912                         return 0;
913
914                 /* promote the child to a root */
915                 child = read_node_slot(root, mid, 0);
916                 BUG_ON(!child);
917                 btrfs_tree_lock(child);
918                 btrfs_set_lock_blocking(child);
919                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
920                 if (ret) {
921                         btrfs_tree_unlock(child);
922                         free_extent_buffer(child);
923                         goto enospc;
924                 }
925
926                 rcu_assign_pointer(root->node, child);
927
928                 add_root_to_dirty_list(root);
929                 btrfs_tree_unlock(child);
930
931                 path->locks[level] = 0;
932                 path->nodes[level] = NULL;
933                 clean_tree_block(trans, root, mid);
934                 btrfs_tree_unlock(mid);
935                 /* once for the path */
936                 free_extent_buffer(mid);
937
938                 root_sub_used(root, mid->len);
939                 btrfs_free_tree_block(trans, root, mid, 0, 1);
940                 /* once for the root ptr */
941                 free_extent_buffer(mid);
942                 return 0;
943         }
944         if (btrfs_header_nritems(mid) >
945             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
946                 return 0;
947
948         btrfs_header_nritems(mid);
949
950         left = read_node_slot(root, parent, pslot - 1);
951         if (left) {
952                 btrfs_tree_lock(left);
953                 btrfs_set_lock_blocking(left);
954                 wret = btrfs_cow_block(trans, root, left,
955                                        parent, pslot - 1, &left);
956                 if (wret) {
957                         ret = wret;
958                         goto enospc;
959                 }
960         }
961         right = read_node_slot(root, parent, pslot + 1);
962         if (right) {
963                 btrfs_tree_lock(right);
964                 btrfs_set_lock_blocking(right);
965                 wret = btrfs_cow_block(trans, root, right,
966                                        parent, pslot + 1, &right);
967                 if (wret) {
968                         ret = wret;
969                         goto enospc;
970                 }
971         }
972
973         /* first, try to make some room in the middle buffer */
974         if (left) {
975                 orig_slot += btrfs_header_nritems(left);
976                 wret = push_node_left(trans, root, left, mid, 1);
977                 if (wret < 0)
978                         ret = wret;
979                 btrfs_header_nritems(mid);
980         }
981
982         /*
983          * then try to empty the right most buffer into the middle
984          */
985         if (right) {
986                 wret = push_node_left(trans, root, mid, right, 1);
987                 if (wret < 0 && wret != -ENOSPC)
988                         ret = wret;
989                 if (btrfs_header_nritems(right) == 0) {
990                         clean_tree_block(trans, root, right);
991                         btrfs_tree_unlock(right);
992                         wret = del_ptr(trans, root, path, level + 1, pslot +
993                                        1);
994                         if (wret)
995                                 ret = wret;
996                         root_sub_used(root, right->len);
997                         btrfs_free_tree_block(trans, root, right, 0, 1);
998                         free_extent_buffer(right);
999                         right = NULL;
1000                 } else {
1001                         struct btrfs_disk_key right_key;
1002                         btrfs_node_key(right, &right_key, 0);
1003                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1004                         btrfs_mark_buffer_dirty(parent);
1005                 }
1006         }
1007         if (btrfs_header_nritems(mid) == 1) {
1008                 /*
1009                  * we're not allowed to leave a node with one item in the
1010                  * tree during a delete.  A deletion from lower in the tree
1011                  * could try to delete the only pointer in this node.
1012                  * So, pull some keys from the left.
1013                  * There has to be a left pointer at this point because
1014                  * otherwise we would have pulled some pointers from the
1015                  * right
1016                  */
1017                 BUG_ON(!left);
1018                 wret = balance_node_right(trans, root, mid, left);
1019                 if (wret < 0) {
1020                         ret = wret;
1021                         goto enospc;
1022                 }
1023                 if (wret == 1) {
1024                         wret = push_node_left(trans, root, left, mid, 1);
1025                         if (wret < 0)
1026                                 ret = wret;
1027                 }
1028                 BUG_ON(wret == 1);
1029         }
1030         if (btrfs_header_nritems(mid) == 0) {
1031                 clean_tree_block(trans, root, mid);
1032                 btrfs_tree_unlock(mid);
1033                 wret = del_ptr(trans, root, path, level + 1, pslot);
1034                 if (wret)
1035                         ret = wret;
1036                 root_sub_used(root, mid->len);
1037                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1038                 free_extent_buffer(mid);
1039                 mid = NULL;
1040         } else {
1041                 /* update the parent key to reflect our changes */
1042                 struct btrfs_disk_key mid_key;
1043                 btrfs_node_key(mid, &mid_key, 0);
1044                 btrfs_set_node_key(parent, &mid_key, pslot);
1045                 btrfs_mark_buffer_dirty(parent);
1046         }
1047
1048         /* update the path */
1049         if (left) {
1050                 if (btrfs_header_nritems(left) > orig_slot) {
1051                         extent_buffer_get(left);
1052                         /* left was locked after cow */
1053                         path->nodes[level] = left;
1054                         path->slots[level + 1] -= 1;
1055                         path->slots[level] = orig_slot;
1056                         if (mid) {
1057                                 btrfs_tree_unlock(mid);
1058                                 free_extent_buffer(mid);
1059                         }
1060                 } else {
1061                         orig_slot -= btrfs_header_nritems(left);
1062                         path->slots[level] = orig_slot;
1063                 }
1064         }
1065         /* double check we haven't messed things up */
1066         if (orig_ptr !=
1067             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1068                 BUG();
1069 enospc:
1070         if (right) {
1071                 btrfs_tree_unlock(right);
1072                 free_extent_buffer(right);
1073         }
1074         if (left) {
1075                 if (path->nodes[level] != left)
1076                         btrfs_tree_unlock(left);
1077                 free_extent_buffer(left);
1078         }
1079         return ret;
1080 }
1081
1082 /* Node balancing for insertion.  Here we only split or push nodes around
1083  * when they are completely full.  This is also done top down, so we
1084  * have to be pessimistic.
1085  */
1086 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1087                                           struct btrfs_root *root,
1088                                           struct btrfs_path *path, int level)
1089 {
1090         struct extent_buffer *right = NULL;
1091         struct extent_buffer *mid;
1092         struct extent_buffer *left = NULL;
1093         struct extent_buffer *parent = NULL;
1094         int ret = 0;
1095         int wret;
1096         int pslot;
1097         int orig_slot = path->slots[level];
1098
1099         if (level == 0)
1100                 return 1;
1101
1102         mid = path->nodes[level];
1103         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1104
1105         if (level < BTRFS_MAX_LEVEL - 1)
1106                 parent = path->nodes[level + 1];
1107         pslot = path->slots[level + 1];
1108
1109         if (!parent)
1110                 return 1;
1111
1112         left = read_node_slot(root, parent, pslot - 1);
1113
1114         /* first, try to make some room in the middle buffer */
1115         if (left) {
1116                 u32 left_nr;
1117
1118                 btrfs_tree_lock(left);
1119                 btrfs_set_lock_blocking(left);
1120
1121                 left_nr = btrfs_header_nritems(left);
1122                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1123                         wret = 1;
1124                 } else {
1125                         ret = btrfs_cow_block(trans, root, left, parent,
1126                                               pslot - 1, &left);
1127                         if (ret)
1128                                 wret = 1;
1129                         else {
1130                                 wret = push_node_left(trans, root,
1131                                                       left, mid, 0);
1132                         }
1133                 }
1134                 if (wret < 0)
1135                         ret = wret;
1136                 if (wret == 0) {
1137                         struct btrfs_disk_key disk_key;
1138                         orig_slot += left_nr;
1139                         btrfs_node_key(mid, &disk_key, 0);
1140                         btrfs_set_node_key(parent, &disk_key, pslot);
1141                         btrfs_mark_buffer_dirty(parent);
1142                         if (btrfs_header_nritems(left) > orig_slot) {
1143                                 path->nodes[level] = left;
1144                                 path->slots[level + 1] -= 1;
1145                                 path->slots[level] = orig_slot;
1146                                 btrfs_tree_unlock(mid);
1147                                 free_extent_buffer(mid);
1148                         } else {
1149                                 orig_slot -=
1150                                         btrfs_header_nritems(left);
1151                                 path->slots[level] = orig_slot;
1152                                 btrfs_tree_unlock(left);
1153                                 free_extent_buffer(left);
1154                         }
1155                         return 0;
1156                 }
1157                 btrfs_tree_unlock(left);
1158                 free_extent_buffer(left);
1159         }
1160         right = read_node_slot(root, parent, pslot + 1);
1161
1162         /*
1163          * then try to empty the right most buffer into the middle
1164          */
1165         if (right) {
1166                 u32 right_nr;
1167
1168                 btrfs_tree_lock(right);
1169                 btrfs_set_lock_blocking(right);
1170
1171                 right_nr = btrfs_header_nritems(right);
1172                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1173                         wret = 1;
1174                 } else {
1175                         ret = btrfs_cow_block(trans, root, right,
1176                                               parent, pslot + 1,
1177                                               &right);
1178                         if (ret)
1179                                 wret = 1;
1180                         else {
1181                                 wret = balance_node_right(trans, root,
1182                                                           right, mid);
1183                         }
1184                 }
1185                 if (wret < 0)
1186                         ret = wret;
1187                 if (wret == 0) {
1188                         struct btrfs_disk_key disk_key;
1189
1190                         btrfs_node_key(right, &disk_key, 0);
1191                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1192                         btrfs_mark_buffer_dirty(parent);
1193
1194                         if (btrfs_header_nritems(mid) <= orig_slot) {
1195                                 path->nodes[level] = right;
1196                                 path->slots[level + 1] += 1;
1197                                 path->slots[level] = orig_slot -
1198                                         btrfs_header_nritems(mid);
1199                                 btrfs_tree_unlock(mid);
1200                                 free_extent_buffer(mid);
1201                         } else {
1202                                 btrfs_tree_unlock(right);
1203                                 free_extent_buffer(right);
1204                         }
1205                         return 0;
1206                 }
1207                 btrfs_tree_unlock(right);
1208                 free_extent_buffer(right);
1209         }
1210         return 1;
1211 }
1212
1213 /*
1214  * readahead one full node of leaves, finding things that are close
1215  * to the block in 'slot', and triggering ra on them.
1216  */
1217 static void reada_for_search(struct btrfs_root *root,
1218                              struct btrfs_path *path,
1219                              int level, int slot, u64 objectid)
1220 {
1221         struct extent_buffer *node;
1222         struct btrfs_disk_key disk_key;
1223         u32 nritems;
1224         u64 search;
1225         u64 target;
1226         u64 nread = 0;
1227         int direction = path->reada;
1228         struct extent_buffer *eb;
1229         u32 nr;
1230         u32 blocksize;
1231         u32 nscan = 0;
1232
1233         if (level != 1)
1234                 return;
1235
1236         if (!path->nodes[level])
1237                 return;
1238
1239         node = path->nodes[level];
1240
1241         search = btrfs_node_blockptr(node, slot);
1242         blocksize = btrfs_level_size(root, level - 1);
1243         eb = btrfs_find_tree_block(root, search, blocksize);
1244         if (eb) {
1245                 free_extent_buffer(eb);
1246                 return;
1247         }
1248
1249         target = search;
1250
1251         nritems = btrfs_header_nritems(node);
1252         nr = slot;
1253         while (1) {
1254                 if (direction < 0) {
1255                         if (nr == 0)
1256                                 break;
1257                         nr--;
1258                 } else if (direction > 0) {
1259                         nr++;
1260                         if (nr >= nritems)
1261                                 break;
1262                 }
1263                 if (path->reada < 0 && objectid) {
1264                         btrfs_node_key(node, &disk_key, nr);
1265                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1266                                 break;
1267                 }
1268                 search = btrfs_node_blockptr(node, nr);
1269                 if ((search <= target && target - search <= 65536) ||
1270                     (search > target && search - target <= 65536)) {
1271                         readahead_tree_block(root, search, blocksize,
1272                                      btrfs_node_ptr_generation(node, nr));
1273                         nread += blocksize;
1274                 }
1275                 nscan++;
1276                 if ((nread > 65536 || nscan > 32))
1277                         break;
1278         }
1279 }
1280
1281 /*
1282  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1283  * cache
1284  */
1285 static noinline int reada_for_balance(struct btrfs_root *root,
1286                                       struct btrfs_path *path, int level)
1287 {
1288         int slot;
1289         int nritems;
1290         struct extent_buffer *parent;
1291         struct extent_buffer *eb;
1292         u64 gen;
1293         u64 block1 = 0;
1294         u64 block2 = 0;
1295         int ret = 0;
1296         int blocksize;
1297
1298         parent = path->nodes[level + 1];
1299         if (!parent)
1300                 return 0;
1301
1302         nritems = btrfs_header_nritems(parent);
1303         slot = path->slots[level + 1];
1304         blocksize = btrfs_level_size(root, level);
1305
1306         if (slot > 0) {
1307                 block1 = btrfs_node_blockptr(parent, slot - 1);
1308                 gen = btrfs_node_ptr_generation(parent, slot - 1);
1309                 eb = btrfs_find_tree_block(root, block1, blocksize);
1310                 if (eb && btrfs_buffer_uptodate(eb, gen))
1311                         block1 = 0;
1312                 free_extent_buffer(eb);
1313         }
1314         if (slot + 1 < nritems) {
1315                 block2 = btrfs_node_blockptr(parent, slot + 1);
1316                 gen = btrfs_node_ptr_generation(parent, slot + 1);
1317                 eb = btrfs_find_tree_block(root, block2, blocksize);
1318                 if (eb && btrfs_buffer_uptodate(eb, gen))
1319                         block2 = 0;
1320                 free_extent_buffer(eb);
1321         }
1322         if (block1 || block2) {
1323                 ret = -EAGAIN;
1324
1325                 /* release the whole path */
1326                 btrfs_release_path(root, path);
1327
1328                 /* read the blocks */
1329                 if (block1)
1330                         readahead_tree_block(root, block1, blocksize, 0);
1331                 if (block2)
1332                         readahead_tree_block(root, block2, blocksize, 0);
1333
1334                 if (block1) {
1335                         eb = read_tree_block(root, block1, blocksize, 0);
1336                         free_extent_buffer(eb);
1337                 }
1338                 if (block2) {
1339                         eb = read_tree_block(root, block2, blocksize, 0);
1340                         free_extent_buffer(eb);
1341                 }
1342         }
1343         return ret;
1344 }
1345
1346
1347 /*
1348  * when we walk down the tree, it is usually safe to unlock the higher layers
1349  * in the tree.  The exceptions are when our path goes through slot 0, because
1350  * operations on the tree might require changing key pointers higher up in the
1351  * tree.
1352  *
1353  * callers might also have set path->keep_locks, which tells this code to keep
1354  * the lock if the path points to the last slot in the block.  This is part of
1355  * walking through the tree, and selecting the next slot in the higher block.
1356  *
1357  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1358  * if lowest_unlock is 1, level 0 won't be unlocked
1359  */
1360 static noinline void unlock_up(struct btrfs_path *path, int level,
1361                                int lowest_unlock)
1362 {
1363         int i;
1364         int skip_level = level;
1365         int no_skips = 0;
1366         struct extent_buffer *t;
1367
1368         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1369                 if (!path->nodes[i])
1370                         break;
1371                 if (!path->locks[i])
1372                         break;
1373                 if (!no_skips && path->slots[i] == 0) {
1374                         skip_level = i + 1;
1375                         continue;
1376                 }
1377                 if (!no_skips && path->keep_locks) {
1378                         u32 nritems;
1379                         t = path->nodes[i];
1380                         nritems = btrfs_header_nritems(t);
1381                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1382                                 skip_level = i + 1;
1383                                 continue;
1384                         }
1385                 }
1386                 if (skip_level < i && i >= lowest_unlock)
1387                         no_skips = 1;
1388
1389                 t = path->nodes[i];
1390                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1391                         btrfs_tree_unlock(t);
1392                         path->locks[i] = 0;
1393                 }
1394         }
1395 }
1396
1397 /*
1398  * This releases any locks held in the path starting at level and
1399  * going all the way up to the root.
1400  *
1401  * btrfs_search_slot will keep the lock held on higher nodes in a few
1402  * corner cases, such as COW of the block at slot zero in the node.  This
1403  * ignores those rules, and it should only be called when there are no
1404  * more updates to be done higher up in the tree.
1405  */
1406 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1407 {
1408         int i;
1409
1410         if (path->keep_locks)
1411                 return;
1412
1413         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1414                 if (!path->nodes[i])
1415                         continue;
1416                 if (!path->locks[i])
1417                         continue;
1418                 btrfs_tree_unlock(path->nodes[i]);
1419                 path->locks[i] = 0;
1420         }
1421 }
1422
1423 /*
1424  * helper function for btrfs_search_slot.  The goal is to find a block
1425  * in cache without setting the path to blocking.  If we find the block
1426  * we return zero and the path is unchanged.
1427  *
1428  * If we can't find the block, we set the path blocking and do some
1429  * reada.  -EAGAIN is returned and the search must be repeated.
1430  */
1431 static int
1432 read_block_for_search(struct btrfs_trans_handle *trans,
1433                        struct btrfs_root *root, struct btrfs_path *p,
1434                        struct extent_buffer **eb_ret, int level, int slot,
1435                        struct btrfs_key *key)
1436 {
1437         u64 blocknr;
1438         u64 gen;
1439         u32 blocksize;
1440         struct extent_buffer *b = *eb_ret;
1441         struct extent_buffer *tmp;
1442         int ret;
1443
1444         blocknr = btrfs_node_blockptr(b, slot);
1445         gen = btrfs_node_ptr_generation(b, slot);
1446         blocksize = btrfs_level_size(root, level - 1);
1447
1448         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1449         if (tmp) {
1450                 if (btrfs_buffer_uptodate(tmp, 0)) {
1451                         if (btrfs_buffer_uptodate(tmp, gen)) {
1452                                 /*
1453                                  * we found an up to date block without
1454                                  * sleeping, return
1455                                  * right away
1456                                  */
1457                                 *eb_ret = tmp;
1458                                 return 0;
1459                         }
1460                         /* the pages were up to date, but we failed
1461                          * the generation number check.  Do a full
1462                          * read for the generation number that is correct.
1463                          * We must do this without dropping locks so
1464                          * we can trust our generation number
1465                          */
1466                         free_extent_buffer(tmp);
1467                         tmp = read_tree_block(root, blocknr, blocksize, gen);
1468                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1469                                 *eb_ret = tmp;
1470                                 return 0;
1471                         }
1472                         free_extent_buffer(tmp);
1473                         btrfs_release_path(NULL, p);
1474                         return -EIO;
1475                 }
1476         }
1477
1478         /*
1479          * reduce lock contention at high levels
1480          * of the btree by dropping locks before
1481          * we read.  Don't release the lock on the current
1482          * level because we need to walk this node to figure
1483          * out which blocks to read.
1484          */
1485         btrfs_unlock_up_safe(p, level + 1);
1486         btrfs_set_path_blocking(p);
1487
1488         free_extent_buffer(tmp);
1489         if (p->reada)
1490                 reada_for_search(root, p, level, slot, key->objectid);
1491
1492         btrfs_release_path(NULL, p);
1493
1494         ret = -EAGAIN;
1495         tmp = read_tree_block(root, blocknr, blocksize, 0);
1496         if (tmp) {
1497                 /*
1498                  * If the read above didn't mark this buffer up to date,
1499                  * it will never end up being up to date.  Set ret to EIO now
1500                  * and give up so that our caller doesn't loop forever
1501                  * on our EAGAINs.
1502                  */
1503                 if (!btrfs_buffer_uptodate(tmp, 0))
1504                         ret = -EIO;
1505                 free_extent_buffer(tmp);
1506         }
1507         return ret;
1508 }
1509
1510 /*
1511  * helper function for btrfs_search_slot.  This does all of the checks
1512  * for node-level blocks and does any balancing required based on
1513  * the ins_len.
1514  *
1515  * If no extra work was required, zero is returned.  If we had to
1516  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1517  * start over
1518  */
1519 static int
1520 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1521                        struct btrfs_root *root, struct btrfs_path *p,
1522                        struct extent_buffer *b, int level, int ins_len)
1523 {
1524         int ret;
1525         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1526             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1527                 int sret;
1528
1529                 sret = reada_for_balance(root, p, level);
1530                 if (sret)
1531                         goto again;
1532
1533                 btrfs_set_path_blocking(p);
1534                 sret = split_node(trans, root, p, level);
1535                 btrfs_clear_path_blocking(p, NULL);
1536
1537                 BUG_ON(sret > 0);
1538                 if (sret) {
1539                         ret = sret;
1540                         goto done;
1541                 }
1542                 b = p->nodes[level];
1543         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1544                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1545                 int sret;
1546
1547                 sret = reada_for_balance(root, p, level);
1548                 if (sret)
1549                         goto again;
1550
1551                 btrfs_set_path_blocking(p);
1552                 sret = balance_level(trans, root, p, level);
1553                 btrfs_clear_path_blocking(p, NULL);
1554
1555                 if (sret) {
1556                         ret = sret;
1557                         goto done;
1558                 }
1559                 b = p->nodes[level];
1560                 if (!b) {
1561                         btrfs_release_path(NULL, p);
1562                         goto again;
1563                 }
1564                 BUG_ON(btrfs_header_nritems(b) == 1);
1565         }
1566         return 0;
1567
1568 again:
1569         ret = -EAGAIN;
1570 done:
1571         return ret;
1572 }
1573
1574 /*
1575  * look for key in the tree.  path is filled in with nodes along the way
1576  * if key is found, we return zero and you can find the item in the leaf
1577  * level of the path (level 0)
1578  *
1579  * If the key isn't found, the path points to the slot where it should
1580  * be inserted, and 1 is returned.  If there are other errors during the
1581  * search a negative error number is returned.
1582  *
1583  * if ins_len > 0, nodes and leaves will be split as we walk down the
1584  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1585  * possible)
1586  */
1587 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1588                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1589                       ins_len, int cow)
1590 {
1591         struct extent_buffer *b;
1592         int slot;
1593         int ret;
1594         int err;
1595         int level;
1596         int lowest_unlock = 1;
1597         u8 lowest_level = 0;
1598
1599         lowest_level = p->lowest_level;
1600         WARN_ON(lowest_level && ins_len > 0);
1601         WARN_ON(p->nodes[0] != NULL);
1602
1603         if (ins_len < 0)
1604                 lowest_unlock = 2;
1605
1606 again:
1607         if (p->search_commit_root) {
1608                 b = root->commit_root;
1609                 extent_buffer_get(b);
1610                 if (!p->skip_locking)
1611                         btrfs_tree_lock(b);
1612         } else {
1613                 if (p->skip_locking)
1614                         b = btrfs_root_node(root);
1615                 else
1616                         b = btrfs_lock_root_node(root);
1617         }
1618
1619         while (b) {
1620                 level = btrfs_header_level(b);
1621
1622                 /*
1623                  * setup the path here so we can release it under lock
1624                  * contention with the cow code
1625                  */
1626                 p->nodes[level] = b;
1627                 if (!p->skip_locking)
1628                         p->locks[level] = 1;
1629
1630                 if (cow) {
1631                         /*
1632                          * if we don't really need to cow this block
1633                          * then we don't want to set the path blocking,
1634                          * so we test it here
1635                          */
1636                         if (!should_cow_block(trans, root, b))
1637                                 goto cow_done;
1638
1639                         btrfs_set_path_blocking(p);
1640
1641                         err = btrfs_cow_block(trans, root, b,
1642                                               p->nodes[level + 1],
1643                                               p->slots[level + 1], &b);
1644                         if (err) {
1645                                 ret = err;
1646                                 goto done;
1647                         }
1648                 }
1649 cow_done:
1650                 BUG_ON(!cow && ins_len);
1651                 if (level != btrfs_header_level(b))
1652                         WARN_ON(1);
1653                 level = btrfs_header_level(b);
1654
1655                 p->nodes[level] = b;
1656                 if (!p->skip_locking)
1657                         p->locks[level] = 1;
1658
1659                 btrfs_clear_path_blocking(p, NULL);
1660
1661                 /*
1662                  * we have a lock on b and as long as we aren't changing
1663                  * the tree, there is no way to for the items in b to change.
1664                  * It is safe to drop the lock on our parent before we
1665                  * go through the expensive btree search on b.
1666                  *
1667                  * If cow is true, then we might be changing slot zero,
1668                  * which may require changing the parent.  So, we can't
1669                  * drop the lock until after we know which slot we're
1670                  * operating on.
1671                  */
1672                 if (!cow)
1673                         btrfs_unlock_up_safe(p, level + 1);
1674
1675                 ret = bin_search(b, key, level, &slot);
1676
1677                 if (level != 0) {
1678                         int dec = 0;
1679                         if (ret && slot > 0) {
1680                                 dec = 1;
1681                                 slot -= 1;
1682                         }
1683                         p->slots[level] = slot;
1684                         err = setup_nodes_for_search(trans, root, p, b, level,
1685                                                      ins_len);
1686                         if (err == -EAGAIN)
1687                                 goto again;
1688                         if (err) {
1689                                 ret = err;
1690                                 goto done;
1691                         }
1692                         b = p->nodes[level];
1693                         slot = p->slots[level];
1694
1695                         unlock_up(p, level, lowest_unlock);
1696
1697                         if (level == lowest_level) {
1698                                 if (dec)
1699                                         p->slots[level]++;
1700                                 goto done;
1701                         }
1702
1703                         err = read_block_for_search(trans, root, p,
1704                                                     &b, level, slot, key);
1705                         if (err == -EAGAIN)
1706                                 goto again;
1707                         if (err) {
1708                                 ret = err;
1709                                 goto done;
1710                         }
1711
1712                         if (!p->skip_locking) {
1713                                 btrfs_clear_path_blocking(p, NULL);
1714                                 err = btrfs_try_spin_lock(b);
1715
1716                                 if (!err) {
1717                                         btrfs_set_path_blocking(p);
1718                                         btrfs_tree_lock(b);
1719                                         btrfs_clear_path_blocking(p, b);
1720                                 }
1721                         }
1722                 } else {
1723                         p->slots[level] = slot;
1724                         if (ins_len > 0 &&
1725                             btrfs_leaf_free_space(root, b) < ins_len) {
1726                                 btrfs_set_path_blocking(p);
1727                                 err = split_leaf(trans, root, key,
1728                                                  p, ins_len, ret == 0);
1729                                 btrfs_clear_path_blocking(p, NULL);
1730
1731                                 BUG_ON(err > 0);
1732                                 if (err) {
1733                                         ret = err;
1734                                         goto done;
1735                                 }
1736                         }
1737                         if (!p->search_for_split)
1738                                 unlock_up(p, level, lowest_unlock);
1739                         goto done;
1740                 }
1741         }
1742         ret = 1;
1743 done:
1744         /*
1745          * we don't really know what they plan on doing with the path
1746          * from here on, so for now just mark it as blocking
1747          */
1748         if (!p->leave_spinning)
1749                 btrfs_set_path_blocking(p);
1750         if (ret < 0)
1751                 btrfs_release_path(root, p);
1752         return ret;
1753 }
1754
1755 /*
1756  * adjust the pointers going up the tree, starting at level
1757  * making sure the right key of each node is points to 'key'.
1758  * This is used after shifting pointers to the left, so it stops
1759  * fixing up pointers when a given leaf/node is not in slot 0 of the
1760  * higher levels
1761  *
1762  * If this fails to write a tree block, it returns -1, but continues
1763  * fixing up the blocks in ram so the tree is consistent.
1764  */
1765 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1766                           struct btrfs_root *root, struct btrfs_path *path,
1767                           struct btrfs_disk_key *key, int level)
1768 {
1769         int i;
1770         int ret = 0;
1771         struct extent_buffer *t;
1772
1773         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1774                 int tslot = path->slots[i];
1775                 if (!path->nodes[i])
1776                         break;
1777                 t = path->nodes[i];
1778                 btrfs_set_node_key(t, key, tslot);
1779                 btrfs_mark_buffer_dirty(path->nodes[i]);
1780                 if (tslot != 0)
1781                         break;
1782         }
1783         return ret;
1784 }
1785
1786 /*
1787  * update item key.
1788  *
1789  * This function isn't completely safe. It's the caller's responsibility
1790  * that the new key won't break the order
1791  */
1792 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1793                             struct btrfs_root *root, struct btrfs_path *path,
1794                             struct btrfs_key *new_key)
1795 {
1796         struct btrfs_disk_key disk_key;
1797         struct extent_buffer *eb;
1798         int slot;
1799
1800         eb = path->nodes[0];
1801         slot = path->slots[0];
1802         if (slot > 0) {
1803                 btrfs_item_key(eb, &disk_key, slot - 1);
1804                 if (comp_keys(&disk_key, new_key) >= 0)
1805                         return -1;
1806         }
1807         if (slot < btrfs_header_nritems(eb) - 1) {
1808                 btrfs_item_key(eb, &disk_key, slot + 1);
1809                 if (comp_keys(&disk_key, new_key) <= 0)
1810                         return -1;
1811         }
1812
1813         btrfs_cpu_key_to_disk(&disk_key, new_key);
1814         btrfs_set_item_key(eb, &disk_key, slot);
1815         btrfs_mark_buffer_dirty(eb);
1816         if (slot == 0)
1817                 fixup_low_keys(trans, root, path, &disk_key, 1);
1818         return 0;
1819 }
1820
1821 /*
1822  * try to push data from one node into the next node left in the
1823  * tree.
1824  *
1825  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1826  * error, and > 0 if there was no room in the left hand block.
1827  */
1828 static int push_node_left(struct btrfs_trans_handle *trans,
1829                           struct btrfs_root *root, struct extent_buffer *dst,
1830                           struct extent_buffer *src, int empty)
1831 {
1832         int push_items = 0;
1833         int src_nritems;
1834         int dst_nritems;
1835         int ret = 0;
1836
1837         src_nritems = btrfs_header_nritems(src);
1838         dst_nritems = btrfs_header_nritems(dst);
1839         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1840         WARN_ON(btrfs_header_generation(src) != trans->transid);
1841         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1842
1843         if (!empty && src_nritems <= 8)
1844                 return 1;
1845
1846         if (push_items <= 0)
1847                 return 1;
1848
1849         if (empty) {
1850                 push_items = min(src_nritems, push_items);
1851                 if (push_items < src_nritems) {
1852                         /* leave at least 8 pointers in the node if
1853                          * we aren't going to empty it
1854                          */
1855                         if (src_nritems - push_items < 8) {
1856                                 if (push_items <= 8)
1857                                         return 1;
1858                                 push_items -= 8;
1859                         }
1860                 }
1861         } else
1862                 push_items = min(src_nritems - 8, push_items);
1863
1864         copy_extent_buffer(dst, src,
1865                            btrfs_node_key_ptr_offset(dst_nritems),
1866                            btrfs_node_key_ptr_offset(0),
1867                            push_items * sizeof(struct btrfs_key_ptr));
1868
1869         if (push_items < src_nritems) {
1870                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1871                                       btrfs_node_key_ptr_offset(push_items),
1872                                       (src_nritems - push_items) *
1873                                       sizeof(struct btrfs_key_ptr));
1874         }
1875         btrfs_set_header_nritems(src, src_nritems - push_items);
1876         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1877         btrfs_mark_buffer_dirty(src);
1878         btrfs_mark_buffer_dirty(dst);
1879
1880         return ret;
1881 }
1882
1883 /*
1884  * try to push data from one node into the next node right in the
1885  * tree.
1886  *
1887  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1888  * error, and > 0 if there was no room in the right hand block.
1889  *
1890  * this will  only push up to 1/2 the contents of the left node over
1891  */
1892 static int balance_node_right(struct btrfs_trans_handle *trans,
1893                               struct btrfs_root *root,
1894                               struct extent_buffer *dst,
1895                               struct extent_buffer *src)
1896 {
1897         int push_items = 0;
1898         int max_push;
1899         int src_nritems;
1900         int dst_nritems;
1901         int ret = 0;
1902
1903         WARN_ON(btrfs_header_generation(src) != trans->transid);
1904         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1905
1906         src_nritems = btrfs_header_nritems(src);
1907         dst_nritems = btrfs_header_nritems(dst);
1908         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1909         if (push_items <= 0)
1910                 return 1;
1911
1912         if (src_nritems < 4)
1913                 return 1;
1914
1915         max_push = src_nritems / 2 + 1;
1916         /* don't try to empty the node */
1917         if (max_push >= src_nritems)
1918                 return 1;
1919
1920         if (max_push < push_items)
1921                 push_items = max_push;
1922
1923         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1924                                       btrfs_node_key_ptr_offset(0),
1925                                       (dst_nritems) *
1926                                       sizeof(struct btrfs_key_ptr));
1927
1928         copy_extent_buffer(dst, src,
1929                            btrfs_node_key_ptr_offset(0),
1930                            btrfs_node_key_ptr_offset(src_nritems - push_items),
1931                            push_items * sizeof(struct btrfs_key_ptr));
1932
1933         btrfs_set_header_nritems(src, src_nritems - push_items);
1934         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1935
1936         btrfs_mark_buffer_dirty(src);
1937         btrfs_mark_buffer_dirty(dst);
1938
1939         return ret;
1940 }
1941
1942 /*
1943  * helper function to insert a new root level in the tree.
1944  * A new node is allocated, and a single item is inserted to
1945  * point to the existing root
1946  *
1947  * returns zero on success or < 0 on failure.
1948  */
1949 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1950                            struct btrfs_root *root,
1951                            struct btrfs_path *path, int level)
1952 {
1953         u64 lower_gen;
1954         struct extent_buffer *lower;
1955         struct extent_buffer *c;
1956         struct extent_buffer *old;
1957         struct btrfs_disk_key lower_key;
1958
1959         BUG_ON(path->nodes[level]);
1960         BUG_ON(path->nodes[level-1] != root->node);
1961
1962         lower = path->nodes[level-1];
1963         if (level == 1)
1964                 btrfs_item_key(lower, &lower_key, 0);
1965         else
1966                 btrfs_node_key(lower, &lower_key, 0);
1967
1968         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1969                                    root->root_key.objectid, &lower_key,
1970                                    level, root->node->start, 0);
1971         if (IS_ERR(c))
1972                 return PTR_ERR(c);
1973
1974         root_add_used(root, root->nodesize);
1975
1976         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
1977         btrfs_set_header_nritems(c, 1);
1978         btrfs_set_header_level(c, level);
1979         btrfs_set_header_bytenr(c, c->start);
1980         btrfs_set_header_generation(c, trans->transid);
1981         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
1982         btrfs_set_header_owner(c, root->root_key.objectid);
1983
1984         write_extent_buffer(c, root->fs_info->fsid,
1985                             (unsigned long)btrfs_header_fsid(c),
1986                             BTRFS_FSID_SIZE);
1987
1988         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1989                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
1990                             BTRFS_UUID_SIZE);
1991
1992         btrfs_set_node_key(c, &lower_key, 0);
1993         btrfs_set_node_blockptr(c, 0, lower->start);
1994         lower_gen = btrfs_header_generation(lower);
1995         WARN_ON(lower_gen != trans->transid);
1996
1997         btrfs_set_node_ptr_generation(c, 0, lower_gen);
1998
1999         btrfs_mark_buffer_dirty(c);
2000
2001         old = root->node;
2002         rcu_assign_pointer(root->node, c);
2003
2004         /* the super has an extra ref to root->node */
2005         free_extent_buffer(old);
2006
2007         add_root_to_dirty_list(root);
2008         extent_buffer_get(c);
2009         path->nodes[level] = c;
2010         path->locks[level] = 1;
2011         path->slots[level] = 0;
2012         return 0;
2013 }
2014
2015 /*
2016  * worker function to insert a single pointer in a node.
2017  * the node should have enough room for the pointer already
2018  *
2019  * slot and level indicate where you want the key to go, and
2020  * blocknr is the block the key points to.
2021  *
2022  * returns zero on success and < 0 on any error
2023  */
2024 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2025                       *root, struct btrfs_path *path, struct btrfs_disk_key
2026                       *key, u64 bytenr, int slot, int level)
2027 {
2028         struct extent_buffer *lower;
2029         int nritems;
2030
2031         BUG_ON(!path->nodes[level]);
2032         btrfs_assert_tree_locked(path->nodes[level]);
2033         lower = path->nodes[level];
2034         nritems = btrfs_header_nritems(lower);
2035         BUG_ON(slot > nritems);
2036         if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2037                 BUG();
2038         if (slot != nritems) {
2039                 memmove_extent_buffer(lower,
2040                               btrfs_node_key_ptr_offset(slot + 1),
2041                               btrfs_node_key_ptr_offset(slot),
2042                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2043         }
2044         btrfs_set_node_key(lower, key, slot);
2045         btrfs_set_node_blockptr(lower, slot, bytenr);
2046         WARN_ON(trans->transid == 0);
2047         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2048         btrfs_set_header_nritems(lower, nritems + 1);
2049         btrfs_mark_buffer_dirty(lower);
2050         return 0;
2051 }
2052
2053 /*
2054  * split the node at the specified level in path in two.
2055  * The path is corrected to point to the appropriate node after the split
2056  *
2057  * Before splitting this tries to make some room in the node by pushing
2058  * left and right, if either one works, it returns right away.
2059  *
2060  * returns 0 on success and < 0 on failure
2061  */
2062 static noinline int split_node(struct btrfs_trans_handle *trans,
2063                                struct btrfs_root *root,
2064                                struct btrfs_path *path, int level)
2065 {
2066         struct extent_buffer *c;
2067         struct extent_buffer *split;
2068         struct btrfs_disk_key disk_key;
2069         int mid;
2070         int ret;
2071         int wret;
2072         u32 c_nritems;
2073
2074         c = path->nodes[level];
2075         WARN_ON(btrfs_header_generation(c) != trans->transid);
2076         if (c == root->node) {
2077                 /* trying to split the root, lets make a new one */
2078                 ret = insert_new_root(trans, root, path, level + 1);
2079                 if (ret)
2080                         return ret;
2081         } else {
2082                 ret = push_nodes_for_insert(trans, root, path, level);
2083                 c = path->nodes[level];
2084                 if (!ret && btrfs_header_nritems(c) <
2085                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2086                         return 0;
2087                 if (ret < 0)
2088                         return ret;
2089         }
2090
2091         c_nritems = btrfs_header_nritems(c);
2092         mid = (c_nritems + 1) / 2;
2093         btrfs_node_key(c, &disk_key, mid);
2094
2095         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2096                                         root->root_key.objectid,
2097                                         &disk_key, level, c->start, 0);
2098         if (IS_ERR(split))
2099                 return PTR_ERR(split);
2100
2101         root_add_used(root, root->nodesize);
2102
2103         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2104         btrfs_set_header_level(split, btrfs_header_level(c));
2105         btrfs_set_header_bytenr(split, split->start);
2106         btrfs_set_header_generation(split, trans->transid);
2107         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2108         btrfs_set_header_owner(split, root->root_key.objectid);
2109         write_extent_buffer(split, root->fs_info->fsid,
2110                             (unsigned long)btrfs_header_fsid(split),
2111                             BTRFS_FSID_SIZE);
2112         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2113                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2114                             BTRFS_UUID_SIZE);
2115
2116
2117         copy_extent_buffer(split, c,
2118                            btrfs_node_key_ptr_offset(0),
2119                            btrfs_node_key_ptr_offset(mid),
2120                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2121         btrfs_set_header_nritems(split, c_nritems - mid);
2122         btrfs_set_header_nritems(c, mid);
2123         ret = 0;
2124
2125         btrfs_mark_buffer_dirty(c);
2126         btrfs_mark_buffer_dirty(split);
2127
2128         wret = insert_ptr(trans, root, path, &disk_key, split->start,
2129                           path->slots[level + 1] + 1,
2130                           level + 1);
2131         if (wret)
2132                 ret = wret;
2133
2134         if (path->slots[level] >= mid) {
2135                 path->slots[level] -= mid;
2136                 btrfs_tree_unlock(c);
2137                 free_extent_buffer(c);
2138                 path->nodes[level] = split;
2139                 path->slots[level + 1] += 1;
2140         } else {
2141                 btrfs_tree_unlock(split);
2142                 free_extent_buffer(split);
2143         }
2144         return ret;
2145 }
2146
2147 /*
2148  * how many bytes are required to store the items in a leaf.  start
2149  * and nr indicate which items in the leaf to check.  This totals up the
2150  * space used both by the item structs and the item data
2151  */
2152 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2153 {
2154         int data_len;
2155         int nritems = btrfs_header_nritems(l);
2156         int end = min(nritems, start + nr) - 1;
2157
2158         if (!nr)
2159                 return 0;
2160         data_len = btrfs_item_end_nr(l, start);
2161         data_len = data_len - btrfs_item_offset_nr(l, end);
2162         data_len += sizeof(struct btrfs_item) * nr;
2163         WARN_ON(data_len < 0);
2164         return data_len;
2165 }
2166
2167 /*
2168  * The space between the end of the leaf items and
2169  * the start of the leaf data.  IOW, how much room
2170  * the leaf has left for both items and data
2171  */
2172 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2173                                    struct extent_buffer *leaf)
2174 {
2175         int nritems = btrfs_header_nritems(leaf);
2176         int ret;
2177         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2178         if (ret < 0) {
2179                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2180                        "used %d nritems %d\n",
2181                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2182                        leaf_space_used(leaf, 0, nritems), nritems);
2183         }
2184         return ret;
2185 }
2186
2187 /*
2188  * min slot controls the lowest index we're willing to push to the
2189  * right.  We'll push up to and including min_slot, but no lower
2190  */
2191 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2192                                       struct btrfs_root *root,
2193                                       struct btrfs_path *path,
2194                                       int data_size, int empty,
2195                                       struct extent_buffer *right,
2196                                       int free_space, u32 left_nritems,
2197                                       u32 min_slot)
2198 {
2199         struct extent_buffer *left = path->nodes[0];
2200         struct extent_buffer *upper = path->nodes[1];
2201         struct btrfs_disk_key disk_key;
2202         int slot;
2203         u32 i;
2204         int push_space = 0;
2205         int push_items = 0;
2206         struct btrfs_item *item;
2207         u32 nr;
2208         u32 right_nritems;
2209         u32 data_end;
2210         u32 this_item_size;
2211
2212         if (empty)
2213                 nr = 0;
2214         else
2215                 nr = max_t(u32, 1, min_slot);
2216
2217         if (path->slots[0] >= left_nritems)
2218                 push_space += data_size;
2219
2220         slot = path->slots[1];
2221         i = left_nritems - 1;
2222         while (i >= nr) {
2223                 item = btrfs_item_nr(left, i);
2224
2225                 if (!empty && push_items > 0) {
2226                         if (path->slots[0] > i)
2227                                 break;
2228                         if (path->slots[0] == i) {
2229                                 int space = btrfs_leaf_free_space(root, left);
2230                                 if (space + push_space * 2 > free_space)
2231                                         break;
2232                         }
2233                 }
2234
2235                 if (path->slots[0] == i)
2236                         push_space += data_size;
2237
2238                 if (!left->map_token) {
2239                         map_extent_buffer(left, (unsigned long)item,
2240                                         sizeof(struct btrfs_item),
2241                                         &left->map_token, &left->kaddr,
2242                                         &left->map_start, &left->map_len,
2243                                         KM_USER1);
2244                 }
2245
2246                 this_item_size = btrfs_item_size(left, item);
2247                 if (this_item_size + sizeof(*item) + push_space > free_space)
2248                         break;
2249
2250                 push_items++;
2251                 push_space += this_item_size + sizeof(*item);
2252                 if (i == 0)
2253                         break;
2254                 i--;
2255         }
2256         if (left->map_token) {
2257                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2258                 left->map_token = NULL;
2259         }
2260
2261         if (push_items == 0)
2262                 goto out_unlock;
2263
2264         if (!empty && push_items == left_nritems)
2265                 WARN_ON(1);
2266
2267         /* push left to right */
2268         right_nritems = btrfs_header_nritems(right);
2269
2270         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2271         push_space -= leaf_data_end(root, left);
2272
2273         /* make room in the right data area */
2274         data_end = leaf_data_end(root, right);
2275         memmove_extent_buffer(right,
2276                               btrfs_leaf_data(right) + data_end - push_space,
2277                               btrfs_leaf_data(right) + data_end,
2278                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2279
2280         /* copy from the left data area */
2281         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2282                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2283                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2284                      push_space);
2285
2286         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2287                               btrfs_item_nr_offset(0),
2288                               right_nritems * sizeof(struct btrfs_item));
2289
2290         /* copy the items from left to right */
2291         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2292                    btrfs_item_nr_offset(left_nritems - push_items),
2293                    push_items * sizeof(struct btrfs_item));
2294
2295         /* update the item pointers */
2296         right_nritems += push_items;
2297         btrfs_set_header_nritems(right, right_nritems);
2298         push_space = BTRFS_LEAF_DATA_SIZE(root);
2299         for (i = 0; i < right_nritems; i++) {
2300                 item = btrfs_item_nr(right, i);
2301                 if (!right->map_token) {
2302                         map_extent_buffer(right, (unsigned long)item,
2303                                         sizeof(struct btrfs_item),
2304                                         &right->map_token, &right->kaddr,
2305                                         &right->map_start, &right->map_len,
2306                                         KM_USER1);
2307                 }
2308                 push_space -= btrfs_item_size(right, item);
2309                 btrfs_set_item_offset(right, item, push_space);
2310         }
2311
2312         if (right->map_token) {
2313                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2314                 right->map_token = NULL;
2315         }
2316         left_nritems -= push_items;
2317         btrfs_set_header_nritems(left, left_nritems);
2318
2319         if (left_nritems)
2320                 btrfs_mark_buffer_dirty(left);
2321         else
2322                 clean_tree_block(trans, root, left);
2323
2324         btrfs_mark_buffer_dirty(right);
2325
2326         btrfs_item_key(right, &disk_key, 0);
2327         btrfs_set_node_key(upper, &disk_key, slot + 1);
2328         btrfs_mark_buffer_dirty(upper);
2329
2330         /* then fixup the leaf pointer in the path */
2331         if (path->slots[0] >= left_nritems) {
2332                 path->slots[0] -= left_nritems;
2333                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2334                         clean_tree_block(trans, root, path->nodes[0]);
2335                 btrfs_tree_unlock(path->nodes[0]);
2336                 free_extent_buffer(path->nodes[0]);
2337                 path->nodes[0] = right;
2338                 path->slots[1] += 1;
2339         } else {
2340                 btrfs_tree_unlock(right);
2341                 free_extent_buffer(right);
2342         }
2343         return 0;
2344
2345 out_unlock:
2346         btrfs_tree_unlock(right);
2347         free_extent_buffer(right);
2348         return 1;
2349 }
2350
2351 /*
2352  * push some data in the path leaf to the right, trying to free up at
2353  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2354  *
2355  * returns 1 if the push failed because the other node didn't have enough
2356  * room, 0 if everything worked out and < 0 if there were major errors.
2357  *
2358  * this will push starting from min_slot to the end of the leaf.  It won't
2359  * push any slot lower than min_slot
2360  */
2361 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2362                            *root, struct btrfs_path *path,
2363                            int min_data_size, int data_size,
2364                            int empty, u32 min_slot)
2365 {
2366         struct extent_buffer *left = path->nodes[0];
2367         struct extent_buffer *right;
2368         struct extent_buffer *upper;
2369         int slot;
2370         int free_space;
2371         u32 left_nritems;
2372         int ret;
2373
2374         if (!path->nodes[1])
2375                 return 1;
2376
2377         slot = path->slots[1];
2378         upper = path->nodes[1];
2379         if (slot >= btrfs_header_nritems(upper) - 1)
2380                 return 1;
2381
2382         btrfs_assert_tree_locked(path->nodes[1]);
2383
2384         right = read_node_slot(root, upper, slot + 1);
2385         if (right == NULL)
2386                 return 1;
2387
2388         btrfs_tree_lock(right);
2389         btrfs_set_lock_blocking(right);
2390
2391         free_space = btrfs_leaf_free_space(root, right);
2392         if (free_space < data_size)
2393                 goto out_unlock;
2394
2395         /* cow and double check */
2396         ret = btrfs_cow_block(trans, root, right, upper,
2397                               slot + 1, &right);
2398         if (ret)
2399                 goto out_unlock;
2400
2401         free_space = btrfs_leaf_free_space(root, right);
2402         if (free_space < data_size)
2403                 goto out_unlock;
2404
2405         left_nritems = btrfs_header_nritems(left);
2406         if (left_nritems == 0)
2407                 goto out_unlock;
2408
2409         return __push_leaf_right(trans, root, path, min_data_size, empty,
2410                                 right, free_space, left_nritems, min_slot);
2411 out_unlock:
2412         btrfs_tree_unlock(right);
2413         free_extent_buffer(right);
2414         return 1;
2415 }
2416
2417 /*
2418  * push some data in the path leaf to the left, trying to free up at
2419  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2420  *
2421  * max_slot can put a limit on how far into the leaf we'll push items.  The
2422  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2423  * items
2424  */
2425 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2426                                      struct btrfs_root *root,
2427                                      struct btrfs_path *path, int data_size,
2428                                      int empty, struct extent_buffer *left,
2429                                      int free_space, u32 right_nritems,
2430                                      u32 max_slot)
2431 {
2432         struct btrfs_disk_key disk_key;
2433         struct extent_buffer *right = path->nodes[0];
2434         int i;
2435         int push_space = 0;
2436         int push_items = 0;
2437         struct btrfs_item *item;
2438         u32 old_left_nritems;
2439         u32 nr;
2440         int ret = 0;
2441         int wret;
2442         u32 this_item_size;
2443         u32 old_left_item_size;
2444
2445         if (empty)
2446                 nr = min(right_nritems, max_slot);
2447         else
2448                 nr = min(right_nritems - 1, max_slot);
2449
2450         for (i = 0; i < nr; i++) {
2451                 item = btrfs_item_nr(right, i);
2452                 if (!right->map_token) {
2453                         map_extent_buffer(right, (unsigned long)item,
2454                                         sizeof(struct btrfs_item),
2455                                         &right->map_token, &right->kaddr,
2456                                         &right->map_start, &right->map_len,
2457                                         KM_USER1);
2458                 }
2459
2460                 if (!empty && push_items > 0) {
2461                         if (path->slots[0] < i)
2462                                 break;
2463                         if (path->slots[0] == i) {
2464                                 int space = btrfs_leaf_free_space(root, right);
2465                                 if (space + push_space * 2 > free_space)
2466                                         break;
2467                         }
2468                 }
2469
2470                 if (path->slots[0] == i)
2471                         push_space += data_size;
2472
2473                 this_item_size = btrfs_item_size(right, item);
2474                 if (this_item_size + sizeof(*item) + push_space > free_space)
2475                         break;
2476
2477                 push_items++;
2478                 push_space += this_item_size + sizeof(*item);
2479         }
2480
2481         if (right->map_token) {
2482                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2483                 right->map_token = NULL;
2484         }
2485
2486         if (push_items == 0) {
2487                 ret = 1;
2488                 goto out;
2489         }
2490         if (!empty && push_items == btrfs_header_nritems(right))
2491                 WARN_ON(1);
2492
2493         /* push data from right to left */
2494         copy_extent_buffer(left, right,
2495                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2496                            btrfs_item_nr_offset(0),
2497                            push_items * sizeof(struct btrfs_item));
2498
2499         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2500                      btrfs_item_offset_nr(right, push_items - 1);
2501
2502         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2503                      leaf_data_end(root, left) - push_space,
2504                      btrfs_leaf_data(right) +
2505                      btrfs_item_offset_nr(right, push_items - 1),
2506                      push_space);
2507         old_left_nritems = btrfs_header_nritems(left);
2508         BUG_ON(old_left_nritems <= 0);
2509
2510         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2511         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2512                 u32 ioff;
2513
2514                 item = btrfs_item_nr(left, i);
2515                 if (!left->map_token) {
2516                         map_extent_buffer(left, (unsigned long)item,
2517                                         sizeof(struct btrfs_item),
2518                                         &left->map_token, &left->kaddr,
2519                                         &left->map_start, &left->map_len,
2520                                         KM_USER1);
2521                 }
2522
2523                 ioff = btrfs_item_offset(left, item);
2524                 btrfs_set_item_offset(left, item,
2525                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2526         }
2527         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2528         if (left->map_token) {
2529                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2530                 left->map_token = NULL;
2531         }
2532
2533         /* fixup right node */
2534         if (push_items > right_nritems) {
2535                 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2536                        right_nritems);
2537                 WARN_ON(1);
2538         }
2539
2540         if (push_items < right_nritems) {
2541                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2542                                                   leaf_data_end(root, right);
2543                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2544                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2545                                       btrfs_leaf_data(right) +
2546                                       leaf_data_end(root, right), push_space);
2547
2548                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2549                               btrfs_item_nr_offset(push_items),
2550                              (btrfs_header_nritems(right) - push_items) *
2551                              sizeof(struct btrfs_item));
2552         }
2553         right_nritems -= push_items;
2554         btrfs_set_header_nritems(right, right_nritems);
2555         push_space = BTRFS_LEAF_DATA_SIZE(root);
2556         for (i = 0; i < right_nritems; i++) {
2557                 item = btrfs_item_nr(right, i);
2558
2559                 if (!right->map_token) {
2560                         map_extent_buffer(right, (unsigned long)item,
2561                                         sizeof(struct btrfs_item),
2562                                         &right->map_token, &right->kaddr,
2563                                         &right->map_start, &right->map_len,
2564                                         KM_USER1);
2565                 }
2566
2567                 push_space = push_space - btrfs_item_size(right, item);
2568                 btrfs_set_item_offset(right, item, push_space);
2569         }
2570         if (right->map_token) {
2571                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2572                 right->map_token = NULL;
2573         }
2574
2575         btrfs_mark_buffer_dirty(left);
2576         if (right_nritems)
2577                 btrfs_mark_buffer_dirty(right);
2578         else
2579                 clean_tree_block(trans, root, right);
2580
2581         btrfs_item_key(right, &disk_key, 0);
2582         wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2583         if (wret)
2584                 ret = wret;
2585
2586         /* then fixup the leaf pointer in the path */
2587         if (path->slots[0] < push_items) {
2588                 path->slots[0] += old_left_nritems;
2589                 btrfs_tree_unlock(path->nodes[0]);
2590                 free_extent_buffer(path->nodes[0]);
2591                 path->nodes[0] = left;
2592                 path->slots[1] -= 1;
2593         } else {
2594                 btrfs_tree_unlock(left);
2595                 free_extent_buffer(left);
2596                 path->slots[0] -= push_items;
2597         }
2598         BUG_ON(path->slots[0] < 0);
2599         return ret;
2600 out:
2601         btrfs_tree_unlock(left);
2602         free_extent_buffer(left);
2603         return ret;
2604 }
2605
2606 /*
2607  * push some data in the path leaf to the left, trying to free up at
2608  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2609  *
2610  * max_slot can put a limit on how far into the leaf we'll push items.  The
2611  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
2612  * items
2613  */
2614 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2615                           *root, struct btrfs_path *path, int min_data_size,
2616                           int data_size, int empty, u32 max_slot)
2617 {
2618         struct extent_buffer *right = path->nodes[0];
2619         struct extent_buffer *left;
2620         int slot;
2621         int free_space;
2622         u32 right_nritems;
2623         int ret = 0;
2624
2625         slot = path->slots[1];
2626         if (slot == 0)
2627                 return 1;
2628         if (!path->nodes[1])
2629                 return 1;
2630
2631         right_nritems = btrfs_header_nritems(right);
2632         if (right_nritems == 0)
2633                 return 1;
2634
2635         btrfs_assert_tree_locked(path->nodes[1]);
2636
2637         left = read_node_slot(root, path->nodes[1], slot - 1);
2638         if (left == NULL)
2639                 return 1;
2640
2641         btrfs_tree_lock(left);
2642         btrfs_set_lock_blocking(left);
2643
2644         free_space = btrfs_leaf_free_space(root, left);
2645         if (free_space < data_size) {
2646                 ret = 1;
2647                 goto out;
2648         }
2649
2650         /* cow and double check */
2651         ret = btrfs_cow_block(trans, root, left,
2652                               path->nodes[1], slot - 1, &left);
2653         if (ret) {
2654                 /* we hit -ENOSPC, but it isn't fatal here */
2655                 ret = 1;
2656                 goto out;
2657         }
2658
2659         free_space = btrfs_leaf_free_space(root, left);
2660         if (free_space < data_size) {
2661                 ret = 1;
2662                 goto out;
2663         }
2664
2665         return __push_leaf_left(trans, root, path, min_data_size,
2666                                empty, left, free_space, right_nritems,
2667                                max_slot);
2668 out:
2669         btrfs_tree_unlock(left);
2670         free_extent_buffer(left);
2671         return ret;
2672 }
2673
2674 /*
2675  * split the path's leaf in two, making sure there is at least data_size
2676  * available for the resulting leaf level of the path.
2677  *
2678  * returns 0 if all went well and < 0 on failure.
2679  */
2680 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2681                                struct btrfs_root *root,
2682                                struct btrfs_path *path,
2683                                struct extent_buffer *l,
2684                                struct extent_buffer *right,
2685                                int slot, int mid, int nritems)
2686 {
2687         int data_copy_size;
2688         int rt_data_off;
2689         int i;
2690         int ret = 0;
2691         int wret;
2692         struct btrfs_disk_key disk_key;
2693
2694         nritems = nritems - mid;
2695         btrfs_set_header_nritems(right, nritems);
2696         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2697
2698         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2699                            btrfs_item_nr_offset(mid),
2700                            nritems * sizeof(struct btrfs_item));
2701
2702         copy_extent_buffer(right, l,
2703                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2704                      data_copy_size, btrfs_leaf_data(l) +
2705                      leaf_data_end(root, l), data_copy_size);
2706
2707         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2708                       btrfs_item_end_nr(l, mid);
2709
2710         for (i = 0; i < nritems; i++) {
2711                 struct btrfs_item *item = btrfs_item_nr(right, i);
2712                 u32 ioff;
2713
2714                 if (!right->map_token) {
2715                         map_extent_buffer(right, (unsigned long)item,
2716                                         sizeof(struct btrfs_item),
2717                                         &right->map_token, &right->kaddr,
2718                                         &right->map_start, &right->map_len,
2719                                         KM_USER1);
2720                 }
2721
2722                 ioff = btrfs_item_offset(right, item);
2723                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2724         }
2725
2726         if (right->map_token) {
2727                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2728                 right->map_token = NULL;
2729         }
2730
2731         btrfs_set_header_nritems(l, mid);
2732         ret = 0;
2733         btrfs_item_key(right, &disk_key, 0);
2734         wret = insert_ptr(trans, root, path, &disk_key, right->start,
2735                           path->slots[1] + 1, 1);
2736         if (wret)
2737                 ret = wret;
2738
2739         btrfs_mark_buffer_dirty(right);
2740         btrfs_mark_buffer_dirty(l);
2741         BUG_ON(path->slots[0] != slot);
2742
2743         if (mid <= slot) {
2744                 btrfs_tree_unlock(path->nodes[0]);
2745                 free_extent_buffer(path->nodes[0]);
2746                 path->nodes[0] = right;
2747                 path->slots[0] -= mid;
2748                 path->slots[1] += 1;
2749         } else {
2750                 btrfs_tree_unlock(right);
2751                 free_extent_buffer(right);
2752         }
2753
2754         BUG_ON(path->slots[0] < 0);
2755
2756         return ret;
2757 }
2758
2759 /*
2760  * double splits happen when we need to insert a big item in the middle
2761  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
2762  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2763  *          A                 B                 C
2764  *
2765  * We avoid this by trying to push the items on either side of our target
2766  * into the adjacent leaves.  If all goes well we can avoid the double split
2767  * completely.
2768  */
2769 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2770                                           struct btrfs_root *root,
2771                                           struct btrfs_path *path,
2772                                           int data_size)
2773 {
2774         int ret;
2775         int progress = 0;
2776         int slot;
2777         u32 nritems;
2778
2779         slot = path->slots[0];
2780
2781         /*
2782          * try to push all the items after our slot into the
2783          * right leaf
2784          */
2785         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2786         if (ret < 0)
2787                 return ret;
2788
2789         if (ret == 0)
2790                 progress++;
2791
2792         nritems = btrfs_header_nritems(path->nodes[0]);
2793         /*
2794          * our goal is to get our slot at the start or end of a leaf.  If
2795          * we've done so we're done
2796          */
2797         if (path->slots[0] == 0 || path->slots[0] == nritems)
2798                 return 0;
2799
2800         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2801                 return 0;
2802
2803         /* try to push all the items before our slot into the next leaf */
2804         slot = path->slots[0];
2805         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2806         if (ret < 0)
2807                 return ret;
2808
2809         if (ret == 0)
2810                 progress++;
2811
2812         if (progress)
2813                 return 0;
2814         return 1;
2815 }
2816
2817 /*
2818  * split the path's leaf in two, making sure there is at least data_size
2819  * available for the resulting leaf level of the path.
2820  *
2821  * returns 0 if all went well and < 0 on failure.
2822  */
2823 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2824                                struct btrfs_root *root,
2825                                struct btrfs_key *ins_key,
2826                                struct btrfs_path *path, int data_size,
2827                                int extend)
2828 {
2829         struct btrfs_disk_key disk_key;
2830         struct extent_buffer *l;
2831         u32 nritems;
2832         int mid;
2833         int slot;
2834         struct extent_buffer *right;
2835         int ret = 0;
2836         int wret;
2837         int split;
2838         int num_doubles = 0;
2839         int tried_avoid_double = 0;
2840
2841         l = path->nodes[0];
2842         slot = path->slots[0];
2843         if (extend && data_size + btrfs_item_size_nr(l, slot) +
2844             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2845                 return -EOVERFLOW;
2846
2847         /* first try to make some room by pushing left and right */
2848         if (data_size) {
2849                 wret = push_leaf_right(trans, root, path, data_size,
2850                                        data_size, 0, 0);
2851                 if (wret < 0)
2852                         return wret;
2853                 if (wret) {
2854                         wret = push_leaf_left(trans, root, path, data_size,
2855                                               data_size, 0, (u32)-1);
2856                         if (wret < 0)
2857                                 return wret;
2858                 }
2859                 l = path->nodes[0];
2860
2861                 /* did the pushes work? */
2862                 if (btrfs_leaf_free_space(root, l) >= data_size)
2863                         return 0;
2864         }
2865
2866         if (!path->nodes[1]) {
2867                 ret = insert_new_root(trans, root, path, 1);
2868                 if (ret)
2869                         return ret;
2870         }
2871 again:
2872         split = 1;
2873         l = path->nodes[0];
2874         slot = path->slots[0];
2875         nritems = btrfs_header_nritems(l);
2876         mid = (nritems + 1) / 2;
2877
2878         if (mid <= slot) {
2879                 if (nritems == 1 ||
2880                     leaf_space_used(l, mid, nritems - mid) + data_size >
2881                         BTRFS_LEAF_DATA_SIZE(root)) {
2882                         if (slot >= nritems) {
2883                                 split = 0;
2884                         } else {
2885                                 mid = slot;
2886                                 if (mid != nritems &&
2887                                     leaf_space_used(l, mid, nritems - mid) +
2888                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2889                                         if (data_size && !tried_avoid_double)
2890                                                 goto push_for_double;
2891                                         split = 2;
2892                                 }
2893                         }
2894                 }
2895         } else {
2896                 if (leaf_space_used(l, 0, mid) + data_size >
2897                         BTRFS_LEAF_DATA_SIZE(root)) {
2898                         if (!extend && data_size && slot == 0) {
2899                                 split = 0;
2900                         } else if ((extend || !data_size) && slot == 0) {
2901                                 mid = 1;
2902                         } else {
2903                                 mid = slot;
2904                                 if (mid != nritems &&
2905                                     leaf_space_used(l, mid, nritems - mid) +
2906                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2907                                         if (data_size && !tried_avoid_double)
2908                                                 goto push_for_double;
2909                                         split = 2 ;
2910                                 }
2911                         }
2912                 }
2913         }
2914
2915         if (split == 0)
2916                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2917         else
2918                 btrfs_item_key(l, &disk_key, mid);
2919
2920         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2921                                         root->root_key.objectid,
2922                                         &disk_key, 0, l->start, 0);
2923         if (IS_ERR(right))
2924                 return PTR_ERR(right);
2925
2926         root_add_used(root, root->leafsize);
2927
2928         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2929         btrfs_set_header_bytenr(right, right->start);
2930         btrfs_set_header_generation(right, trans->transid);
2931         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
2932         btrfs_set_header_owner(right, root->root_key.objectid);
2933         btrfs_set_header_level(right, 0);
2934         write_extent_buffer(right, root->fs_info->fsid,
2935                             (unsigned long)btrfs_header_fsid(right),
2936                             BTRFS_FSID_SIZE);
2937
2938         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2939                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2940                             BTRFS_UUID_SIZE);
2941
2942         if (split == 0) {
2943                 if (mid <= slot) {
2944                         btrfs_set_header_nritems(right, 0);
2945                         wret = insert_ptr(trans, root, path,
2946                                           &disk_key, right->start,
2947                                           path->slots[1] + 1, 1);
2948                         if (wret)
2949                                 ret = wret;
2950
2951                         btrfs_tree_unlock(path->nodes[0]);
2952                         free_extent_buffer(path->nodes[0]);
2953                         path->nodes[0] = right;
2954                         path->slots[0] = 0;
2955                         path->slots[1] += 1;
2956                 } else {
2957                         btrfs_set_header_nritems(right, 0);
2958                         wret = insert_ptr(trans, root, path,
2959                                           &disk_key,
2960                                           right->start,
2961                                           path->slots[1], 1);
2962                         if (wret)
2963                                 ret = wret;
2964                         btrfs_tree_unlock(path->nodes[0]);
2965                         free_extent_buffer(path->nodes[0]);
2966                         path->nodes[0] = right;
2967                         path->slots[0] = 0;
2968                         if (path->slots[1] == 0) {
2969                                 wret = fixup_low_keys(trans, root,
2970                                                 path, &disk_key, 1);
2971                                 if (wret)
2972                                         ret = wret;
2973                         }
2974                 }
2975                 btrfs_mark_buffer_dirty(right);
2976                 return ret;
2977         }
2978
2979         ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
2980         BUG_ON(ret);
2981
2982         if (split == 2) {
2983                 BUG_ON(num_doubles != 0);
2984                 num_doubles++;
2985                 goto again;
2986         }
2987
2988         return ret;
2989
2990 push_for_double:
2991         push_for_double_split(trans, root, path, data_size);
2992         tried_avoid_double = 1;
2993         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2994                 return 0;
2995         goto again;
2996 }
2997
2998 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
2999                                          struct btrfs_root *root,
3000                                          struct btrfs_path *path, int ins_len)
3001 {
3002         struct btrfs_key key;
3003         struct extent_buffer *leaf;
3004         struct btrfs_file_extent_item *fi;
3005         u64 extent_len = 0;
3006         u32 item_size;
3007         int ret;
3008
3009         leaf = path->nodes[0];
3010         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3011
3012         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3013                key.type != BTRFS_EXTENT_CSUM_KEY);
3014
3015         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3016                 return 0;
3017
3018         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3019         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3020                 fi = btrfs_item_ptr(leaf, path->slots[0],
3021                                     struct btrfs_file_extent_item);
3022                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3023         }
3024         btrfs_release_path(root, path);
3025
3026         path->keep_locks = 1;
3027         path->search_for_split = 1;
3028         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3029         path->search_for_split = 0;
3030         if (ret < 0)
3031                 goto err;
3032
3033         ret = -EAGAIN;
3034         leaf = path->nodes[0];
3035         /* if our item isn't there or got smaller, return now */
3036         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3037                 goto err;
3038
3039         /* the leaf has  changed, it now has room.  return now */
3040         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3041                 goto err;
3042
3043         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044                 fi = btrfs_item_ptr(leaf, path->slots[0],
3045                                     struct btrfs_file_extent_item);
3046                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3047                         goto err;
3048         }
3049
3050         btrfs_set_path_blocking(path);
3051         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3052         if (ret)
3053                 goto err;
3054
3055         path->keep_locks = 0;
3056         btrfs_unlock_up_safe(path, 1);
3057         return 0;
3058 err:
3059         path->keep_locks = 0;
3060         return ret;
3061 }
3062
3063 static noinline int split_item(struct btrfs_trans_handle *trans,
3064                                struct btrfs_root *root,
3065                                struct btrfs_path *path,
3066                                struct btrfs_key *new_key,
3067                                unsigned long split_offset)
3068 {
3069         struct extent_buffer *leaf;
3070         struct btrfs_item *item;
3071         struct btrfs_item *new_item;
3072         int slot;
3073         char *buf;
3074         u32 nritems;
3075         u32 item_size;
3076         u32 orig_offset;
3077         struct btrfs_disk_key disk_key;
3078
3079         leaf = path->nodes[0];
3080         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3081
3082         btrfs_set_path_blocking(path);
3083
3084         item = btrfs_item_nr(leaf, path->slots[0]);
3085         orig_offset = btrfs_item_offset(leaf, item);
3086         item_size = btrfs_item_size(leaf, item);
3087
3088         buf = kmalloc(item_size, GFP_NOFS);
3089         if (!buf)
3090                 return -ENOMEM;
3091
3092         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3093                             path->slots[0]), item_size);
3094
3095         slot = path->slots[0] + 1;
3096         nritems = btrfs_header_nritems(leaf);
3097         if (slot != nritems) {
3098                 /* shift the items */
3099                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3100                                 btrfs_item_nr_offset(slot),
3101                                 (nritems - slot) * sizeof(struct btrfs_item));
3102         }
3103
3104         btrfs_cpu_key_to_disk(&disk_key, new_key);
3105         btrfs_set_item_key(leaf, &disk_key, slot);
3106
3107         new_item = btrfs_item_nr(leaf, slot);
3108
3109         btrfs_set_item_offset(leaf, new_item, orig_offset);
3110         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3111
3112         btrfs_set_item_offset(leaf, item,
3113                               orig_offset + item_size - split_offset);
3114         btrfs_set_item_size(leaf, item, split_offset);
3115
3116         btrfs_set_header_nritems(leaf, nritems + 1);
3117
3118         /* write the data for the start of the original item */
3119         write_extent_buffer(leaf, buf,
3120                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3121                             split_offset);
3122
3123         /* write the data for the new item */
3124         write_extent_buffer(leaf, buf + split_offset,
3125                             btrfs_item_ptr_offset(leaf, slot),
3126                             item_size - split_offset);
3127         btrfs_mark_buffer_dirty(leaf);
3128
3129         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
3130         kfree(buf);
3131         return 0;
3132 }
3133
3134 /*
3135  * This function splits a single item into two items,
3136  * giving 'new_key' to the new item and splitting the
3137  * old one at split_offset (from the start of the item).
3138  *
3139  * The path may be released by this operation.  After
3140  * the split, the path is pointing to the old item.  The
3141  * new item is going to be in the same node as the old one.
3142  *
3143  * Note, the item being split must be smaller enough to live alone on
3144  * a tree block with room for one extra struct btrfs_item
3145  *
3146  * This allows us to split the item in place, keeping a lock on the
3147  * leaf the entire time.
3148  */
3149 int btrfs_split_item(struct btrfs_trans_handle *trans,
3150                      struct btrfs_root *root,
3151                      struct btrfs_path *path,
3152                      struct btrfs_key *new_key,
3153                      unsigned long split_offset)
3154 {
3155         int ret;
3156         ret = setup_leaf_for_split(trans, root, path,
3157                                    sizeof(struct btrfs_item));
3158         if (ret)
3159                 return ret;
3160
3161         ret = split_item(trans, root, path, new_key, split_offset);
3162         return ret;
3163 }
3164
3165 /*
3166  * This function duplicate a item, giving 'new_key' to the new item.
3167  * It guarantees both items live in the same tree leaf and the new item
3168  * is contiguous with the original item.
3169  *
3170  * This allows us to split file extent in place, keeping a lock on the
3171  * leaf the entire time.
3172  */
3173 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3174                          struct btrfs_root *root,
3175                          struct btrfs_path *path,
3176                          struct btrfs_key *new_key)
3177 {
3178         struct extent_buffer *leaf;
3179         int ret;
3180         u32 item_size;
3181
3182         leaf = path->nodes[0];
3183         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3184         ret = setup_leaf_for_split(trans, root, path,
3185                                    item_size + sizeof(struct btrfs_item));
3186         if (ret)
3187                 return ret;
3188
3189         path->slots[0]++;
3190         ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
3191                                      item_size, item_size +
3192                                      sizeof(struct btrfs_item), 1);
3193         BUG_ON(ret);
3194
3195         leaf = path->nodes[0];
3196         memcpy_extent_buffer(leaf,
3197                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3198                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3199                              item_size);
3200         return 0;
3201 }
3202
3203 /*
3204  * make the item pointed to by the path smaller.  new_size indicates
3205  * how small to make it, and from_end tells us if we just chop bytes
3206  * off the end of the item or if we shift the item to chop bytes off
3207  * the front.
3208  */
3209 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3210                         struct btrfs_root *root,
3211                         struct btrfs_path *path,
3212                         u32 new_size, int from_end)
3213 {
3214         int ret = 0;
3215         int slot;
3216         struct extent_buffer *leaf;
3217         struct btrfs_item *item;
3218         u32 nritems;
3219         unsigned int data_end;
3220         unsigned int old_data_start;
3221         unsigned int old_size;
3222         unsigned int size_diff;
3223         int i;
3224
3225         leaf = path->nodes[0];
3226         slot = path->slots[0];
3227
3228         old_size = btrfs_item_size_nr(leaf, slot);
3229         if (old_size == new_size)
3230                 return 0;
3231
3232         nritems = btrfs_header_nritems(leaf);
3233         data_end = leaf_data_end(root, leaf);
3234
3235         old_data_start = btrfs_item_offset_nr(leaf, slot);
3236
3237         size_diff = old_size - new_size;
3238
3239         BUG_ON(slot < 0);
3240         BUG_ON(slot >= nritems);
3241
3242         /*
3243          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3244          */
3245         /* first correct the data pointers */
3246         for (i = slot; i < nritems; i++) {
3247                 u32 ioff;
3248                 item = btrfs_item_nr(leaf, i);
3249
3250                 if (!leaf->map_token) {
3251                         map_extent_buffer(leaf, (unsigned long)item,
3252                                         sizeof(struct btrfs_item),
3253                                         &leaf->map_token, &leaf->kaddr,
3254                                         &leaf->map_start, &leaf->map_len,
3255                                         KM_USER1);
3256                 }
3257
3258                 ioff = btrfs_item_offset(leaf, item);
3259                 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3260         }
3261
3262         if (leaf->map_token) {
3263                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3264                 leaf->map_token = NULL;
3265         }
3266
3267         /* shift the data */
3268         if (from_end) {
3269                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3270                               data_end + size_diff, btrfs_leaf_data(leaf) +
3271                               data_end, old_data_start + new_size - data_end);
3272         } else {
3273                 struct btrfs_disk_key disk_key;
3274                 u64 offset;
3275
3276                 btrfs_item_key(leaf, &disk_key, slot);
3277
3278                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3279                         unsigned long ptr;
3280                         struct btrfs_file_extent_item *fi;
3281
3282                         fi = btrfs_item_ptr(leaf, slot,
3283                                             struct btrfs_file_extent_item);
3284                         fi = (struct btrfs_file_extent_item *)(
3285                              (unsigned long)fi - size_diff);
3286
3287                         if (btrfs_file_extent_type(leaf, fi) ==
3288                             BTRFS_FILE_EXTENT_INLINE) {
3289                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3290                                 memmove_extent_buffer(leaf, ptr,
3291                                       (unsigned long)fi,
3292                                       offsetof(struct btrfs_file_extent_item,
3293                                                  disk_bytenr));
3294                         }
3295                 }
3296
3297                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3298                               data_end + size_diff, btrfs_leaf_data(leaf) +
3299                               data_end, old_data_start - data_end);
3300
3301                 offset = btrfs_disk_key_offset(&disk_key);
3302                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3303                 btrfs_set_item_key(leaf, &disk_key, slot);
3304                 if (slot == 0)
3305                         fixup_low_keys(trans, root, path, &disk_key, 1);
3306         }
3307
3308         item = btrfs_item_nr(leaf, slot);
3309         btrfs_set_item_size(leaf, item, new_size);
3310         btrfs_mark_buffer_dirty(leaf);
3311
3312         ret = 0;
3313         if (btrfs_leaf_free_space(root, leaf) < 0) {
3314                 btrfs_print_leaf(root, leaf);
3315                 BUG();
3316         }
3317         return ret;
3318 }
3319
3320 /*
3321  * make the item pointed to by the path bigger, data_size is the new size.
3322  */
3323 int btrfs_extend_item(struct btrfs_trans_handle *trans,
3324                       struct btrfs_root *root, struct btrfs_path *path,
3325                       u32 data_size)
3326 {
3327         int ret = 0;
3328         int slot;
3329         struct extent_buffer *leaf;
3330         struct btrfs_item *item;
3331         u32 nritems;
3332         unsigned int data_end;
3333         unsigned int old_data;
3334         unsigned int old_size;
3335         int i;
3336
3337         leaf = path->nodes[0];
3338
3339         nritems = btrfs_header_nritems(leaf);
3340         data_end = leaf_data_end(root, leaf);
3341
3342         if (btrfs_leaf_free_space(root, leaf) < data_size) {
3343                 btrfs_print_leaf(root, leaf);
3344                 BUG();
3345         }
3346         slot = path->slots[0];
3347         old_data = btrfs_item_end_nr(leaf, slot);
3348
3349         BUG_ON(slot < 0);
3350         if (slot >= nritems) {
3351                 btrfs_print_leaf(root, leaf);
3352                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3353                        slot, nritems);
3354                 BUG_ON(1);
3355         }
3356
3357         /*
3358          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3359          */
3360         /* first correct the data pointers */
3361         for (i = slot; i < nritems; i++) {
3362                 u32 ioff;
3363                 item = btrfs_item_nr(leaf, i);
3364
3365                 if (!leaf->map_token) {
3366                         map_extent_buffer(leaf, (unsigned long)item,
3367                                         sizeof(struct btrfs_item),
3368                                         &leaf->map_token, &leaf->kaddr,
3369                                         &leaf->map_start, &leaf->map_len,
3370                                         KM_USER1);
3371                 }
3372                 ioff = btrfs_item_offset(leaf, item);
3373                 btrfs_set_item_offset(leaf, item, ioff - data_size);
3374         }
3375
3376         if (leaf->map_token) {
3377                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3378                 leaf->map_token = NULL;
3379         }
3380
3381         /* shift the data */
3382         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3383                       data_end - data_size, btrfs_leaf_data(leaf) +
3384                       data_end, old_data - data_end);
3385
3386         data_end = old_data;
3387         old_size = btrfs_item_size_nr(leaf, slot);
3388         item = btrfs_item_nr(leaf, slot);
3389         btrfs_set_item_size(leaf, item, old_size + data_size);
3390         btrfs_mark_buffer_dirty(leaf);
3391
3392         ret = 0;
3393         if (btrfs_leaf_free_space(root, leaf) < 0) {
3394                 btrfs_print_leaf(root, leaf);
3395                 BUG();
3396         }
3397         return ret;
3398 }
3399
3400 /*
3401  * Given a key and some data, insert items into the tree.
3402  * This does all the path init required, making room in the tree if needed.
3403  * Returns the number of keys that were inserted.
3404  */
3405 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3406                             struct btrfs_root *root,
3407                             struct btrfs_path *path,
3408                             struct btrfs_key *cpu_key, u32 *data_size,
3409                             int nr)
3410 {
3411         struct extent_buffer *leaf;
3412         struct btrfs_item *item;
3413         int ret = 0;
3414         int slot;
3415         int i;
3416         u32 nritems;
3417         u32 total_data = 0;
3418         u32 total_size = 0;
3419         unsigned int data_end;
3420         struct btrfs_disk_key disk_key;
3421         struct btrfs_key found_key;
3422
3423         for (i = 0; i < nr; i++) {
3424                 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3425                     BTRFS_LEAF_DATA_SIZE(root)) {
3426                         break;
3427                         nr = i;
3428                 }
3429                 total_data += data_size[i];
3430                 total_size += data_size[i] + sizeof(struct btrfs_item);
3431         }
3432         BUG_ON(nr == 0);
3433
3434         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3435         if (ret == 0)
3436                 return -EEXIST;
3437         if (ret < 0)
3438                 goto out;
3439
3440         leaf = path->nodes[0];
3441
3442         nritems = btrfs_header_nritems(leaf);
3443         data_end = leaf_data_end(root, leaf);
3444
3445         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3446                 for (i = nr; i >= 0; i--) {
3447                         total_data -= data_size[i];
3448                         total_size -= data_size[i] + sizeof(struct btrfs_item);
3449                         if (total_size < btrfs_leaf_free_space(root, leaf))
3450                                 break;
3451                 }
3452                 nr = i;
3453         }
3454
3455         slot = path->slots[0];
3456         BUG_ON(slot < 0);
3457
3458         if (slot != nritems) {
3459                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3460
3461                 item = btrfs_item_nr(leaf, slot);
3462                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3463
3464                 /* figure out how many keys we can insert in here */
3465                 total_data = data_size[0];
3466                 for (i = 1; i < nr; i++) {
3467                         if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3468                                 break;
3469                         total_data += data_size[i];
3470                 }
3471                 nr = i;
3472
3473                 if (old_data < data_end) {
3474                         btrfs_print_leaf(root, leaf);
3475                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3476                                slot, old_data, data_end);
3477                         BUG_ON(1);
3478                 }
3479                 /*
3480                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3481                  */
3482                 /* first correct the data pointers */
3483                 WARN_ON(leaf->map_token);
3484                 for (i = slot; i < nritems; i++) {
3485                         u32 ioff;
3486
3487                         item = btrfs_item_nr(leaf, i);
3488                         if (!leaf->map_token) {
3489                                 map_extent_buffer(leaf, (unsigned long)item,
3490                                         sizeof(struct btrfs_item),
3491                                         &leaf->map_token, &leaf->kaddr,
3492                                         &leaf->map_start, &leaf->map_len,
3493                                         KM_USER1);
3494                         }
3495
3496                         ioff = btrfs_item_offset(leaf, item);
3497                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3498                 }
3499                 if (leaf->map_token) {
3500                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3501                         leaf->map_token = NULL;
3502                 }
3503
3504                 /* shift the items */
3505                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3506                               btrfs_item_nr_offset(slot),
3507                               (nritems - slot) * sizeof(struct btrfs_item));
3508
3509                 /* shift the data */
3510                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3511                               data_end - total_data, btrfs_leaf_data(leaf) +
3512                               data_end, old_data - data_end);
3513                 data_end = old_data;
3514         } else {
3515                 /*
3516                  * this sucks but it has to be done, if we are inserting at
3517                  * the end of the leaf only insert 1 of the items, since we
3518                  * have no way of knowing whats on the next leaf and we'd have
3519                  * to drop our current locks to figure it out
3520                  */
3521                 nr = 1;
3522         }
3523
3524         /* setup the item for the new data */
3525         for (i = 0; i < nr; i++) {
3526                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3527                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3528                 item = btrfs_item_nr(leaf, slot + i);
3529                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3530                 data_end -= data_size[i];
3531                 btrfs_set_item_size(leaf, item, data_size[i]);
3532         }
3533         btrfs_set_header_nritems(leaf, nritems + nr);
3534         btrfs_mark_buffer_dirty(leaf);
3535
3536         ret = 0;
3537         if (slot == 0) {
3538                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3539                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3540         }
3541
3542         if (btrfs_leaf_free_space(root, leaf) < 0) {
3543                 btrfs_print_leaf(root, leaf);
3544                 BUG();
3545         }
3546 out:
3547         if (!ret)
3548                 ret = nr;
3549         return ret;
3550 }
3551
3552 /*
3553  * this is a helper for btrfs_insert_empty_items, the main goal here is
3554  * to save stack depth by doing the bulk of the work in a function
3555  * that doesn't call btrfs_search_slot
3556  */
3557 static noinline_for_stack int
3558 setup_items_for_insert(struct btrfs_trans_handle *trans,
3559                       struct btrfs_root *root, struct btrfs_path *path,
3560                       struct btrfs_key *cpu_key, u32 *data_size,
3561                       u32 total_data, u32 total_size, int nr)
3562 {
3563         struct btrfs_item *item;
3564         int i;
3565         u32 nritems;
3566         unsigned int data_end;
3567         struct btrfs_disk_key disk_key;
3568         int ret;
3569         struct extent_buffer *leaf;
3570         int slot;
3571
3572         leaf = path->nodes[0];
3573         slot = path->slots[0];
3574
3575         nritems = btrfs_header_nritems(leaf);
3576         data_end = leaf_data_end(root, leaf);
3577
3578         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3579                 btrfs_print_leaf(root, leaf);
3580                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3581                        total_size, btrfs_leaf_free_space(root, leaf));
3582                 BUG();
3583         }
3584
3585         if (slot != nritems) {
3586                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3587
3588                 if (old_data < data_end) {
3589                         btrfs_print_leaf(root, leaf);
3590                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3591                                slot, old_data, data_end);
3592                         BUG_ON(1);
3593                 }
3594                 /*
3595                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3596                  */
3597                 /* first correct the data pointers */
3598                 WARN_ON(leaf->map_token);
3599                 for (i = slot; i < nritems; i++) {
3600                         u32 ioff;
3601
3602                         item = btrfs_item_nr(leaf, i);
3603                         if (!leaf->map_token) {
3604                                 map_extent_buffer(leaf, (unsigned long)item,
3605                                         sizeof(struct btrfs_item),
3606                                         &leaf->map_token, &leaf->kaddr,
3607                                         &leaf->map_start, &leaf->map_len,
3608                                         KM_USER1);
3609                         }
3610
3611                         ioff = btrfs_item_offset(leaf, item);
3612                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3613                 }
3614                 if (leaf->map_token) {
3615                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3616                         leaf->map_token = NULL;
3617                 }
3618
3619                 /* shift the items */
3620                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3621                               btrfs_item_nr_offset(slot),
3622                               (nritems - slot) * sizeof(struct btrfs_item));
3623
3624                 /* shift the data */
3625                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3626                               data_end - total_data, btrfs_leaf_data(leaf) +
3627                               data_end, old_data - data_end);
3628                 data_end = old_data;
3629         }
3630
3631         /* setup the item for the new data */
3632         for (i = 0; i < nr; i++) {
3633                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3634                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3635                 item = btrfs_item_nr(leaf, slot + i);
3636                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3637                 data_end -= data_size[i];
3638                 btrfs_set_item_size(leaf, item, data_size[i]);
3639         }
3640
3641         btrfs_set_header_nritems(leaf, nritems + nr);
3642
3643         ret = 0;
3644         if (slot == 0) {
3645                 struct btrfs_disk_key disk_key;
3646                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3647                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3648         }
3649         btrfs_unlock_up_safe(path, 1);
3650         btrfs_mark_buffer_dirty(leaf);
3651
3652         if (btrfs_leaf_free_space(root, leaf) < 0) {
3653                 btrfs_print_leaf(root, leaf);
3654                 BUG();
3655         }
3656         return ret;
3657 }
3658
3659 /*
3660  * Given a key and some data, insert items into the tree.
3661  * This does all the path init required, making room in the tree if needed.
3662  */
3663 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3664                             struct btrfs_root *root,
3665                             struct btrfs_path *path,
3666                             struct btrfs_key *cpu_key, u32 *data_size,
3667                             int nr)
3668 {
3669         int ret = 0;
3670         int slot;
3671         int i;
3672         u32 total_size = 0;
3673         u32 total_data = 0;
3674
3675         for (i = 0; i < nr; i++)
3676                 total_data += data_size[i];
3677
3678         total_size = total_data + (nr * sizeof(struct btrfs_item));
3679         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3680         if (ret == 0)
3681                 return -EEXIST;
3682         if (ret < 0)
3683                 goto out;
3684
3685         slot = path->slots[0];
3686         BUG_ON(slot < 0);
3687
3688         ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3689                                total_data, total_size, nr);
3690
3691 out:
3692         return ret;
3693 }
3694
3695 /*
3696  * Given a key and some data, insert an item into the tree.
3697  * This does all the path init required, making room in the tree if needed.
3698  */
3699 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3700                       *root, struct btrfs_key *cpu_key, void *data, u32
3701                       data_size)
3702 {
3703         int ret = 0;
3704         struct btrfs_path *path;
3705         struct extent_buffer *leaf;
3706         unsigned long ptr;
3707
3708         path = btrfs_alloc_path();
3709         BUG_ON(!path);
3710         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3711         if (!ret) {
3712                 leaf = path->nodes[0];
3713                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3714                 write_extent_buffer(leaf, data, ptr, data_size);
3715                 btrfs_mark_buffer_dirty(leaf);
3716         }
3717         btrfs_free_path(path);
3718         return ret;
3719 }
3720
3721 /*
3722  * delete the pointer from a given node.
3723  *
3724  * the tree should have been previously balanced so the deletion does not
3725  * empty a node.
3726  */
3727 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3728                    struct btrfs_path *path, int level, int slot)
3729 {
3730         struct extent_buffer *parent = path->nodes[level];
3731         u32 nritems;
3732         int ret = 0;
3733         int wret;
3734
3735         nritems = btrfs_header_nritems(parent);
3736         if (slot != nritems - 1) {
3737                 memmove_extent_buffer(parent,
3738                               btrfs_node_key_ptr_offset(slot),
3739                               btrfs_node_key_ptr_offset(slot + 1),
3740                               sizeof(struct btrfs_key_ptr) *
3741                               (nritems - slot - 1));
3742         }
3743         nritems--;
3744         btrfs_set_header_nritems(parent, nritems);
3745         if (nritems == 0 && parent == root->node) {
3746                 BUG_ON(btrfs_header_level(root->node) != 1);
3747                 /* just turn the root into a leaf and break */
3748                 btrfs_set_header_level(root->node, 0);
3749         } else if (slot == 0) {
3750                 struct btrfs_disk_key disk_key;
3751
3752                 btrfs_node_key(parent, &disk_key, 0);
3753                 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3754                 if (wret)
3755                         ret = wret;
3756         }
3757         btrfs_mark_buffer_dirty(parent);
3758         return ret;
3759 }
3760
3761 /*
3762  * a helper function to delete the leaf pointed to by path->slots[1] and
3763  * path->nodes[1].
3764  *
3765  * This deletes the pointer in path->nodes[1] and frees the leaf
3766  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3767  *
3768  * The path must have already been setup for deleting the leaf, including
3769  * all the proper balancing.  path->nodes[1] must be locked.
3770  */
3771 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3772                                    struct btrfs_root *root,
3773                                    struct btrfs_path *path,
3774                                    struct extent_buffer *leaf)
3775 {
3776         int ret;
3777
3778         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3779         ret = del_ptr(trans, root, path, 1, path->slots[1]);
3780         if (ret)
3781                 return ret;
3782
3783         /*
3784          * btrfs_free_extent is expensive, we want to make sure we
3785          * aren't holding any locks when we call it
3786          */
3787         btrfs_unlock_up_safe(path, 0);
3788
3789         root_sub_used(root, leaf->len);
3790
3791         btrfs_free_tree_block(trans, root, leaf, 0, 1);
3792         return 0;
3793 }
3794 /*
3795  * delete the item at the leaf level in path.  If that empties
3796  * the leaf, remove it from the tree
3797  */
3798 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3799                     struct btrfs_path *path, int slot, int nr)
3800 {
3801         struct extent_buffer *leaf;
3802         struct btrfs_item *item;
3803         int last_off;
3804         int dsize = 0;
3805         int ret = 0;
3806         int wret;
3807         int i;
3808         u32 nritems;
3809
3810         leaf = path->nodes[0];
3811         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3812
3813         for (i = 0; i < nr; i++)
3814                 dsize += btrfs_item_size_nr(leaf, slot + i);
3815
3816         nritems = btrfs_header_nritems(leaf);
3817
3818         if (slot + nr != nritems) {
3819                 int data_end = leaf_data_end(root, leaf);
3820
3821                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3822                               data_end + dsize,
3823                               btrfs_leaf_data(leaf) + data_end,
3824                               last_off - data_end);
3825
3826                 for (i = slot + nr; i < nritems; i++) {
3827                         u32 ioff;
3828
3829                         item = btrfs_item_nr(leaf, i);
3830                         if (!leaf->map_token) {
3831                                 map_extent_buffer(leaf, (unsigned long)item,
3832                                         sizeof(struct btrfs_item),
3833                                         &leaf->map_token, &leaf->kaddr,
3834                                         &leaf->map_start, &leaf->map_len,
3835                                         KM_USER1);
3836                         }
3837                         ioff = btrfs_item_offset(leaf, item);
3838                         btrfs_set_item_offset(leaf, item, ioff + dsize);
3839                 }
3840
3841                 if (leaf->map_token) {
3842                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3843                         leaf->map_token = NULL;
3844                 }
3845
3846                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3847                               btrfs_item_nr_offset(slot + nr),
3848                               sizeof(struct btrfs_item) *
3849                               (nritems - slot - nr));
3850         }
3851         btrfs_set_header_nritems(leaf, nritems - nr);
3852         nritems -= nr;
3853
3854         /* delete the leaf if we've emptied it */
3855         if (nritems == 0) {
3856                 if (leaf == root->node) {
3857                         btrfs_set_header_level(leaf, 0);
3858                 } else {
3859                         btrfs_set_path_blocking(path);
3860                         clean_tree_block(trans, root, leaf);
3861                         ret = btrfs_del_leaf(trans, root, path, leaf);
3862                         BUG_ON(ret);
3863                 }
3864         } else {
3865                 int used = leaf_space_used(leaf, 0, nritems);
3866                 if (slot == 0) {
3867                         struct btrfs_disk_key disk_key;
3868
3869                         btrfs_item_key(leaf, &disk_key, 0);
3870                         wret = fixup_low_keys(trans, root, path,
3871                                               &disk_key, 1);
3872                         if (wret)
3873                                 ret = wret;
3874                 }
3875
3876                 /* delete the leaf if it is mostly empty */
3877                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3878                         /* push_leaf_left fixes the path.
3879                          * make sure the path still points to our leaf
3880                          * for possible call to del_ptr below
3881                          */
3882                         slot = path->slots[1];
3883                         extent_buffer_get(leaf);
3884
3885                         btrfs_set_path_blocking(path);
3886                         wret = push_leaf_left(trans, root, path, 1, 1,
3887                                               1, (u32)-1);
3888                         if (wret < 0 && wret != -ENOSPC)
3889                                 ret = wret;
3890
3891                         if (path->nodes[0] == leaf &&
3892                             btrfs_header_nritems(leaf)) {
3893                                 wret = push_leaf_right(trans, root, path, 1,
3894                                                        1, 1, 0);
3895                                 if (wret < 0 && wret != -ENOSPC)
3896                                         ret = wret;
3897                         }
3898
3899                         if (btrfs_header_nritems(leaf) == 0) {
3900                                 path->slots[1] = slot;
3901                                 ret = btrfs_del_leaf(trans, root, path, leaf);
3902                                 BUG_ON(ret);
3903                                 free_extent_buffer(leaf);
3904                         } else {
3905                                 /* if we're still in the path, make sure
3906                                  * we're dirty.  Otherwise, one of the
3907                                  * push_leaf functions must have already
3908                                  * dirtied this buffer
3909                                  */
3910                                 if (path->nodes[0] == leaf)
3911                                         btrfs_mark_buffer_dirty(leaf);
3912                                 free_extent_buffer(leaf);
3913                         }
3914                 } else {
3915                         btrfs_mark_buffer_dirty(leaf);
3916                 }
3917         }
3918         return ret;
3919 }
3920
3921 /*
3922  * search the tree again to find a leaf with lesser keys
3923  * returns 0 if it found something or 1 if there are no lesser leaves.
3924  * returns < 0 on io errors.
3925  *
3926  * This may release the path, and so you may lose any locks held at the
3927  * time you call it.
3928  */
3929 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3930 {
3931         struct btrfs_key key;
3932         struct btrfs_disk_key found_key;
3933         int ret;
3934
3935         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3936
3937         if (key.offset > 0)
3938                 key.offset--;
3939         else if (key.type > 0)
3940                 key.type--;
3941         else if (key.objectid > 0)
3942                 key.objectid--;
3943         else
3944                 return 1;
3945
3946         btrfs_release_path(root, path);
3947         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3948         if (ret < 0)
3949                 return ret;
3950         btrfs_item_key(path->nodes[0], &found_key, 0);
3951         ret = comp_keys(&found_key, &key);
3952         if (ret < 0)
3953                 return 0;
3954         return 1;
3955 }
3956
3957 /*
3958  * A helper function to walk down the tree starting at min_key, and looking
3959  * for nodes or leaves that are either in cache or have a minimum
3960  * transaction id.  This is used by the btree defrag code, and tree logging
3961  *
3962  * This does not cow, but it does stuff the starting key it finds back
3963  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3964  * key and get a writable path.
3965  *
3966  * This does lock as it descends, and path->keep_locks should be set
3967  * to 1 by the caller.
3968  *
3969  * This honors path->lowest_level to prevent descent past a given level
3970  * of the tree.
3971  *
3972  * min_trans indicates the oldest transaction that you are interested
3973  * in walking through.  Any nodes or leaves older than min_trans are
3974  * skipped over (without reading them).
3975  *
3976  * returns zero if something useful was found, < 0 on error and 1 if there
3977  * was nothing in the tree that matched the search criteria.
3978  */
3979 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3980                          struct btrfs_key *max_key,
3981                          struct btrfs_path *path, int cache_only,
3982                          u64 min_trans)
3983 {
3984         struct extent_buffer *cur;
3985         struct btrfs_key found_key;
3986         int slot;
3987         int sret;
3988         u32 nritems;
3989         int level;
3990         int ret = 1;
3991
3992         WARN_ON(!path->keep_locks);
3993 again:
3994         cur = btrfs_lock_root_node(root);
3995         level = btrfs_header_level(cur);
3996         WARN_ON(path->nodes[level]);
3997         path->nodes[level] = cur;
3998         path->locks[level] = 1;
3999
4000         if (btrfs_header_generation(cur) < min_trans) {
4001                 ret = 1;
4002                 goto out;
4003         }
4004         while (1) {
4005                 nritems = btrfs_header_nritems(cur);
4006                 level = btrfs_header_level(cur);
4007                 sret = bin_search(cur, min_key, level, &slot);
4008
4009                 /* at the lowest level, we're done, setup the path and exit */
4010                 if (level == path->lowest_level) {
4011                         if (slot >= nritems)
4012                                 goto find_next_key;
4013                         ret = 0;
4014                         path->slots[level] = slot;
4015                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4016                         goto out;
4017                 }
4018                 if (sret && slot > 0)
4019                         slot--;
4020                 /*
4021                  * check this node pointer against the cache_only and
4022                  * min_trans parameters.  If it isn't in cache or is too
4023                  * old, skip to the next one.
4024                  */
4025                 while (slot < nritems) {
4026                         u64 blockptr;
4027                         u64 gen;
4028                         struct extent_buffer *tmp;
4029                         struct btrfs_disk_key disk_key;
4030
4031                         blockptr = btrfs_node_blockptr(cur, slot);
4032                         gen = btrfs_node_ptr_generation(cur, slot);
4033                         if (gen < min_trans) {
4034                                 slot++;
4035                                 continue;
4036                         }
4037                         if (!cache_only)
4038                                 break;
4039
4040                         if (max_key) {
4041                                 btrfs_node_key(cur, &disk_key, slot);
4042                                 if (comp_keys(&disk_key, max_key) >= 0) {
4043                                         ret = 1;
4044                                         goto out;
4045                                 }
4046                         }
4047
4048                         tmp = btrfs_find_tree_block(root, blockptr,
4049                                             btrfs_level_size(root, level - 1));
4050
4051                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
4052                                 free_extent_buffer(tmp);
4053                                 break;
4054                         }
4055                         if (tmp)
4056                                 free_extent_buffer(tmp);
4057                         slot++;
4058                 }
4059 find_next_key:
4060                 /*
4061                  * we didn't find a candidate key in this node, walk forward
4062                  * and find another one
4063                  */
4064                 if (slot >= nritems) {
4065                         path->slots[level] = slot;
4066                         btrfs_set_path_blocking(path);
4067                         sret = btrfs_find_next_key(root, path, min_key, level,
4068                                                   cache_only, min_trans);
4069                         if (sret == 0) {
4070                                 btrfs_release_path(root, path);
4071                                 goto again;
4072                         } else {
4073                                 goto out;
4074                         }
4075                 }
4076                 /* save our key for returning back */
4077                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4078                 path->slots[level] = slot;
4079                 if (level == path->lowest_level) {
4080                         ret = 0;
4081                         unlock_up(path, level, 1);
4082                         goto out;
4083                 }
4084                 btrfs_set_path_blocking(path);
4085                 cur = read_node_slot(root, cur, slot);
4086
4087                 btrfs_tree_lock(cur);
4088
4089                 path->locks[level - 1] = 1;
4090                 path->nodes[level - 1] = cur;
4091                 unlock_up(path, level, 1);
4092                 btrfs_clear_path_blocking(path, NULL);
4093         }
4094 out:
4095         if (ret == 0)
4096                 memcpy(min_key, &found_key, sizeof(found_key));
4097         btrfs_set_path_blocking(path);
4098         return ret;
4099 }
4100
4101 /*
4102  * this is similar to btrfs_next_leaf, but does not try to preserve
4103  * and fixup the path.  It looks for and returns the next key in the
4104  * tree based on the current path and the cache_only and min_trans
4105  * parameters.
4106  *
4107  * 0 is returned if another key is found, < 0 if there are any errors
4108  * and 1 is returned if there are no higher keys in the tree
4109  *
4110  * path->keep_locks should be set to 1 on the search made before
4111  * calling this function.
4112  */
4113 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4114                         struct btrfs_key *key, int level,
4115                         int cache_only, u64 min_trans)
4116 {
4117         int slot;
4118         struct extent_buffer *c;
4119
4120         WARN_ON(!path->keep_locks);
4121         while (level < BTRFS_MAX_LEVEL) {
4122                 if (!path->nodes[level])
4123                         return 1;
4124
4125                 slot = path->slots[level] + 1;
4126                 c = path->nodes[level];
4127 next:
4128                 if (slot >= btrfs_header_nritems(c)) {
4129                         int ret;
4130                         int orig_lowest;
4131                         struct btrfs_key cur_key;
4132                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4133                             !path->nodes[level + 1])
4134                                 return 1;
4135
4136                         if (path->locks[level + 1]) {
4137                                 level++;
4138                                 continue;
4139                         }
4140
4141                         slot = btrfs_header_nritems(c) - 1;
4142                         if (level == 0)
4143                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4144                         else
4145                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4146
4147                         orig_lowest = path->lowest_level;
4148                         btrfs_release_path(root, path);
4149                         path->lowest_level = level;
4150                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4151                                                 0, 0);
4152                         path->lowest_level = orig_lowest;
4153                         if (ret < 0)
4154                                 return ret;
4155
4156                         c = path->nodes[level];
4157                         slot = path->slots[level];
4158                         if (ret == 0)
4159                                 slot++;
4160                         goto next;
4161                 }
4162
4163                 if (level == 0)
4164                         btrfs_item_key_to_cpu(c, key, slot);
4165                 else {
4166                         u64 blockptr = btrfs_node_blockptr(c, slot);
4167                         u64 gen = btrfs_node_ptr_generation(c, slot);
4168
4169                         if (cache_only) {
4170                                 struct extent_buffer *cur;
4171                                 cur = btrfs_find_tree_block(root, blockptr,
4172                                             btrfs_level_size(root, level - 1));
4173                                 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4174                                         slot++;
4175                                         if (cur)
4176                                                 free_extent_buffer(cur);
4177                                         goto next;
4178                                 }
4179                                 free_extent_buffer(cur);
4180                         }
4181                         if (gen < min_trans) {
4182                                 slot++;
4183                                 goto next;
4184                         }
4185                         btrfs_node_key_to_cpu(c, key, slot);
4186                 }
4187                 return 0;
4188         }
4189         return 1;
4190 }
4191
4192 /*
4193  * search the tree again to find a leaf with greater keys
4194  * returns 0 if it found something or 1 if there are no greater leaves.
4195  * returns < 0 on io errors.
4196  */
4197 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4198 {
4199         int slot;
4200         int level;
4201         struct extent_buffer *c;
4202         struct extent_buffer *next;
4203         struct btrfs_key key;
4204         u32 nritems;
4205         int ret;
4206         int old_spinning = path->leave_spinning;
4207         int force_blocking = 0;
4208
4209         nritems = btrfs_header_nritems(path->nodes[0]);
4210         if (nritems == 0)
4211                 return 1;
4212
4213         /*
4214          * we take the blocks in an order that upsets lockdep.  Using
4215          * blocking mode is the only way around it.
4216          */
4217 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4218         force_blocking = 1;
4219 #endif
4220
4221         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4222 again:
4223         level = 1;
4224         next = NULL;
4225         btrfs_release_path(root, path);
4226
4227         path->keep_locks = 1;
4228
4229         if (!force_blocking)
4230                 path->leave_spinning = 1;
4231
4232         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4233         path->keep_locks = 0;
4234
4235         if (ret < 0)
4236                 return ret;
4237
4238         nritems = btrfs_header_nritems(path->nodes[0]);
4239         /*
4240          * by releasing the path above we dropped all our locks.  A balance
4241          * could have added more items next to the key that used to be
4242          * at the very end of the block.  So, check again here and
4243          * advance the path if there are now more items available.
4244          */
4245         if (nritems > 0 && path->slots[0] < nritems - 1) {
4246                 if (ret == 0)
4247                         path->slots[0]++;
4248                 ret = 0;
4249                 goto done;
4250         }
4251
4252         while (level < BTRFS_MAX_LEVEL) {
4253                 if (!path->nodes[level]) {
4254                         ret = 1;
4255                         goto done;
4256                 }
4257
4258                 slot = path->slots[level] + 1;
4259                 c = path->nodes[level];
4260                 if (slot >= btrfs_header_nritems(c)) {
4261                         level++;
4262                         if (level == BTRFS_MAX_LEVEL) {
4263                                 ret = 1;
4264                                 goto done;
4265                         }
4266                         continue;
4267                 }
4268
4269                 if (next) {
4270                         btrfs_tree_unlock(next);
4271                         free_extent_buffer(next);
4272                 }
4273
4274                 next = c;
4275                 ret = read_block_for_search(NULL, root, path, &next, level,
4276                                             slot, &key);
4277                 if (ret == -EAGAIN)
4278                         goto again;
4279
4280                 if (ret < 0) {
4281                         btrfs_release_path(root, path);
4282                         goto done;
4283                 }
4284
4285                 if (!path->skip_locking) {
4286                         ret = btrfs_try_spin_lock(next);
4287                         if (!ret) {
4288                                 btrfs_set_path_blocking(path);
4289                                 btrfs_tree_lock(next);
4290                                 if (!force_blocking)
4291                                         btrfs_clear_path_blocking(path, next);
4292                         }
4293                         if (force_blocking)
4294                                 btrfs_set_lock_blocking(next);
4295                 }
4296                 break;
4297         }
4298         path->slots[level] = slot;
4299         while (1) {
4300                 level--;
4301                 c = path->nodes[level];
4302                 if (path->locks[level])
4303                         btrfs_tree_unlock(c);
4304
4305                 free_extent_buffer(c);
4306                 path->nodes[level] = next;
4307                 path->slots[level] = 0;
4308                 if (!path->skip_locking)
4309                         path->locks[level] = 1;
4310
4311                 if (!level)
4312                         break;
4313
4314                 ret = read_block_for_search(NULL, root, path, &next, level,
4315                                             0, &key);
4316                 if (ret == -EAGAIN)
4317                         goto again;
4318
4319                 if (ret < 0) {
4320                         btrfs_release_path(root, path);
4321                         goto done;
4322                 }
4323
4324                 if (!path->skip_locking) {
4325                         btrfs_assert_tree_locked(path->nodes[level]);
4326                         ret = btrfs_try_spin_lock(next);
4327                         if (!ret) {
4328                                 btrfs_set_path_blocking(path);
4329                                 btrfs_tree_lock(next);
4330                                 if (!force_blocking)
4331                                         btrfs_clear_path_blocking(path, next);
4332                         }
4333                         if (force_blocking)
4334                                 btrfs_set_lock_blocking(next);
4335                 }
4336         }
4337         ret = 0;
4338 done:
4339         unlock_up(path, 0, 1);
4340         path->leave_spinning = old_spinning;
4341         if (!old_spinning)
4342                 btrfs_set_path_blocking(path);
4343
4344         return ret;
4345 }
4346
4347 /*
4348  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4349  * searching until it gets past min_objectid or finds an item of 'type'
4350  *
4351  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4352  */
4353 int btrfs_previous_item(struct btrfs_root *root,
4354                         struct btrfs_path *path, u64 min_objectid,
4355                         int type)
4356 {
4357         struct btrfs_key found_key;
4358         struct extent_buffer *leaf;
4359         u32 nritems;
4360         int ret;
4361
4362         while (1) {
4363                 if (path->slots[0] == 0) {
4364                         btrfs_set_path_blocking(path);
4365                         ret = btrfs_prev_leaf(root, path);
4366                         if (ret != 0)
4367                                 return ret;
4368                 } else {
4369                         path->slots[0]--;
4370                 }
4371                 leaf = path->nodes[0];
4372                 nritems = btrfs_header_nritems(leaf);
4373                 if (nritems == 0)
4374                         return 1;
4375                 if (path->slots[0] == nritems)
4376                         path->slots[0]--;
4377
4378                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4379                 if (found_key.objectid < min_objectid)
4380                         break;
4381                 if (found_key.type == type)
4382                         return 0;
4383                 if (found_key.objectid == min_objectid &&
4384                     found_key.type < type)
4385                         break;
4386         }
4387         return 1;
4388 }