Btrfs: map the node block when looking for readahead targets
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include "ctree.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "locking.h"
26
27 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
28                       *root, struct btrfs_path *path, int level);
29 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_key *ins_key,
31                       struct btrfs_path *path, int data_size, int extend);
32 static int push_node_left(struct btrfs_trans_handle *trans,
33                           struct btrfs_root *root, struct extent_buffer *dst,
34                           struct extent_buffer *src, int empty);
35 static int balance_node_right(struct btrfs_trans_handle *trans,
36                               struct btrfs_root *root,
37                               struct extent_buffer *dst_buf,
38                               struct extent_buffer *src_buf);
39 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40                    struct btrfs_path *path, int level, int slot);
41 static int setup_items_for_insert(struct btrfs_trans_handle *trans,
42                         struct btrfs_root *root, struct btrfs_path *path,
43                         struct btrfs_key *cpu_key, u32 *data_size,
44                         u32 total_data, u32 total_size, int nr);
45
46
47 struct btrfs_path *btrfs_alloc_path(void)
48 {
49         struct btrfs_path *path;
50         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
51         if (path)
52                 path->reada = 1;
53         return path;
54 }
55
56 /*
57  * set all locked nodes in the path to blocking locks.  This should
58  * be done before scheduling
59  */
60 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
61 {
62         int i;
63         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
64                 if (p->nodes[i] && p->locks[i])
65                         btrfs_set_lock_blocking(p->nodes[i]);
66         }
67 }
68
69 /*
70  * reset all the locked nodes in the patch to spinning locks.
71  *
72  * held is used to keep lockdep happy, when lockdep is enabled
73  * we set held to a blocking lock before we go around and
74  * retake all the spinlocks in the path.  You can safely use NULL
75  * for held
76  */
77 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
78                                         struct extent_buffer *held)
79 {
80         int i;
81
82 #ifdef CONFIG_DEBUG_LOCK_ALLOC
83         /* lockdep really cares that we take all of these spinlocks
84          * in the right order.  If any of the locks in the path are not
85          * currently blocking, it is going to complain.  So, make really
86          * really sure by forcing the path to blocking before we clear
87          * the path blocking.
88          */
89         if (held)
90                 btrfs_set_lock_blocking(held);
91         btrfs_set_path_blocking(p);
92 #endif
93
94         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
95                 if (p->nodes[i] && p->locks[i])
96                         btrfs_clear_lock_blocking(p->nodes[i]);
97         }
98
99 #ifdef CONFIG_DEBUG_LOCK_ALLOC
100         if (held)
101                 btrfs_clear_lock_blocking(held);
102 #endif
103 }
104
105 /* this also releases the path */
106 void btrfs_free_path(struct btrfs_path *p)
107 {
108         if (!p)
109                 return;
110         btrfs_release_path(NULL, p);
111         kmem_cache_free(btrfs_path_cachep, p);
112 }
113
114 /*
115  * path release drops references on the extent buffers in the path
116  * and it drops any locks held by this path
117  *
118  * It is safe to call this on paths that no locks or extent buffers held.
119  */
120 noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
121 {
122         int i;
123
124         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
125                 p->slots[i] = 0;
126                 if (!p->nodes[i])
127                         continue;
128                 if (p->locks[i]) {
129                         btrfs_tree_unlock(p->nodes[i]);
130                         p->locks[i] = 0;
131                 }
132                 free_extent_buffer(p->nodes[i]);
133                 p->nodes[i] = NULL;
134         }
135 }
136
137 /*
138  * safely gets a reference on the root node of a tree.  A lock
139  * is not taken, so a concurrent writer may put a different node
140  * at the root of the tree.  See btrfs_lock_root_node for the
141  * looping required.
142  *
143  * The extent buffer returned by this has a reference taken, so
144  * it won't disappear.  It may stop being the root of the tree
145  * at any time because there are no locks held.
146  */
147 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
148 {
149         struct extent_buffer *eb;
150
151         rcu_read_lock();
152         eb = rcu_dereference(root->node);
153         extent_buffer_get(eb);
154         rcu_read_unlock();
155         return eb;
156 }
157
158 /* loop around taking references on and locking the root node of the
159  * tree until you end up with a lock on the root.  A locked buffer
160  * is returned, with a reference held.
161  */
162 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
163 {
164         struct extent_buffer *eb;
165
166         while (1) {
167                 eb = btrfs_root_node(root);
168                 btrfs_tree_lock(eb);
169                 if (eb == root->node)
170                         break;
171                 btrfs_tree_unlock(eb);
172                 free_extent_buffer(eb);
173         }
174         return eb;
175 }
176
177 /* cowonly root (everything not a reference counted cow subvolume), just get
178  * put onto a simple dirty list.  transaction.c walks this to make sure they
179  * get properly updated on disk.
180  */
181 static void add_root_to_dirty_list(struct btrfs_root *root)
182 {
183         if (root->track_dirty && list_empty(&root->dirty_list)) {
184                 list_add(&root->dirty_list,
185                          &root->fs_info->dirty_cowonly_roots);
186         }
187 }
188
189 /*
190  * used by snapshot creation to make a copy of a root for a tree with
191  * a given objectid.  The buffer with the new root node is returned in
192  * cow_ret, and this func returns zero on success or a negative error code.
193  */
194 int btrfs_copy_root(struct btrfs_trans_handle *trans,
195                       struct btrfs_root *root,
196                       struct extent_buffer *buf,
197                       struct extent_buffer **cow_ret, u64 new_root_objectid)
198 {
199         struct extent_buffer *cow;
200         int ret = 0;
201         int level;
202         struct btrfs_disk_key disk_key;
203
204         WARN_ON(root->ref_cows && trans->transid !=
205                 root->fs_info->running_transaction->transid);
206         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
207
208         level = btrfs_header_level(buf);
209         if (level == 0)
210                 btrfs_item_key(buf, &disk_key, 0);
211         else
212                 btrfs_node_key(buf, &disk_key, 0);
213
214         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
215                                      new_root_objectid, &disk_key, level,
216                                      buf->start, 0);
217         if (IS_ERR(cow))
218                 return PTR_ERR(cow);
219
220         copy_extent_buffer(cow, buf, 0, 0, cow->len);
221         btrfs_set_header_bytenr(cow, cow->start);
222         btrfs_set_header_generation(cow, trans->transid);
223         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
224         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
225                                      BTRFS_HEADER_FLAG_RELOC);
226         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
227                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
228         else
229                 btrfs_set_header_owner(cow, new_root_objectid);
230
231         write_extent_buffer(cow, root->fs_info->fsid,
232                             (unsigned long)btrfs_header_fsid(cow),
233                             BTRFS_FSID_SIZE);
234
235         WARN_ON(btrfs_header_generation(buf) > trans->transid);
236         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
237                 ret = btrfs_inc_ref(trans, root, cow, 1);
238         else
239                 ret = btrfs_inc_ref(trans, root, cow, 0);
240
241         if (ret)
242                 return ret;
243
244         btrfs_mark_buffer_dirty(cow);
245         *cow_ret = cow;
246         return 0;
247 }
248
249 /*
250  * check if the tree block can be shared by multiple trees
251  */
252 int btrfs_block_can_be_shared(struct btrfs_root *root,
253                               struct extent_buffer *buf)
254 {
255         /*
256          * Tree blocks not in refernece counted trees and tree roots
257          * are never shared. If a block was allocated after the last
258          * snapshot and the block was not allocated by tree relocation,
259          * we know the block is not shared.
260          */
261         if (root->ref_cows &&
262             buf != root->node && buf != root->commit_root &&
263             (btrfs_header_generation(buf) <=
264              btrfs_root_last_snapshot(&root->root_item) ||
265              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
266                 return 1;
267 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
268         if (root->ref_cows &&
269             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
270                 return 1;
271 #endif
272         return 0;
273 }
274
275 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
276                                        struct btrfs_root *root,
277                                        struct extent_buffer *buf,
278                                        struct extent_buffer *cow,
279                                        int *last_ref)
280 {
281         u64 refs;
282         u64 owner;
283         u64 flags;
284         u64 new_flags = 0;
285         int ret;
286
287         /*
288          * Backrefs update rules:
289          *
290          * Always use full backrefs for extent pointers in tree block
291          * allocated by tree relocation.
292          *
293          * If a shared tree block is no longer referenced by its owner
294          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
295          * use full backrefs for extent pointers in tree block.
296          *
297          * If a tree block is been relocating
298          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
299          * use full backrefs for extent pointers in tree block.
300          * The reason for this is some operations (such as drop tree)
301          * are only allowed for blocks use full backrefs.
302          */
303
304         if (btrfs_block_can_be_shared(root, buf)) {
305                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
306                                                buf->len, &refs, &flags);
307                 BUG_ON(ret);
308                 BUG_ON(refs == 0);
309         } else {
310                 refs = 1;
311                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
312                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
313                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
314                 else
315                         flags = 0;
316         }
317
318         owner = btrfs_header_owner(buf);
319         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
320                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
321
322         if (refs > 1) {
323                 if ((owner == root->root_key.objectid ||
324                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
325                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
326                         ret = btrfs_inc_ref(trans, root, buf, 1);
327                         BUG_ON(ret);
328
329                         if (root->root_key.objectid ==
330                             BTRFS_TREE_RELOC_OBJECTID) {
331                                 ret = btrfs_dec_ref(trans, root, buf, 0);
332                                 BUG_ON(ret);
333                                 ret = btrfs_inc_ref(trans, root, cow, 1);
334                                 BUG_ON(ret);
335                         }
336                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
337                 } else {
338
339                         if (root->root_key.objectid ==
340                             BTRFS_TREE_RELOC_OBJECTID)
341                                 ret = btrfs_inc_ref(trans, root, cow, 1);
342                         else
343                                 ret = btrfs_inc_ref(trans, root, cow, 0);
344                         BUG_ON(ret);
345                 }
346                 if (new_flags != 0) {
347                         ret = btrfs_set_disk_extent_flags(trans, root,
348                                                           buf->start,
349                                                           buf->len,
350                                                           new_flags, 0);
351                         BUG_ON(ret);
352                 }
353         } else {
354                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
355                         if (root->root_key.objectid ==
356                             BTRFS_TREE_RELOC_OBJECTID)
357                                 ret = btrfs_inc_ref(trans, root, cow, 1);
358                         else
359                                 ret = btrfs_inc_ref(trans, root, cow, 0);
360                         BUG_ON(ret);
361                         ret = btrfs_dec_ref(trans, root, buf, 1);
362                         BUG_ON(ret);
363                 }
364                 clean_tree_block(trans, root, buf);
365                 *last_ref = 1;
366         }
367         return 0;
368 }
369
370 /*
371  * does the dirty work in cow of a single block.  The parent block (if
372  * supplied) is updated to point to the new cow copy.  The new buffer is marked
373  * dirty and returned locked.  If you modify the block it needs to be marked
374  * dirty again.
375  *
376  * search_start -- an allocation hint for the new block
377  *
378  * empty_size -- a hint that you plan on doing more cow.  This is the size in
379  * bytes the allocator should try to find free next to the block it returns.
380  * This is just a hint and may be ignored by the allocator.
381  */
382 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
383                              struct btrfs_root *root,
384                              struct extent_buffer *buf,
385                              struct extent_buffer *parent, int parent_slot,
386                              struct extent_buffer **cow_ret,
387                              u64 search_start, u64 empty_size)
388 {
389         struct btrfs_disk_key disk_key;
390         struct extent_buffer *cow;
391         int level;
392         int last_ref = 0;
393         int unlock_orig = 0;
394         u64 parent_start;
395
396         if (*cow_ret == buf)
397                 unlock_orig = 1;
398
399         btrfs_assert_tree_locked(buf);
400
401         WARN_ON(root->ref_cows && trans->transid !=
402                 root->fs_info->running_transaction->transid);
403         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
404
405         level = btrfs_header_level(buf);
406
407         if (level == 0)
408                 btrfs_item_key(buf, &disk_key, 0);
409         else
410                 btrfs_node_key(buf, &disk_key, 0);
411
412         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
413                 if (parent)
414                         parent_start = parent->start;
415                 else
416                         parent_start = 0;
417         } else
418                 parent_start = 0;
419
420         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
421                                      root->root_key.objectid, &disk_key,
422                                      level, search_start, empty_size);
423         if (IS_ERR(cow))
424                 return PTR_ERR(cow);
425
426         /* cow is set to blocking by btrfs_init_new_buffer */
427
428         copy_extent_buffer(cow, buf, 0, 0, cow->len);
429         btrfs_set_header_bytenr(cow, cow->start);
430         btrfs_set_header_generation(cow, trans->transid);
431         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
432         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
433                                      BTRFS_HEADER_FLAG_RELOC);
434         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
435                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
436         else
437                 btrfs_set_header_owner(cow, root->root_key.objectid);
438
439         write_extent_buffer(cow, root->fs_info->fsid,
440                             (unsigned long)btrfs_header_fsid(cow),
441                             BTRFS_FSID_SIZE);
442
443         update_ref_for_cow(trans, root, buf, cow, &last_ref);
444
445         if (root->ref_cows)
446                 btrfs_reloc_cow_block(trans, root, buf, cow);
447
448         if (buf == root->node) {
449                 WARN_ON(parent && parent != buf);
450                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
451                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
452                         parent_start = buf->start;
453                 else
454                         parent_start = 0;
455
456                 extent_buffer_get(cow);
457                 rcu_assign_pointer(root->node, cow);
458
459                 btrfs_free_tree_block(trans, root, buf, parent_start,
460                                       last_ref);
461                 free_extent_buffer(buf);
462                 add_root_to_dirty_list(root);
463         } else {
464                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
465                         parent_start = parent->start;
466                 else
467                         parent_start = 0;
468
469                 WARN_ON(trans->transid != btrfs_header_generation(parent));
470                 btrfs_set_node_blockptr(parent, parent_slot,
471                                         cow->start);
472                 btrfs_set_node_ptr_generation(parent, parent_slot,
473                                               trans->transid);
474                 btrfs_mark_buffer_dirty(parent);
475                 btrfs_free_tree_block(trans, root, buf, parent_start,
476                                       last_ref);
477         }
478         if (unlock_orig)
479                 btrfs_tree_unlock(buf);
480         free_extent_buffer(buf);
481         btrfs_mark_buffer_dirty(cow);
482         *cow_ret = cow;
483         return 0;
484 }
485
486 static inline int should_cow_block(struct btrfs_trans_handle *trans,
487                                    struct btrfs_root *root,
488                                    struct extent_buffer *buf)
489 {
490         if (btrfs_header_generation(buf) == trans->transid &&
491             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
492             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
493               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
494                 return 0;
495         return 1;
496 }
497
498 /*
499  * cows a single block, see __btrfs_cow_block for the real work.
500  * This version of it has extra checks so that a block isn't cow'd more than
501  * once per transaction, as long as it hasn't been written yet
502  */
503 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
504                     struct btrfs_root *root, struct extent_buffer *buf,
505                     struct extent_buffer *parent, int parent_slot,
506                     struct extent_buffer **cow_ret)
507 {
508         u64 search_start;
509         int ret;
510
511         if (trans->transaction != root->fs_info->running_transaction) {
512                 printk(KERN_CRIT "trans %llu running %llu\n",
513                        (unsigned long long)trans->transid,
514                        (unsigned long long)
515                        root->fs_info->running_transaction->transid);
516                 WARN_ON(1);
517         }
518         if (trans->transid != root->fs_info->generation) {
519                 printk(KERN_CRIT "trans %llu running %llu\n",
520                        (unsigned long long)trans->transid,
521                        (unsigned long long)root->fs_info->generation);
522                 WARN_ON(1);
523         }
524
525         if (!should_cow_block(trans, root, buf)) {
526                 *cow_ret = buf;
527                 return 0;
528         }
529
530         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
531
532         if (parent)
533                 btrfs_set_lock_blocking(parent);
534         btrfs_set_lock_blocking(buf);
535
536         ret = __btrfs_cow_block(trans, root, buf, parent,
537                                  parent_slot, cow_ret, search_start, 0);
538
539         trace_btrfs_cow_block(root, buf, *cow_ret);
540
541         return ret;
542 }
543
544 /*
545  * helper function for defrag to decide if two blocks pointed to by a
546  * node are actually close by
547  */
548 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
549 {
550         if (blocknr < other && other - (blocknr + blocksize) < 32768)
551                 return 1;
552         if (blocknr > other && blocknr - (other + blocksize) < 32768)
553                 return 1;
554         return 0;
555 }
556
557 /*
558  * compare two keys in a memcmp fashion
559  */
560 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
561 {
562         struct btrfs_key k1;
563
564         btrfs_disk_key_to_cpu(&k1, disk);
565
566         return btrfs_comp_cpu_keys(&k1, k2);
567 }
568
569 /*
570  * same as comp_keys only with two btrfs_key's
571  */
572 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
573 {
574         if (k1->objectid > k2->objectid)
575                 return 1;
576         if (k1->objectid < k2->objectid)
577                 return -1;
578         if (k1->type > k2->type)
579                 return 1;
580         if (k1->type < k2->type)
581                 return -1;
582         if (k1->offset > k2->offset)
583                 return 1;
584         if (k1->offset < k2->offset)
585                 return -1;
586         return 0;
587 }
588
589 /*
590  * this is used by the defrag code to go through all the
591  * leaves pointed to by a node and reallocate them so that
592  * disk order is close to key order
593  */
594 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
595                        struct btrfs_root *root, struct extent_buffer *parent,
596                        int start_slot, int cache_only, u64 *last_ret,
597                        struct btrfs_key *progress)
598 {
599         struct extent_buffer *cur;
600         u64 blocknr;
601         u64 gen;
602         u64 search_start = *last_ret;
603         u64 last_block = 0;
604         u64 other;
605         u32 parent_nritems;
606         int end_slot;
607         int i;
608         int err = 0;
609         int parent_level;
610         int uptodate;
611         u32 blocksize;
612         int progress_passed = 0;
613         struct btrfs_disk_key disk_key;
614
615         parent_level = btrfs_header_level(parent);
616         if (cache_only && parent_level != 1)
617                 return 0;
618
619         if (trans->transaction != root->fs_info->running_transaction)
620                 WARN_ON(1);
621         if (trans->transid != root->fs_info->generation)
622                 WARN_ON(1);
623
624         parent_nritems = btrfs_header_nritems(parent);
625         blocksize = btrfs_level_size(root, parent_level - 1);
626         end_slot = parent_nritems;
627
628         if (parent_nritems == 1)
629                 return 0;
630
631         btrfs_set_lock_blocking(parent);
632
633         for (i = start_slot; i < end_slot; i++) {
634                 int close = 1;
635
636                 if (!parent->map_token) {
637                         map_extent_buffer(parent,
638                                         btrfs_node_key_ptr_offset(i),
639                                         sizeof(struct btrfs_key_ptr),
640                                         &parent->map_token, &parent->kaddr,
641                                         &parent->map_start, &parent->map_len,
642                                         KM_USER1);
643                 }
644                 btrfs_node_key(parent, &disk_key, i);
645                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
646                         continue;
647
648                 progress_passed = 1;
649                 blocknr = btrfs_node_blockptr(parent, i);
650                 gen = btrfs_node_ptr_generation(parent, i);
651                 if (last_block == 0)
652                         last_block = blocknr;
653
654                 if (i > 0) {
655                         other = btrfs_node_blockptr(parent, i - 1);
656                         close = close_blocks(blocknr, other, blocksize);
657                 }
658                 if (!close && i < end_slot - 2) {
659                         other = btrfs_node_blockptr(parent, i + 1);
660                         close = close_blocks(blocknr, other, blocksize);
661                 }
662                 if (close) {
663                         last_block = blocknr;
664                         continue;
665                 }
666                 if (parent->map_token) {
667                         unmap_extent_buffer(parent, parent->map_token,
668                                             KM_USER1);
669                         parent->map_token = NULL;
670                 }
671
672                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
673                 if (cur)
674                         uptodate = btrfs_buffer_uptodate(cur, gen);
675                 else
676                         uptodate = 0;
677                 if (!cur || !uptodate) {
678                         if (cache_only) {
679                                 free_extent_buffer(cur);
680                                 continue;
681                         }
682                         if (!cur) {
683                                 cur = read_tree_block(root, blocknr,
684                                                          blocksize, gen);
685                                 if (!cur)
686                                         return -EIO;
687                         } else if (!uptodate) {
688                                 btrfs_read_buffer(cur, gen);
689                         }
690                 }
691                 if (search_start == 0)
692                         search_start = last_block;
693
694                 btrfs_tree_lock(cur);
695                 btrfs_set_lock_blocking(cur);
696                 err = __btrfs_cow_block(trans, root, cur, parent, i,
697                                         &cur, search_start,
698                                         min(16 * blocksize,
699                                             (end_slot - i) * blocksize));
700                 if (err) {
701                         btrfs_tree_unlock(cur);
702                         free_extent_buffer(cur);
703                         break;
704                 }
705                 search_start = cur->start;
706                 last_block = cur->start;
707                 *last_ret = search_start;
708                 btrfs_tree_unlock(cur);
709                 free_extent_buffer(cur);
710         }
711         if (parent->map_token) {
712                 unmap_extent_buffer(parent, parent->map_token,
713                                     KM_USER1);
714                 parent->map_token = NULL;
715         }
716         return err;
717 }
718
719 /*
720  * The leaf data grows from end-to-front in the node.
721  * this returns the address of the start of the last item,
722  * which is the stop of the leaf data stack
723  */
724 static inline unsigned int leaf_data_end(struct btrfs_root *root,
725                                          struct extent_buffer *leaf)
726 {
727         u32 nr = btrfs_header_nritems(leaf);
728         if (nr == 0)
729                 return BTRFS_LEAF_DATA_SIZE(root);
730         return btrfs_item_offset_nr(leaf, nr - 1);
731 }
732
733
734 /*
735  * search for key in the extent_buffer.  The items start at offset p,
736  * and they are item_size apart.  There are 'max' items in p.
737  *
738  * the slot in the array is returned via slot, and it points to
739  * the place where you would insert key if it is not found in
740  * the array.
741  *
742  * slot may point to max if the key is bigger than all of the keys
743  */
744 static noinline int generic_bin_search(struct extent_buffer *eb,
745                                        unsigned long p,
746                                        int item_size, struct btrfs_key *key,
747                                        int max, int *slot)
748 {
749         int low = 0;
750         int high = max;
751         int mid;
752         int ret;
753         struct btrfs_disk_key *tmp = NULL;
754         struct btrfs_disk_key unaligned;
755         unsigned long offset;
756         char *map_token = NULL;
757         char *kaddr = NULL;
758         unsigned long map_start = 0;
759         unsigned long map_len = 0;
760         int err;
761
762         while (low < high) {
763                 mid = (low + high) / 2;
764                 offset = p + mid * item_size;
765
766                 if (!map_token || offset < map_start ||
767                     (offset + sizeof(struct btrfs_disk_key)) >
768                     map_start + map_len) {
769                         if (map_token) {
770                                 unmap_extent_buffer(eb, map_token, KM_USER0);
771                                 map_token = NULL;
772                         }
773
774                         err = map_private_extent_buffer(eb, offset,
775                                                 sizeof(struct btrfs_disk_key),
776                                                 &map_token, &kaddr,
777                                                 &map_start, &map_len, KM_USER0);
778
779                         if (!err) {
780                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
781                                                         map_start);
782                         } else {
783                                 read_extent_buffer(eb, &unaligned,
784                                                    offset, sizeof(unaligned));
785                                 tmp = &unaligned;
786                         }
787
788                 } else {
789                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
790                                                         map_start);
791                 }
792                 ret = comp_keys(tmp, key);
793
794                 if (ret < 0)
795                         low = mid + 1;
796                 else if (ret > 0)
797                         high = mid;
798                 else {
799                         *slot = mid;
800                         if (map_token)
801                                 unmap_extent_buffer(eb, map_token, KM_USER0);
802                         return 0;
803                 }
804         }
805         *slot = low;
806         if (map_token)
807                 unmap_extent_buffer(eb, map_token, KM_USER0);
808         return 1;
809 }
810
811 /*
812  * simple bin_search frontend that does the right thing for
813  * leaves vs nodes
814  */
815 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
816                       int level, int *slot)
817 {
818         if (level == 0) {
819                 return generic_bin_search(eb,
820                                           offsetof(struct btrfs_leaf, items),
821                                           sizeof(struct btrfs_item),
822                                           key, btrfs_header_nritems(eb),
823                                           slot);
824         } else {
825                 return generic_bin_search(eb,
826                                           offsetof(struct btrfs_node, ptrs),
827                                           sizeof(struct btrfs_key_ptr),
828                                           key, btrfs_header_nritems(eb),
829                                           slot);
830         }
831         return -1;
832 }
833
834 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
835                      int level, int *slot)
836 {
837         return bin_search(eb, key, level, slot);
838 }
839
840 static void root_add_used(struct btrfs_root *root, u32 size)
841 {
842         spin_lock(&root->accounting_lock);
843         btrfs_set_root_used(&root->root_item,
844                             btrfs_root_used(&root->root_item) + size);
845         spin_unlock(&root->accounting_lock);
846 }
847
848 static void root_sub_used(struct btrfs_root *root, u32 size)
849 {
850         spin_lock(&root->accounting_lock);
851         btrfs_set_root_used(&root->root_item,
852                             btrfs_root_used(&root->root_item) - size);
853         spin_unlock(&root->accounting_lock);
854 }
855
856 /* given a node and slot number, this reads the blocks it points to.  The
857  * extent buffer is returned with a reference taken (but unlocked).
858  * NULL is returned on error.
859  */
860 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
861                                    struct extent_buffer *parent, int slot)
862 {
863         int level = btrfs_header_level(parent);
864         if (slot < 0)
865                 return NULL;
866         if (slot >= btrfs_header_nritems(parent))
867                 return NULL;
868
869         BUG_ON(level == 0);
870
871         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
872                        btrfs_level_size(root, level - 1),
873                        btrfs_node_ptr_generation(parent, slot));
874 }
875
876 /*
877  * node level balancing, used to make sure nodes are in proper order for
878  * item deletion.  We balance from the top down, so we have to make sure
879  * that a deletion won't leave an node completely empty later on.
880  */
881 static noinline int balance_level(struct btrfs_trans_handle *trans,
882                          struct btrfs_root *root,
883                          struct btrfs_path *path, int level)
884 {
885         struct extent_buffer *right = NULL;
886         struct extent_buffer *mid;
887         struct extent_buffer *left = NULL;
888         struct extent_buffer *parent = NULL;
889         int ret = 0;
890         int wret;
891         int pslot;
892         int orig_slot = path->slots[level];
893         u64 orig_ptr;
894
895         if (level == 0)
896                 return 0;
897
898         mid = path->nodes[level];
899
900         WARN_ON(!path->locks[level]);
901         WARN_ON(btrfs_header_generation(mid) != trans->transid);
902
903         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904
905         if (level < BTRFS_MAX_LEVEL - 1)
906                 parent = path->nodes[level + 1];
907         pslot = path->slots[level + 1];
908
909         /*
910          * deal with the case where there is only one pointer in the root
911          * by promoting the node below to a root
912          */
913         if (!parent) {
914                 struct extent_buffer *child;
915
916                 if (btrfs_header_nritems(mid) != 1)
917                         return 0;
918
919                 /* promote the child to a root */
920                 child = read_node_slot(root, mid, 0);
921                 BUG_ON(!child);
922                 btrfs_tree_lock(child);
923                 btrfs_set_lock_blocking(child);
924                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
925                 if (ret) {
926                         btrfs_tree_unlock(child);
927                         free_extent_buffer(child);
928                         goto enospc;
929                 }
930
931                 rcu_assign_pointer(root->node, child);
932
933                 add_root_to_dirty_list(root);
934                 btrfs_tree_unlock(child);
935
936                 path->locks[level] = 0;
937                 path->nodes[level] = NULL;
938                 clean_tree_block(trans, root, mid);
939                 btrfs_tree_unlock(mid);
940                 /* once for the path */
941                 free_extent_buffer(mid);
942
943                 root_sub_used(root, mid->len);
944                 btrfs_free_tree_block(trans, root, mid, 0, 1);
945                 /* once for the root ptr */
946                 free_extent_buffer(mid);
947                 return 0;
948         }
949         if (btrfs_header_nritems(mid) >
950             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
951                 return 0;
952
953         btrfs_header_nritems(mid);
954
955         left = read_node_slot(root, parent, pslot - 1);
956         if (left) {
957                 btrfs_tree_lock(left);
958                 btrfs_set_lock_blocking(left);
959                 wret = btrfs_cow_block(trans, root, left,
960                                        parent, pslot - 1, &left);
961                 if (wret) {
962                         ret = wret;
963                         goto enospc;
964                 }
965         }
966         right = read_node_slot(root, parent, pslot + 1);
967         if (right) {
968                 btrfs_tree_lock(right);
969                 btrfs_set_lock_blocking(right);
970                 wret = btrfs_cow_block(trans, root, right,
971                                        parent, pslot + 1, &right);
972                 if (wret) {
973                         ret = wret;
974                         goto enospc;
975                 }
976         }
977
978         /* first, try to make some room in the middle buffer */
979         if (left) {
980                 orig_slot += btrfs_header_nritems(left);
981                 wret = push_node_left(trans, root, left, mid, 1);
982                 if (wret < 0)
983                         ret = wret;
984                 btrfs_header_nritems(mid);
985         }
986
987         /*
988          * then try to empty the right most buffer into the middle
989          */
990         if (right) {
991                 wret = push_node_left(trans, root, mid, right, 1);
992                 if (wret < 0 && wret != -ENOSPC)
993                         ret = wret;
994                 if (btrfs_header_nritems(right) == 0) {
995                         clean_tree_block(trans, root, right);
996                         btrfs_tree_unlock(right);
997                         wret = del_ptr(trans, root, path, level + 1, pslot +
998                                        1);
999                         if (wret)
1000                                 ret = wret;
1001                         root_sub_used(root, right->len);
1002                         btrfs_free_tree_block(trans, root, right, 0, 1);
1003                         free_extent_buffer(right);
1004                         right = NULL;
1005                 } else {
1006                         struct btrfs_disk_key right_key;
1007                         btrfs_node_key(right, &right_key, 0);
1008                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1009                         btrfs_mark_buffer_dirty(parent);
1010                 }
1011         }
1012         if (btrfs_header_nritems(mid) == 1) {
1013                 /*
1014                  * we're not allowed to leave a node with one item in the
1015                  * tree during a delete.  A deletion from lower in the tree
1016                  * could try to delete the only pointer in this node.
1017                  * So, pull some keys from the left.
1018                  * There has to be a left pointer at this point because
1019                  * otherwise we would have pulled some pointers from the
1020                  * right
1021                  */
1022                 BUG_ON(!left);
1023                 wret = balance_node_right(trans, root, mid, left);
1024                 if (wret < 0) {
1025                         ret = wret;
1026                         goto enospc;
1027                 }
1028                 if (wret == 1) {
1029                         wret = push_node_left(trans, root, left, mid, 1);
1030                         if (wret < 0)
1031                                 ret = wret;
1032                 }
1033                 BUG_ON(wret == 1);
1034         }
1035         if (btrfs_header_nritems(mid) == 0) {
1036                 clean_tree_block(trans, root, mid);
1037                 btrfs_tree_unlock(mid);
1038                 wret = del_ptr(trans, root, path, level + 1, pslot);
1039                 if (wret)
1040                         ret = wret;
1041                 root_sub_used(root, mid->len);
1042                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1043                 free_extent_buffer(mid);
1044                 mid = NULL;
1045         } else {
1046                 /* update the parent key to reflect our changes */
1047                 struct btrfs_disk_key mid_key;
1048                 btrfs_node_key(mid, &mid_key, 0);
1049                 btrfs_set_node_key(parent, &mid_key, pslot);
1050                 btrfs_mark_buffer_dirty(parent);
1051         }
1052
1053         /* update the path */
1054         if (left) {
1055                 if (btrfs_header_nritems(left) > orig_slot) {
1056                         extent_buffer_get(left);
1057                         /* left was locked after cow */
1058                         path->nodes[level] = left;
1059                         path->slots[level + 1] -= 1;
1060                         path->slots[level] = orig_slot;
1061                         if (mid) {
1062                                 btrfs_tree_unlock(mid);
1063                                 free_extent_buffer(mid);
1064                         }
1065                 } else {
1066                         orig_slot -= btrfs_header_nritems(left);
1067                         path->slots[level] = orig_slot;
1068                 }
1069         }
1070         /* double check we haven't messed things up */
1071         if (orig_ptr !=
1072             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1073                 BUG();
1074 enospc:
1075         if (right) {
1076                 btrfs_tree_unlock(right);
1077                 free_extent_buffer(right);
1078         }
1079         if (left) {
1080                 if (path->nodes[level] != left)
1081                         btrfs_tree_unlock(left);
1082                 free_extent_buffer(left);
1083         }
1084         return ret;
1085 }
1086
1087 /* Node balancing for insertion.  Here we only split or push nodes around
1088  * when they are completely full.  This is also done top down, so we
1089  * have to be pessimistic.
1090  */
1091 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1092                                           struct btrfs_root *root,
1093                                           struct btrfs_path *path, int level)
1094 {
1095         struct extent_buffer *right = NULL;
1096         struct extent_buffer *mid;
1097         struct extent_buffer *left = NULL;
1098         struct extent_buffer *parent = NULL;
1099         int ret = 0;
1100         int wret;
1101         int pslot;
1102         int orig_slot = path->slots[level];
1103
1104         if (level == 0)
1105                 return 1;
1106
1107         mid = path->nodes[level];
1108         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109
1110         if (level < BTRFS_MAX_LEVEL - 1)
1111                 parent = path->nodes[level + 1];
1112         pslot = path->slots[level + 1];
1113
1114         if (!parent)
1115                 return 1;
1116
1117         left = read_node_slot(root, parent, pslot - 1);
1118
1119         /* first, try to make some room in the middle buffer */
1120         if (left) {
1121                 u32 left_nr;
1122
1123                 btrfs_tree_lock(left);
1124                 btrfs_set_lock_blocking(left);
1125
1126                 left_nr = btrfs_header_nritems(left);
1127                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1128                         wret = 1;
1129                 } else {
1130                         ret = btrfs_cow_block(trans, root, left, parent,
1131                                               pslot - 1, &left);
1132                         if (ret)
1133                                 wret = 1;
1134                         else {
1135                                 wret = push_node_left(trans, root,
1136                                                       left, mid, 0);
1137                         }
1138                 }
1139                 if (wret < 0)
1140                         ret = wret;
1141                 if (wret == 0) {
1142                         struct btrfs_disk_key disk_key;
1143                         orig_slot += left_nr;
1144                         btrfs_node_key(mid, &disk_key, 0);
1145                         btrfs_set_node_key(parent, &disk_key, pslot);
1146                         btrfs_mark_buffer_dirty(parent);
1147                         if (btrfs_header_nritems(left) > orig_slot) {
1148                                 path->nodes[level] = left;
1149                                 path->slots[level + 1] -= 1;
1150                                 path->slots[level] = orig_slot;
1151                                 btrfs_tree_unlock(mid);
1152                                 free_extent_buffer(mid);
1153                         } else {
1154                                 orig_slot -=
1155                                         btrfs_header_nritems(left);
1156                                 path->slots[level] = orig_slot;
1157                                 btrfs_tree_unlock(left);
1158                                 free_extent_buffer(left);
1159                         }
1160                         return 0;
1161                 }
1162                 btrfs_tree_unlock(left);
1163                 free_extent_buffer(left);
1164         }
1165         right = read_node_slot(root, parent, pslot + 1);
1166
1167         /*
1168          * then try to empty the right most buffer into the middle
1169          */
1170         if (right) {
1171                 u32 right_nr;
1172
1173                 btrfs_tree_lock(right);
1174                 btrfs_set_lock_blocking(right);
1175
1176                 right_nr = btrfs_header_nritems(right);
1177                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1178                         wret = 1;
1179                 } else {
1180                         ret = btrfs_cow_block(trans, root, right,
1181                                               parent, pslot + 1,
1182                                               &right);
1183                         if (ret)
1184                                 wret = 1;
1185                         else {
1186                                 wret = balance_node_right(trans, root,
1187                                                           right, mid);
1188                         }
1189                 }
1190                 if (wret < 0)
1191                         ret = wret;
1192                 if (wret == 0) {
1193                         struct btrfs_disk_key disk_key;
1194
1195                         btrfs_node_key(right, &disk_key, 0);
1196                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1197                         btrfs_mark_buffer_dirty(parent);
1198
1199                         if (btrfs_header_nritems(mid) <= orig_slot) {
1200                                 path->nodes[level] = right;
1201                                 path->slots[level + 1] += 1;
1202                                 path->slots[level] = orig_slot -
1203                                         btrfs_header_nritems(mid);
1204                                 btrfs_tree_unlock(mid);
1205                                 free_extent_buffer(mid);
1206                         } else {
1207                                 btrfs_tree_unlock(right);
1208                                 free_extent_buffer(right);
1209                         }
1210                         return 0;
1211                 }
1212                 btrfs_tree_unlock(right);
1213                 free_extent_buffer(right);
1214         }
1215         return 1;
1216 }
1217
1218 /*
1219  * readahead one full node of leaves, finding things that are close
1220  * to the block in 'slot', and triggering ra on them.
1221  */
1222 static void reada_for_search(struct btrfs_root *root,
1223                              struct btrfs_path *path,
1224                              int level, int slot, u64 objectid)
1225 {
1226         struct extent_buffer *node;
1227         struct btrfs_disk_key disk_key;
1228         u32 nritems;
1229         u64 search;
1230         u64 target;
1231         u64 nread = 0;
1232         u64 gen;
1233         int direction = path->reada;
1234         struct extent_buffer *eb;
1235         u32 nr;
1236         u32 blocksize;
1237         u32 nscan = 0;
1238
1239         if (level != 1)
1240                 return;
1241
1242         if (!path->nodes[level])
1243                 return;
1244
1245         node = path->nodes[level];
1246
1247         search = btrfs_node_blockptr(node, slot);
1248         blocksize = btrfs_level_size(root, level - 1);
1249         eb = btrfs_find_tree_block(root, search, blocksize);
1250         if (eb) {
1251                 free_extent_buffer(eb);
1252                 return;
1253         }
1254
1255         target = search;
1256
1257         nritems = btrfs_header_nritems(node);
1258         nr = slot;
1259         while (1) {
1260                 if (!node->map_token) {
1261                         unsigned long offset = btrfs_node_key_ptr_offset(nr);
1262                         map_private_extent_buffer(node, offset,
1263                                                   sizeof(struct btrfs_key_ptr),
1264                                                   &node->map_token,
1265                                                   &node->kaddr,
1266                                                   &node->map_start,
1267                                                   &node->map_len, KM_USER1);
1268                 }
1269                 if (direction < 0) {
1270                         if (nr == 0)
1271                                 break;
1272                         nr--;
1273                 } else if (direction > 0) {
1274                         nr++;
1275                         if (nr >= nritems)
1276                                 break;
1277                 }
1278                 if (path->reada < 0 && objectid) {
1279                         btrfs_node_key(node, &disk_key, nr);
1280                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1281                                 break;
1282                 }
1283                 search = btrfs_node_blockptr(node, nr);
1284                 if ((search <= target && target - search <= 65536) ||
1285                     (search > target && search - target <= 65536)) {
1286                         gen = btrfs_node_ptr_generation(node, nr);
1287                         if (node->map_token) {
1288                                 unmap_extent_buffer(node, node->map_token,
1289                                                     KM_USER1);
1290                                 node->map_token = NULL;
1291                         }
1292                         readahead_tree_block(root, search, blocksize, gen);
1293                         nread += blocksize;
1294                 }
1295                 nscan++;
1296                 if ((nread > 65536 || nscan > 32))
1297                         break;
1298         }
1299         if (node->map_token) {
1300                 unmap_extent_buffer(node, node->map_token, KM_USER1);
1301                 node->map_token = NULL;
1302         }
1303 }
1304
1305 /*
1306  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1307  * cache
1308  */
1309 static noinline int reada_for_balance(struct btrfs_root *root,
1310                                       struct btrfs_path *path, int level)
1311 {
1312         int slot;
1313         int nritems;
1314         struct extent_buffer *parent;
1315         struct extent_buffer *eb;
1316         u64 gen;
1317         u64 block1 = 0;
1318         u64 block2 = 0;
1319         int ret = 0;
1320         int blocksize;
1321
1322         parent = path->nodes[level + 1];
1323         if (!parent)
1324                 return 0;
1325
1326         nritems = btrfs_header_nritems(parent);
1327         slot = path->slots[level + 1];
1328         blocksize = btrfs_level_size(root, level);
1329
1330         if (slot > 0) {
1331                 block1 = btrfs_node_blockptr(parent, slot - 1);
1332                 gen = btrfs_node_ptr_generation(parent, slot - 1);
1333                 eb = btrfs_find_tree_block(root, block1, blocksize);
1334                 if (eb && btrfs_buffer_uptodate(eb, gen))
1335                         block1 = 0;
1336                 free_extent_buffer(eb);
1337         }
1338         if (slot + 1 < nritems) {
1339                 block2 = btrfs_node_blockptr(parent, slot + 1);
1340                 gen = btrfs_node_ptr_generation(parent, slot + 1);
1341                 eb = btrfs_find_tree_block(root, block2, blocksize);
1342                 if (eb && btrfs_buffer_uptodate(eb, gen))
1343                         block2 = 0;
1344                 free_extent_buffer(eb);
1345         }
1346         if (block1 || block2) {
1347                 ret = -EAGAIN;
1348
1349                 /* release the whole path */
1350                 btrfs_release_path(root, path);
1351
1352                 /* read the blocks */
1353                 if (block1)
1354                         readahead_tree_block(root, block1, blocksize, 0);
1355                 if (block2)
1356                         readahead_tree_block(root, block2, blocksize, 0);
1357
1358                 if (block1) {
1359                         eb = read_tree_block(root, block1, blocksize, 0);
1360                         free_extent_buffer(eb);
1361                 }
1362                 if (block2) {
1363                         eb = read_tree_block(root, block2, blocksize, 0);
1364                         free_extent_buffer(eb);
1365                 }
1366         }
1367         return ret;
1368 }
1369
1370
1371 /*
1372  * when we walk down the tree, it is usually safe to unlock the higher layers
1373  * in the tree.  The exceptions are when our path goes through slot 0, because
1374  * operations on the tree might require changing key pointers higher up in the
1375  * tree.
1376  *
1377  * callers might also have set path->keep_locks, which tells this code to keep
1378  * the lock if the path points to the last slot in the block.  This is part of
1379  * walking through the tree, and selecting the next slot in the higher block.
1380  *
1381  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1382  * if lowest_unlock is 1, level 0 won't be unlocked
1383  */
1384 static noinline void unlock_up(struct btrfs_path *path, int level,
1385                                int lowest_unlock)
1386 {
1387         int i;
1388         int skip_level = level;
1389         int no_skips = 0;
1390         struct extent_buffer *t;
1391
1392         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1393                 if (!path->nodes[i])
1394                         break;
1395                 if (!path->locks[i])
1396                         break;
1397                 if (!no_skips && path->slots[i] == 0) {
1398                         skip_level = i + 1;
1399                         continue;
1400                 }
1401                 if (!no_skips && path->keep_locks) {
1402                         u32 nritems;
1403                         t = path->nodes[i];
1404                         nritems = btrfs_header_nritems(t);
1405                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1406                                 skip_level = i + 1;
1407                                 continue;
1408                         }
1409                 }
1410                 if (skip_level < i && i >= lowest_unlock)
1411                         no_skips = 1;
1412
1413                 t = path->nodes[i];
1414                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1415                         btrfs_tree_unlock(t);
1416                         path->locks[i] = 0;
1417                 }
1418         }
1419 }
1420
1421 /*
1422  * This releases any locks held in the path starting at level and
1423  * going all the way up to the root.
1424  *
1425  * btrfs_search_slot will keep the lock held on higher nodes in a few
1426  * corner cases, such as COW of the block at slot zero in the node.  This
1427  * ignores those rules, and it should only be called when there are no
1428  * more updates to be done higher up in the tree.
1429  */
1430 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1431 {
1432         int i;
1433
1434         if (path->keep_locks)
1435                 return;
1436
1437         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1438                 if (!path->nodes[i])
1439                         continue;
1440                 if (!path->locks[i])
1441                         continue;
1442                 btrfs_tree_unlock(path->nodes[i]);
1443                 path->locks[i] = 0;
1444         }
1445 }
1446
1447 /*
1448  * helper function for btrfs_search_slot.  The goal is to find a block
1449  * in cache without setting the path to blocking.  If we find the block
1450  * we return zero and the path is unchanged.
1451  *
1452  * If we can't find the block, we set the path blocking and do some
1453  * reada.  -EAGAIN is returned and the search must be repeated.
1454  */
1455 static int
1456 read_block_for_search(struct btrfs_trans_handle *trans,
1457                        struct btrfs_root *root, struct btrfs_path *p,
1458                        struct extent_buffer **eb_ret, int level, int slot,
1459                        struct btrfs_key *key)
1460 {
1461         u64 blocknr;
1462         u64 gen;
1463         u32 blocksize;
1464         struct extent_buffer *b = *eb_ret;
1465         struct extent_buffer *tmp;
1466         int ret;
1467
1468         blocknr = btrfs_node_blockptr(b, slot);
1469         gen = btrfs_node_ptr_generation(b, slot);
1470         blocksize = btrfs_level_size(root, level - 1);
1471
1472         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1473         if (tmp) {
1474                 if (btrfs_buffer_uptodate(tmp, 0)) {
1475                         if (btrfs_buffer_uptodate(tmp, gen)) {
1476                                 /*
1477                                  * we found an up to date block without
1478                                  * sleeping, return
1479                                  * right away
1480                                  */
1481                                 *eb_ret = tmp;
1482                                 return 0;
1483                         }
1484                         /* the pages were up to date, but we failed
1485                          * the generation number check.  Do a full
1486                          * read for the generation number that is correct.
1487                          * We must do this without dropping locks so
1488                          * we can trust our generation number
1489                          */
1490                         free_extent_buffer(tmp);
1491                         tmp = read_tree_block(root, blocknr, blocksize, gen);
1492                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1493                                 *eb_ret = tmp;
1494                                 return 0;
1495                         }
1496                         free_extent_buffer(tmp);
1497                         btrfs_release_path(NULL, p);
1498                         return -EIO;
1499                 }
1500         }
1501
1502         /*
1503          * reduce lock contention at high levels
1504          * of the btree by dropping locks before
1505          * we read.  Don't release the lock on the current
1506          * level because we need to walk this node to figure
1507          * out which blocks to read.
1508          */
1509         btrfs_unlock_up_safe(p, level + 1);
1510         btrfs_set_path_blocking(p);
1511
1512         free_extent_buffer(tmp);
1513         if (p->reada)
1514                 reada_for_search(root, p, level, slot, key->objectid);
1515
1516         btrfs_release_path(NULL, p);
1517
1518         ret = -EAGAIN;
1519         tmp = read_tree_block(root, blocknr, blocksize, 0);
1520         if (tmp) {
1521                 /*
1522                  * If the read above didn't mark this buffer up to date,
1523                  * it will never end up being up to date.  Set ret to EIO now
1524                  * and give up so that our caller doesn't loop forever
1525                  * on our EAGAINs.
1526                  */
1527                 if (!btrfs_buffer_uptodate(tmp, 0))
1528                         ret = -EIO;
1529                 free_extent_buffer(tmp);
1530         }
1531         return ret;
1532 }
1533
1534 /*
1535  * helper function for btrfs_search_slot.  This does all of the checks
1536  * for node-level blocks and does any balancing required based on
1537  * the ins_len.
1538  *
1539  * If no extra work was required, zero is returned.  If we had to
1540  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1541  * start over
1542  */
1543 static int
1544 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1545                        struct btrfs_root *root, struct btrfs_path *p,
1546                        struct extent_buffer *b, int level, int ins_len)
1547 {
1548         int ret;
1549         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1550             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1551                 int sret;
1552
1553                 sret = reada_for_balance(root, p, level);
1554                 if (sret)
1555                         goto again;
1556
1557                 btrfs_set_path_blocking(p);
1558                 sret = split_node(trans, root, p, level);
1559                 btrfs_clear_path_blocking(p, NULL);
1560
1561                 BUG_ON(sret > 0);
1562                 if (sret) {
1563                         ret = sret;
1564                         goto done;
1565                 }
1566                 b = p->nodes[level];
1567         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1568                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1569                 int sret;
1570
1571                 sret = reada_for_balance(root, p, level);
1572                 if (sret)
1573                         goto again;
1574
1575                 btrfs_set_path_blocking(p);
1576                 sret = balance_level(trans, root, p, level);
1577                 btrfs_clear_path_blocking(p, NULL);
1578
1579                 if (sret) {
1580                         ret = sret;
1581                         goto done;
1582                 }
1583                 b = p->nodes[level];
1584                 if (!b) {
1585                         btrfs_release_path(NULL, p);
1586                         goto again;
1587                 }
1588                 BUG_ON(btrfs_header_nritems(b) == 1);
1589         }
1590         return 0;
1591
1592 again:
1593         ret = -EAGAIN;
1594 done:
1595         return ret;
1596 }
1597
1598 /*
1599  * look for key in the tree.  path is filled in with nodes along the way
1600  * if key is found, we return zero and you can find the item in the leaf
1601  * level of the path (level 0)
1602  *
1603  * If the key isn't found, the path points to the slot where it should
1604  * be inserted, and 1 is returned.  If there are other errors during the
1605  * search a negative error number is returned.
1606  *
1607  * if ins_len > 0, nodes and leaves will be split as we walk down the
1608  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1609  * possible)
1610  */
1611 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1612                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1613                       ins_len, int cow)
1614 {
1615         struct extent_buffer *b;
1616         int slot;
1617         int ret;
1618         int err;
1619         int level;
1620         int lowest_unlock = 1;
1621         u8 lowest_level = 0;
1622
1623         lowest_level = p->lowest_level;
1624         WARN_ON(lowest_level && ins_len > 0);
1625         WARN_ON(p->nodes[0] != NULL);
1626
1627         if (ins_len < 0)
1628                 lowest_unlock = 2;
1629
1630 again:
1631         if (p->search_commit_root) {
1632                 b = root->commit_root;
1633                 extent_buffer_get(b);
1634                 if (!p->skip_locking)
1635                         btrfs_tree_lock(b);
1636         } else {
1637                 if (p->skip_locking)
1638                         b = btrfs_root_node(root);
1639                 else
1640                         b = btrfs_lock_root_node(root);
1641         }
1642
1643         while (b) {
1644                 level = btrfs_header_level(b);
1645
1646                 /*
1647                  * setup the path here so we can release it under lock
1648                  * contention with the cow code
1649                  */
1650                 p->nodes[level] = b;
1651                 if (!p->skip_locking)
1652                         p->locks[level] = 1;
1653
1654                 if (cow) {
1655                         /*
1656                          * if we don't really need to cow this block
1657                          * then we don't want to set the path blocking,
1658                          * so we test it here
1659                          */
1660                         if (!should_cow_block(trans, root, b))
1661                                 goto cow_done;
1662
1663                         btrfs_set_path_blocking(p);
1664
1665                         err = btrfs_cow_block(trans, root, b,
1666                                               p->nodes[level + 1],
1667                                               p->slots[level + 1], &b);
1668                         if (err) {
1669                                 ret = err;
1670                                 goto done;
1671                         }
1672                 }
1673 cow_done:
1674                 BUG_ON(!cow && ins_len);
1675                 if (level != btrfs_header_level(b))
1676                         WARN_ON(1);
1677                 level = btrfs_header_level(b);
1678
1679                 p->nodes[level] = b;
1680                 if (!p->skip_locking)
1681                         p->locks[level] = 1;
1682
1683                 btrfs_clear_path_blocking(p, NULL);
1684
1685                 /*
1686                  * we have a lock on b and as long as we aren't changing
1687                  * the tree, there is no way to for the items in b to change.
1688                  * It is safe to drop the lock on our parent before we
1689                  * go through the expensive btree search on b.
1690                  *
1691                  * If cow is true, then we might be changing slot zero,
1692                  * which may require changing the parent.  So, we can't
1693                  * drop the lock until after we know which slot we're
1694                  * operating on.
1695                  */
1696                 if (!cow)
1697                         btrfs_unlock_up_safe(p, level + 1);
1698
1699                 ret = bin_search(b, key, level, &slot);
1700
1701                 if (level != 0) {
1702                         int dec = 0;
1703                         if (ret && slot > 0) {
1704                                 dec = 1;
1705                                 slot -= 1;
1706                         }
1707                         p->slots[level] = slot;
1708                         err = setup_nodes_for_search(trans, root, p, b, level,
1709                                                      ins_len);
1710                         if (err == -EAGAIN)
1711                                 goto again;
1712                         if (err) {
1713                                 ret = err;
1714                                 goto done;
1715                         }
1716                         b = p->nodes[level];
1717                         slot = p->slots[level];
1718
1719                         unlock_up(p, level, lowest_unlock);
1720
1721                         if (level == lowest_level) {
1722                                 if (dec)
1723                                         p->slots[level]++;
1724                                 goto done;
1725                         }
1726
1727                         err = read_block_for_search(trans, root, p,
1728                                                     &b, level, slot, key);
1729                         if (err == -EAGAIN)
1730                                 goto again;
1731                         if (err) {
1732                                 ret = err;
1733                                 goto done;
1734                         }
1735
1736                         if (!p->skip_locking) {
1737                                 btrfs_clear_path_blocking(p, NULL);
1738                                 err = btrfs_try_spin_lock(b);
1739
1740                                 if (!err) {
1741                                         btrfs_set_path_blocking(p);
1742                                         btrfs_tree_lock(b);
1743                                         btrfs_clear_path_blocking(p, b);
1744                                 }
1745                         }
1746                 } else {
1747                         p->slots[level] = slot;
1748                         if (ins_len > 0 &&
1749                             btrfs_leaf_free_space(root, b) < ins_len) {
1750                                 btrfs_set_path_blocking(p);
1751                                 err = split_leaf(trans, root, key,
1752                                                  p, ins_len, ret == 0);
1753                                 btrfs_clear_path_blocking(p, NULL);
1754
1755                                 BUG_ON(err > 0);
1756                                 if (err) {
1757                                         ret = err;
1758                                         goto done;
1759                                 }
1760                         }
1761                         if (!p->search_for_split)
1762                                 unlock_up(p, level, lowest_unlock);
1763                         goto done;
1764                 }
1765         }
1766         ret = 1;
1767 done:
1768         /*
1769          * we don't really know what they plan on doing with the path
1770          * from here on, so for now just mark it as blocking
1771          */
1772         if (!p->leave_spinning)
1773                 btrfs_set_path_blocking(p);
1774         if (ret < 0)
1775                 btrfs_release_path(root, p);
1776         return ret;
1777 }
1778
1779 /*
1780  * adjust the pointers going up the tree, starting at level
1781  * making sure the right key of each node is points to 'key'.
1782  * This is used after shifting pointers to the left, so it stops
1783  * fixing up pointers when a given leaf/node is not in slot 0 of the
1784  * higher levels
1785  *
1786  * If this fails to write a tree block, it returns -1, but continues
1787  * fixing up the blocks in ram so the tree is consistent.
1788  */
1789 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1790                           struct btrfs_root *root, struct btrfs_path *path,
1791                           struct btrfs_disk_key *key, int level)
1792 {
1793         int i;
1794         int ret = 0;
1795         struct extent_buffer *t;
1796
1797         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1798                 int tslot = path->slots[i];
1799                 if (!path->nodes[i])
1800                         break;
1801                 t = path->nodes[i];
1802                 btrfs_set_node_key(t, key, tslot);
1803                 btrfs_mark_buffer_dirty(path->nodes[i]);
1804                 if (tslot != 0)
1805                         break;
1806         }
1807         return ret;
1808 }
1809
1810 /*
1811  * update item key.
1812  *
1813  * This function isn't completely safe. It's the caller's responsibility
1814  * that the new key won't break the order
1815  */
1816 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1817                             struct btrfs_root *root, struct btrfs_path *path,
1818                             struct btrfs_key *new_key)
1819 {
1820         struct btrfs_disk_key disk_key;
1821         struct extent_buffer *eb;
1822         int slot;
1823
1824         eb = path->nodes[0];
1825         slot = path->slots[0];
1826         if (slot > 0) {
1827                 btrfs_item_key(eb, &disk_key, slot - 1);
1828                 if (comp_keys(&disk_key, new_key) >= 0)
1829                         return -1;
1830         }
1831         if (slot < btrfs_header_nritems(eb) - 1) {
1832                 btrfs_item_key(eb, &disk_key, slot + 1);
1833                 if (comp_keys(&disk_key, new_key) <= 0)
1834                         return -1;
1835         }
1836
1837         btrfs_cpu_key_to_disk(&disk_key, new_key);
1838         btrfs_set_item_key(eb, &disk_key, slot);
1839         btrfs_mark_buffer_dirty(eb);
1840         if (slot == 0)
1841                 fixup_low_keys(trans, root, path, &disk_key, 1);
1842         return 0;
1843 }
1844
1845 /*
1846  * try to push data from one node into the next node left in the
1847  * tree.
1848  *
1849  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1850  * error, and > 0 if there was no room in the left hand block.
1851  */
1852 static int push_node_left(struct btrfs_trans_handle *trans,
1853                           struct btrfs_root *root, struct extent_buffer *dst,
1854                           struct extent_buffer *src, int empty)
1855 {
1856         int push_items = 0;
1857         int src_nritems;
1858         int dst_nritems;
1859         int ret = 0;
1860
1861         src_nritems = btrfs_header_nritems(src);
1862         dst_nritems = btrfs_header_nritems(dst);
1863         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1864         WARN_ON(btrfs_header_generation(src) != trans->transid);
1865         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1866
1867         if (!empty && src_nritems <= 8)
1868                 return 1;
1869
1870         if (push_items <= 0)
1871                 return 1;
1872
1873         if (empty) {
1874                 push_items = min(src_nritems, push_items);
1875                 if (push_items < src_nritems) {
1876                         /* leave at least 8 pointers in the node if
1877                          * we aren't going to empty it
1878                          */
1879                         if (src_nritems - push_items < 8) {
1880                                 if (push_items <= 8)
1881                                         return 1;
1882                                 push_items -= 8;
1883                         }
1884                 }
1885         } else
1886                 push_items = min(src_nritems - 8, push_items);
1887
1888         copy_extent_buffer(dst, src,
1889                            btrfs_node_key_ptr_offset(dst_nritems),
1890                            btrfs_node_key_ptr_offset(0),
1891                            push_items * sizeof(struct btrfs_key_ptr));
1892
1893         if (push_items < src_nritems) {
1894                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1895                                       btrfs_node_key_ptr_offset(push_items),
1896                                       (src_nritems - push_items) *
1897                                       sizeof(struct btrfs_key_ptr));
1898         }
1899         btrfs_set_header_nritems(src, src_nritems - push_items);
1900         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1901         btrfs_mark_buffer_dirty(src);
1902         btrfs_mark_buffer_dirty(dst);
1903
1904         return ret;
1905 }
1906
1907 /*
1908  * try to push data from one node into the next node right in the
1909  * tree.
1910  *
1911  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1912  * error, and > 0 if there was no room in the right hand block.
1913  *
1914  * this will  only push up to 1/2 the contents of the left node over
1915  */
1916 static int balance_node_right(struct btrfs_trans_handle *trans,
1917                               struct btrfs_root *root,
1918                               struct extent_buffer *dst,
1919                               struct extent_buffer *src)
1920 {
1921         int push_items = 0;
1922         int max_push;
1923         int src_nritems;
1924         int dst_nritems;
1925         int ret = 0;
1926
1927         WARN_ON(btrfs_header_generation(src) != trans->transid);
1928         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1929
1930         src_nritems = btrfs_header_nritems(src);
1931         dst_nritems = btrfs_header_nritems(dst);
1932         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1933         if (push_items <= 0)
1934                 return 1;
1935
1936         if (src_nritems < 4)
1937                 return 1;
1938
1939         max_push = src_nritems / 2 + 1;
1940         /* don't try to empty the node */
1941         if (max_push >= src_nritems)
1942                 return 1;
1943
1944         if (max_push < push_items)
1945                 push_items = max_push;
1946
1947         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1948                                       btrfs_node_key_ptr_offset(0),
1949                                       (dst_nritems) *
1950                                       sizeof(struct btrfs_key_ptr));
1951
1952         copy_extent_buffer(dst, src,
1953                            btrfs_node_key_ptr_offset(0),
1954                            btrfs_node_key_ptr_offset(src_nritems - push_items),
1955                            push_items * sizeof(struct btrfs_key_ptr));
1956
1957         btrfs_set_header_nritems(src, src_nritems - push_items);
1958         btrfs_set_header_nritems(dst, dst_nritems + push_items);
1959
1960         btrfs_mark_buffer_dirty(src);
1961         btrfs_mark_buffer_dirty(dst);
1962
1963         return ret;
1964 }
1965
1966 /*
1967  * helper function to insert a new root level in the tree.
1968  * A new node is allocated, and a single item is inserted to
1969  * point to the existing root
1970  *
1971  * returns zero on success or < 0 on failure.
1972  */
1973 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1974                            struct btrfs_root *root,
1975                            struct btrfs_path *path, int level)
1976 {
1977         u64 lower_gen;
1978         struct extent_buffer *lower;
1979         struct extent_buffer *c;
1980         struct extent_buffer *old;
1981         struct btrfs_disk_key lower_key;
1982
1983         BUG_ON(path->nodes[level]);
1984         BUG_ON(path->nodes[level-1] != root->node);
1985
1986         lower = path->nodes[level-1];
1987         if (level == 1)
1988                 btrfs_item_key(lower, &lower_key, 0);
1989         else
1990                 btrfs_node_key(lower, &lower_key, 0);
1991
1992         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1993                                    root->root_key.objectid, &lower_key,
1994                                    level, root->node->start, 0);
1995         if (IS_ERR(c))
1996                 return PTR_ERR(c);
1997
1998         root_add_used(root, root->nodesize);
1999
2000         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2001         btrfs_set_header_nritems(c, 1);
2002         btrfs_set_header_level(c, level);
2003         btrfs_set_header_bytenr(c, c->start);
2004         btrfs_set_header_generation(c, trans->transid);
2005         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
2006         btrfs_set_header_owner(c, root->root_key.objectid);
2007
2008         write_extent_buffer(c, root->fs_info->fsid,
2009                             (unsigned long)btrfs_header_fsid(c),
2010                             BTRFS_FSID_SIZE);
2011
2012         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2013                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
2014                             BTRFS_UUID_SIZE);
2015
2016         btrfs_set_node_key(c, &lower_key, 0);
2017         btrfs_set_node_blockptr(c, 0, lower->start);
2018         lower_gen = btrfs_header_generation(lower);
2019         WARN_ON(lower_gen != trans->transid);
2020
2021         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2022
2023         btrfs_mark_buffer_dirty(c);
2024
2025         old = root->node;
2026         rcu_assign_pointer(root->node, c);
2027
2028         /* the super has an extra ref to root->node */
2029         free_extent_buffer(old);
2030
2031         add_root_to_dirty_list(root);
2032         extent_buffer_get(c);
2033         path->nodes[level] = c;
2034         path->locks[level] = 1;
2035         path->slots[level] = 0;
2036         return 0;
2037 }
2038
2039 /*
2040  * worker function to insert a single pointer in a node.
2041  * the node should have enough room for the pointer already
2042  *
2043  * slot and level indicate where you want the key to go, and
2044  * blocknr is the block the key points to.
2045  *
2046  * returns zero on success and < 0 on any error
2047  */
2048 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2049                       *root, struct btrfs_path *path, struct btrfs_disk_key
2050                       *key, u64 bytenr, int slot, int level)
2051 {
2052         struct extent_buffer *lower;
2053         int nritems;
2054
2055         BUG_ON(!path->nodes[level]);
2056         btrfs_assert_tree_locked(path->nodes[level]);
2057         lower = path->nodes[level];
2058         nritems = btrfs_header_nritems(lower);
2059         BUG_ON(slot > nritems);
2060         if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2061                 BUG();
2062         if (slot != nritems) {
2063                 memmove_extent_buffer(lower,
2064                               btrfs_node_key_ptr_offset(slot + 1),
2065                               btrfs_node_key_ptr_offset(slot),
2066                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2067         }
2068         btrfs_set_node_key(lower, key, slot);
2069         btrfs_set_node_blockptr(lower, slot, bytenr);
2070         WARN_ON(trans->transid == 0);
2071         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2072         btrfs_set_header_nritems(lower, nritems + 1);
2073         btrfs_mark_buffer_dirty(lower);
2074         return 0;
2075 }
2076
2077 /*
2078  * split the node at the specified level in path in two.
2079  * The path is corrected to point to the appropriate node after the split
2080  *
2081  * Before splitting this tries to make some room in the node by pushing
2082  * left and right, if either one works, it returns right away.
2083  *
2084  * returns 0 on success and < 0 on failure
2085  */
2086 static noinline int split_node(struct btrfs_trans_handle *trans,
2087                                struct btrfs_root *root,
2088                                struct btrfs_path *path, int level)
2089 {
2090         struct extent_buffer *c;
2091         struct extent_buffer *split;
2092         struct btrfs_disk_key disk_key;
2093         int mid;
2094         int ret;
2095         int wret;
2096         u32 c_nritems;
2097
2098         c = path->nodes[level];
2099         WARN_ON(btrfs_header_generation(c) != trans->transid);
2100         if (c == root->node) {
2101                 /* trying to split the root, lets make a new one */
2102                 ret = insert_new_root(trans, root, path, level + 1);
2103                 if (ret)
2104                         return ret;
2105         } else {
2106                 ret = push_nodes_for_insert(trans, root, path, level);
2107                 c = path->nodes[level];
2108                 if (!ret && btrfs_header_nritems(c) <
2109                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2110                         return 0;
2111                 if (ret < 0)
2112                         return ret;
2113         }
2114
2115         c_nritems = btrfs_header_nritems(c);
2116         mid = (c_nritems + 1) / 2;
2117         btrfs_node_key(c, &disk_key, mid);
2118
2119         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2120                                         root->root_key.objectid,
2121                                         &disk_key, level, c->start, 0);
2122         if (IS_ERR(split))
2123                 return PTR_ERR(split);
2124
2125         root_add_used(root, root->nodesize);
2126
2127         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2128         btrfs_set_header_level(split, btrfs_header_level(c));
2129         btrfs_set_header_bytenr(split, split->start);
2130         btrfs_set_header_generation(split, trans->transid);
2131         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2132         btrfs_set_header_owner(split, root->root_key.objectid);
2133         write_extent_buffer(split, root->fs_info->fsid,
2134                             (unsigned long)btrfs_header_fsid(split),
2135                             BTRFS_FSID_SIZE);
2136         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2137                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2138                             BTRFS_UUID_SIZE);
2139
2140
2141         copy_extent_buffer(split, c,
2142                            btrfs_node_key_ptr_offset(0),
2143                            btrfs_node_key_ptr_offset(mid),
2144                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2145         btrfs_set_header_nritems(split, c_nritems - mid);
2146         btrfs_set_header_nritems(c, mid);
2147         ret = 0;
2148
2149         btrfs_mark_buffer_dirty(c);
2150         btrfs_mark_buffer_dirty(split);
2151
2152         wret = insert_ptr(trans, root, path, &disk_key, split->start,
2153                           path->slots[level + 1] + 1,
2154                           level + 1);
2155         if (wret)
2156                 ret = wret;
2157
2158         if (path->slots[level] >= mid) {
2159                 path->slots[level] -= mid;
2160                 btrfs_tree_unlock(c);
2161                 free_extent_buffer(c);
2162                 path->nodes[level] = split;
2163                 path->slots[level + 1] += 1;
2164         } else {
2165                 btrfs_tree_unlock(split);
2166                 free_extent_buffer(split);
2167         }
2168         return ret;
2169 }
2170
2171 /*
2172  * how many bytes are required to store the items in a leaf.  start
2173  * and nr indicate which items in the leaf to check.  This totals up the
2174  * space used both by the item structs and the item data
2175  */
2176 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2177 {
2178         int data_len;
2179         int nritems = btrfs_header_nritems(l);
2180         int end = min(nritems, start + nr) - 1;
2181
2182         if (!nr)
2183                 return 0;
2184         data_len = btrfs_item_end_nr(l, start);
2185         data_len = data_len - btrfs_item_offset_nr(l, end);
2186         data_len += sizeof(struct btrfs_item) * nr;
2187         WARN_ON(data_len < 0);
2188         return data_len;
2189 }
2190
2191 /*
2192  * The space between the end of the leaf items and
2193  * the start of the leaf data.  IOW, how much room
2194  * the leaf has left for both items and data
2195  */
2196 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2197                                    struct extent_buffer *leaf)
2198 {
2199         int nritems = btrfs_header_nritems(leaf);
2200         int ret;
2201         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2202         if (ret < 0) {
2203                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2204                        "used %d nritems %d\n",
2205                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2206                        leaf_space_used(leaf, 0, nritems), nritems);
2207         }
2208         return ret;
2209 }
2210
2211 /*
2212  * min slot controls the lowest index we're willing to push to the
2213  * right.  We'll push up to and including min_slot, but no lower
2214  */
2215 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2216                                       struct btrfs_root *root,
2217                                       struct btrfs_path *path,
2218                                       int data_size, int empty,
2219                                       struct extent_buffer *right,
2220                                       int free_space, u32 left_nritems,
2221                                       u32 min_slot)
2222 {
2223         struct extent_buffer *left = path->nodes[0];
2224         struct extent_buffer *upper = path->nodes[1];
2225         struct btrfs_disk_key disk_key;
2226         int slot;
2227         u32 i;
2228         int push_space = 0;
2229         int push_items = 0;
2230         struct btrfs_item *item;
2231         u32 nr;
2232         u32 right_nritems;
2233         u32 data_end;
2234         u32 this_item_size;
2235
2236         if (empty)
2237                 nr = 0;
2238         else
2239                 nr = max_t(u32, 1, min_slot);
2240
2241         if (path->slots[0] >= left_nritems)
2242                 push_space += data_size;
2243
2244         slot = path->slots[1];
2245         i = left_nritems - 1;
2246         while (i >= nr) {
2247                 item = btrfs_item_nr(left, i);
2248
2249                 if (!empty && push_items > 0) {
2250                         if (path->slots[0] > i)
2251                                 break;
2252                         if (path->slots[0] == i) {
2253                                 int space = btrfs_leaf_free_space(root, left);
2254                                 if (space + push_space * 2 > free_space)
2255                                         break;
2256                         }
2257                 }
2258
2259                 if (path->slots[0] == i)
2260                         push_space += data_size;
2261
2262                 if (!left->map_token) {
2263                         map_extent_buffer(left, (unsigned long)item,
2264                                         sizeof(struct btrfs_item),
2265                                         &left->map_token, &left->kaddr,
2266                                         &left->map_start, &left->map_len,
2267                                         KM_USER1);
2268                 }
2269
2270                 this_item_size = btrfs_item_size(left, item);
2271                 if (this_item_size + sizeof(*item) + push_space > free_space)
2272                         break;
2273
2274                 push_items++;
2275                 push_space += this_item_size + sizeof(*item);
2276                 if (i == 0)
2277                         break;
2278                 i--;
2279         }
2280         if (left->map_token) {
2281                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2282                 left->map_token = NULL;
2283         }
2284
2285         if (push_items == 0)
2286                 goto out_unlock;
2287
2288         if (!empty && push_items == left_nritems)
2289                 WARN_ON(1);
2290
2291         /* push left to right */
2292         right_nritems = btrfs_header_nritems(right);
2293
2294         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2295         push_space -= leaf_data_end(root, left);
2296
2297         /* make room in the right data area */
2298         data_end = leaf_data_end(root, right);
2299         memmove_extent_buffer(right,
2300                               btrfs_leaf_data(right) + data_end - push_space,
2301                               btrfs_leaf_data(right) + data_end,
2302                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2303
2304         /* copy from the left data area */
2305         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2306                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2307                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2308                      push_space);
2309
2310         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2311                               btrfs_item_nr_offset(0),
2312                               right_nritems * sizeof(struct btrfs_item));
2313
2314         /* copy the items from left to right */
2315         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2316                    btrfs_item_nr_offset(left_nritems - push_items),
2317                    push_items * sizeof(struct btrfs_item));
2318
2319         /* update the item pointers */
2320         right_nritems += push_items;
2321         btrfs_set_header_nritems(right, right_nritems);
2322         push_space = BTRFS_LEAF_DATA_SIZE(root);
2323         for (i = 0; i < right_nritems; i++) {
2324                 item = btrfs_item_nr(right, i);
2325                 if (!right->map_token) {
2326                         map_extent_buffer(right, (unsigned long)item,
2327                                         sizeof(struct btrfs_item),
2328                                         &right->map_token, &right->kaddr,
2329                                         &right->map_start, &right->map_len,
2330                                         KM_USER1);
2331                 }
2332                 push_space -= btrfs_item_size(right, item);
2333                 btrfs_set_item_offset(right, item, push_space);
2334         }
2335
2336         if (right->map_token) {
2337                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2338                 right->map_token = NULL;
2339         }
2340         left_nritems -= push_items;
2341         btrfs_set_header_nritems(left, left_nritems);
2342
2343         if (left_nritems)
2344                 btrfs_mark_buffer_dirty(left);
2345         else
2346                 clean_tree_block(trans, root, left);
2347
2348         btrfs_mark_buffer_dirty(right);
2349
2350         btrfs_item_key(right, &disk_key, 0);
2351         btrfs_set_node_key(upper, &disk_key, slot + 1);
2352         btrfs_mark_buffer_dirty(upper);
2353
2354         /* then fixup the leaf pointer in the path */
2355         if (path->slots[0] >= left_nritems) {
2356                 path->slots[0] -= left_nritems;
2357                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2358                         clean_tree_block(trans, root, path->nodes[0]);
2359                 btrfs_tree_unlock(path->nodes[0]);
2360                 free_extent_buffer(path->nodes[0]);
2361                 path->nodes[0] = right;
2362                 path->slots[1] += 1;
2363         } else {
2364                 btrfs_tree_unlock(right);
2365                 free_extent_buffer(right);
2366         }
2367         return 0;
2368
2369 out_unlock:
2370         btrfs_tree_unlock(right);
2371         free_extent_buffer(right);
2372         return 1;
2373 }
2374
2375 /*
2376  * push some data in the path leaf to the right, trying to free up at
2377  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2378  *
2379  * returns 1 if the push failed because the other node didn't have enough
2380  * room, 0 if everything worked out and < 0 if there were major errors.
2381  *
2382  * this will push starting from min_slot to the end of the leaf.  It won't
2383  * push any slot lower than min_slot
2384  */
2385 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2386                            *root, struct btrfs_path *path,
2387                            int min_data_size, int data_size,
2388                            int empty, u32 min_slot)
2389 {
2390         struct extent_buffer *left = path->nodes[0];
2391         struct extent_buffer *right;
2392         struct extent_buffer *upper;
2393         int slot;
2394         int free_space;
2395         u32 left_nritems;
2396         int ret;
2397
2398         if (!path->nodes[1])
2399                 return 1;
2400
2401         slot = path->slots[1];
2402         upper = path->nodes[1];
2403         if (slot >= btrfs_header_nritems(upper) - 1)
2404                 return 1;
2405
2406         btrfs_assert_tree_locked(path->nodes[1]);
2407
2408         right = read_node_slot(root, upper, slot + 1);
2409         if (right == NULL)
2410                 return 1;
2411
2412         btrfs_tree_lock(right);
2413         btrfs_set_lock_blocking(right);
2414
2415         free_space = btrfs_leaf_free_space(root, right);
2416         if (free_space < data_size)
2417                 goto out_unlock;
2418
2419         /* cow and double check */
2420         ret = btrfs_cow_block(trans, root, right, upper,
2421                               slot + 1, &right);
2422         if (ret)
2423                 goto out_unlock;
2424
2425         free_space = btrfs_leaf_free_space(root, right);
2426         if (free_space < data_size)
2427                 goto out_unlock;
2428
2429         left_nritems = btrfs_header_nritems(left);
2430         if (left_nritems == 0)
2431                 goto out_unlock;
2432
2433         return __push_leaf_right(trans, root, path, min_data_size, empty,
2434                                 right, free_space, left_nritems, min_slot);
2435 out_unlock:
2436         btrfs_tree_unlock(right);
2437         free_extent_buffer(right);
2438         return 1;
2439 }
2440
2441 /*
2442  * push some data in the path leaf to the left, trying to free up at
2443  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2444  *
2445  * max_slot can put a limit on how far into the leaf we'll push items.  The
2446  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2447  * items
2448  */
2449 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2450                                      struct btrfs_root *root,
2451                                      struct btrfs_path *path, int data_size,
2452                                      int empty, struct extent_buffer *left,
2453                                      int free_space, u32 right_nritems,
2454                                      u32 max_slot)
2455 {
2456         struct btrfs_disk_key disk_key;
2457         struct extent_buffer *right = path->nodes[0];
2458         int i;
2459         int push_space = 0;
2460         int push_items = 0;
2461         struct btrfs_item *item;
2462         u32 old_left_nritems;
2463         u32 nr;
2464         int ret = 0;
2465         int wret;
2466         u32 this_item_size;
2467         u32 old_left_item_size;
2468
2469         if (empty)
2470                 nr = min(right_nritems, max_slot);
2471         else
2472                 nr = min(right_nritems - 1, max_slot);
2473
2474         for (i = 0; i < nr; i++) {
2475                 item = btrfs_item_nr(right, i);
2476                 if (!right->map_token) {
2477                         map_extent_buffer(right, (unsigned long)item,
2478                                         sizeof(struct btrfs_item),
2479                                         &right->map_token, &right->kaddr,
2480                                         &right->map_start, &right->map_len,
2481                                         KM_USER1);
2482                 }
2483
2484                 if (!empty && push_items > 0) {
2485                         if (path->slots[0] < i)
2486                                 break;
2487                         if (path->slots[0] == i) {
2488                                 int space = btrfs_leaf_free_space(root, right);
2489                                 if (space + push_space * 2 > free_space)
2490                                         break;
2491                         }
2492                 }
2493
2494                 if (path->slots[0] == i)
2495                         push_space += data_size;
2496
2497                 this_item_size = btrfs_item_size(right, item);
2498                 if (this_item_size + sizeof(*item) + push_space > free_space)
2499                         break;
2500
2501                 push_items++;
2502                 push_space += this_item_size + sizeof(*item);
2503         }
2504
2505         if (right->map_token) {
2506                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2507                 right->map_token = NULL;
2508         }
2509
2510         if (push_items == 0) {
2511                 ret = 1;
2512                 goto out;
2513         }
2514         if (!empty && push_items == btrfs_header_nritems(right))
2515                 WARN_ON(1);
2516
2517         /* push data from right to left */
2518         copy_extent_buffer(left, right,
2519                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2520                            btrfs_item_nr_offset(0),
2521                            push_items * sizeof(struct btrfs_item));
2522
2523         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2524                      btrfs_item_offset_nr(right, push_items - 1);
2525
2526         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2527                      leaf_data_end(root, left) - push_space,
2528                      btrfs_leaf_data(right) +
2529                      btrfs_item_offset_nr(right, push_items - 1),
2530                      push_space);
2531         old_left_nritems = btrfs_header_nritems(left);
2532         BUG_ON(old_left_nritems <= 0);
2533
2534         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2535         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2536                 u32 ioff;
2537
2538                 item = btrfs_item_nr(left, i);
2539                 if (!left->map_token) {
2540                         map_extent_buffer(left, (unsigned long)item,
2541                                         sizeof(struct btrfs_item),
2542                                         &left->map_token, &left->kaddr,
2543                                         &left->map_start, &left->map_len,
2544                                         KM_USER1);
2545                 }
2546
2547                 ioff = btrfs_item_offset(left, item);
2548                 btrfs_set_item_offset(left, item,
2549                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2550         }
2551         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2552         if (left->map_token) {
2553                 unmap_extent_buffer(left, left->map_token, KM_USER1);
2554                 left->map_token = NULL;
2555         }
2556
2557         /* fixup right node */
2558         if (push_items > right_nritems) {
2559                 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2560                        right_nritems);
2561                 WARN_ON(1);
2562         }
2563
2564         if (push_items < right_nritems) {
2565                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2566                                                   leaf_data_end(root, right);
2567                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2568                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2569                                       btrfs_leaf_data(right) +
2570                                       leaf_data_end(root, right), push_space);
2571
2572                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2573                               btrfs_item_nr_offset(push_items),
2574                              (btrfs_header_nritems(right) - push_items) *
2575                              sizeof(struct btrfs_item));
2576         }
2577         right_nritems -= push_items;
2578         btrfs_set_header_nritems(right, right_nritems);
2579         push_space = BTRFS_LEAF_DATA_SIZE(root);
2580         for (i = 0; i < right_nritems; i++) {
2581                 item = btrfs_item_nr(right, i);
2582
2583                 if (!right->map_token) {
2584                         map_extent_buffer(right, (unsigned long)item,
2585                                         sizeof(struct btrfs_item),
2586                                         &right->map_token, &right->kaddr,
2587                                         &right->map_start, &right->map_len,
2588                                         KM_USER1);
2589                 }
2590
2591                 push_space = push_space - btrfs_item_size(right, item);
2592                 btrfs_set_item_offset(right, item, push_space);
2593         }
2594         if (right->map_token) {
2595                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2596                 right->map_token = NULL;
2597         }
2598
2599         btrfs_mark_buffer_dirty(left);
2600         if (right_nritems)
2601                 btrfs_mark_buffer_dirty(right);
2602         else
2603                 clean_tree_block(trans, root, right);
2604
2605         btrfs_item_key(right, &disk_key, 0);
2606         wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2607         if (wret)
2608                 ret = wret;
2609
2610         /* then fixup the leaf pointer in the path */
2611         if (path->slots[0] < push_items) {
2612                 path->slots[0] += old_left_nritems;
2613                 btrfs_tree_unlock(path->nodes[0]);
2614                 free_extent_buffer(path->nodes[0]);
2615                 path->nodes[0] = left;
2616                 path->slots[1] -= 1;
2617         } else {
2618                 btrfs_tree_unlock(left);
2619                 free_extent_buffer(left);
2620                 path->slots[0] -= push_items;
2621         }
2622         BUG_ON(path->slots[0] < 0);
2623         return ret;
2624 out:
2625         btrfs_tree_unlock(left);
2626         free_extent_buffer(left);
2627         return ret;
2628 }
2629
2630 /*
2631  * push some data in the path leaf to the left, trying to free up at
2632  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2633  *
2634  * max_slot can put a limit on how far into the leaf we'll push items.  The
2635  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
2636  * items
2637  */
2638 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2639                           *root, struct btrfs_path *path, int min_data_size,
2640                           int data_size, int empty, u32 max_slot)
2641 {
2642         struct extent_buffer *right = path->nodes[0];
2643         struct extent_buffer *left;
2644         int slot;
2645         int free_space;
2646         u32 right_nritems;
2647         int ret = 0;
2648
2649         slot = path->slots[1];
2650         if (slot == 0)
2651                 return 1;
2652         if (!path->nodes[1])
2653                 return 1;
2654
2655         right_nritems = btrfs_header_nritems(right);
2656         if (right_nritems == 0)
2657                 return 1;
2658
2659         btrfs_assert_tree_locked(path->nodes[1]);
2660
2661         left = read_node_slot(root, path->nodes[1], slot - 1);
2662         if (left == NULL)
2663                 return 1;
2664
2665         btrfs_tree_lock(left);
2666         btrfs_set_lock_blocking(left);
2667
2668         free_space = btrfs_leaf_free_space(root, left);
2669         if (free_space < data_size) {
2670                 ret = 1;
2671                 goto out;
2672         }
2673
2674         /* cow and double check */
2675         ret = btrfs_cow_block(trans, root, left,
2676                               path->nodes[1], slot - 1, &left);
2677         if (ret) {
2678                 /* we hit -ENOSPC, but it isn't fatal here */
2679                 ret = 1;
2680                 goto out;
2681         }
2682
2683         free_space = btrfs_leaf_free_space(root, left);
2684         if (free_space < data_size) {
2685                 ret = 1;
2686                 goto out;
2687         }
2688
2689         return __push_leaf_left(trans, root, path, min_data_size,
2690                                empty, left, free_space, right_nritems,
2691                                max_slot);
2692 out:
2693         btrfs_tree_unlock(left);
2694         free_extent_buffer(left);
2695         return ret;
2696 }
2697
2698 /*
2699  * split the path's leaf in two, making sure there is at least data_size
2700  * available for the resulting leaf level of the path.
2701  *
2702  * returns 0 if all went well and < 0 on failure.
2703  */
2704 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2705                                struct btrfs_root *root,
2706                                struct btrfs_path *path,
2707                                struct extent_buffer *l,
2708                                struct extent_buffer *right,
2709                                int slot, int mid, int nritems)
2710 {
2711         int data_copy_size;
2712         int rt_data_off;
2713         int i;
2714         int ret = 0;
2715         int wret;
2716         struct btrfs_disk_key disk_key;
2717
2718         nritems = nritems - mid;
2719         btrfs_set_header_nritems(right, nritems);
2720         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2721
2722         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2723                            btrfs_item_nr_offset(mid),
2724                            nritems * sizeof(struct btrfs_item));
2725
2726         copy_extent_buffer(right, l,
2727                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2728                      data_copy_size, btrfs_leaf_data(l) +
2729                      leaf_data_end(root, l), data_copy_size);
2730
2731         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2732                       btrfs_item_end_nr(l, mid);
2733
2734         for (i = 0; i < nritems; i++) {
2735                 struct btrfs_item *item = btrfs_item_nr(right, i);
2736                 u32 ioff;
2737
2738                 if (!right->map_token) {
2739                         map_extent_buffer(right, (unsigned long)item,
2740                                         sizeof(struct btrfs_item),
2741                                         &right->map_token, &right->kaddr,
2742                                         &right->map_start, &right->map_len,
2743                                         KM_USER1);
2744                 }
2745
2746                 ioff = btrfs_item_offset(right, item);
2747                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2748         }
2749
2750         if (right->map_token) {
2751                 unmap_extent_buffer(right, right->map_token, KM_USER1);
2752                 right->map_token = NULL;
2753         }
2754
2755         btrfs_set_header_nritems(l, mid);
2756         ret = 0;
2757         btrfs_item_key(right, &disk_key, 0);
2758         wret = insert_ptr(trans, root, path, &disk_key, right->start,
2759                           path->slots[1] + 1, 1);
2760         if (wret)
2761                 ret = wret;
2762
2763         btrfs_mark_buffer_dirty(right);
2764         btrfs_mark_buffer_dirty(l);
2765         BUG_ON(path->slots[0] != slot);
2766
2767         if (mid <= slot) {
2768                 btrfs_tree_unlock(path->nodes[0]);
2769                 free_extent_buffer(path->nodes[0]);
2770                 path->nodes[0] = right;
2771                 path->slots[0] -= mid;
2772                 path->slots[1] += 1;
2773         } else {
2774                 btrfs_tree_unlock(right);
2775                 free_extent_buffer(right);
2776         }
2777
2778         BUG_ON(path->slots[0] < 0);
2779
2780         return ret;
2781 }
2782
2783 /*
2784  * double splits happen when we need to insert a big item in the middle
2785  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
2786  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2787  *          A                 B                 C
2788  *
2789  * We avoid this by trying to push the items on either side of our target
2790  * into the adjacent leaves.  If all goes well we can avoid the double split
2791  * completely.
2792  */
2793 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2794                                           struct btrfs_root *root,
2795                                           struct btrfs_path *path,
2796                                           int data_size)
2797 {
2798         int ret;
2799         int progress = 0;
2800         int slot;
2801         u32 nritems;
2802
2803         slot = path->slots[0];
2804
2805         /*
2806          * try to push all the items after our slot into the
2807          * right leaf
2808          */
2809         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2810         if (ret < 0)
2811                 return ret;
2812
2813         if (ret == 0)
2814                 progress++;
2815
2816         nritems = btrfs_header_nritems(path->nodes[0]);
2817         /*
2818          * our goal is to get our slot at the start or end of a leaf.  If
2819          * we've done so we're done
2820          */
2821         if (path->slots[0] == 0 || path->slots[0] == nritems)
2822                 return 0;
2823
2824         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2825                 return 0;
2826
2827         /* try to push all the items before our slot into the next leaf */
2828         slot = path->slots[0];
2829         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2830         if (ret < 0)
2831                 return ret;
2832
2833         if (ret == 0)
2834                 progress++;
2835
2836         if (progress)
2837                 return 0;
2838         return 1;
2839 }
2840
2841 /*
2842  * split the path's leaf in two, making sure there is at least data_size
2843  * available for the resulting leaf level of the path.
2844  *
2845  * returns 0 if all went well and < 0 on failure.
2846  */
2847 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2848                                struct btrfs_root *root,
2849                                struct btrfs_key *ins_key,
2850                                struct btrfs_path *path, int data_size,
2851                                int extend)
2852 {
2853         struct btrfs_disk_key disk_key;
2854         struct extent_buffer *l;
2855         u32 nritems;
2856         int mid;
2857         int slot;
2858         struct extent_buffer *right;
2859         int ret = 0;
2860         int wret;
2861         int split;
2862         int num_doubles = 0;
2863         int tried_avoid_double = 0;
2864
2865         l = path->nodes[0];
2866         slot = path->slots[0];
2867         if (extend && data_size + btrfs_item_size_nr(l, slot) +
2868             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2869                 return -EOVERFLOW;
2870
2871         /* first try to make some room by pushing left and right */
2872         if (data_size) {
2873                 wret = push_leaf_right(trans, root, path, data_size,
2874                                        data_size, 0, 0);
2875                 if (wret < 0)
2876                         return wret;
2877                 if (wret) {
2878                         wret = push_leaf_left(trans, root, path, data_size,
2879                                               data_size, 0, (u32)-1);
2880                         if (wret < 0)
2881                                 return wret;
2882                 }
2883                 l = path->nodes[0];
2884
2885                 /* did the pushes work? */
2886                 if (btrfs_leaf_free_space(root, l) >= data_size)
2887                         return 0;
2888         }
2889
2890         if (!path->nodes[1]) {
2891                 ret = insert_new_root(trans, root, path, 1);
2892                 if (ret)
2893                         return ret;
2894         }
2895 again:
2896         split = 1;
2897         l = path->nodes[0];
2898         slot = path->slots[0];
2899         nritems = btrfs_header_nritems(l);
2900         mid = (nritems + 1) / 2;
2901
2902         if (mid <= slot) {
2903                 if (nritems == 1 ||
2904                     leaf_space_used(l, mid, nritems - mid) + data_size >
2905                         BTRFS_LEAF_DATA_SIZE(root)) {
2906                         if (slot >= nritems) {
2907                                 split = 0;
2908                         } else {
2909                                 mid = slot;
2910                                 if (mid != nritems &&
2911                                     leaf_space_used(l, mid, nritems - mid) +
2912                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2913                                         if (data_size && !tried_avoid_double)
2914                                                 goto push_for_double;
2915                                         split = 2;
2916                                 }
2917                         }
2918                 }
2919         } else {
2920                 if (leaf_space_used(l, 0, mid) + data_size >
2921                         BTRFS_LEAF_DATA_SIZE(root)) {
2922                         if (!extend && data_size && slot == 0) {
2923                                 split = 0;
2924                         } else if ((extend || !data_size) && slot == 0) {
2925                                 mid = 1;
2926                         } else {
2927                                 mid = slot;
2928                                 if (mid != nritems &&
2929                                     leaf_space_used(l, mid, nritems - mid) +
2930                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2931                                         if (data_size && !tried_avoid_double)
2932                                                 goto push_for_double;
2933                                         split = 2 ;
2934                                 }
2935                         }
2936                 }
2937         }
2938
2939         if (split == 0)
2940                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2941         else
2942                 btrfs_item_key(l, &disk_key, mid);
2943
2944         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2945                                         root->root_key.objectid,
2946                                         &disk_key, 0, l->start, 0);
2947         if (IS_ERR(right))
2948                 return PTR_ERR(right);
2949
2950         root_add_used(root, root->leafsize);
2951
2952         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2953         btrfs_set_header_bytenr(right, right->start);
2954         btrfs_set_header_generation(right, trans->transid);
2955         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
2956         btrfs_set_header_owner(right, root->root_key.objectid);
2957         btrfs_set_header_level(right, 0);
2958         write_extent_buffer(right, root->fs_info->fsid,
2959                             (unsigned long)btrfs_header_fsid(right),
2960                             BTRFS_FSID_SIZE);
2961
2962         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2963                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2964                             BTRFS_UUID_SIZE);
2965
2966         if (split == 0) {
2967                 if (mid <= slot) {
2968                         btrfs_set_header_nritems(right, 0);
2969                         wret = insert_ptr(trans, root, path,
2970                                           &disk_key, right->start,
2971                                           path->slots[1] + 1, 1);
2972                         if (wret)
2973                                 ret = wret;
2974
2975                         btrfs_tree_unlock(path->nodes[0]);
2976                         free_extent_buffer(path->nodes[0]);
2977                         path->nodes[0] = right;
2978                         path->slots[0] = 0;
2979                         path->slots[1] += 1;
2980                 } else {
2981                         btrfs_set_header_nritems(right, 0);
2982                         wret = insert_ptr(trans, root, path,
2983                                           &disk_key,
2984                                           right->start,
2985                                           path->slots[1], 1);
2986                         if (wret)
2987                                 ret = wret;
2988                         btrfs_tree_unlock(path->nodes[0]);
2989                         free_extent_buffer(path->nodes[0]);
2990                         path->nodes[0] = right;
2991                         path->slots[0] = 0;
2992                         if (path->slots[1] == 0) {
2993                                 wret = fixup_low_keys(trans, root,
2994                                                 path, &disk_key, 1);
2995                                 if (wret)
2996                                         ret = wret;
2997                         }
2998                 }
2999                 btrfs_mark_buffer_dirty(right);
3000                 return ret;
3001         }
3002
3003         ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3004         BUG_ON(ret);
3005
3006         if (split == 2) {
3007                 BUG_ON(num_doubles != 0);
3008                 num_doubles++;
3009                 goto again;
3010         }
3011
3012         return ret;
3013
3014 push_for_double:
3015         push_for_double_split(trans, root, path, data_size);
3016         tried_avoid_double = 1;
3017         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3018                 return 0;
3019         goto again;
3020 }
3021
3022 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3023                                          struct btrfs_root *root,
3024                                          struct btrfs_path *path, int ins_len)
3025 {
3026         struct btrfs_key key;
3027         struct extent_buffer *leaf;
3028         struct btrfs_file_extent_item *fi;
3029         u64 extent_len = 0;
3030         u32 item_size;
3031         int ret;
3032
3033         leaf = path->nodes[0];
3034         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3035
3036         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3037                key.type != BTRFS_EXTENT_CSUM_KEY);
3038
3039         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3040                 return 0;
3041
3042         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3043         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044                 fi = btrfs_item_ptr(leaf, path->slots[0],
3045                                     struct btrfs_file_extent_item);
3046                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3047         }
3048         btrfs_release_path(root, path);
3049
3050         path->keep_locks = 1;
3051         path->search_for_split = 1;
3052         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3053         path->search_for_split = 0;
3054         if (ret < 0)
3055                 goto err;
3056
3057         ret = -EAGAIN;
3058         leaf = path->nodes[0];
3059         /* if our item isn't there or got smaller, return now */
3060         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3061                 goto err;
3062
3063         /* the leaf has  changed, it now has room.  return now */
3064         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3065                 goto err;
3066
3067         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3068                 fi = btrfs_item_ptr(leaf, path->slots[0],
3069                                     struct btrfs_file_extent_item);
3070                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3071                         goto err;
3072         }
3073
3074         btrfs_set_path_blocking(path);
3075         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3076         if (ret)
3077                 goto err;
3078
3079         path->keep_locks = 0;
3080         btrfs_unlock_up_safe(path, 1);
3081         return 0;
3082 err:
3083         path->keep_locks = 0;
3084         return ret;
3085 }
3086
3087 static noinline int split_item(struct btrfs_trans_handle *trans,
3088                                struct btrfs_root *root,
3089                                struct btrfs_path *path,
3090                                struct btrfs_key *new_key,
3091                                unsigned long split_offset)
3092 {
3093         struct extent_buffer *leaf;
3094         struct btrfs_item *item;
3095         struct btrfs_item *new_item;
3096         int slot;
3097         char *buf;
3098         u32 nritems;
3099         u32 item_size;
3100         u32 orig_offset;
3101         struct btrfs_disk_key disk_key;
3102
3103         leaf = path->nodes[0];
3104         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3105
3106         btrfs_set_path_blocking(path);
3107
3108         item = btrfs_item_nr(leaf, path->slots[0]);
3109         orig_offset = btrfs_item_offset(leaf, item);
3110         item_size = btrfs_item_size(leaf, item);
3111
3112         buf = kmalloc(item_size, GFP_NOFS);
3113         if (!buf)
3114                 return -ENOMEM;
3115
3116         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3117                             path->slots[0]), item_size);
3118
3119         slot = path->slots[0] + 1;
3120         nritems = btrfs_header_nritems(leaf);
3121         if (slot != nritems) {
3122                 /* shift the items */
3123                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3124                                 btrfs_item_nr_offset(slot),
3125                                 (nritems - slot) * sizeof(struct btrfs_item));
3126         }
3127
3128         btrfs_cpu_key_to_disk(&disk_key, new_key);
3129         btrfs_set_item_key(leaf, &disk_key, slot);
3130
3131         new_item = btrfs_item_nr(leaf, slot);
3132
3133         btrfs_set_item_offset(leaf, new_item, orig_offset);
3134         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3135
3136         btrfs_set_item_offset(leaf, item,
3137                               orig_offset + item_size - split_offset);
3138         btrfs_set_item_size(leaf, item, split_offset);
3139
3140         btrfs_set_header_nritems(leaf, nritems + 1);
3141
3142         /* write the data for the start of the original item */
3143         write_extent_buffer(leaf, buf,
3144                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3145                             split_offset);
3146
3147         /* write the data for the new item */
3148         write_extent_buffer(leaf, buf + split_offset,
3149                             btrfs_item_ptr_offset(leaf, slot),
3150                             item_size - split_offset);
3151         btrfs_mark_buffer_dirty(leaf);
3152
3153         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
3154         kfree(buf);
3155         return 0;
3156 }
3157
3158 /*
3159  * This function splits a single item into two items,
3160  * giving 'new_key' to the new item and splitting the
3161  * old one at split_offset (from the start of the item).
3162  *
3163  * The path may be released by this operation.  After
3164  * the split, the path is pointing to the old item.  The
3165  * new item is going to be in the same node as the old one.
3166  *
3167  * Note, the item being split must be smaller enough to live alone on
3168  * a tree block with room for one extra struct btrfs_item
3169  *
3170  * This allows us to split the item in place, keeping a lock on the
3171  * leaf the entire time.
3172  */
3173 int btrfs_split_item(struct btrfs_trans_handle *trans,
3174                      struct btrfs_root *root,
3175                      struct btrfs_path *path,
3176                      struct btrfs_key *new_key,
3177                      unsigned long split_offset)
3178 {
3179         int ret;
3180         ret = setup_leaf_for_split(trans, root, path,
3181                                    sizeof(struct btrfs_item));
3182         if (ret)
3183                 return ret;
3184
3185         ret = split_item(trans, root, path, new_key, split_offset);
3186         return ret;
3187 }
3188
3189 /*
3190  * This function duplicate a item, giving 'new_key' to the new item.
3191  * It guarantees both items live in the same tree leaf and the new item
3192  * is contiguous with the original item.
3193  *
3194  * This allows us to split file extent in place, keeping a lock on the
3195  * leaf the entire time.
3196  */
3197 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3198                          struct btrfs_root *root,
3199                          struct btrfs_path *path,
3200                          struct btrfs_key *new_key)
3201 {
3202         struct extent_buffer *leaf;
3203         int ret;
3204         u32 item_size;
3205
3206         leaf = path->nodes[0];
3207         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3208         ret = setup_leaf_for_split(trans, root, path,
3209                                    item_size + sizeof(struct btrfs_item));
3210         if (ret)
3211                 return ret;
3212
3213         path->slots[0]++;
3214         ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
3215                                      item_size, item_size +
3216                                      sizeof(struct btrfs_item), 1);
3217         BUG_ON(ret);
3218
3219         leaf = path->nodes[0];
3220         memcpy_extent_buffer(leaf,
3221                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3222                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3223                              item_size);
3224         return 0;
3225 }
3226
3227 /*
3228  * make the item pointed to by the path smaller.  new_size indicates
3229  * how small to make it, and from_end tells us if we just chop bytes
3230  * off the end of the item or if we shift the item to chop bytes off
3231  * the front.
3232  */
3233 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3234                         struct btrfs_root *root,
3235                         struct btrfs_path *path,
3236                         u32 new_size, int from_end)
3237 {
3238         int ret = 0;
3239         int slot;
3240         struct extent_buffer *leaf;
3241         struct btrfs_item *item;
3242         u32 nritems;
3243         unsigned int data_end;
3244         unsigned int old_data_start;
3245         unsigned int old_size;
3246         unsigned int size_diff;
3247         int i;
3248
3249         leaf = path->nodes[0];
3250         slot = path->slots[0];
3251
3252         old_size = btrfs_item_size_nr(leaf, slot);
3253         if (old_size == new_size)
3254                 return 0;
3255
3256         nritems = btrfs_header_nritems(leaf);
3257         data_end = leaf_data_end(root, leaf);
3258
3259         old_data_start = btrfs_item_offset_nr(leaf, slot);
3260
3261         size_diff = old_size - new_size;
3262
3263         BUG_ON(slot < 0);
3264         BUG_ON(slot >= nritems);
3265
3266         /*
3267          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3268          */
3269         /* first correct the data pointers */
3270         for (i = slot; i < nritems; i++) {
3271                 u32 ioff;
3272                 item = btrfs_item_nr(leaf, i);
3273
3274                 if (!leaf->map_token) {
3275                         map_extent_buffer(leaf, (unsigned long)item,
3276                                         sizeof(struct btrfs_item),
3277                                         &leaf->map_token, &leaf->kaddr,
3278                                         &leaf->map_start, &leaf->map_len,
3279                                         KM_USER1);
3280                 }
3281
3282                 ioff = btrfs_item_offset(leaf, item);
3283                 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3284         }
3285
3286         if (leaf->map_token) {
3287                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3288                 leaf->map_token = NULL;
3289         }
3290
3291         /* shift the data */
3292         if (from_end) {
3293                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3294                               data_end + size_diff, btrfs_leaf_data(leaf) +
3295                               data_end, old_data_start + new_size - data_end);
3296         } else {
3297                 struct btrfs_disk_key disk_key;
3298                 u64 offset;
3299
3300                 btrfs_item_key(leaf, &disk_key, slot);
3301
3302                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3303                         unsigned long ptr;
3304                         struct btrfs_file_extent_item *fi;
3305
3306                         fi = btrfs_item_ptr(leaf, slot,
3307                                             struct btrfs_file_extent_item);
3308                         fi = (struct btrfs_file_extent_item *)(
3309                              (unsigned long)fi - size_diff);
3310
3311                         if (btrfs_file_extent_type(leaf, fi) ==
3312                             BTRFS_FILE_EXTENT_INLINE) {
3313                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3314                                 memmove_extent_buffer(leaf, ptr,
3315                                       (unsigned long)fi,
3316                                       offsetof(struct btrfs_file_extent_item,
3317                                                  disk_bytenr));
3318                         }
3319                 }
3320
3321                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3322                               data_end + size_diff, btrfs_leaf_data(leaf) +
3323                               data_end, old_data_start - data_end);
3324
3325                 offset = btrfs_disk_key_offset(&disk_key);
3326                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3327                 btrfs_set_item_key(leaf, &disk_key, slot);
3328                 if (slot == 0)
3329                         fixup_low_keys(trans, root, path, &disk_key, 1);
3330         }
3331
3332         item = btrfs_item_nr(leaf, slot);
3333         btrfs_set_item_size(leaf, item, new_size);
3334         btrfs_mark_buffer_dirty(leaf);
3335
3336         ret = 0;
3337         if (btrfs_leaf_free_space(root, leaf) < 0) {
3338                 btrfs_print_leaf(root, leaf);
3339                 BUG();
3340         }
3341         return ret;
3342 }
3343
3344 /*
3345  * make the item pointed to by the path bigger, data_size is the new size.
3346  */
3347 int btrfs_extend_item(struct btrfs_trans_handle *trans,
3348                       struct btrfs_root *root, struct btrfs_path *path,
3349                       u32 data_size)
3350 {
3351         int ret = 0;
3352         int slot;
3353         struct extent_buffer *leaf;
3354         struct btrfs_item *item;
3355         u32 nritems;
3356         unsigned int data_end;
3357         unsigned int old_data;
3358         unsigned int old_size;
3359         int i;
3360
3361         leaf = path->nodes[0];
3362
3363         nritems = btrfs_header_nritems(leaf);
3364         data_end = leaf_data_end(root, leaf);
3365
3366         if (btrfs_leaf_free_space(root, leaf) < data_size) {
3367                 btrfs_print_leaf(root, leaf);
3368                 BUG();
3369         }
3370         slot = path->slots[0];
3371         old_data = btrfs_item_end_nr(leaf, slot);
3372
3373         BUG_ON(slot < 0);
3374         if (slot >= nritems) {
3375                 btrfs_print_leaf(root, leaf);
3376                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3377                        slot, nritems);
3378                 BUG_ON(1);
3379         }
3380
3381         /*
3382          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3383          */
3384         /* first correct the data pointers */
3385         for (i = slot; i < nritems; i++) {
3386                 u32 ioff;
3387                 item = btrfs_item_nr(leaf, i);
3388
3389                 if (!leaf->map_token) {
3390                         map_extent_buffer(leaf, (unsigned long)item,
3391                                         sizeof(struct btrfs_item),
3392                                         &leaf->map_token, &leaf->kaddr,
3393                                         &leaf->map_start, &leaf->map_len,
3394                                         KM_USER1);
3395                 }
3396                 ioff = btrfs_item_offset(leaf, item);
3397                 btrfs_set_item_offset(leaf, item, ioff - data_size);
3398         }
3399
3400         if (leaf->map_token) {
3401                 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3402                 leaf->map_token = NULL;
3403         }
3404
3405         /* shift the data */
3406         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3407                       data_end - data_size, btrfs_leaf_data(leaf) +
3408                       data_end, old_data - data_end);
3409
3410         data_end = old_data;
3411         old_size = btrfs_item_size_nr(leaf, slot);
3412         item = btrfs_item_nr(leaf, slot);
3413         btrfs_set_item_size(leaf, item, old_size + data_size);
3414         btrfs_mark_buffer_dirty(leaf);
3415
3416         ret = 0;
3417         if (btrfs_leaf_free_space(root, leaf) < 0) {
3418                 btrfs_print_leaf(root, leaf);
3419                 BUG();
3420         }
3421         return ret;
3422 }
3423
3424 /*
3425  * Given a key and some data, insert items into the tree.
3426  * This does all the path init required, making room in the tree if needed.
3427  * Returns the number of keys that were inserted.
3428  */
3429 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3430                             struct btrfs_root *root,
3431                             struct btrfs_path *path,
3432                             struct btrfs_key *cpu_key, u32 *data_size,
3433                             int nr)
3434 {
3435         struct extent_buffer *leaf;
3436         struct btrfs_item *item;
3437         int ret = 0;
3438         int slot;
3439         int i;
3440         u32 nritems;
3441         u32 total_data = 0;
3442         u32 total_size = 0;
3443         unsigned int data_end;
3444         struct btrfs_disk_key disk_key;
3445         struct btrfs_key found_key;
3446
3447         for (i = 0; i < nr; i++) {
3448                 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3449                     BTRFS_LEAF_DATA_SIZE(root)) {
3450                         break;
3451                         nr = i;
3452                 }
3453                 total_data += data_size[i];
3454                 total_size += data_size[i] + sizeof(struct btrfs_item);
3455         }
3456         BUG_ON(nr == 0);
3457
3458         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3459         if (ret == 0)
3460                 return -EEXIST;
3461         if (ret < 0)
3462                 goto out;
3463
3464         leaf = path->nodes[0];
3465
3466         nritems = btrfs_header_nritems(leaf);
3467         data_end = leaf_data_end(root, leaf);
3468
3469         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3470                 for (i = nr; i >= 0; i--) {
3471                         total_data -= data_size[i];
3472                         total_size -= data_size[i] + sizeof(struct btrfs_item);
3473                         if (total_size < btrfs_leaf_free_space(root, leaf))
3474                                 break;
3475                 }
3476                 nr = i;
3477         }
3478
3479         slot = path->slots[0];
3480         BUG_ON(slot < 0);
3481
3482         if (slot != nritems) {
3483                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3484
3485                 item = btrfs_item_nr(leaf, slot);
3486                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3487
3488                 /* figure out how many keys we can insert in here */
3489                 total_data = data_size[0];
3490                 for (i = 1; i < nr; i++) {
3491                         if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3492                                 break;
3493                         total_data += data_size[i];
3494                 }
3495                 nr = i;
3496
3497                 if (old_data < data_end) {
3498                         btrfs_print_leaf(root, leaf);
3499                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3500                                slot, old_data, data_end);
3501                         BUG_ON(1);
3502                 }
3503                 /*
3504                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3505                  */
3506                 /* first correct the data pointers */
3507                 WARN_ON(leaf->map_token);
3508                 for (i = slot; i < nritems; i++) {
3509                         u32 ioff;
3510
3511                         item = btrfs_item_nr(leaf, i);
3512                         if (!leaf->map_token) {
3513                                 map_extent_buffer(leaf, (unsigned long)item,
3514                                         sizeof(struct btrfs_item),
3515                                         &leaf->map_token, &leaf->kaddr,
3516                                         &leaf->map_start, &leaf->map_len,
3517                                         KM_USER1);
3518                         }
3519
3520                         ioff = btrfs_item_offset(leaf, item);
3521                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3522                 }
3523                 if (leaf->map_token) {
3524                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3525                         leaf->map_token = NULL;
3526                 }
3527
3528                 /* shift the items */
3529                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3530                               btrfs_item_nr_offset(slot),
3531                               (nritems - slot) * sizeof(struct btrfs_item));
3532
3533                 /* shift the data */
3534                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3535                               data_end - total_data, btrfs_leaf_data(leaf) +
3536                               data_end, old_data - data_end);
3537                 data_end = old_data;
3538         } else {
3539                 /*
3540                  * this sucks but it has to be done, if we are inserting at
3541                  * the end of the leaf only insert 1 of the items, since we
3542                  * have no way of knowing whats on the next leaf and we'd have
3543                  * to drop our current locks to figure it out
3544                  */
3545                 nr = 1;
3546         }
3547
3548         /* setup the item for the new data */
3549         for (i = 0; i < nr; i++) {
3550                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3551                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3552                 item = btrfs_item_nr(leaf, slot + i);
3553                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3554                 data_end -= data_size[i];
3555                 btrfs_set_item_size(leaf, item, data_size[i]);
3556         }
3557         btrfs_set_header_nritems(leaf, nritems + nr);
3558         btrfs_mark_buffer_dirty(leaf);
3559
3560         ret = 0;
3561         if (slot == 0) {
3562                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3563                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3564         }
3565
3566         if (btrfs_leaf_free_space(root, leaf) < 0) {
3567                 btrfs_print_leaf(root, leaf);
3568                 BUG();
3569         }
3570 out:
3571         if (!ret)
3572                 ret = nr;
3573         return ret;
3574 }
3575
3576 /*
3577  * this is a helper for btrfs_insert_empty_items, the main goal here is
3578  * to save stack depth by doing the bulk of the work in a function
3579  * that doesn't call btrfs_search_slot
3580  */
3581 static noinline_for_stack int
3582 setup_items_for_insert(struct btrfs_trans_handle *trans,
3583                       struct btrfs_root *root, struct btrfs_path *path,
3584                       struct btrfs_key *cpu_key, u32 *data_size,
3585                       u32 total_data, u32 total_size, int nr)
3586 {
3587         struct btrfs_item *item;
3588         int i;
3589         u32 nritems;
3590         unsigned int data_end;
3591         struct btrfs_disk_key disk_key;
3592         int ret;
3593         struct extent_buffer *leaf;
3594         int slot;
3595
3596         leaf = path->nodes[0];
3597         slot = path->slots[0];
3598
3599         nritems = btrfs_header_nritems(leaf);
3600         data_end = leaf_data_end(root, leaf);
3601
3602         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3603                 btrfs_print_leaf(root, leaf);
3604                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3605                        total_size, btrfs_leaf_free_space(root, leaf));
3606                 BUG();
3607         }
3608
3609         if (slot != nritems) {
3610                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3611
3612                 if (old_data < data_end) {
3613                         btrfs_print_leaf(root, leaf);
3614                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3615                                slot, old_data, data_end);
3616                         BUG_ON(1);
3617                 }
3618                 /*
3619                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3620                  */
3621                 /* first correct the data pointers */
3622                 WARN_ON(leaf->map_token);
3623                 for (i = slot; i < nritems; i++) {
3624                         u32 ioff;
3625
3626                         item = btrfs_item_nr(leaf, i);
3627                         if (!leaf->map_token) {
3628                                 map_extent_buffer(leaf, (unsigned long)item,
3629                                         sizeof(struct btrfs_item),
3630                                         &leaf->map_token, &leaf->kaddr,
3631                                         &leaf->map_start, &leaf->map_len,
3632                                         KM_USER1);
3633                         }
3634
3635                         ioff = btrfs_item_offset(leaf, item);
3636                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3637                 }
3638                 if (leaf->map_token) {
3639                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3640                         leaf->map_token = NULL;
3641                 }
3642
3643                 /* shift the items */
3644                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3645                               btrfs_item_nr_offset(slot),
3646                               (nritems - slot) * sizeof(struct btrfs_item));
3647
3648                 /* shift the data */
3649                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3650                               data_end - total_data, btrfs_leaf_data(leaf) +
3651                               data_end, old_data - data_end);
3652                 data_end = old_data;
3653         }
3654
3655         /* setup the item for the new data */
3656         for (i = 0; i < nr; i++) {
3657                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3658                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3659                 item = btrfs_item_nr(leaf, slot + i);
3660                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3661                 data_end -= data_size[i];
3662                 btrfs_set_item_size(leaf, item, data_size[i]);
3663         }
3664
3665         btrfs_set_header_nritems(leaf, nritems + nr);
3666
3667         ret = 0;
3668         if (slot == 0) {
3669                 struct btrfs_disk_key disk_key;
3670                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3671                 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3672         }
3673         btrfs_unlock_up_safe(path, 1);
3674         btrfs_mark_buffer_dirty(leaf);
3675
3676         if (btrfs_leaf_free_space(root, leaf) < 0) {
3677                 btrfs_print_leaf(root, leaf);
3678                 BUG();
3679         }
3680         return ret;
3681 }
3682
3683 /*
3684  * Given a key and some data, insert items into the tree.
3685  * This does all the path init required, making room in the tree if needed.
3686  */
3687 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3688                             struct btrfs_root *root,
3689                             struct btrfs_path *path,
3690                             struct btrfs_key *cpu_key, u32 *data_size,
3691                             int nr)
3692 {
3693         int ret = 0;
3694         int slot;
3695         int i;
3696         u32 total_size = 0;
3697         u32 total_data = 0;
3698
3699         for (i = 0; i < nr; i++)
3700                 total_data += data_size[i];
3701
3702         total_size = total_data + (nr * sizeof(struct btrfs_item));
3703         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3704         if (ret == 0)
3705                 return -EEXIST;
3706         if (ret < 0)
3707                 goto out;
3708
3709         slot = path->slots[0];
3710         BUG_ON(slot < 0);
3711
3712         ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3713                                total_data, total_size, nr);
3714
3715 out:
3716         return ret;
3717 }
3718
3719 /*
3720  * Given a key and some data, insert an item into the tree.
3721  * This does all the path init required, making room in the tree if needed.
3722  */
3723 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3724                       *root, struct btrfs_key *cpu_key, void *data, u32
3725                       data_size)
3726 {
3727         int ret = 0;
3728         struct btrfs_path *path;
3729         struct extent_buffer *leaf;
3730         unsigned long ptr;
3731
3732         path = btrfs_alloc_path();
3733         if (!path)
3734                 return -ENOMEM;
3735         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3736         if (!ret) {
3737                 leaf = path->nodes[0];
3738                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3739                 write_extent_buffer(leaf, data, ptr, data_size);
3740                 btrfs_mark_buffer_dirty(leaf);
3741         }
3742         btrfs_free_path(path);
3743         return ret;
3744 }
3745
3746 /*
3747  * delete the pointer from a given node.
3748  *
3749  * the tree should have been previously balanced so the deletion does not
3750  * empty a node.
3751  */
3752 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3753                    struct btrfs_path *path, int level, int slot)
3754 {
3755         struct extent_buffer *parent = path->nodes[level];
3756         u32 nritems;
3757         int ret = 0;
3758         int wret;
3759
3760         nritems = btrfs_header_nritems(parent);
3761         if (slot != nritems - 1) {
3762                 memmove_extent_buffer(parent,
3763                               btrfs_node_key_ptr_offset(slot),
3764                               btrfs_node_key_ptr_offset(slot + 1),
3765                               sizeof(struct btrfs_key_ptr) *
3766                               (nritems - slot - 1));
3767         }
3768         nritems--;
3769         btrfs_set_header_nritems(parent, nritems);
3770         if (nritems == 0 && parent == root->node) {
3771                 BUG_ON(btrfs_header_level(root->node) != 1);
3772                 /* just turn the root into a leaf and break */
3773                 btrfs_set_header_level(root->node, 0);
3774         } else if (slot == 0) {
3775                 struct btrfs_disk_key disk_key;
3776
3777                 btrfs_node_key(parent, &disk_key, 0);
3778                 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3779                 if (wret)
3780                         ret = wret;
3781         }
3782         btrfs_mark_buffer_dirty(parent);
3783         return ret;
3784 }
3785
3786 /*
3787  * a helper function to delete the leaf pointed to by path->slots[1] and
3788  * path->nodes[1].
3789  *
3790  * This deletes the pointer in path->nodes[1] and frees the leaf
3791  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3792  *
3793  * The path must have already been setup for deleting the leaf, including
3794  * all the proper balancing.  path->nodes[1] must be locked.
3795  */
3796 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3797                                    struct btrfs_root *root,
3798                                    struct btrfs_path *path,
3799                                    struct extent_buffer *leaf)
3800 {
3801         int ret;
3802
3803         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3804         ret = del_ptr(trans, root, path, 1, path->slots[1]);
3805         if (ret)
3806                 return ret;
3807
3808         /*
3809          * btrfs_free_extent is expensive, we want to make sure we
3810          * aren't holding any locks when we call it
3811          */
3812         btrfs_unlock_up_safe(path, 0);
3813
3814         root_sub_used(root, leaf->len);
3815
3816         btrfs_free_tree_block(trans, root, leaf, 0, 1);
3817         return 0;
3818 }
3819 /*
3820  * delete the item at the leaf level in path.  If that empties
3821  * the leaf, remove it from the tree
3822  */
3823 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3824                     struct btrfs_path *path, int slot, int nr)
3825 {
3826         struct extent_buffer *leaf;
3827         struct btrfs_item *item;
3828         int last_off;
3829         int dsize = 0;
3830         int ret = 0;
3831         int wret;
3832         int i;
3833         u32 nritems;
3834
3835         leaf = path->nodes[0];
3836         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3837
3838         for (i = 0; i < nr; i++)
3839                 dsize += btrfs_item_size_nr(leaf, slot + i);
3840
3841         nritems = btrfs_header_nritems(leaf);
3842
3843         if (slot + nr != nritems) {
3844                 int data_end = leaf_data_end(root, leaf);
3845
3846                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3847                               data_end + dsize,
3848                               btrfs_leaf_data(leaf) + data_end,
3849                               last_off - data_end);
3850
3851                 for (i = slot + nr; i < nritems; i++) {
3852                         u32 ioff;
3853
3854                         item = btrfs_item_nr(leaf, i);
3855                         if (!leaf->map_token) {
3856                                 map_extent_buffer(leaf, (unsigned long)item,
3857                                         sizeof(struct btrfs_item),
3858                                         &leaf->map_token, &leaf->kaddr,
3859                                         &leaf->map_start, &leaf->map_len,
3860                                         KM_USER1);
3861                         }
3862                         ioff = btrfs_item_offset(leaf, item);
3863                         btrfs_set_item_offset(leaf, item, ioff + dsize);
3864                 }
3865
3866                 if (leaf->map_token) {
3867                         unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3868                         leaf->map_token = NULL;
3869                 }
3870
3871                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3872                               btrfs_item_nr_offset(slot + nr),
3873                               sizeof(struct btrfs_item) *
3874                               (nritems - slot - nr));
3875         }
3876         btrfs_set_header_nritems(leaf, nritems - nr);
3877         nritems -= nr;
3878
3879         /* delete the leaf if we've emptied it */
3880         if (nritems == 0) {
3881                 if (leaf == root->node) {
3882                         btrfs_set_header_level(leaf, 0);
3883                 } else {
3884                         btrfs_set_path_blocking(path);
3885                         clean_tree_block(trans, root, leaf);
3886                         ret = btrfs_del_leaf(trans, root, path, leaf);
3887                         BUG_ON(ret);
3888                 }
3889         } else {
3890                 int used = leaf_space_used(leaf, 0, nritems);
3891                 if (slot == 0) {
3892                         struct btrfs_disk_key disk_key;
3893
3894                         btrfs_item_key(leaf, &disk_key, 0);
3895                         wret = fixup_low_keys(trans, root, path,
3896                                               &disk_key, 1);
3897                         if (wret)
3898                                 ret = wret;
3899                 }
3900
3901                 /* delete the leaf if it is mostly empty */
3902                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3903                         /* push_leaf_left fixes the path.
3904                          * make sure the path still points to our leaf
3905                          * for possible call to del_ptr below
3906                          */
3907                         slot = path->slots[1];
3908                         extent_buffer_get(leaf);
3909
3910                         btrfs_set_path_blocking(path);
3911                         wret = push_leaf_left(trans, root, path, 1, 1,
3912                                               1, (u32)-1);
3913                         if (wret < 0 && wret != -ENOSPC)
3914                                 ret = wret;
3915
3916                         if (path->nodes[0] == leaf &&
3917                             btrfs_header_nritems(leaf)) {
3918                                 wret = push_leaf_right(trans, root, path, 1,
3919                                                        1, 1, 0);
3920                                 if (wret < 0 && wret != -ENOSPC)
3921                                         ret = wret;
3922                         }
3923
3924                         if (btrfs_header_nritems(leaf) == 0) {
3925                                 path->slots[1] = slot;
3926                                 ret = btrfs_del_leaf(trans, root, path, leaf);
3927                                 BUG_ON(ret);
3928                                 free_extent_buffer(leaf);
3929                         } else {
3930                                 /* if we're still in the path, make sure
3931                                  * we're dirty.  Otherwise, one of the
3932                                  * push_leaf functions must have already
3933                                  * dirtied this buffer
3934                                  */
3935                                 if (path->nodes[0] == leaf)
3936                                         btrfs_mark_buffer_dirty(leaf);
3937                                 free_extent_buffer(leaf);
3938                         }
3939                 } else {
3940                         btrfs_mark_buffer_dirty(leaf);
3941                 }
3942         }
3943         return ret;
3944 }
3945
3946 /*
3947  * search the tree again to find a leaf with lesser keys
3948  * returns 0 if it found something or 1 if there are no lesser leaves.
3949  * returns < 0 on io errors.
3950  *
3951  * This may release the path, and so you may lose any locks held at the
3952  * time you call it.
3953  */
3954 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3955 {
3956         struct btrfs_key key;
3957         struct btrfs_disk_key found_key;
3958         int ret;
3959
3960         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3961
3962         if (key.offset > 0)
3963                 key.offset--;
3964         else if (key.type > 0)
3965                 key.type--;
3966         else if (key.objectid > 0)
3967                 key.objectid--;
3968         else
3969                 return 1;
3970
3971         btrfs_release_path(root, path);
3972         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3973         if (ret < 0)
3974                 return ret;
3975         btrfs_item_key(path->nodes[0], &found_key, 0);
3976         ret = comp_keys(&found_key, &key);
3977         if (ret < 0)
3978                 return 0;
3979         return 1;
3980 }
3981
3982 /*
3983  * A helper function to walk down the tree starting at min_key, and looking
3984  * for nodes or leaves that are either in cache or have a minimum
3985  * transaction id.  This is used by the btree defrag code, and tree logging
3986  *
3987  * This does not cow, but it does stuff the starting key it finds back
3988  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3989  * key and get a writable path.
3990  *
3991  * This does lock as it descends, and path->keep_locks should be set
3992  * to 1 by the caller.
3993  *
3994  * This honors path->lowest_level to prevent descent past a given level
3995  * of the tree.
3996  *
3997  * min_trans indicates the oldest transaction that you are interested
3998  * in walking through.  Any nodes or leaves older than min_trans are
3999  * skipped over (without reading them).
4000  *
4001  * returns zero if something useful was found, < 0 on error and 1 if there
4002  * was nothing in the tree that matched the search criteria.
4003  */
4004 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4005                          struct btrfs_key *max_key,
4006                          struct btrfs_path *path, int cache_only,
4007                          u64 min_trans)
4008 {
4009         struct extent_buffer *cur;
4010         struct btrfs_key found_key;
4011         int slot;
4012         int sret;
4013         u32 nritems;
4014         int level;
4015         int ret = 1;
4016
4017         WARN_ON(!path->keep_locks);
4018 again:
4019         cur = btrfs_lock_root_node(root);
4020         level = btrfs_header_level(cur);
4021         WARN_ON(path->nodes[level]);
4022         path->nodes[level] = cur;
4023         path->locks[level] = 1;
4024
4025         if (btrfs_header_generation(cur) < min_trans) {
4026                 ret = 1;
4027                 goto out;
4028         }
4029         while (1) {
4030                 nritems = btrfs_header_nritems(cur);
4031                 level = btrfs_header_level(cur);
4032                 sret = bin_search(cur, min_key, level, &slot);
4033
4034                 /* at the lowest level, we're done, setup the path and exit */
4035                 if (level == path->lowest_level) {
4036                         if (slot >= nritems)
4037                                 goto find_next_key;
4038                         ret = 0;
4039                         path->slots[level] = slot;
4040                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4041                         goto out;
4042                 }
4043                 if (sret && slot > 0)
4044                         slot--;
4045                 /*
4046                  * check this node pointer against the cache_only and
4047                  * min_trans parameters.  If it isn't in cache or is too
4048                  * old, skip to the next one.
4049                  */
4050                 while (slot < nritems) {
4051                         u64 blockptr;
4052                         u64 gen;
4053                         struct extent_buffer *tmp;
4054                         struct btrfs_disk_key disk_key;
4055
4056                         blockptr = btrfs_node_blockptr(cur, slot);
4057                         gen = btrfs_node_ptr_generation(cur, slot);
4058                         if (gen < min_trans) {
4059                                 slot++;
4060                                 continue;
4061                         }
4062                         if (!cache_only)
4063                                 break;
4064
4065                         if (max_key) {
4066                                 btrfs_node_key(cur, &disk_key, slot);
4067                                 if (comp_keys(&disk_key, max_key) >= 0) {
4068                                         ret = 1;
4069                                         goto out;
4070                                 }
4071                         }
4072
4073                         tmp = btrfs_find_tree_block(root, blockptr,
4074                                             btrfs_level_size(root, level - 1));
4075
4076                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
4077                                 free_extent_buffer(tmp);
4078                                 break;
4079                         }
4080                         if (tmp)
4081                                 free_extent_buffer(tmp);
4082                         slot++;
4083                 }
4084 find_next_key:
4085                 /*
4086                  * we didn't find a candidate key in this node, walk forward
4087                  * and find another one
4088                  */
4089                 if (slot >= nritems) {
4090                         path->slots[level] = slot;
4091                         btrfs_set_path_blocking(path);
4092                         sret = btrfs_find_next_key(root, path, min_key, level,
4093                                                   cache_only, min_trans);
4094                         if (sret == 0) {
4095                                 btrfs_release_path(root, path);
4096                                 goto again;
4097                         } else {
4098                                 goto out;
4099                         }
4100                 }
4101                 /* save our key for returning back */
4102                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4103                 path->slots[level] = slot;
4104                 if (level == path->lowest_level) {
4105                         ret = 0;
4106                         unlock_up(path, level, 1);
4107                         goto out;
4108                 }
4109                 btrfs_set_path_blocking(path);
4110                 cur = read_node_slot(root, cur, slot);
4111                 BUG_ON(!cur);
4112
4113                 btrfs_tree_lock(cur);
4114
4115                 path->locks[level - 1] = 1;
4116                 path->nodes[level - 1] = cur;
4117                 unlock_up(path, level, 1);
4118                 btrfs_clear_path_blocking(path, NULL);
4119         }
4120 out:
4121         if (ret == 0)
4122                 memcpy(min_key, &found_key, sizeof(found_key));
4123         btrfs_set_path_blocking(path);
4124         return ret;
4125 }
4126
4127 /*
4128  * this is similar to btrfs_next_leaf, but does not try to preserve
4129  * and fixup the path.  It looks for and returns the next key in the
4130  * tree based on the current path and the cache_only and min_trans
4131  * parameters.
4132  *
4133  * 0 is returned if another key is found, < 0 if there are any errors
4134  * and 1 is returned if there are no higher keys in the tree
4135  *
4136  * path->keep_locks should be set to 1 on the search made before
4137  * calling this function.
4138  */
4139 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4140                         struct btrfs_key *key, int level,
4141                         int cache_only, u64 min_trans)
4142 {
4143         int slot;
4144         struct extent_buffer *c;
4145
4146         WARN_ON(!path->keep_locks);
4147         while (level < BTRFS_MAX_LEVEL) {
4148                 if (!path->nodes[level])
4149                         return 1;
4150
4151                 slot = path->slots[level] + 1;
4152                 c = path->nodes[level];
4153 next:
4154                 if (slot >= btrfs_header_nritems(c)) {
4155                         int ret;
4156                         int orig_lowest;
4157                         struct btrfs_key cur_key;
4158                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4159                             !path->nodes[level + 1])
4160                                 return 1;
4161
4162                         if (path->locks[level + 1]) {
4163                                 level++;
4164                                 continue;
4165                         }
4166
4167                         slot = btrfs_header_nritems(c) - 1;
4168                         if (level == 0)
4169                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4170                         else
4171                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4172
4173                         orig_lowest = path->lowest_level;
4174                         btrfs_release_path(root, path);
4175                         path->lowest_level = level;
4176                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4177                                                 0, 0);
4178                         path->lowest_level = orig_lowest;
4179                         if (ret < 0)
4180                                 return ret;
4181
4182                         c = path->nodes[level];
4183                         slot = path->slots[level];
4184                         if (ret == 0)
4185                                 slot++;
4186                         goto next;
4187                 }
4188
4189                 if (level == 0)
4190                         btrfs_item_key_to_cpu(c, key, slot);
4191                 else {
4192                         u64 blockptr = btrfs_node_blockptr(c, slot);
4193                         u64 gen = btrfs_node_ptr_generation(c, slot);
4194
4195                         if (cache_only) {
4196                                 struct extent_buffer *cur;
4197                                 cur = btrfs_find_tree_block(root, blockptr,
4198                                             btrfs_level_size(root, level - 1));
4199                                 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4200                                         slot++;
4201                                         if (cur)
4202                                                 free_extent_buffer(cur);
4203                                         goto next;
4204                                 }
4205                                 free_extent_buffer(cur);
4206                         }
4207                         if (gen < min_trans) {
4208                                 slot++;
4209                                 goto next;
4210                         }
4211                         btrfs_node_key_to_cpu(c, key, slot);
4212                 }
4213                 return 0;
4214         }
4215         return 1;
4216 }
4217
4218 /*
4219  * search the tree again to find a leaf with greater keys
4220  * returns 0 if it found something or 1 if there are no greater leaves.
4221  * returns < 0 on io errors.
4222  */
4223 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4224 {
4225         int slot;
4226         int level;
4227         struct extent_buffer *c;
4228         struct extent_buffer *next;
4229         struct btrfs_key key;
4230         u32 nritems;
4231         int ret;
4232         int old_spinning = path->leave_spinning;
4233         int force_blocking = 0;
4234
4235         nritems = btrfs_header_nritems(path->nodes[0]);
4236         if (nritems == 0)
4237                 return 1;
4238
4239         /*
4240          * we take the blocks in an order that upsets lockdep.  Using
4241          * blocking mode is the only way around it.
4242          */
4243 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4244         force_blocking = 1;
4245 #endif
4246
4247         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4248 again:
4249         level = 1;
4250         next = NULL;
4251         btrfs_release_path(root, path);
4252
4253         path->keep_locks = 1;
4254
4255         if (!force_blocking)
4256                 path->leave_spinning = 1;
4257
4258         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4259         path->keep_locks = 0;
4260
4261         if (ret < 0)
4262                 return ret;
4263
4264         nritems = btrfs_header_nritems(path->nodes[0]);
4265         /*
4266          * by releasing the path above we dropped all our locks.  A balance
4267          * could have added more items next to the key that used to be
4268          * at the very end of the block.  So, check again here and
4269          * advance the path if there are now more items available.
4270          */
4271         if (nritems > 0 && path->slots[0] < nritems - 1) {
4272                 if (ret == 0)
4273                         path->slots[0]++;
4274                 ret = 0;
4275                 goto done;
4276         }
4277
4278         while (level < BTRFS_MAX_LEVEL) {
4279                 if (!path->nodes[level]) {
4280                         ret = 1;
4281                         goto done;
4282                 }
4283
4284                 slot = path->slots[level] + 1;
4285                 c = path->nodes[level];
4286                 if (slot >= btrfs_header_nritems(c)) {
4287                         level++;
4288                         if (level == BTRFS_MAX_LEVEL) {
4289                                 ret = 1;
4290                                 goto done;
4291                         }
4292                         continue;
4293                 }
4294
4295                 if (next) {
4296                         btrfs_tree_unlock(next);
4297                         free_extent_buffer(next);
4298                 }
4299
4300                 next = c;
4301                 ret = read_block_for_search(NULL, root, path, &next, level,
4302                                             slot, &key);
4303                 if (ret == -EAGAIN)
4304                         goto again;
4305
4306                 if (ret < 0) {
4307                         btrfs_release_path(root, path);
4308                         goto done;
4309                 }
4310
4311                 if (!path->skip_locking) {
4312                         ret = btrfs_try_spin_lock(next);
4313                         if (!ret) {
4314                                 btrfs_set_path_blocking(path);
4315                                 btrfs_tree_lock(next);
4316                                 if (!force_blocking)
4317                                         btrfs_clear_path_blocking(path, next);
4318                         }
4319                         if (force_blocking)
4320                                 btrfs_set_lock_blocking(next);
4321                 }
4322                 break;
4323         }
4324         path->slots[level] = slot;
4325         while (1) {
4326                 level--;
4327                 c = path->nodes[level];
4328                 if (path->locks[level])
4329                         btrfs_tree_unlock(c);
4330
4331                 free_extent_buffer(c);
4332                 path->nodes[level] = next;
4333                 path->slots[level] = 0;
4334                 if (!path->skip_locking)
4335                         path->locks[level] = 1;
4336
4337                 if (!level)
4338                         break;
4339
4340                 ret = read_block_for_search(NULL, root, path, &next, level,
4341                                             0, &key);
4342                 if (ret == -EAGAIN)
4343                         goto again;
4344
4345                 if (ret < 0) {
4346                         btrfs_release_path(root, path);
4347                         goto done;
4348                 }
4349
4350                 if (!path->skip_locking) {
4351                         btrfs_assert_tree_locked(path->nodes[level]);
4352                         ret = btrfs_try_spin_lock(next);
4353                         if (!ret) {
4354                                 btrfs_set_path_blocking(path);
4355                                 btrfs_tree_lock(next);
4356                                 if (!force_blocking)
4357                                         btrfs_clear_path_blocking(path, next);
4358                         }
4359                         if (force_blocking)
4360                                 btrfs_set_lock_blocking(next);
4361                 }
4362         }
4363         ret = 0;
4364 done:
4365         unlock_up(path, 0, 1);
4366         path->leave_spinning = old_spinning;
4367         if (!old_spinning)
4368                 btrfs_set_path_blocking(path);
4369
4370         return ret;
4371 }
4372
4373 /*
4374  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4375  * searching until it gets past min_objectid or finds an item of 'type'
4376  *
4377  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4378  */
4379 int btrfs_previous_item(struct btrfs_root *root,
4380                         struct btrfs_path *path, u64 min_objectid,
4381                         int type)
4382 {
4383         struct btrfs_key found_key;
4384         struct extent_buffer *leaf;
4385         u32 nritems;
4386         int ret;
4387
4388         while (1) {
4389                 if (path->slots[0] == 0) {
4390                         btrfs_set_path_blocking(path);
4391                         ret = btrfs_prev_leaf(root, path);
4392                         if (ret != 0)
4393                                 return ret;
4394                 } else {
4395                         path->slots[0]--;
4396                 }
4397                 leaf = path->nodes[0];
4398                 nritems = btrfs_header_nritems(leaf);
4399                 if (nritems == 0)
4400                         return 1;
4401                 if (path->slots[0] == nritems)
4402                         path->slots[0]--;
4403
4404                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4405                 if (found_key.objectid < min_objectid)
4406                         break;
4407                 if (found_key.type == type)
4408                         return 0;
4409                 if (found_key.objectid == min_objectid &&
4410                     found_key.type < type)
4411                         break;
4412         }
4413         return 1;
4414 }