Btrfs: merge on the way down during deletes
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / ctree.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include "kerncompat.h"
4 #include "radix-tree.h"
5 #include "ctree.h"
6 #include "disk-io.h"
7 #include "print-tree.h"
8
9 static int split_node(struct ctree_root *root, struct ctree_path *path,
10                       int level);
11 static int split_leaf(struct ctree_root *root, struct ctree_path *path,
12                       int data_size);
13 static int push_node_left(struct ctree_root *root, struct tree_buffer *dst,
14                           struct tree_buffer *src);
15 static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
16                    int slot);
17
18 inline void init_path(struct ctree_path *p)
19 {
20         memset(p, 0, sizeof(*p));
21 }
22
23 void release_path(struct ctree_root *root, struct ctree_path *p)
24 {
25         int i;
26         for (i = 0; i < MAX_LEVEL; i++) {
27                 if (!p->nodes[i])
28                         break;
29                 tree_block_release(root, p->nodes[i]);
30         }
31         memset(p, 0, sizeof(*p));
32 }
33
34 /*
35  * The leaf data grows from end-to-front in the node.
36  * this returns the address of the start of the last item,
37  * which is the stop of the leaf data stack
38  */
39 static inline unsigned int leaf_data_end(struct leaf *leaf)
40 {
41         unsigned int nr = leaf->header.nritems;
42         if (nr == 0)
43                 return sizeof(leaf->data);
44         return leaf->items[nr-1].offset;
45 }
46
47 /*
48  * The space between the end of the leaf items and
49  * the start of the leaf data.  IOW, how much room
50  * the leaf has left for both items and data
51  */
52 int leaf_free_space(struct leaf *leaf)
53 {
54         int data_end = leaf_data_end(leaf);
55         int nritems = leaf->header.nritems;
56         char *items_end = (char *)(leaf->items + nritems + 1);
57         return (char *)(leaf->data + data_end) - (char *)items_end;
58 }
59
60 /*
61  * compare two keys in a memcmp fashion
62  */
63 int comp_keys(struct key *k1, struct key *k2)
64 {
65         if (k1->objectid > k2->objectid)
66                 return 1;
67         if (k1->objectid < k2->objectid)
68                 return -1;
69         if (k1->flags > k2->flags)
70                 return 1;
71         if (k1->flags < k2->flags)
72                 return -1;
73         if (k1->offset > k2->offset)
74                 return 1;
75         if (k1->offset < k2->offset)
76                 return -1;
77         return 0;
78 }
79
80 int check_node(struct ctree_path *path, int level)
81 {
82         int i;
83         struct node *parent = NULL;
84         struct node *node = &path->nodes[level]->node;
85         int parent_slot;
86
87         if (path->nodes[level + 1])
88                 parent = &path->nodes[level + 1]->node;
89         parent_slot = path->slots[level + 1];
90         if (parent && node->header.nritems > 0) {
91                 struct key *parent_key;
92                 parent_key = &parent->keys[parent_slot];
93                 BUG_ON(memcmp(parent_key, node->keys, sizeof(struct key)));
94                 BUG_ON(parent->blockptrs[parent_slot] != node->header.blocknr);
95         }
96         BUG_ON(node->header.nritems > NODEPTRS_PER_BLOCK);
97         for (i = 0; i < node->header.nritems - 2; i++) {
98                 BUG_ON(comp_keys(&node->keys[i], &node->keys[i+1]) >= 0);
99         }
100         return 0;
101 }
102
103 int check_leaf(struct ctree_path *path, int level)
104 {
105         int i;
106         struct leaf *leaf = &path->nodes[level]->leaf;
107         struct node *parent = NULL;
108         int parent_slot;
109
110         if (path->nodes[level + 1])
111                 parent = &path->nodes[level + 1]->node;
112         parent_slot = path->slots[level + 1];
113         if (parent && leaf->header.nritems > 0) {
114                 struct key *parent_key;
115                 parent_key = &parent->keys[parent_slot];
116                 BUG_ON(memcmp(parent_key, &leaf->items[0].key,
117                        sizeof(struct key)));
118                 BUG_ON(parent->blockptrs[parent_slot] != leaf->header.blocknr);
119         }
120         for (i = 0; i < leaf->header.nritems - 2; i++) {
121                 BUG_ON(comp_keys(&leaf->items[i].key,
122                                  &leaf->items[i+1].key) >= 0);
123                 BUG_ON(leaf->items[i].offset != leaf->items[i + 1].offset +
124                     leaf->items[i + 1].size);
125                 if (i == 0) {
126                         BUG_ON(leaf->items[i].offset + leaf->items[i].size !=
127                                 LEAF_DATA_SIZE);
128                 }
129         }
130         BUG_ON(leaf_free_space(leaf) < 0);
131         return 0;
132 }
133
134 int check_block(struct ctree_path *path, int level)
135 {
136         if (level == 0)
137                 return check_leaf(path, level);
138         return check_node(path, level);
139 }
140
141 /*
142  * search for key in the array p.  items p are item_size apart
143  * and there are 'max' items in p
144  * the slot in the array is returned via slot, and it points to
145  * the place where you would insert key if it is not found in
146  * the array.
147  *
148  * slot may point to max if the key is bigger than all of the keys
149  */
150 int generic_bin_search(char *p, int item_size, struct key *key,
151                        int max, int *slot)
152 {
153         int low = 0;
154         int high = max;
155         int mid;
156         int ret;
157         struct key *tmp;
158
159         while(low < high) {
160                 mid = (low + high) / 2;
161                 tmp = (struct key *)(p + mid * item_size);
162                 ret = comp_keys(tmp, key);
163
164                 if (ret < 0)
165                         low = mid + 1;
166                 else if (ret > 0)
167                         high = mid;
168                 else {
169                         *slot = mid;
170                         return 0;
171                 }
172         }
173         *slot = low;
174         return 1;
175 }
176
177 /*
178  * simple bin_search frontend that does the right thing for
179  * leaves vs nodes
180  */
181 int bin_search(struct node *c, struct key *key, int *slot)
182 {
183         if (is_leaf(c->header.flags)) {
184                 struct leaf *l = (struct leaf *)c;
185                 return generic_bin_search((void *)l->items, sizeof(struct item),
186                                           key, c->header.nritems, slot);
187         } else {
188                 return generic_bin_search((void *)c->keys, sizeof(struct key),
189                                           key, c->header.nritems, slot);
190         }
191         return -1;
192 }
193
194 struct tree_buffer *read_node_slot(struct ctree_root *root,
195                                    struct tree_buffer *parent_buf,
196                                    int slot)
197 {
198         struct node *node = &parent_buf->node;
199         if (slot < 0)
200                 return NULL;
201         if (slot >= node->header.nritems)
202                 return NULL;
203         return read_tree_block(root, node->blockptrs[slot]);
204 }
205
206 static int balance_level(struct ctree_root *root, struct ctree_path *path,
207                         int level)
208 {
209         struct tree_buffer *right_buf;
210         struct tree_buffer *mid_buf;
211         struct tree_buffer *left_buf;
212         struct tree_buffer *parent_buf = NULL;
213         struct node *right = NULL;
214         struct node *mid;
215         struct node *left = NULL;
216         struct node *parent = NULL;
217         int ret = 0;
218         int wret;
219         int pslot;
220         int used = 0;
221         int count;
222         int orig_slot = path->slots[level];
223
224         if (level == 0)
225                 return 0;
226
227         mid_buf = path->nodes[level];
228         mid = &mid_buf->node;
229         if (level < MAX_LEVEL - 1)
230                 parent_buf = path->nodes[level + 1];
231         pslot = path->slots[level + 1];
232
233         if (!parent_buf) {
234                 struct tree_buffer *child;
235                 u64 blocknr = mid_buf->blocknr;
236
237                 if (mid->header.nritems != 1)
238                         return 0;
239
240                 /* promote the child to a root */
241                 child = read_node_slot(root, mid_buf, 0);
242                 BUG_ON(!child);
243                 root->node = child;
244                 path->nodes[level] = NULL;
245                 /* once for the path */
246                 tree_block_release(root, mid_buf);
247                 /* once for the root ptr */
248                 tree_block_release(root, mid_buf);
249                 return free_extent(root, blocknr, 1);
250         }
251         parent = &parent_buf->node;
252
253         if (mid->header.nritems > NODEPTRS_PER_BLOCK / 4)
254                 return 0;
255
256         // print_tree(root, root->node);
257         left_buf = read_node_slot(root, parent_buf, pslot - 1);
258         right_buf = read_node_slot(root, parent_buf, pslot + 1);
259         if (right_buf) {
260                 right = &right_buf->node;
261                 used = right->header.nritems;
262                 count = 1;
263         }
264         if (left_buf) {
265                 left = &left_buf->node;
266                 used += left->header.nritems;
267                 orig_slot += left->header.nritems;
268                 count++;
269         }
270         if (left_buf)
271                 push_node_left(root, left_buf, mid_buf);
272         if (right_buf) {
273                 push_node_left(root, mid_buf, right_buf);
274                 if (right->header.nritems == 0) {
275                         u64 blocknr = right_buf->blocknr;
276                         tree_block_release(root, right_buf);
277                         right_buf = NULL;
278                         right = NULL;
279                         wret = del_ptr(root, path, level + 1, pslot + 1);
280                         if (wret)
281                                 ret = wret;
282                         wret = free_extent(root, blocknr, 1);
283                         if (wret)
284                                 ret = wret;
285                 } else {
286                         memcpy(parent->keys + pslot + 1, right->keys,
287                                 sizeof(struct key));
288                 }
289         }
290         if (mid->header.nritems == 0) {
291                 u64 blocknr = mid_buf->blocknr;
292                 tree_block_release(root, mid_buf);
293                 mid_buf = NULL;
294                 mid = NULL;
295                 wret = del_ptr(root, path, level + 1, pslot);
296                 if (wret)
297                         ret = wret;
298                 wret = free_extent(root, blocknr, 1);
299                 if (wret)
300                         ret = wret;
301         } else
302                 memcpy(parent->keys + pslot, mid->keys, sizeof(struct key));
303
304         if (left_buf) {
305                 if (left->header.nritems >= orig_slot) {
306                         left_buf->count++; // released below
307                         path->nodes[level] = left_buf;
308                         path->slots[level + 1] -= 1;
309                         path->slots[level] = orig_slot;
310                         if (mid_buf)
311                                 tree_block_release(root, mid_buf);
312                 } else {
313                         orig_slot -= left->header.nritems;
314                         path->slots[level] = orig_slot;
315                 }
316         }
317
318         if (right_buf)
319                 tree_block_release(root, right_buf);
320         if (left_buf)
321                 tree_block_release(root, left_buf);
322
323         return ret;
324 }
325
326 /*
327  * look for key in the tree.  path is filled in with nodes along the way
328  * if key is found, we return zero and you can find the item in the leaf
329  * level of the path (level 0)
330  *
331  * If the key isn't found, the path points to the slot where it should
332  * be inserted, and 1 is returned.  If there are other errors during the
333  * search a negative error number is returned.
334  *
335  * if ins_len > 0, nodes and leaves will be split as we walk down the
336  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
337  * possible)
338  */
339 int search_slot(struct ctree_root *root, struct key *key,
340                 struct ctree_path *p, int ins_len)
341 {
342         struct tree_buffer *b;
343         struct node *c;
344         int slot;
345         int ret;
346         int level;
347
348 again:
349         b = root->node;
350         b->count++;
351         while (b) {
352                 c = &b->node;
353                 level = node_level(c->header.flags);
354                 p->nodes[level] = b;
355                 ret = check_block(p, level);
356                 if (ret)
357                         return -1;
358                 ret = bin_search(c, key, &slot);
359                 if (!is_leaf(c->header.flags)) {
360                         if (ret && slot > 0)
361                                 slot -= 1;
362                         p->slots[level] = slot;
363                         if (ins_len > 0 &&
364                             c->header.nritems == NODEPTRS_PER_BLOCK) {
365                                 int sret = split_node(root, p, level);
366                                 BUG_ON(sret > 0);
367                                 if (sret)
368                                         return sret;
369                                 b = p->nodes[level];
370                                 c = &b->node;
371                                 slot = p->slots[level];
372                         } else if (ins_len < 0) {
373                                 int sret = balance_level(root, p, level);
374                                 if (sret)
375                                         return sret;
376                                 b = p->nodes[level];
377                                 if (!b)
378                                         goto again;
379                                 c = &b->node;
380                                 slot = p->slots[level];
381                         }
382                         b = read_tree_block(root, c->blockptrs[slot]);
383                 } else {
384                         struct leaf *l = (struct leaf *)c;
385                         p->slots[level] = slot;
386                         if (ins_len > 0 && leaf_free_space(l) <
387                             sizeof(struct item) + ins_len) {
388                                 int sret = split_leaf(root, p, ins_len);
389                                 BUG_ON(sret > 0);
390                                 if (sret)
391                                         return sret;
392                         }
393                         BUG_ON(root->node->count == 1);
394                         return ret;
395                 }
396         }
397         BUG_ON(root->node->count == 1);
398         return 1;
399 }
400
401 /*
402  * adjust the pointers going up the tree, starting at level
403  * making sure the right key of each node is points to 'key'.
404  * This is used after shifting pointers to the left, so it stops
405  * fixing up pointers when a given leaf/node is not in slot 0 of the
406  * higher levels
407  *
408  * If this fails to write a tree block, it returns -1, but continues
409  * fixing up the blocks in ram so the tree is consistent.
410  */
411 static int fixup_low_keys(struct ctree_root *root,
412                            struct ctree_path *path, struct key *key,
413                            int level)
414 {
415         int i;
416         int ret = 0;
417         int wret;
418         for (i = level; i < MAX_LEVEL; i++) {
419                 struct node *t;
420                 int tslot = path->slots[i];
421                 if (!path->nodes[i])
422                         break;
423                 t = &path->nodes[i]->node;
424                 memcpy(t->keys + tslot, key, sizeof(*key));
425                 wret = write_tree_block(root, path->nodes[i]);
426                 if (wret)
427                         ret = wret;
428                 if (tslot != 0)
429                         break;
430         }
431         return ret;
432 }
433
434 /*
435  * try to push data from one node into the next node left in the
436  * tree.  The src node is found at specified level in the path.
437  * If some bytes were pushed, return 0, otherwise return 1.
438  *
439  * Lower nodes/leaves in the path are not touched, higher nodes may
440  * be modified to reflect the push.
441  *
442  * The path is altered to reflect the push.
443  *
444  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
445  * error, and > 0 if there was no room in the left hand block.
446  */
447 static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
448                           struct tree_buffer *src_buf)
449 {
450         struct node *src = &src_buf->node;
451         struct node *dst = &dst_buf->node;
452         int push_items = 0;
453         int src_nritems;
454         int dst_nritems;
455         int ret = 0;
456         int wret;
457
458         src_nritems = src->header.nritems;
459         dst_nritems = dst->header.nritems;
460         push_items = NODEPTRS_PER_BLOCK - dst_nritems;
461         if (push_items <= 0) {
462                 return 1;
463         }
464
465         if (src_nritems < push_items)
466                 push_items =src_nritems;
467         memcpy(dst->keys + dst_nritems, src->keys,
468                 push_items * sizeof(struct key));
469         memcpy(dst->blockptrs + dst_nritems, src->blockptrs,
470                 push_items * sizeof(u64));
471         if (push_items < src_nritems) {
472                 memmove(src->keys, src->keys + push_items,
473                         (src_nritems - push_items) * sizeof(struct key));
474                 memmove(src->blockptrs, src->blockptrs + push_items,
475                         (src_nritems - push_items) * sizeof(u64));
476         }
477         src->header.nritems -= push_items;
478         dst->header.nritems += push_items;
479
480         wret = write_tree_block(root, src_buf);
481         if (wret < 0)
482                 ret = wret;
483
484         wret = write_tree_block(root, dst_buf);
485         if (wret < 0)
486                 ret = wret;
487         return ret;
488 }
489
490 /*
491  * helper function to insert a new root level in the tree.
492  * A new node is allocated, and a single item is inserted to
493  * point to the existing root
494  *
495  * returns zero on success or < 0 on failure.
496  */
497 static int insert_new_root(struct ctree_root *root,
498                            struct ctree_path *path, int level)
499 {
500         struct tree_buffer *t;
501         struct node *lower;
502         struct node *c;
503         struct key *lower_key;
504
505         BUG_ON(path->nodes[level]);
506         BUG_ON(path->nodes[level-1] != root->node);
507
508         t = alloc_free_block(root);
509         c = &t->node;
510         memset(c, 0, sizeof(c));
511         c->header.nritems = 1;
512         c->header.flags = node_level(level);
513         c->header.blocknr = t->blocknr;
514         c->header.parentid = root->node->node.header.parentid;
515         lower = &path->nodes[level-1]->node;
516         if (is_leaf(lower->header.flags))
517                 lower_key = &((struct leaf *)lower)->items[0].key;
518         else
519                 lower_key = lower->keys;
520         memcpy(c->keys, lower_key, sizeof(struct key));
521         c->blockptrs[0] = path->nodes[level-1]->blocknr;
522         /* the super has an extra ref to root->node */
523         tree_block_release(root, root->node);
524         root->node = t;
525         t->count++;
526         write_tree_block(root, t);
527         path->nodes[level] = t;
528         path->slots[level] = 0;
529         return 0;
530 }
531
532 /*
533  * worker function to insert a single pointer in a node.
534  * the node should have enough room for the pointer already
535  *
536  * slot and level indicate where you want the key to go, and
537  * blocknr is the block the key points to.
538  *
539  * returns zero on success and < 0 on any error
540  */
541 static int insert_ptr(struct ctree_root *root,
542                 struct ctree_path *path, struct key *key,
543                 u64 blocknr, int slot, int level)
544 {
545         struct node *lower;
546         int nritems;
547
548         BUG_ON(!path->nodes[level]);
549         lower = &path->nodes[level]->node;
550         nritems = lower->header.nritems;
551         if (slot > nritems)
552                 BUG();
553         if (nritems == NODEPTRS_PER_BLOCK)
554                 BUG();
555         if (slot != nritems) {
556                 memmove(lower->keys + slot + 1, lower->keys + slot,
557                         (nritems - slot) * sizeof(struct key));
558                 memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
559                         (nritems - slot) * sizeof(u64));
560         }
561         memcpy(lower->keys + slot, key, sizeof(struct key));
562         lower->blockptrs[slot] = blocknr;
563         lower->header.nritems++;
564         if (lower->keys[1].objectid == 0)
565                         BUG();
566         write_tree_block(root, path->nodes[level]);
567         return 0;
568 }
569
570 /*
571  * split the node at the specified level in path in two.
572  * The path is corrected to point to the appropriate node after the split
573  *
574  * Before splitting this tries to make some room in the node by pushing
575  * left and right, if either one works, it returns right away.
576  *
577  * returns 0 on success and < 0 on failure
578  */
579 static int split_node(struct ctree_root *root, struct ctree_path *path,
580                       int level)
581 {
582         struct tree_buffer *t;
583         struct node *c;
584         struct tree_buffer *split_buffer;
585         struct node *split;
586         int mid;
587         int ret;
588         int wret;
589
590         t = path->nodes[level];
591         c = &t->node;
592         if (t == root->node) {
593                 /* trying to split the root, lets make a new one */
594                 ret = insert_new_root(root, path, level + 1);
595                 if (ret)
596                         return ret;
597         }
598         split_buffer = alloc_free_block(root);
599         split = &split_buffer->node;
600         split->header.flags = c->header.flags;
601         split->header.blocknr = split_buffer->blocknr;
602         split->header.parentid = root->node->node.header.parentid;
603         mid = (c->header.nritems + 1) / 2;
604         memcpy(split->keys, c->keys + mid,
605                 (c->header.nritems - mid) * sizeof(struct key));
606         memcpy(split->blockptrs, c->blockptrs + mid,
607                 (c->header.nritems - mid) * sizeof(u64));
608         split->header.nritems = c->header.nritems - mid;
609         c->header.nritems = mid;
610         ret = 0;
611
612         wret = write_tree_block(root, t);
613         if (wret)
614                 ret = wret;
615         wret = write_tree_block(root, split_buffer);
616         if (wret)
617                 ret = wret;
618         wret = insert_ptr(root, path, split->keys, split_buffer->blocknr,
619                           path->slots[level + 1] + 1, level + 1);
620         if (wret)
621                 ret = wret;
622
623         if (path->slots[level] >= mid) {
624                 path->slots[level] -= mid;
625                 tree_block_release(root, t);
626                 path->nodes[level] = split_buffer;
627                 path->slots[level + 1] += 1;
628         } else {
629                 tree_block_release(root, split_buffer);
630         }
631         return ret;
632 }
633
634 /*
635  * how many bytes are required to store the items in a leaf.  start
636  * and nr indicate which items in the leaf to check.  This totals up the
637  * space used both by the item structs and the item data
638  */
639 static int leaf_space_used(struct leaf *l, int start, int nr)
640 {
641         int data_len;
642         int end = start + nr - 1;
643
644         if (!nr)
645                 return 0;
646         data_len = l->items[start].offset + l->items[start].size;
647         data_len = data_len - l->items[end].offset;
648         data_len += sizeof(struct item) * nr;
649         return data_len;
650 }
651
652 /*
653  * push some data in the path leaf to the right, trying to free up at
654  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
655  *
656  * returns 1 if the push failed because the other node didn't have enough
657  * room, 0 if everything worked out and < 0 if there were major errors.
658  */
659 static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
660                            int data_size)
661 {
662         struct tree_buffer *left_buf = path->nodes[0];
663         struct leaf *left = &left_buf->leaf;
664         struct leaf *right;
665         struct tree_buffer *right_buf;
666         struct tree_buffer *upper;
667         int slot;
668         int i;
669         int free_space;
670         int push_space = 0;
671         int push_items = 0;
672         struct item *item;
673
674         slot = path->slots[1];
675         if (!path->nodes[1]) {
676                 return 1;
677         }
678         upper = path->nodes[1];
679         if (slot >= upper->node.header.nritems - 1) {
680                 return 1;
681         }
682         right_buf = read_tree_block(root, upper->node.blockptrs[slot + 1]);
683         right = &right_buf->leaf;
684         free_space = leaf_free_space(right);
685         if (free_space < data_size + sizeof(struct item)) {
686                 tree_block_release(root, right_buf);
687                 return 1;
688         }
689         for (i = left->header.nritems - 1; i >= 0; i--) {
690                 item = left->items + i;
691                 if (path->slots[0] == i)
692                         push_space += data_size + sizeof(*item);
693                 if (item->size + sizeof(*item) + push_space > free_space)
694                         break;
695                 push_items++;
696                 push_space += item->size + sizeof(*item);
697         }
698         if (push_items == 0) {
699                 tree_block_release(root, right_buf);
700                 return 1;
701         }
702         /* push left to right */
703         push_space = left->items[left->header.nritems - push_items].offset +
704                      left->items[left->header.nritems - push_items].size;
705         push_space -= leaf_data_end(left);
706         /* make room in the right data area */
707         memmove(right->data + leaf_data_end(right) - push_space,
708                 right->data + leaf_data_end(right),
709                 LEAF_DATA_SIZE - leaf_data_end(right));
710         /* copy from the left data area */
711         memcpy(right->data + LEAF_DATA_SIZE - push_space,
712                 left->data + leaf_data_end(left),
713                 push_space);
714         memmove(right->items + push_items, right->items,
715                 right->header.nritems * sizeof(struct item));
716         /* copy the items from left to right */
717         memcpy(right->items, left->items + left->header.nritems - push_items,
718                 push_items * sizeof(struct item));
719
720         /* update the item pointers */
721         right->header.nritems += push_items;
722         push_space = LEAF_DATA_SIZE;
723         for (i = 0; i < right->header.nritems; i++) {
724                 right->items[i].offset = push_space - right->items[i].size;
725                 push_space = right->items[i].offset;
726         }
727         left->header.nritems -= push_items;
728
729         write_tree_block(root, left_buf);
730         write_tree_block(root, right_buf);
731         memcpy(upper->node.keys + slot + 1,
732                 &right->items[0].key, sizeof(struct key));
733         write_tree_block(root, upper);
734         /* then fixup the leaf pointer in the path */
735         if (path->slots[0] >= left->header.nritems) {
736                 path->slots[0] -= left->header.nritems;
737                 tree_block_release(root, path->nodes[0]);
738                 path->nodes[0] = right_buf;
739                 path->slots[1] += 1;
740         } else {
741                 tree_block_release(root, right_buf);
742         }
743         return 0;
744 }
745 /*
746  * push some data in the path leaf to the left, trying to free up at
747  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
748  */
749 static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
750                           int data_size)
751 {
752         struct tree_buffer *right_buf = path->nodes[0];
753         struct leaf *right = &right_buf->leaf;
754         struct tree_buffer *t;
755         struct leaf *left;
756         int slot;
757         int i;
758         int free_space;
759         int push_space = 0;
760         int push_items = 0;
761         struct item *item;
762         int old_left_nritems;
763         int ret = 0;
764         int wret;
765
766         slot = path->slots[1];
767         if (slot == 0) {
768                 return 1;
769         }
770         if (!path->nodes[1]) {
771                 return 1;
772         }
773         t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
774         left = &t->leaf;
775         free_space = leaf_free_space(left);
776         if (free_space < data_size + sizeof(struct item)) {
777                 tree_block_release(root, t);
778                 return 1;
779         }
780         for (i = 0; i < right->header.nritems; i++) {
781                 item = right->items + i;
782                 if (path->slots[0] == i)
783                         push_space += data_size + sizeof(*item);
784                 if (item->size + sizeof(*item) + push_space > free_space)
785                         break;
786                 push_items++;
787                 push_space += item->size + sizeof(*item);
788         }
789         if (push_items == 0) {
790                 tree_block_release(root, t);
791                 return 1;
792         }
793         /* push data from right to left */
794         memcpy(left->items + left->header.nritems,
795                 right->items, push_items * sizeof(struct item));
796         push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset;
797         memcpy(left->data + leaf_data_end(left) - push_space,
798                 right->data + right->items[push_items - 1].offset,
799                 push_space);
800         old_left_nritems = left->header.nritems;
801         BUG_ON(old_left_nritems < 0);
802
803         for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
804                 left->items[i].offset -= LEAF_DATA_SIZE -
805                         left->items[old_left_nritems -1].offset;
806         }
807         left->header.nritems += push_items;
808
809         /* fixup right node */
810         push_space = right->items[push_items-1].offset - leaf_data_end(right);
811         memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
812                 leaf_data_end(right), push_space);
813         memmove(right->items, right->items + push_items,
814                 (right->header.nritems - push_items) * sizeof(struct item));
815         right->header.nritems -= push_items;
816         push_space = LEAF_DATA_SIZE;
817
818         for (i = 0; i < right->header.nritems; i++) {
819                 right->items[i].offset = push_space - right->items[i].size;
820                 push_space = right->items[i].offset;
821         }
822
823         wret = write_tree_block(root, t);
824         if (wret)
825                 ret = wret;
826         wret = write_tree_block(root, right_buf);
827         if (wret)
828                 ret = wret;
829
830         wret = fixup_low_keys(root, path, &right->items[0].key, 1);
831         if (wret)
832                 ret = wret;
833
834         /* then fixup the leaf pointer in the path */
835         if (path->slots[0] < push_items) {
836                 path->slots[0] += old_left_nritems;
837                 tree_block_release(root, path->nodes[0]);
838                 path->nodes[0] = t;
839                 path->slots[1] -= 1;
840         } else {
841                 tree_block_release(root, t);
842                 path->slots[0] -= push_items;
843         }
844         BUG_ON(path->slots[0] < 0);
845         return ret;
846 }
847
848 /*
849  * split the path's leaf in two, making sure there is at least data_size
850  * available for the resulting leaf level of the path.
851  *
852  * returns 0 if all went well and < 0 on failure.
853  */
854 static int split_leaf(struct ctree_root *root, struct ctree_path *path,
855                       int data_size)
856 {
857         struct tree_buffer *l_buf;
858         struct leaf *l;
859         int nritems;
860         int mid;
861         int slot;
862         struct leaf *right;
863         struct tree_buffer *right_buffer;
864         int space_needed = data_size + sizeof(struct item);
865         int data_copy_size;
866         int rt_data_off;
867         int i;
868         int ret;
869         int wret;
870
871         wret = push_leaf_left(root, path, data_size);
872         if (wret < 0)
873                 return wret;
874         if (wret) {
875                 wret = push_leaf_right(root, path, data_size);
876                 if (wret < 0)
877                         return wret;
878         }
879         l_buf = path->nodes[0];
880         l = &l_buf->leaf;
881
882         /* did the pushes work? */
883         if (leaf_free_space(l) >= sizeof(struct item) + data_size)
884                 return 0;
885
886         if (!path->nodes[1]) {
887                 ret = insert_new_root(root, path, 1);
888                 if (ret)
889                         return ret;
890         }
891         slot = path->slots[0];
892         nritems = l->header.nritems;
893         mid = (nritems + 1)/ 2;
894
895         right_buffer = alloc_free_block(root);
896         BUG_ON(!right_buffer);
897         BUG_ON(mid == nritems);
898         right = &right_buffer->leaf;
899         memset(right, 0, sizeof(*right));
900         if (mid <= slot) {
901                 /* FIXME, just alloc a new leaf here */
902                 if (leaf_space_used(l, mid, nritems - mid) + space_needed >
903                         LEAF_DATA_SIZE)
904                         BUG();
905         } else {
906                 /* FIXME, just alloc a new leaf here */
907                 if (leaf_space_used(l, 0, mid + 1) + space_needed >
908                         LEAF_DATA_SIZE)
909                         BUG();
910         }
911         right->header.nritems = nritems - mid;
912         right->header.blocknr = right_buffer->blocknr;
913         right->header.flags = node_level(0);
914         right->header.parentid = root->node->node.header.parentid;
915         data_copy_size = l->items[mid].offset + l->items[mid].size -
916                          leaf_data_end(l);
917         memcpy(right->items, l->items + mid,
918                (nritems - mid) * sizeof(struct item));
919         memcpy(right->data + LEAF_DATA_SIZE - data_copy_size,
920                l->data + leaf_data_end(l), data_copy_size);
921         rt_data_off = LEAF_DATA_SIZE -
922                      (l->items[mid].offset + l->items[mid].size);
923
924         for (i = 0; i < right->header.nritems; i++)
925                 right->items[i].offset += rt_data_off;
926
927         l->header.nritems = mid;
928         ret = 0;
929         wret = insert_ptr(root, path, &right->items[0].key,
930                           right_buffer->blocknr, path->slots[1] + 1, 1);
931         if (wret)
932                 ret = wret;
933         wret = write_tree_block(root, right_buffer);
934         if (wret)
935                 ret = wret;
936         wret = write_tree_block(root, l_buf);
937         if (wret)
938                 ret = wret;
939
940         BUG_ON(path->slots[0] != slot);
941         if (mid <= slot) {
942                 tree_block_release(root, path->nodes[0]);
943                 path->nodes[0] = right_buffer;
944                 path->slots[0] -= mid;
945                 path->slots[1] += 1;
946         } else
947                 tree_block_release(root, right_buffer);
948         BUG_ON(path->slots[0] < 0);
949         return ret;
950 }
951
952 /*
953  * Given a key and some data, insert an item into the tree.
954  * This does all the path init required, making room in the tree if needed.
955  */
956 int insert_item(struct ctree_root *root, struct key *key,
957                           void *data, int data_size)
958 {
959         int ret = 0;
960         int wret;
961         int slot;
962         int slot_orig;
963         struct leaf *leaf;
964         struct tree_buffer *leaf_buf;
965         unsigned int nritems;
966         unsigned int data_end;
967         struct ctree_path path;
968
969         /* create a root if there isn't one */
970         if (!root->node)
971                 BUG();
972         init_path(&path);
973         ret = search_slot(root, key, &path, data_size);
974         if (ret == 0) {
975                 release_path(root, &path);
976                 return -EEXIST;
977         }
978         if (ret < 0) {
979                 release_path(root, &path);
980                 return ret;
981         }
982
983         slot_orig = path.slots[0];
984         leaf_buf = path.nodes[0];
985         leaf = &leaf_buf->leaf;
986
987         nritems = leaf->header.nritems;
988         data_end = leaf_data_end(leaf);
989
990         if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
991                 BUG();
992
993         slot = path.slots[0];
994         BUG_ON(slot < 0);
995         if (slot != nritems) {
996                 int i;
997                 unsigned int old_data = leaf->items[slot].offset +
998                                         leaf->items[slot].size;
999
1000                 /*
1001                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
1002                  */
1003                 /* first correct the data pointers */
1004                 for (i = slot; i < nritems; i++)
1005                         leaf->items[i].offset -= data_size;
1006
1007                 /* shift the items */
1008                 memmove(leaf->items + slot + 1, leaf->items + slot,
1009                         (nritems - slot) * sizeof(struct item));
1010
1011                 /* shift the data */
1012                 memmove(leaf->data + data_end - data_size, leaf->data +
1013                         data_end, old_data - data_end);
1014                 data_end = old_data;
1015         }
1016         /* copy the new data in */
1017         memcpy(&leaf->items[slot].key, key, sizeof(struct key));
1018         leaf->items[slot].offset = data_end - data_size;
1019         leaf->items[slot].size = data_size;
1020         memcpy(leaf->data + data_end - data_size, data, data_size);
1021         leaf->header.nritems += 1;
1022
1023         ret = 0;
1024         if (slot == 0)
1025                 ret = fixup_low_keys(root, &path, key, 1);
1026
1027         wret = write_tree_block(root, leaf_buf);
1028         if (wret)
1029                 ret = wret;
1030
1031         if (leaf_free_space(leaf) < 0)
1032                 BUG();
1033         check_leaf(&path, 0);
1034         release_path(root, &path);
1035         return ret;
1036 }
1037
1038 /*
1039  * delete the pointer from a given node.
1040  *
1041  * If the delete empties a node, the node is removed from the tree,
1042  * continuing all the way the root if required.  The root is converted into
1043  * a leaf if all the nodes are emptied.
1044  */
1045 static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
1046                    int slot)
1047 {
1048         struct node *node;
1049         struct tree_buffer *parent = path->nodes[level];
1050         int nritems;
1051         int ret = 0;
1052         int wret;
1053
1054         node = &parent->node;
1055         nritems = node->header.nritems;
1056
1057         if (slot != nritems -1) {
1058                 memmove(node->keys + slot, node->keys + slot + 1,
1059                         sizeof(struct key) * (nritems - slot - 1));
1060                 memmove(node->blockptrs + slot,
1061                         node->blockptrs + slot + 1,
1062                         sizeof(u64) * (nritems - slot - 1));
1063         }
1064         node->header.nritems--;
1065         if (node->header.nritems == 0 && parent == root->node) {
1066                 BUG_ON(node_level(root->node->node.header.flags) != 1);
1067                 /* just turn the root into a leaf and break */
1068                 root->node->node.header.flags = node_level(0);
1069         } else if (slot == 0) {
1070                 wret = fixup_low_keys(root, path, node->keys, level + 1);
1071                 if (wret)
1072                         ret = wret;
1073         }
1074         wret = write_tree_block(root, parent);
1075         if (wret)
1076                 ret = wret;
1077         return ret;
1078 }
1079
1080 /*
1081  * delete the item at the leaf level in path.  If that empties
1082  * the leaf, remove it from the tree
1083  */
1084 int del_item(struct ctree_root *root, struct ctree_path *path)
1085 {
1086         int slot;
1087         struct leaf *leaf;
1088         struct tree_buffer *leaf_buf;
1089         int doff;
1090         int dsize;
1091         int ret = 0;
1092         int wret;
1093
1094         leaf_buf = path->nodes[0];
1095         leaf = &leaf_buf->leaf;
1096         slot = path->slots[0];
1097         doff = leaf->items[slot].offset;
1098         dsize = leaf->items[slot].size;
1099
1100         if (slot != leaf->header.nritems - 1) {
1101                 int i;
1102                 int data_end = leaf_data_end(leaf);
1103                 memmove(leaf->data + data_end + dsize,
1104                         leaf->data + data_end,
1105                         doff - data_end);
1106                 for (i = slot + 1; i < leaf->header.nritems; i++)
1107                         leaf->items[i].offset += dsize;
1108                 memmove(leaf->items + slot, leaf->items + slot + 1,
1109                         sizeof(struct item) *
1110                         (leaf->header.nritems - slot - 1));
1111         }
1112         leaf->header.nritems -= 1;
1113         /* delete the leaf if we've emptied it */
1114         if (leaf->header.nritems == 0) {
1115                 if (leaf_buf == root->node) {
1116                         leaf->header.flags = node_level(0);
1117                         write_tree_block(root, leaf_buf);
1118                 } else {
1119                         wret = del_ptr(root, path, 1, path->slots[1]);
1120                         if (wret)
1121                                 ret = wret;
1122                         wret = free_extent(root, leaf_buf->blocknr, 1);
1123                         if (wret)
1124                                 ret = wret;
1125                 }
1126         } else {
1127                 int used = leaf_space_used(leaf, 0, leaf->header.nritems);
1128                 if (slot == 0) {
1129                         wret = fixup_low_keys(root, path,
1130                                                    &leaf->items[0].key, 1);
1131                         if (wret)
1132                                 ret = wret;
1133                 }
1134                 wret = write_tree_block(root, leaf_buf);
1135                 if (wret)
1136                         ret = wret;
1137
1138                 /* delete the leaf if it is mostly empty */
1139                 if (used < LEAF_DATA_SIZE / 3) {
1140                         /* push_leaf_left fixes the path.
1141                          * make sure the path still points to our leaf
1142                          * for possible call to del_ptr below
1143                          */
1144                         slot = path->slots[1];
1145                         leaf_buf->count++;
1146                         wret = push_leaf_left(root, path, 1);
1147                         if (wret < 0)
1148                                 ret = wret;
1149                         if (leaf->header.nritems) {
1150                                 wret = push_leaf_right(root, path, 1);
1151                                 if (wret < 0)
1152                                         ret = wret;
1153                         }
1154                         if (leaf->header.nritems == 0) {
1155                                 u64 blocknr = leaf_buf->blocknr;
1156                                 wret = del_ptr(root, path, 1, slot);
1157                                 if (wret)
1158                                         ret = wret;
1159                                 tree_block_release(root, leaf_buf);
1160                                 wret = free_extent(root, blocknr, 1);
1161                                 if (wret)
1162                                         ret = wret;
1163                         } else {
1164                                 tree_block_release(root, leaf_buf);
1165                         }
1166                 }
1167         }
1168         return ret;
1169 }
1170
1171 /*
1172  * walk up the tree as far as required to find the next leaf.
1173  * returns 0 if it found something or 1 if there are no greater leaves.
1174  * returns < 0 on io errors.
1175  */
1176 int next_leaf(struct ctree_root *root, struct ctree_path *path)
1177 {
1178         int slot;
1179         int level = 1;
1180         u64 blocknr;
1181         struct tree_buffer *c;
1182         struct tree_buffer *next = NULL;
1183
1184         while(level < MAX_LEVEL) {
1185                 if (!path->nodes[level])
1186                         return 1;
1187                 slot = path->slots[level] + 1;
1188                 c = path->nodes[level];
1189                 if (slot >= c->node.header.nritems) {
1190                         level++;
1191                         continue;
1192                 }
1193                 blocknr = c->node.blockptrs[slot];
1194                 if (next)
1195                         tree_block_release(root, next);
1196                 next = read_tree_block(root, blocknr);
1197                 break;
1198         }
1199         path->slots[level] = slot;
1200         while(1) {
1201                 level--;
1202                 c = path->nodes[level];
1203                 tree_block_release(root, c);
1204                 path->nodes[level] = next;
1205                 path->slots[level] = 0;
1206                 if (!level)
1207                         break;
1208                 next = read_tree_block(root, next->node.blockptrs[0]);
1209         }
1210         return 0;
1211 }
1212