Btrfs: add a io_ctl struct and helpers for dealing with the space cache
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / free-space-cache.c
1 /*
2  * Copyright (C) 2008 Red Hat.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/slab.h>
22 #include <linux/math64.h>
23 #include <linux/ratelimit.h>
24 #include "ctree.h"
25 #include "free-space-cache.h"
26 #include "transaction.h"
27 #include "disk-io.h"
28 #include "extent_io.h"
29 #include "inode-map.h"
30
31 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
32 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
33
34 static int link_free_space(struct btrfs_free_space_ctl *ctl,
35                            struct btrfs_free_space *info);
36
37 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
38                                                struct btrfs_path *path,
39                                                u64 offset)
40 {
41         struct btrfs_key key;
42         struct btrfs_key location;
43         struct btrfs_disk_key disk_key;
44         struct btrfs_free_space_header *header;
45         struct extent_buffer *leaf;
46         struct inode *inode = NULL;
47         int ret;
48
49         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
50         key.offset = offset;
51         key.type = 0;
52
53         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
54         if (ret < 0)
55                 return ERR_PTR(ret);
56         if (ret > 0) {
57                 btrfs_release_path(path);
58                 return ERR_PTR(-ENOENT);
59         }
60
61         leaf = path->nodes[0];
62         header = btrfs_item_ptr(leaf, path->slots[0],
63                                 struct btrfs_free_space_header);
64         btrfs_free_space_key(leaf, header, &disk_key);
65         btrfs_disk_key_to_cpu(&location, &disk_key);
66         btrfs_release_path(path);
67
68         inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
69         if (!inode)
70                 return ERR_PTR(-ENOENT);
71         if (IS_ERR(inode))
72                 return inode;
73         if (is_bad_inode(inode)) {
74                 iput(inode);
75                 return ERR_PTR(-ENOENT);
76         }
77
78         inode->i_mapping->flags &= ~__GFP_FS;
79
80         return inode;
81 }
82
83 struct inode *lookup_free_space_inode(struct btrfs_root *root,
84                                       struct btrfs_block_group_cache
85                                       *block_group, struct btrfs_path *path)
86 {
87         struct inode *inode = NULL;
88
89         spin_lock(&block_group->lock);
90         if (block_group->inode)
91                 inode = igrab(block_group->inode);
92         spin_unlock(&block_group->lock);
93         if (inode)
94                 return inode;
95
96         inode = __lookup_free_space_inode(root, path,
97                                           block_group->key.objectid);
98         if (IS_ERR(inode))
99                 return inode;
100
101         spin_lock(&block_group->lock);
102         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
103                 printk(KERN_INFO "Old style space inode found, converting.\n");
104                 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
105                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
106         }
107
108         if (!block_group->iref) {
109                 block_group->inode = igrab(inode);
110                 block_group->iref = 1;
111         }
112         spin_unlock(&block_group->lock);
113
114         return inode;
115 }
116
117 int __create_free_space_inode(struct btrfs_root *root,
118                               struct btrfs_trans_handle *trans,
119                               struct btrfs_path *path, u64 ino, u64 offset)
120 {
121         struct btrfs_key key;
122         struct btrfs_disk_key disk_key;
123         struct btrfs_free_space_header *header;
124         struct btrfs_inode_item *inode_item;
125         struct extent_buffer *leaf;
126         int ret;
127
128         ret = btrfs_insert_empty_inode(trans, root, path, ino);
129         if (ret)
130                 return ret;
131
132         leaf = path->nodes[0];
133         inode_item = btrfs_item_ptr(leaf, path->slots[0],
134                                     struct btrfs_inode_item);
135         btrfs_item_key(leaf, &disk_key, path->slots[0]);
136         memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
137                              sizeof(*inode_item));
138         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
139         btrfs_set_inode_size(leaf, inode_item, 0);
140         btrfs_set_inode_nbytes(leaf, inode_item, 0);
141         btrfs_set_inode_uid(leaf, inode_item, 0);
142         btrfs_set_inode_gid(leaf, inode_item, 0);
143         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
144         btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
145                               BTRFS_INODE_PREALLOC);
146         btrfs_set_inode_nlink(leaf, inode_item, 1);
147         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
148         btrfs_set_inode_block_group(leaf, inode_item, offset);
149         btrfs_mark_buffer_dirty(leaf);
150         btrfs_release_path(path);
151
152         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
153         key.offset = offset;
154         key.type = 0;
155
156         ret = btrfs_insert_empty_item(trans, root, path, &key,
157                                       sizeof(struct btrfs_free_space_header));
158         if (ret < 0) {
159                 btrfs_release_path(path);
160                 return ret;
161         }
162         leaf = path->nodes[0];
163         header = btrfs_item_ptr(leaf, path->slots[0],
164                                 struct btrfs_free_space_header);
165         memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
166         btrfs_set_free_space_key(leaf, header, &disk_key);
167         btrfs_mark_buffer_dirty(leaf);
168         btrfs_release_path(path);
169
170         return 0;
171 }
172
173 int create_free_space_inode(struct btrfs_root *root,
174                             struct btrfs_trans_handle *trans,
175                             struct btrfs_block_group_cache *block_group,
176                             struct btrfs_path *path)
177 {
178         int ret;
179         u64 ino;
180
181         ret = btrfs_find_free_objectid(root, &ino);
182         if (ret < 0)
183                 return ret;
184
185         return __create_free_space_inode(root, trans, path, ino,
186                                          block_group->key.objectid);
187 }
188
189 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
190                                     struct btrfs_trans_handle *trans,
191                                     struct btrfs_path *path,
192                                     struct inode *inode)
193 {
194         struct btrfs_block_rsv *rsv;
195         loff_t oldsize;
196         int ret = 0;
197
198         rsv = trans->block_rsv;
199         trans->block_rsv = root->orphan_block_rsv;
200         ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0);
201         if (ret)
202                 return ret;
203
204         oldsize = i_size_read(inode);
205         btrfs_i_size_write(inode, 0);
206         truncate_pagecache(inode, oldsize, 0);
207
208         /*
209          * We don't need an orphan item because truncating the free space cache
210          * will never be split across transactions.
211          */
212         ret = btrfs_truncate_inode_items(trans, root, inode,
213                                          0, BTRFS_EXTENT_DATA_KEY);
214
215         trans->block_rsv = rsv;
216         if (ret) {
217                 WARN_ON(1);
218                 return ret;
219         }
220
221         ret = btrfs_update_inode(trans, root, inode);
222         return ret;
223 }
224
225 static int readahead_cache(struct inode *inode)
226 {
227         struct file_ra_state *ra;
228         unsigned long last_index;
229
230         ra = kzalloc(sizeof(*ra), GFP_NOFS);
231         if (!ra)
232                 return -ENOMEM;
233
234         file_ra_state_init(ra, inode->i_mapping);
235         last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
236
237         page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
238
239         kfree(ra);
240
241         return 0;
242 }
243
244 struct io_ctl {
245         void *cur, *orig;
246         struct page *page;
247         struct page **pages;
248         struct btrfs_root *root;
249         unsigned long size;
250         int index;
251         int num_pages;
252 };
253
254 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
255                        struct btrfs_root *root)
256 {
257         memset(io_ctl, 0, sizeof(struct io_ctl));
258         io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
259                 PAGE_CACHE_SHIFT;
260         io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
261                                 GFP_NOFS);
262         if (!io_ctl->pages)
263                 return -ENOMEM;
264         io_ctl->root = root;
265         return 0;
266 }
267
268 static void io_ctl_free(struct io_ctl *io_ctl)
269 {
270         kfree(io_ctl->pages);
271 }
272
273 static void io_ctl_unmap_page(struct io_ctl *io_ctl)
274 {
275         if (io_ctl->cur) {
276                 kunmap(io_ctl->page);
277                 io_ctl->cur = NULL;
278                 io_ctl->orig = NULL;
279         }
280 }
281
282 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
283 {
284         WARN_ON(io_ctl->cur);
285         BUG_ON(io_ctl->index >= io_ctl->num_pages);
286         io_ctl->page = io_ctl->pages[io_ctl->index++];
287         io_ctl->cur = kmap(io_ctl->page);
288         io_ctl->orig = io_ctl->cur;
289         io_ctl->size = PAGE_CACHE_SIZE;
290         if (clear)
291                 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
292 }
293
294 static void io_ctl_drop_pages(struct io_ctl *io_ctl)
295 {
296         int i;
297
298         io_ctl_unmap_page(io_ctl);
299
300         for (i = 0; i < io_ctl->num_pages; i++) {
301                 ClearPageChecked(io_ctl->pages[i]);
302                 unlock_page(io_ctl->pages[i]);
303                 page_cache_release(io_ctl->pages[i]);
304         }
305 }
306
307 static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
308                                 int uptodate)
309 {
310         struct page *page;
311         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
312         int i;
313
314         for (i = 0; i < io_ctl->num_pages; i++) {
315                 page = find_or_create_page(inode->i_mapping, i, mask);
316                 if (!page) {
317                         io_ctl_drop_pages(io_ctl);
318                         return -ENOMEM;
319                 }
320                 io_ctl->pages[i] = page;
321                 if (uptodate && !PageUptodate(page)) {
322                         btrfs_readpage(NULL, page);
323                         lock_page(page);
324                         if (!PageUptodate(page)) {
325                                 printk(KERN_ERR "btrfs: error reading free "
326                                        "space cache\n");
327                                 io_ctl_drop_pages(io_ctl);
328                                 return -EIO;
329                         }
330                 }
331         }
332
333         return 0;
334 }
335
336 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
337 {
338         u64 *val;
339
340         io_ctl_map_page(io_ctl, 1);
341
342         /*
343          * Skip the first 64bits to make sure theres a bogus crc for old
344          * kernels
345          */
346         io_ctl->cur += sizeof(u64);
347
348         val = io_ctl->cur;
349         *val = cpu_to_le64(generation);
350         io_ctl->cur += sizeof(u64);
351         io_ctl->size -= sizeof(u64) * 2;
352 }
353
354 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
355 {
356         u64 *gen;
357
358         io_ctl_map_page(io_ctl, 0);
359
360         /* Skip the bogus crc area */
361         io_ctl->cur += sizeof(u64);
362         gen = io_ctl->cur;
363         if (le64_to_cpu(*gen) != generation) {
364                 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
365                                    "(%Lu) does not match inode (%Lu)\n", *gen,
366                                    generation);
367                 io_ctl_unmap_page(io_ctl);
368                 return -EIO;
369         }
370         io_ctl->cur += sizeof(u64);
371         io_ctl->size -= sizeof(u64) * 2;
372         return 0;
373 }
374
375 static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
376                             void *bitmap)
377 {
378         struct btrfs_free_space_entry *entry;
379
380         if (!io_ctl->cur)
381                 return -ENOSPC;
382
383         entry = io_ctl->cur;
384         entry->offset = cpu_to_le64(offset);
385         entry->bytes = cpu_to_le64(bytes);
386         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
387                 BTRFS_FREE_SPACE_EXTENT;
388         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
389         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
390
391         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
392                 return 0;
393
394         /*
395          * index == 1 means the current page is 0, we need to generate a bogus
396          * crc for older kernels.
397          */
398         if (io_ctl->index == 1) {
399                 u32 *tmp;
400                 u32 crc = ~(u32)0;
401
402                 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64),
403                                       crc, PAGE_CACHE_SIZE - sizeof(u64));
404                 btrfs_csum_final(crc, (char *)&crc);
405                 crc++;
406                 tmp = io_ctl->orig;
407                 *tmp = crc;
408         }
409         io_ctl_unmap_page(io_ctl);
410
411         /* No more pages to map */
412         if (io_ctl->index >= io_ctl->num_pages)
413                 return 0;
414
415         /* map the next page */
416         io_ctl_map_page(io_ctl, 1);
417         return 0;
418 }
419
420 static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
421 {
422         if (!io_ctl->cur)
423                 return -ENOSPC;
424
425         /*
426          * If we aren't at the start of the current page, unmap this one and
427          * map the next one if there is any left.
428          */
429         if (io_ctl->cur != io_ctl->orig) {
430                 io_ctl_unmap_page(io_ctl);
431                 if (io_ctl->index >= io_ctl->num_pages)
432                         return -ENOSPC;
433                 io_ctl_map_page(io_ctl, 0);
434         }
435
436         memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
437         io_ctl_unmap_page(io_ctl);
438         if (io_ctl->index < io_ctl->num_pages)
439                 io_ctl_map_page(io_ctl, 0);
440         return 0;
441 }
442
443 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
444 {
445         io_ctl_unmap_page(io_ctl);
446
447         while (io_ctl->index < io_ctl->num_pages) {
448                 io_ctl_map_page(io_ctl, 1);
449                 io_ctl_unmap_page(io_ctl);
450         }
451 }
452
453 static u8 io_ctl_read_entry(struct io_ctl *io_ctl,
454                             struct btrfs_free_space *entry)
455 {
456         struct btrfs_free_space_entry *e;
457         u8 type;
458
459         e = io_ctl->cur;
460         entry->offset = le64_to_cpu(e->offset);
461         entry->bytes = le64_to_cpu(e->bytes);
462         type = e->type;
463         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
464         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
465
466         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
467                 return type;
468
469         io_ctl_unmap_page(io_ctl);
470
471         if (io_ctl->index >= io_ctl->num_pages)
472                 return type;
473
474         io_ctl_map_page(io_ctl, 0);
475         return type;
476 }
477
478 static void io_ctl_read_bitmap(struct io_ctl *io_ctl,
479                                struct btrfs_free_space *entry)
480 {
481         BUG_ON(!io_ctl->cur);
482         if (io_ctl->cur != io_ctl->orig) {
483                 io_ctl_unmap_page(io_ctl);
484                 io_ctl_map_page(io_ctl, 0);
485         }
486         memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
487         io_ctl_unmap_page(io_ctl);
488         if (io_ctl->index < io_ctl->num_pages)
489                 io_ctl_map_page(io_ctl, 0);
490 }
491
492 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
493                             struct btrfs_free_space_ctl *ctl,
494                             struct btrfs_path *path, u64 offset)
495 {
496         struct btrfs_free_space_header *header;
497         struct extent_buffer *leaf;
498         struct io_ctl io_ctl;
499         struct btrfs_key key;
500         struct btrfs_free_space *e, *n;
501         struct list_head bitmaps;
502         u64 num_entries;
503         u64 num_bitmaps;
504         u64 generation;
505         u8 type;
506         int ret = 0;
507
508         INIT_LIST_HEAD(&bitmaps);
509
510         /* Nothing in the space cache, goodbye */
511         if (!i_size_read(inode))
512                 return 0;
513
514         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
515         key.offset = offset;
516         key.type = 0;
517
518         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
519         if (ret < 0)
520                 return 0;
521         else if (ret > 0) {
522                 btrfs_release_path(path);
523                 return 0;
524         }
525
526         ret = -1;
527
528         leaf = path->nodes[0];
529         header = btrfs_item_ptr(leaf, path->slots[0],
530                                 struct btrfs_free_space_header);
531         num_entries = btrfs_free_space_entries(leaf, header);
532         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
533         generation = btrfs_free_space_generation(leaf, header);
534         btrfs_release_path(path);
535
536         if (BTRFS_I(inode)->generation != generation) {
537                 printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
538                        " not match free space cache generation (%llu)\n",
539                        (unsigned long long)BTRFS_I(inode)->generation,
540                        (unsigned long long)generation);
541                 return 0;
542         }
543
544         if (!num_entries)
545                 return 0;
546
547         io_ctl_init(&io_ctl, inode, root);
548         ret = readahead_cache(inode);
549         if (ret)
550                 goto out;
551
552         ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
553         if (ret)
554                 goto out;
555
556         ret = io_ctl_check_generation(&io_ctl, generation);
557         if (ret)
558                 goto free_cache;
559
560         while (num_entries) {
561                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
562                                       GFP_NOFS);
563                 if (!e)
564                         goto free_cache;
565
566                 type = io_ctl_read_entry(&io_ctl, e);
567                 if (!e->bytes) {
568                         kmem_cache_free(btrfs_free_space_cachep, e);
569                         goto free_cache;
570                 }
571
572                 if (type == BTRFS_FREE_SPACE_EXTENT) {
573                         spin_lock(&ctl->tree_lock);
574                         ret = link_free_space(ctl, e);
575                         spin_unlock(&ctl->tree_lock);
576                         if (ret) {
577                                 printk(KERN_ERR "Duplicate entries in "
578                                        "free space cache, dumping\n");
579                                 kmem_cache_free(btrfs_free_space_cachep, e);
580                                 goto free_cache;
581                         }
582                 } else {
583                         BUG_ON(!num_bitmaps);
584                         num_bitmaps--;
585                         e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
586                         if (!e->bitmap) {
587                                 kmem_cache_free(
588                                         btrfs_free_space_cachep, e);
589                                 goto free_cache;
590                         }
591                         spin_lock(&ctl->tree_lock);
592                         ret = link_free_space(ctl, e);
593                         ctl->total_bitmaps++;
594                         ctl->op->recalc_thresholds(ctl);
595                         spin_unlock(&ctl->tree_lock);
596                         if (ret) {
597                                 printk(KERN_ERR "Duplicate entries in "
598                                        "free space cache, dumping\n");
599                                 kmem_cache_free(btrfs_free_space_cachep, e);
600                                 goto free_cache;
601                         }
602                         list_add_tail(&e->list, &bitmaps);
603                 }
604
605                 num_entries--;
606         }
607
608         /*
609          * We add the bitmaps at the end of the entries in order that
610          * the bitmap entries are added to the cache.
611          */
612         list_for_each_entry_safe(e, n, &bitmaps, list) {
613                 list_del_init(&e->list);
614                 io_ctl_read_bitmap(&io_ctl, e);
615         }
616
617         io_ctl_drop_pages(&io_ctl);
618         ret = 1;
619 out:
620         io_ctl_free(&io_ctl);
621         return ret;
622 free_cache:
623         io_ctl_drop_pages(&io_ctl);
624         __btrfs_remove_free_space_cache(ctl);
625         goto out;
626 }
627
628 int load_free_space_cache(struct btrfs_fs_info *fs_info,
629                           struct btrfs_block_group_cache *block_group)
630 {
631         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
632         struct btrfs_root *root = fs_info->tree_root;
633         struct inode *inode;
634         struct btrfs_path *path;
635         int ret;
636         bool matched;
637         u64 used = btrfs_block_group_used(&block_group->item);
638
639         /*
640          * If we're unmounting then just return, since this does a search on the
641          * normal root and not the commit root and we could deadlock.
642          */
643         if (btrfs_fs_closing(fs_info))
644                 return 0;
645
646         /*
647          * If this block group has been marked to be cleared for one reason or
648          * another then we can't trust the on disk cache, so just return.
649          */
650         spin_lock(&block_group->lock);
651         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
652                 spin_unlock(&block_group->lock);
653                 return 0;
654         }
655         spin_unlock(&block_group->lock);
656
657         path = btrfs_alloc_path();
658         if (!path)
659                 return 0;
660
661         inode = lookup_free_space_inode(root, block_group, path);
662         if (IS_ERR(inode)) {
663                 btrfs_free_path(path);
664                 return 0;
665         }
666
667         ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
668                                       path, block_group->key.objectid);
669         btrfs_free_path(path);
670         if (ret <= 0)
671                 goto out;
672
673         spin_lock(&ctl->tree_lock);
674         matched = (ctl->free_space == (block_group->key.offset - used -
675                                        block_group->bytes_super));
676         spin_unlock(&ctl->tree_lock);
677
678         if (!matched) {
679                 __btrfs_remove_free_space_cache(ctl);
680                 printk(KERN_ERR "block group %llu has an wrong amount of free "
681                        "space\n", block_group->key.objectid);
682                 ret = -1;
683         }
684 out:
685         if (ret < 0) {
686                 /* This cache is bogus, make sure it gets cleared */
687                 spin_lock(&block_group->lock);
688                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
689                 spin_unlock(&block_group->lock);
690                 ret = 0;
691
692                 printk(KERN_ERR "btrfs: failed to load free space cache "
693                        "for block group %llu\n", block_group->key.objectid);
694         }
695
696         iput(inode);
697         return ret;
698 }
699
700 /**
701  * __btrfs_write_out_cache - write out cached info to an inode
702  * @root - the root the inode belongs to
703  * @ctl - the free space cache we are going to write out
704  * @block_group - the block_group for this cache if it belongs to a block_group
705  * @trans - the trans handle
706  * @path - the path to use
707  * @offset - the offset for the key we'll insert
708  *
709  * This function writes out a free space cache struct to disk for quick recovery
710  * on mount.  This will return 0 if it was successfull in writing the cache out,
711  * and -1 if it was not.
712  */
713 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
714                             struct btrfs_free_space_ctl *ctl,
715                             struct btrfs_block_group_cache *block_group,
716                             struct btrfs_trans_handle *trans,
717                             struct btrfs_path *path, u64 offset)
718 {
719         struct btrfs_free_space_header *header;
720         struct extent_buffer *leaf;
721         struct rb_node *node;
722         struct list_head *pos, *n;
723         struct extent_state *cached_state = NULL;
724         struct btrfs_free_cluster *cluster = NULL;
725         struct extent_io_tree *unpin = NULL;
726         struct io_ctl io_ctl;
727         struct list_head bitmap_list;
728         struct btrfs_key key;
729         u64 start, end, len;
730         int entries = 0;
731         int bitmaps = 0;
732         int ret;
733         int err = -1;
734
735         INIT_LIST_HEAD(&bitmap_list);
736
737         if (!i_size_read(inode))
738                 return -1;
739
740         filemap_write_and_wait(inode->i_mapping);
741         btrfs_wait_ordered_range(inode, inode->i_size &
742                                  ~(root->sectorsize - 1), (u64)-1);
743
744         io_ctl_init(&io_ctl, inode, root);
745
746         /* Get the cluster for this block_group if it exists */
747         if (block_group && !list_empty(&block_group->cluster_list))
748                 cluster = list_entry(block_group->cluster_list.next,
749                                      struct btrfs_free_cluster,
750                                      block_group_list);
751
752         /*
753          * We shouldn't have switched the pinned extents yet so this is the
754          * right one
755          */
756         unpin = root->fs_info->pinned_extents;
757
758         /* Lock all pages first so we can lock the extent safely. */
759         io_ctl_prepare_pages(&io_ctl, inode, 0);
760
761         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
762                          0, &cached_state, GFP_NOFS);
763
764         /*
765          * When searching for pinned extents, we need to start at our start
766          * offset.
767          */
768         if (block_group)
769                 start = block_group->key.objectid;
770
771         node = rb_first(&ctl->free_space_offset);
772         if (!node && cluster) {
773                 node = rb_first(&cluster->root);
774                 cluster = NULL;
775         }
776
777         io_ctl_set_generation(&io_ctl, trans->transid);
778
779         /* Write out the extent entries */
780         while (node) {
781                 struct btrfs_free_space *e;
782
783                 e = rb_entry(node, struct btrfs_free_space, offset_index);
784                 entries++;
785
786                 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
787                                        e->bitmap);
788                 if (ret)
789                         goto out_nospc;
790
791                 if (e->bitmap) {
792                         list_add_tail(&e->list, &bitmap_list);
793                         bitmaps++;
794                 }
795                 node = rb_next(node);
796                 if (!node && cluster) {
797                         node = rb_first(&cluster->root);
798                         cluster = NULL;
799                 }
800         }
801
802         /*
803          * We want to add any pinned extents to our free space cache
804          * so we don't leak the space
805          */
806         while (block_group && (start < block_group->key.objectid +
807                                block_group->key.offset)) {
808                 ret = find_first_extent_bit(unpin, start, &start, &end,
809                                             EXTENT_DIRTY);
810                 if (ret) {
811                         ret = 0;
812                         break;
813                 }
814
815                 /* This pinned extent is out of our range */
816                 if (start >= block_group->key.objectid +
817                     block_group->key.offset)
818                         break;
819
820                 len = block_group->key.objectid +
821                         block_group->key.offset - start;
822                 len = min(len, end + 1 - start);
823
824                 entries++;
825                 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
826                 if (ret)
827                         goto out_nospc;
828
829                 start = end + 1;
830         }
831
832         /* Write out the bitmaps */
833         list_for_each_safe(pos, n, &bitmap_list) {
834                 struct btrfs_free_space *entry =
835                         list_entry(pos, struct btrfs_free_space, list);
836
837                 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
838                 if (ret)
839                         goto out_nospc;
840                 list_del_init(&entry->list);
841         }
842
843         /* Zero out the rest of the pages just to make sure */
844         io_ctl_zero_remaining_pages(&io_ctl);
845
846         ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
847                                 0, i_size_read(inode), &cached_state);
848         io_ctl_drop_pages(&io_ctl);
849         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
850                              i_size_read(inode) - 1, &cached_state, GFP_NOFS);
851
852         if (ret)
853                 goto out;
854
855         BTRFS_I(inode)->generation = trans->transid;
856
857         filemap_write_and_wait(inode->i_mapping);
858
859         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
860         key.offset = offset;
861         key.type = 0;
862
863         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
864         if (ret < 0) {
865                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
866                                  EXTENT_DIRTY | EXTENT_DELALLOC |
867                                  EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
868                 goto out;
869         }
870         leaf = path->nodes[0];
871         if (ret > 0) {
872                 struct btrfs_key found_key;
873                 BUG_ON(!path->slots[0]);
874                 path->slots[0]--;
875                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
876                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
877                     found_key.offset != offset) {
878                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
879                                          inode->i_size - 1,
880                                          EXTENT_DIRTY | EXTENT_DELALLOC |
881                                          EXTENT_DO_ACCOUNTING, 0, 0, NULL,
882                                          GFP_NOFS);
883                         btrfs_release_path(path);
884                         goto out;
885                 }
886         }
887         header = btrfs_item_ptr(leaf, path->slots[0],
888                                 struct btrfs_free_space_header);
889         btrfs_set_free_space_entries(leaf, header, entries);
890         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
891         btrfs_set_free_space_generation(leaf, header, trans->transid);
892         btrfs_mark_buffer_dirty(leaf);
893         btrfs_release_path(path);
894
895         err = 0;
896 out:
897         io_ctl_free(&io_ctl);
898         if (err) {
899                 invalidate_inode_pages2(inode->i_mapping);
900                 BTRFS_I(inode)->generation = 0;
901         }
902         btrfs_update_inode(trans, root, inode);
903         return err;
904
905 out_nospc:
906         list_for_each_safe(pos, n, &bitmap_list) {
907                 struct btrfs_free_space *entry =
908                         list_entry(pos, struct btrfs_free_space, list);
909                 list_del_init(&entry->list);
910         }
911         io_ctl_drop_pages(&io_ctl);
912         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
913                              i_size_read(inode) - 1, &cached_state, GFP_NOFS);
914         goto out;
915 }
916
917 int btrfs_write_out_cache(struct btrfs_root *root,
918                           struct btrfs_trans_handle *trans,
919                           struct btrfs_block_group_cache *block_group,
920                           struct btrfs_path *path)
921 {
922         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
923         struct inode *inode;
924         int ret = 0;
925
926         root = root->fs_info->tree_root;
927
928         spin_lock(&block_group->lock);
929         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
930                 spin_unlock(&block_group->lock);
931                 return 0;
932         }
933         spin_unlock(&block_group->lock);
934
935         inode = lookup_free_space_inode(root, block_group, path);
936         if (IS_ERR(inode))
937                 return 0;
938
939         ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
940                                       path, block_group->key.objectid);
941         if (ret) {
942                 btrfs_delalloc_release_metadata(inode, inode->i_size);
943                 spin_lock(&block_group->lock);
944                 block_group->disk_cache_state = BTRFS_DC_ERROR;
945                 spin_unlock(&block_group->lock);
946                 ret = 0;
947 #ifdef DEBUG
948                 printk(KERN_ERR "btrfs: failed to write free space cace "
949                        "for block group %llu\n", block_group->key.objectid);
950 #endif
951         }
952
953         iput(inode);
954         return ret;
955 }
956
957 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
958                                           u64 offset)
959 {
960         BUG_ON(offset < bitmap_start);
961         offset -= bitmap_start;
962         return (unsigned long)(div_u64(offset, unit));
963 }
964
965 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
966 {
967         return (unsigned long)(div_u64(bytes, unit));
968 }
969
970 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
971                                    u64 offset)
972 {
973         u64 bitmap_start;
974         u64 bytes_per_bitmap;
975
976         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
977         bitmap_start = offset - ctl->start;
978         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
979         bitmap_start *= bytes_per_bitmap;
980         bitmap_start += ctl->start;
981
982         return bitmap_start;
983 }
984
985 static int tree_insert_offset(struct rb_root *root, u64 offset,
986                               struct rb_node *node, int bitmap)
987 {
988         struct rb_node **p = &root->rb_node;
989         struct rb_node *parent = NULL;
990         struct btrfs_free_space *info;
991
992         while (*p) {
993                 parent = *p;
994                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
995
996                 if (offset < info->offset) {
997                         p = &(*p)->rb_left;
998                 } else if (offset > info->offset) {
999                         p = &(*p)->rb_right;
1000                 } else {
1001                         /*
1002                          * we could have a bitmap entry and an extent entry
1003                          * share the same offset.  If this is the case, we want
1004                          * the extent entry to always be found first if we do a
1005                          * linear search through the tree, since we want to have
1006                          * the quickest allocation time, and allocating from an
1007                          * extent is faster than allocating from a bitmap.  So
1008                          * if we're inserting a bitmap and we find an entry at
1009                          * this offset, we want to go right, or after this entry
1010                          * logically.  If we are inserting an extent and we've
1011                          * found a bitmap, we want to go left, or before
1012                          * logically.
1013                          */
1014                         if (bitmap) {
1015                                 if (info->bitmap) {
1016                                         WARN_ON_ONCE(1);
1017                                         return -EEXIST;
1018                                 }
1019                                 p = &(*p)->rb_right;
1020                         } else {
1021                                 if (!info->bitmap) {
1022                                         WARN_ON_ONCE(1);
1023                                         return -EEXIST;
1024                                 }
1025                                 p = &(*p)->rb_left;
1026                         }
1027                 }
1028         }
1029
1030         rb_link_node(node, parent, p);
1031         rb_insert_color(node, root);
1032
1033         return 0;
1034 }
1035
1036 /*
1037  * searches the tree for the given offset.
1038  *
1039  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1040  * want a section that has at least bytes size and comes at or after the given
1041  * offset.
1042  */
1043 static struct btrfs_free_space *
1044 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1045                    u64 offset, int bitmap_only, int fuzzy)
1046 {
1047         struct rb_node *n = ctl->free_space_offset.rb_node;
1048         struct btrfs_free_space *entry, *prev = NULL;
1049
1050         /* find entry that is closest to the 'offset' */
1051         while (1) {
1052                 if (!n) {
1053                         entry = NULL;
1054                         break;
1055                 }
1056
1057                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1058                 prev = entry;
1059
1060                 if (offset < entry->offset)
1061                         n = n->rb_left;
1062                 else if (offset > entry->offset)
1063                         n = n->rb_right;
1064                 else
1065                         break;
1066         }
1067
1068         if (bitmap_only) {
1069                 if (!entry)
1070                         return NULL;
1071                 if (entry->bitmap)
1072                         return entry;
1073
1074                 /*
1075                  * bitmap entry and extent entry may share same offset,
1076                  * in that case, bitmap entry comes after extent entry.
1077                  */
1078                 n = rb_next(n);
1079                 if (!n)
1080                         return NULL;
1081                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1082                 if (entry->offset != offset)
1083                         return NULL;
1084
1085                 WARN_ON(!entry->bitmap);
1086                 return entry;
1087         } else if (entry) {
1088                 if (entry->bitmap) {
1089                         /*
1090                          * if previous extent entry covers the offset,
1091                          * we should return it instead of the bitmap entry
1092                          */
1093                         n = &entry->offset_index;
1094                         while (1) {
1095                                 n = rb_prev(n);
1096                                 if (!n)
1097                                         break;
1098                                 prev = rb_entry(n, struct btrfs_free_space,
1099                                                 offset_index);
1100                                 if (!prev->bitmap) {
1101                                         if (prev->offset + prev->bytes > offset)
1102                                                 entry = prev;
1103                                         break;
1104                                 }
1105                         }
1106                 }
1107                 return entry;
1108         }
1109
1110         if (!prev)
1111                 return NULL;
1112
1113         /* find last entry before the 'offset' */
1114         entry = prev;
1115         if (entry->offset > offset) {
1116                 n = rb_prev(&entry->offset_index);
1117                 if (n) {
1118                         entry = rb_entry(n, struct btrfs_free_space,
1119                                         offset_index);
1120                         BUG_ON(entry->offset > offset);
1121                 } else {
1122                         if (fuzzy)
1123                                 return entry;
1124                         else
1125                                 return NULL;
1126                 }
1127         }
1128
1129         if (entry->bitmap) {
1130                 n = &entry->offset_index;
1131                 while (1) {
1132                         n = rb_prev(n);
1133                         if (!n)
1134                                 break;
1135                         prev = rb_entry(n, struct btrfs_free_space,
1136                                         offset_index);
1137                         if (!prev->bitmap) {
1138                                 if (prev->offset + prev->bytes > offset)
1139                                         return prev;
1140                                 break;
1141                         }
1142                 }
1143                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1144                         return entry;
1145         } else if (entry->offset + entry->bytes > offset)
1146                 return entry;
1147
1148         if (!fuzzy)
1149                 return NULL;
1150
1151         while (1) {
1152                 if (entry->bitmap) {
1153                         if (entry->offset + BITS_PER_BITMAP *
1154                             ctl->unit > offset)
1155                                 break;
1156                 } else {
1157                         if (entry->offset + entry->bytes > offset)
1158                                 break;
1159                 }
1160
1161                 n = rb_next(&entry->offset_index);
1162                 if (!n)
1163                         return NULL;
1164                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1165         }
1166         return entry;
1167 }
1168
1169 static inline void
1170 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1171                     struct btrfs_free_space *info)
1172 {
1173         rb_erase(&info->offset_index, &ctl->free_space_offset);
1174         ctl->free_extents--;
1175 }
1176
1177 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1178                               struct btrfs_free_space *info)
1179 {
1180         __unlink_free_space(ctl, info);
1181         ctl->free_space -= info->bytes;
1182 }
1183
1184 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1185                            struct btrfs_free_space *info)
1186 {
1187         int ret = 0;
1188
1189         BUG_ON(!info->bitmap && !info->bytes);
1190         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1191                                  &info->offset_index, (info->bitmap != NULL));
1192         if (ret)
1193                 return ret;
1194
1195         ctl->free_space += info->bytes;
1196         ctl->free_extents++;
1197         return ret;
1198 }
1199
1200 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1201 {
1202         struct btrfs_block_group_cache *block_group = ctl->private;
1203         u64 max_bytes;
1204         u64 bitmap_bytes;
1205         u64 extent_bytes;
1206         u64 size = block_group->key.offset;
1207         u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
1208         int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1209
1210         BUG_ON(ctl->total_bitmaps > max_bitmaps);
1211
1212         /*
1213          * The goal is to keep the total amount of memory used per 1gb of space
1214          * at or below 32k, so we need to adjust how much memory we allow to be
1215          * used by extent based free space tracking
1216          */
1217         if (size < 1024 * 1024 * 1024)
1218                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1219         else
1220                 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1221                         div64_u64(size, 1024 * 1024 * 1024);
1222
1223         /*
1224          * we want to account for 1 more bitmap than what we have so we can make
1225          * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1226          * we add more bitmaps.
1227          */
1228         bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
1229
1230         if (bitmap_bytes >= max_bytes) {
1231                 ctl->extents_thresh = 0;
1232                 return;
1233         }
1234
1235         /*
1236          * we want the extent entry threshold to always be at most 1/2 the maxw
1237          * bytes we can have, or whatever is less than that.
1238          */
1239         extent_bytes = max_bytes - bitmap_bytes;
1240         extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
1241
1242         ctl->extents_thresh =
1243                 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1244 }
1245
1246 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1247                                        struct btrfs_free_space *info,
1248                                        u64 offset, u64 bytes)
1249 {
1250         unsigned long start, count;
1251
1252         start = offset_to_bit(info->offset, ctl->unit, offset);
1253         count = bytes_to_bits(bytes, ctl->unit);
1254         BUG_ON(start + count > BITS_PER_BITMAP);
1255
1256         bitmap_clear(info->bitmap, start, count);
1257
1258         info->bytes -= bytes;
1259 }
1260
1261 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1262                               struct btrfs_free_space *info, u64 offset,
1263                               u64 bytes)
1264 {
1265         __bitmap_clear_bits(ctl, info, offset, bytes);
1266         ctl->free_space -= bytes;
1267 }
1268
1269 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1270                             struct btrfs_free_space *info, u64 offset,
1271                             u64 bytes)
1272 {
1273         unsigned long start, count;
1274
1275         start = offset_to_bit(info->offset, ctl->unit, offset);
1276         count = bytes_to_bits(bytes, ctl->unit);
1277         BUG_ON(start + count > BITS_PER_BITMAP);
1278
1279         bitmap_set(info->bitmap, start, count);
1280
1281         info->bytes += bytes;
1282         ctl->free_space += bytes;
1283 }
1284
1285 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1286                          struct btrfs_free_space *bitmap_info, u64 *offset,
1287                          u64 *bytes)
1288 {
1289         unsigned long found_bits = 0;
1290         unsigned long bits, i;
1291         unsigned long next_zero;
1292
1293         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1294                           max_t(u64, *offset, bitmap_info->offset));
1295         bits = bytes_to_bits(*bytes, ctl->unit);
1296
1297         for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
1298              i < BITS_PER_BITMAP;
1299              i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
1300                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1301                                                BITS_PER_BITMAP, i);
1302                 if ((next_zero - i) >= bits) {
1303                         found_bits = next_zero - i;
1304                         break;
1305                 }
1306                 i = next_zero;
1307         }
1308
1309         if (found_bits) {
1310                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1311                 *bytes = (u64)(found_bits) * ctl->unit;
1312                 return 0;
1313         }
1314
1315         return -1;
1316 }
1317
1318 static struct btrfs_free_space *
1319 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1320 {
1321         struct btrfs_free_space *entry;
1322         struct rb_node *node;
1323         int ret;
1324
1325         if (!ctl->free_space_offset.rb_node)
1326                 return NULL;
1327
1328         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1329         if (!entry)
1330                 return NULL;
1331
1332         for (node = &entry->offset_index; node; node = rb_next(node)) {
1333                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1334                 if (entry->bytes < *bytes)
1335                         continue;
1336
1337                 if (entry->bitmap) {
1338                         ret = search_bitmap(ctl, entry, offset, bytes);
1339                         if (!ret)
1340                                 return entry;
1341                         continue;
1342                 }
1343
1344                 *offset = entry->offset;
1345                 *bytes = entry->bytes;
1346                 return entry;
1347         }
1348
1349         return NULL;
1350 }
1351
1352 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1353                            struct btrfs_free_space *info, u64 offset)
1354 {
1355         info->offset = offset_to_bitmap(ctl, offset);
1356         info->bytes = 0;
1357         link_free_space(ctl, info);
1358         ctl->total_bitmaps++;
1359
1360         ctl->op->recalc_thresholds(ctl);
1361 }
1362
1363 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1364                         struct btrfs_free_space *bitmap_info)
1365 {
1366         unlink_free_space(ctl, bitmap_info);
1367         kfree(bitmap_info->bitmap);
1368         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1369         ctl->total_bitmaps--;
1370         ctl->op->recalc_thresholds(ctl);
1371 }
1372
1373 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1374                               struct btrfs_free_space *bitmap_info,
1375                               u64 *offset, u64 *bytes)
1376 {
1377         u64 end;
1378         u64 search_start, search_bytes;
1379         int ret;
1380
1381 again:
1382         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1383
1384         /*
1385          * XXX - this can go away after a few releases.
1386          *
1387          * since the only user of btrfs_remove_free_space is the tree logging
1388          * stuff, and the only way to test that is under crash conditions, we
1389          * want to have this debug stuff here just in case somethings not
1390          * working.  Search the bitmap for the space we are trying to use to
1391          * make sure its actually there.  If its not there then we need to stop
1392          * because something has gone wrong.
1393          */
1394         search_start = *offset;
1395         search_bytes = *bytes;
1396         search_bytes = min(search_bytes, end - search_start + 1);
1397         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
1398         BUG_ON(ret < 0 || search_start != *offset);
1399
1400         if (*offset > bitmap_info->offset && *offset + *bytes > end) {
1401                 bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
1402                 *bytes -= end - *offset + 1;
1403                 *offset = end + 1;
1404         } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
1405                 bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
1406                 *bytes = 0;
1407         }
1408
1409         if (*bytes) {
1410                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1411                 if (!bitmap_info->bytes)
1412                         free_bitmap(ctl, bitmap_info);
1413
1414                 /*
1415                  * no entry after this bitmap, but we still have bytes to
1416                  * remove, so something has gone wrong.
1417                  */
1418                 if (!next)
1419                         return -EINVAL;
1420
1421                 bitmap_info = rb_entry(next, struct btrfs_free_space,
1422                                        offset_index);
1423
1424                 /*
1425                  * if the next entry isn't a bitmap we need to return to let the
1426                  * extent stuff do its work.
1427                  */
1428                 if (!bitmap_info->bitmap)
1429                         return -EAGAIN;
1430
1431                 /*
1432                  * Ok the next item is a bitmap, but it may not actually hold
1433                  * the information for the rest of this free space stuff, so
1434                  * look for it, and if we don't find it return so we can try
1435                  * everything over again.
1436                  */
1437                 search_start = *offset;
1438                 search_bytes = *bytes;
1439                 ret = search_bitmap(ctl, bitmap_info, &search_start,
1440                                     &search_bytes);
1441                 if (ret < 0 || search_start != *offset)
1442                         return -EAGAIN;
1443
1444                 goto again;
1445         } else if (!bitmap_info->bytes)
1446                 free_bitmap(ctl, bitmap_info);
1447
1448         return 0;
1449 }
1450
1451 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1452                                struct btrfs_free_space *info, u64 offset,
1453                                u64 bytes)
1454 {
1455         u64 bytes_to_set = 0;
1456         u64 end;
1457
1458         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1459
1460         bytes_to_set = min(end - offset, bytes);
1461
1462         bitmap_set_bits(ctl, info, offset, bytes_to_set);
1463
1464         return bytes_to_set;
1465
1466 }
1467
1468 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1469                       struct btrfs_free_space *info)
1470 {
1471         struct btrfs_block_group_cache *block_group = ctl->private;
1472
1473         /*
1474          * If we are below the extents threshold then we can add this as an
1475          * extent, and don't have to deal with the bitmap
1476          */
1477         if (ctl->free_extents < ctl->extents_thresh) {
1478                 /*
1479                  * If this block group has some small extents we don't want to
1480                  * use up all of our free slots in the cache with them, we want
1481                  * to reserve them to larger extents, however if we have plent
1482                  * of cache left then go ahead an dadd them, no sense in adding
1483                  * the overhead of a bitmap if we don't have to.
1484                  */
1485                 if (info->bytes <= block_group->sectorsize * 4) {
1486                         if (ctl->free_extents * 2 <= ctl->extents_thresh)
1487                                 return false;
1488                 } else {
1489                         return false;
1490                 }
1491         }
1492
1493         /*
1494          * some block groups are so tiny they can't be enveloped by a bitmap, so
1495          * don't even bother to create a bitmap for this
1496          */
1497         if (BITS_PER_BITMAP * block_group->sectorsize >
1498             block_group->key.offset)
1499                 return false;
1500
1501         return true;
1502 }
1503
1504 static struct btrfs_free_space_op free_space_op = {
1505         .recalc_thresholds      = recalculate_thresholds,
1506         .use_bitmap             = use_bitmap,
1507 };
1508
1509 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
1510                               struct btrfs_free_space *info)
1511 {
1512         struct btrfs_free_space *bitmap_info;
1513         struct btrfs_block_group_cache *block_group = NULL;
1514         int added = 0;
1515         u64 bytes, offset, bytes_added;
1516         int ret;
1517
1518         bytes = info->bytes;
1519         offset = info->offset;
1520
1521         if (!ctl->op->use_bitmap(ctl, info))
1522                 return 0;
1523
1524         if (ctl->op == &free_space_op)
1525                 block_group = ctl->private;
1526 again:
1527         /*
1528          * Since we link bitmaps right into the cluster we need to see if we
1529          * have a cluster here, and if so and it has our bitmap we need to add
1530          * the free space to that bitmap.
1531          */
1532         if (block_group && !list_empty(&block_group->cluster_list)) {
1533                 struct btrfs_free_cluster *cluster;
1534                 struct rb_node *node;
1535                 struct btrfs_free_space *entry;
1536
1537                 cluster = list_entry(block_group->cluster_list.next,
1538                                      struct btrfs_free_cluster,
1539                                      block_group_list);
1540                 spin_lock(&cluster->lock);
1541                 node = rb_first(&cluster->root);
1542                 if (!node) {
1543                         spin_unlock(&cluster->lock);
1544                         goto no_cluster_bitmap;
1545                 }
1546
1547                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1548                 if (!entry->bitmap) {
1549                         spin_unlock(&cluster->lock);
1550                         goto no_cluster_bitmap;
1551                 }
1552
1553                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
1554                         bytes_added = add_bytes_to_bitmap(ctl, entry,
1555                                                           offset, bytes);
1556                         bytes -= bytes_added;
1557                         offset += bytes_added;
1558                 }
1559                 spin_unlock(&cluster->lock);
1560                 if (!bytes) {
1561                         ret = 1;
1562                         goto out;
1563                 }
1564         }
1565
1566 no_cluster_bitmap:
1567         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1568                                          1, 0);
1569         if (!bitmap_info) {
1570                 BUG_ON(added);
1571                 goto new_bitmap;
1572         }
1573
1574         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
1575         bytes -= bytes_added;
1576         offset += bytes_added;
1577         added = 0;
1578
1579         if (!bytes) {
1580                 ret = 1;
1581                 goto out;
1582         } else
1583                 goto again;
1584
1585 new_bitmap:
1586         if (info && info->bitmap) {
1587                 add_new_bitmap(ctl, info, offset);
1588                 added = 1;
1589                 info = NULL;
1590                 goto again;
1591         } else {
1592                 spin_unlock(&ctl->tree_lock);
1593
1594                 /* no pre-allocated info, allocate a new one */
1595                 if (!info) {
1596                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
1597                                                  GFP_NOFS);
1598                         if (!info) {
1599                                 spin_lock(&ctl->tree_lock);
1600                                 ret = -ENOMEM;
1601                                 goto out;
1602                         }
1603                 }
1604
1605                 /* allocate the bitmap */
1606                 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
1607                 spin_lock(&ctl->tree_lock);
1608                 if (!info->bitmap) {
1609                         ret = -ENOMEM;
1610                         goto out;
1611                 }
1612                 goto again;
1613         }
1614
1615 out:
1616         if (info) {
1617                 if (info->bitmap)
1618                         kfree(info->bitmap);
1619                 kmem_cache_free(btrfs_free_space_cachep, info);
1620         }
1621
1622         return ret;
1623 }
1624
1625 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
1626                           struct btrfs_free_space *info, bool update_stat)
1627 {
1628         struct btrfs_free_space *left_info;
1629         struct btrfs_free_space *right_info;
1630         bool merged = false;
1631         u64 offset = info->offset;
1632         u64 bytes = info->bytes;
1633
1634         /*
1635          * first we want to see if there is free space adjacent to the range we
1636          * are adding, if there is remove that struct and add a new one to
1637          * cover the entire range
1638          */
1639         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
1640         if (right_info && rb_prev(&right_info->offset_index))
1641                 left_info = rb_entry(rb_prev(&right_info->offset_index),
1642                                      struct btrfs_free_space, offset_index);
1643         else
1644                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
1645
1646         if (right_info && !right_info->bitmap) {
1647                 if (update_stat)
1648                         unlink_free_space(ctl, right_info);
1649                 else
1650                         __unlink_free_space(ctl, right_info);
1651                 info->bytes += right_info->bytes;
1652                 kmem_cache_free(btrfs_free_space_cachep, right_info);
1653                 merged = true;
1654         }
1655
1656         if (left_info && !left_info->bitmap &&
1657             left_info->offset + left_info->bytes == offset) {
1658                 if (update_stat)
1659                         unlink_free_space(ctl, left_info);
1660                 else
1661                         __unlink_free_space(ctl, left_info);
1662                 info->offset = left_info->offset;
1663                 info->bytes += left_info->bytes;
1664                 kmem_cache_free(btrfs_free_space_cachep, left_info);
1665                 merged = true;
1666         }
1667
1668         return merged;
1669 }
1670
1671 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
1672                            u64 offset, u64 bytes)
1673 {
1674         struct btrfs_free_space *info;
1675         int ret = 0;
1676
1677         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
1678         if (!info)
1679                 return -ENOMEM;
1680
1681         info->offset = offset;
1682         info->bytes = bytes;
1683
1684         spin_lock(&ctl->tree_lock);
1685
1686         if (try_merge_free_space(ctl, info, true))
1687                 goto link;
1688
1689         /*
1690          * There was no extent directly to the left or right of this new
1691          * extent then we know we're going to have to allocate a new extent, so
1692          * before we do that see if we need to drop this into a bitmap
1693          */
1694         ret = insert_into_bitmap(ctl, info);
1695         if (ret < 0) {
1696                 goto out;
1697         } else if (ret) {
1698                 ret = 0;
1699                 goto out;
1700         }
1701 link:
1702         ret = link_free_space(ctl, info);
1703         if (ret)
1704                 kmem_cache_free(btrfs_free_space_cachep, info);
1705 out:
1706         spin_unlock(&ctl->tree_lock);
1707
1708         if (ret) {
1709                 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
1710                 BUG_ON(ret == -EEXIST);
1711         }
1712
1713         return ret;
1714 }
1715
1716 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1717                             u64 offset, u64 bytes)
1718 {
1719         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1720         struct btrfs_free_space *info;
1721         struct btrfs_free_space *next_info = NULL;
1722         int ret = 0;
1723
1724         spin_lock(&ctl->tree_lock);
1725
1726 again:
1727         info = tree_search_offset(ctl, offset, 0, 0);
1728         if (!info) {
1729                 /*
1730                  * oops didn't find an extent that matched the space we wanted
1731                  * to remove, look for a bitmap instead
1732                  */
1733                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1734                                           1, 0);
1735                 if (!info) {
1736                         WARN_ON(1);
1737                         goto out_lock;
1738                 }
1739         }
1740
1741         if (info->bytes < bytes && rb_next(&info->offset_index)) {
1742                 u64 end;
1743                 next_info = rb_entry(rb_next(&info->offset_index),
1744                                              struct btrfs_free_space,
1745                                              offset_index);
1746
1747                 if (next_info->bitmap)
1748                         end = next_info->offset +
1749                               BITS_PER_BITMAP * ctl->unit - 1;
1750                 else
1751                         end = next_info->offset + next_info->bytes;
1752
1753                 if (next_info->bytes < bytes ||
1754                     next_info->offset > offset || offset > end) {
1755                         printk(KERN_CRIT "Found free space at %llu, size %llu,"
1756                               " trying to use %llu\n",
1757                               (unsigned long long)info->offset,
1758                               (unsigned long long)info->bytes,
1759                               (unsigned long long)bytes);
1760                         WARN_ON(1);
1761                         ret = -EINVAL;
1762                         goto out_lock;
1763                 }
1764
1765                 info = next_info;
1766         }
1767
1768         if (info->bytes == bytes) {
1769                 unlink_free_space(ctl, info);
1770                 if (info->bitmap) {
1771                         kfree(info->bitmap);
1772                         ctl->total_bitmaps--;
1773                 }
1774                 kmem_cache_free(btrfs_free_space_cachep, info);
1775                 goto out_lock;
1776         }
1777
1778         if (!info->bitmap && info->offset == offset) {
1779                 unlink_free_space(ctl, info);
1780                 info->offset += bytes;
1781                 info->bytes -= bytes;
1782                 link_free_space(ctl, info);
1783                 goto out_lock;
1784         }
1785
1786         if (!info->bitmap && info->offset <= offset &&
1787             info->offset + info->bytes >= offset + bytes) {
1788                 u64 old_start = info->offset;
1789                 /*
1790                  * we're freeing space in the middle of the info,
1791                  * this can happen during tree log replay
1792                  *
1793                  * first unlink the old info and then
1794                  * insert it again after the hole we're creating
1795                  */
1796                 unlink_free_space(ctl, info);
1797                 if (offset + bytes < info->offset + info->bytes) {
1798                         u64 old_end = info->offset + info->bytes;
1799
1800                         info->offset = offset + bytes;
1801                         info->bytes = old_end - info->offset;
1802                         ret = link_free_space(ctl, info);
1803                         WARN_ON(ret);
1804                         if (ret)
1805                                 goto out_lock;
1806                 } else {
1807                         /* the hole we're creating ends at the end
1808                          * of the info struct, just free the info
1809                          */
1810                         kmem_cache_free(btrfs_free_space_cachep, info);
1811                 }
1812                 spin_unlock(&ctl->tree_lock);
1813
1814                 /* step two, insert a new info struct to cover
1815                  * anything before the hole
1816                  */
1817                 ret = btrfs_add_free_space(block_group, old_start,
1818                                            offset - old_start);
1819                 WARN_ON(ret);
1820                 goto out;
1821         }
1822
1823         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
1824         if (ret == -EAGAIN)
1825                 goto again;
1826         BUG_ON(ret);
1827 out_lock:
1828         spin_unlock(&ctl->tree_lock);
1829 out:
1830         return ret;
1831 }
1832
1833 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1834                            u64 bytes)
1835 {
1836         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1837         struct btrfs_free_space *info;
1838         struct rb_node *n;
1839         int count = 0;
1840
1841         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
1842                 info = rb_entry(n, struct btrfs_free_space, offset_index);
1843                 if (info->bytes >= bytes)
1844                         count++;
1845                 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
1846                        (unsigned long long)info->offset,
1847                        (unsigned long long)info->bytes,
1848                        (info->bitmap) ? "yes" : "no");
1849         }
1850         printk(KERN_INFO "block group has cluster?: %s\n",
1851                list_empty(&block_group->cluster_list) ? "no" : "yes");
1852         printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
1853                "\n", count);
1854 }
1855
1856 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
1857 {
1858         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1859
1860         spin_lock_init(&ctl->tree_lock);
1861         ctl->unit = block_group->sectorsize;
1862         ctl->start = block_group->key.objectid;
1863         ctl->private = block_group;
1864         ctl->op = &free_space_op;
1865
1866         /*
1867          * we only want to have 32k of ram per block group for keeping
1868          * track of free space, and if we pass 1/2 of that we want to
1869          * start converting things over to using bitmaps
1870          */
1871         ctl->extents_thresh = ((1024 * 32) / 2) /
1872                                 sizeof(struct btrfs_free_space);
1873 }
1874
1875 /*
1876  * for a given cluster, put all of its extents back into the free
1877  * space cache.  If the block group passed doesn't match the block group
1878  * pointed to by the cluster, someone else raced in and freed the
1879  * cluster already.  In that case, we just return without changing anything
1880  */
1881 static int
1882 __btrfs_return_cluster_to_free_space(
1883                              struct btrfs_block_group_cache *block_group,
1884                              struct btrfs_free_cluster *cluster)
1885 {
1886         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1887         struct btrfs_free_space *entry;
1888         struct rb_node *node;
1889
1890         spin_lock(&cluster->lock);
1891         if (cluster->block_group != block_group)
1892                 goto out;
1893
1894         cluster->block_group = NULL;
1895         cluster->window_start = 0;
1896         list_del_init(&cluster->block_group_list);
1897
1898         node = rb_first(&cluster->root);
1899         while (node) {
1900                 bool bitmap;
1901
1902                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1903                 node = rb_next(&entry->offset_index);
1904                 rb_erase(&entry->offset_index, &cluster->root);
1905
1906                 bitmap = (entry->bitmap != NULL);
1907                 if (!bitmap)
1908                         try_merge_free_space(ctl, entry, false);
1909                 tree_insert_offset(&ctl->free_space_offset,
1910                                    entry->offset, &entry->offset_index, bitmap);
1911         }
1912         cluster->root = RB_ROOT;
1913
1914 out:
1915         spin_unlock(&cluster->lock);
1916         btrfs_put_block_group(block_group);
1917         return 0;
1918 }
1919
1920 void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
1921 {
1922         struct btrfs_free_space *info;
1923         struct rb_node *node;
1924
1925         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
1926                 info = rb_entry(node, struct btrfs_free_space, offset_index);
1927                 if (!info->bitmap) {
1928                         unlink_free_space(ctl, info);
1929                         kmem_cache_free(btrfs_free_space_cachep, info);
1930                 } else {
1931                         free_bitmap(ctl, info);
1932                 }
1933                 if (need_resched()) {
1934                         spin_unlock(&ctl->tree_lock);
1935                         cond_resched();
1936                         spin_lock(&ctl->tree_lock);
1937                 }
1938         }
1939 }
1940
1941 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
1942 {
1943         spin_lock(&ctl->tree_lock);
1944         __btrfs_remove_free_space_cache_locked(ctl);
1945         spin_unlock(&ctl->tree_lock);
1946 }
1947
1948 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
1949 {
1950         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1951         struct btrfs_free_cluster *cluster;
1952         struct list_head *head;
1953
1954         spin_lock(&ctl->tree_lock);
1955         while ((head = block_group->cluster_list.next) !=
1956                &block_group->cluster_list) {
1957                 cluster = list_entry(head, struct btrfs_free_cluster,
1958                                      block_group_list);
1959
1960                 WARN_ON(cluster->block_group != block_group);
1961                 __btrfs_return_cluster_to_free_space(block_group, cluster);
1962                 if (need_resched()) {
1963                         spin_unlock(&ctl->tree_lock);
1964                         cond_resched();
1965                         spin_lock(&ctl->tree_lock);
1966                 }
1967         }
1968         __btrfs_remove_free_space_cache_locked(ctl);
1969         spin_unlock(&ctl->tree_lock);
1970
1971 }
1972
1973 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1974                                u64 offset, u64 bytes, u64 empty_size)
1975 {
1976         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1977         struct btrfs_free_space *entry = NULL;
1978         u64 bytes_search = bytes + empty_size;
1979         u64 ret = 0;
1980
1981         spin_lock(&ctl->tree_lock);
1982         entry = find_free_space(ctl, &offset, &bytes_search);
1983         if (!entry)
1984                 goto out;
1985
1986         ret = offset;
1987         if (entry->bitmap) {
1988                 bitmap_clear_bits(ctl, entry, offset, bytes);
1989                 if (!entry->bytes)
1990                         free_bitmap(ctl, entry);
1991         } else {
1992                 unlink_free_space(ctl, entry);
1993                 entry->offset += bytes;
1994                 entry->bytes -= bytes;
1995                 if (!entry->bytes)
1996                         kmem_cache_free(btrfs_free_space_cachep, entry);
1997                 else
1998                         link_free_space(ctl, entry);
1999         }
2000
2001 out:
2002         spin_unlock(&ctl->tree_lock);
2003
2004         return ret;
2005 }
2006
2007 /*
2008  * given a cluster, put all of its extents back into the free space
2009  * cache.  If a block group is passed, this function will only free
2010  * a cluster that belongs to the passed block group.
2011  *
2012  * Otherwise, it'll get a reference on the block group pointed to by the
2013  * cluster and remove the cluster from it.
2014  */
2015 int btrfs_return_cluster_to_free_space(
2016                                struct btrfs_block_group_cache *block_group,
2017                                struct btrfs_free_cluster *cluster)
2018 {
2019         struct btrfs_free_space_ctl *ctl;
2020         int ret;
2021
2022         /* first, get a safe pointer to the block group */
2023         spin_lock(&cluster->lock);
2024         if (!block_group) {
2025                 block_group = cluster->block_group;
2026                 if (!block_group) {
2027                         spin_unlock(&cluster->lock);
2028                         return 0;
2029                 }
2030         } else if (cluster->block_group != block_group) {
2031                 /* someone else has already freed it don't redo their work */
2032                 spin_unlock(&cluster->lock);
2033                 return 0;
2034         }
2035         atomic_inc(&block_group->count);
2036         spin_unlock(&cluster->lock);
2037
2038         ctl = block_group->free_space_ctl;
2039
2040         /* now return any extents the cluster had on it */
2041         spin_lock(&ctl->tree_lock);
2042         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2043         spin_unlock(&ctl->tree_lock);
2044
2045         /* finally drop our ref */
2046         btrfs_put_block_group(block_group);
2047         return ret;
2048 }
2049
2050 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2051                                    struct btrfs_free_cluster *cluster,
2052                                    struct btrfs_free_space *entry,
2053                                    u64 bytes, u64 min_start)
2054 {
2055         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2056         int err;
2057         u64 search_start = cluster->window_start;
2058         u64 search_bytes = bytes;
2059         u64 ret = 0;
2060
2061         search_start = min_start;
2062         search_bytes = bytes;
2063
2064         err = search_bitmap(ctl, entry, &search_start, &search_bytes);
2065         if (err)
2066                 return 0;
2067
2068         ret = search_start;
2069         __bitmap_clear_bits(ctl, entry, ret, bytes);
2070
2071         return ret;
2072 }
2073
2074 /*
2075  * given a cluster, try to allocate 'bytes' from it, returns 0
2076  * if it couldn't find anything suitably large, or a logical disk offset
2077  * if things worked out
2078  */
2079 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2080                              struct btrfs_free_cluster *cluster, u64 bytes,
2081                              u64 min_start)
2082 {
2083         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2084         struct btrfs_free_space *entry = NULL;
2085         struct rb_node *node;
2086         u64 ret = 0;
2087
2088         spin_lock(&cluster->lock);
2089         if (bytes > cluster->max_size)
2090                 goto out;
2091
2092         if (cluster->block_group != block_group)
2093                 goto out;
2094
2095         node = rb_first(&cluster->root);
2096         if (!node)
2097                 goto out;
2098
2099         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2100         while(1) {
2101                 if (entry->bytes < bytes ||
2102                     (!entry->bitmap && entry->offset < min_start)) {
2103                         node = rb_next(&entry->offset_index);
2104                         if (!node)
2105                                 break;
2106                         entry = rb_entry(node, struct btrfs_free_space,
2107                                          offset_index);
2108                         continue;
2109                 }
2110
2111                 if (entry->bitmap) {
2112                         ret = btrfs_alloc_from_bitmap(block_group,
2113                                                       cluster, entry, bytes,
2114                                                       min_start);
2115                         if (ret == 0) {
2116                                 node = rb_next(&entry->offset_index);
2117                                 if (!node)
2118                                         break;
2119                                 entry = rb_entry(node, struct btrfs_free_space,
2120                                                  offset_index);
2121                                 continue;
2122                         }
2123                 } else {
2124                         ret = entry->offset;
2125
2126                         entry->offset += bytes;
2127                         entry->bytes -= bytes;
2128                 }
2129
2130                 if (entry->bytes == 0)
2131                         rb_erase(&entry->offset_index, &cluster->root);
2132                 break;
2133         }
2134 out:
2135         spin_unlock(&cluster->lock);
2136
2137         if (!ret)
2138                 return 0;
2139
2140         spin_lock(&ctl->tree_lock);
2141
2142         ctl->free_space -= bytes;
2143         if (entry->bytes == 0) {
2144                 ctl->free_extents--;
2145                 if (entry->bitmap) {
2146                         kfree(entry->bitmap);
2147                         ctl->total_bitmaps--;
2148                         ctl->op->recalc_thresholds(ctl);
2149                 }
2150                 kmem_cache_free(btrfs_free_space_cachep, entry);
2151         }
2152
2153         spin_unlock(&ctl->tree_lock);
2154
2155         return ret;
2156 }
2157
2158 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2159                                 struct btrfs_free_space *entry,
2160                                 struct btrfs_free_cluster *cluster,
2161                                 u64 offset, u64 bytes, u64 min_bytes)
2162 {
2163         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2164         unsigned long next_zero;
2165         unsigned long i;
2166         unsigned long search_bits;
2167         unsigned long total_bits;
2168         unsigned long found_bits;
2169         unsigned long start = 0;
2170         unsigned long total_found = 0;
2171         int ret;
2172         bool found = false;
2173
2174         i = offset_to_bit(entry->offset, block_group->sectorsize,
2175                           max_t(u64, offset, entry->offset));
2176         search_bits = bytes_to_bits(bytes, block_group->sectorsize);
2177         total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2178
2179 again:
2180         found_bits = 0;
2181         for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
2182              i < BITS_PER_BITMAP;
2183              i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2184                 next_zero = find_next_zero_bit(entry->bitmap,
2185                                                BITS_PER_BITMAP, i);
2186                 if (next_zero - i >= search_bits) {
2187                         found_bits = next_zero - i;
2188                         break;
2189                 }
2190                 i = next_zero;
2191         }
2192
2193         if (!found_bits)
2194                 return -ENOSPC;
2195
2196         if (!found) {
2197                 start = i;
2198                 found = true;
2199         }
2200
2201         total_found += found_bits;
2202
2203         if (cluster->max_size < found_bits * block_group->sectorsize)
2204                 cluster->max_size = found_bits * block_group->sectorsize;
2205
2206         if (total_found < total_bits) {
2207                 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
2208                 if (i - start > total_bits * 2) {
2209                         total_found = 0;
2210                         cluster->max_size = 0;
2211                         found = false;
2212                 }
2213                 goto again;
2214         }
2215
2216         cluster->window_start = start * block_group->sectorsize +
2217                 entry->offset;
2218         rb_erase(&entry->offset_index, &ctl->free_space_offset);
2219         ret = tree_insert_offset(&cluster->root, entry->offset,
2220                                  &entry->offset_index, 1);
2221         BUG_ON(ret);
2222
2223         return 0;
2224 }
2225
2226 /*
2227  * This searches the block group for just extents to fill the cluster with.
2228  */
2229 static noinline int
2230 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2231                         struct btrfs_free_cluster *cluster,
2232                         struct list_head *bitmaps, u64 offset, u64 bytes,
2233                         u64 min_bytes)
2234 {
2235         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2236         struct btrfs_free_space *first = NULL;
2237         struct btrfs_free_space *entry = NULL;
2238         struct btrfs_free_space *prev = NULL;
2239         struct btrfs_free_space *last;
2240         struct rb_node *node;
2241         u64 window_start;
2242         u64 window_free;
2243         u64 max_extent;
2244         u64 max_gap = 128 * 1024;
2245
2246         entry = tree_search_offset(ctl, offset, 0, 1);
2247         if (!entry)
2248                 return -ENOSPC;
2249
2250         /*
2251          * We don't want bitmaps, so just move along until we find a normal
2252          * extent entry.
2253          */
2254         while (entry->bitmap) {
2255                 if (list_empty(&entry->list))
2256                         list_add_tail(&entry->list, bitmaps);
2257                 node = rb_next(&entry->offset_index);
2258                 if (!node)
2259                         return -ENOSPC;
2260                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2261         }
2262
2263         window_start = entry->offset;
2264         window_free = entry->bytes;
2265         max_extent = entry->bytes;
2266         first = entry;
2267         last = entry;
2268         prev = entry;
2269
2270         while (window_free <= min_bytes) {
2271                 node = rb_next(&entry->offset_index);
2272                 if (!node)
2273                         return -ENOSPC;
2274                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2275
2276                 if (entry->bitmap) {
2277                         if (list_empty(&entry->list))
2278                                 list_add_tail(&entry->list, bitmaps);
2279                         continue;
2280                 }
2281
2282                 /*
2283                  * we haven't filled the empty size and the window is
2284                  * very large.  reset and try again
2285                  */
2286                 if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
2287                     entry->offset - window_start > (min_bytes * 2)) {
2288                         first = entry;
2289                         window_start = entry->offset;
2290                         window_free = entry->bytes;
2291                         last = entry;
2292                         max_extent = entry->bytes;
2293                 } else {
2294                         last = entry;
2295                         window_free += entry->bytes;
2296                         if (entry->bytes > max_extent)
2297                                 max_extent = entry->bytes;
2298                 }
2299                 prev = entry;
2300         }
2301
2302         cluster->window_start = first->offset;
2303
2304         node = &first->offset_index;
2305
2306         /*
2307          * now we've found our entries, pull them out of the free space
2308          * cache and put them into the cluster rbtree
2309          */
2310         do {
2311                 int ret;
2312
2313                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2314                 node = rb_next(&entry->offset_index);
2315                 if (entry->bitmap)
2316                         continue;
2317
2318                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2319                 ret = tree_insert_offset(&cluster->root, entry->offset,
2320                                          &entry->offset_index, 0);
2321                 BUG_ON(ret);
2322         } while (node && entry != last);
2323
2324         cluster->max_size = max_extent;
2325
2326         return 0;
2327 }
2328
2329 /*
2330  * This specifically looks for bitmaps that may work in the cluster, we assume
2331  * that we have already failed to find extents that will work.
2332  */
2333 static noinline int
2334 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2335                      struct btrfs_free_cluster *cluster,
2336                      struct list_head *bitmaps, u64 offset, u64 bytes,
2337                      u64 min_bytes)
2338 {
2339         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2340         struct btrfs_free_space *entry;
2341         struct rb_node *node;
2342         int ret = -ENOSPC;
2343
2344         if (ctl->total_bitmaps == 0)
2345                 return -ENOSPC;
2346
2347         /*
2348          * First check our cached list of bitmaps and see if there is an entry
2349          * here that will work.
2350          */
2351         list_for_each_entry(entry, bitmaps, list) {
2352                 if (entry->bytes < min_bytes)
2353                         continue;
2354                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2355                                            bytes, min_bytes);
2356                 if (!ret)
2357                         return 0;
2358         }
2359
2360         /*
2361          * If we do have entries on our list and we are here then we didn't find
2362          * anything, so go ahead and get the next entry after the last entry in
2363          * this list and start the search from there.
2364          */
2365         if (!list_empty(bitmaps)) {
2366                 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2367                                    list);
2368                 node = rb_next(&entry->offset_index);
2369                 if (!node)
2370                         return -ENOSPC;
2371                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2372                 goto search;
2373         }
2374
2375         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2376         if (!entry)
2377                 return -ENOSPC;
2378
2379 search:
2380         node = &entry->offset_index;
2381         do {
2382                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2383                 node = rb_next(&entry->offset_index);
2384                 if (!entry->bitmap)
2385                         continue;
2386                 if (entry->bytes < min_bytes)
2387                         continue;
2388                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2389                                            bytes, min_bytes);
2390         } while (ret && node);
2391
2392         return ret;
2393 }
2394
2395 /*
2396  * here we try to find a cluster of blocks in a block group.  The goal
2397  * is to find at least bytes free and up to empty_size + bytes free.
2398  * We might not find them all in one contiguous area.
2399  *
2400  * returns zero and sets up cluster if things worked out, otherwise
2401  * it returns -enospc
2402  */
2403 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2404                              struct btrfs_root *root,
2405                              struct btrfs_block_group_cache *block_group,
2406                              struct btrfs_free_cluster *cluster,
2407                              u64 offset, u64 bytes, u64 empty_size)
2408 {
2409         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2410         struct list_head bitmaps;
2411         struct btrfs_free_space *entry, *tmp;
2412         u64 min_bytes;
2413         int ret;
2414
2415         /* for metadata, allow allocates with more holes */
2416         if (btrfs_test_opt(root, SSD_SPREAD)) {
2417                 min_bytes = bytes + empty_size;
2418         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2419                 /*
2420                  * we want to do larger allocations when we are
2421                  * flushing out the delayed refs, it helps prevent
2422                  * making more work as we go along.
2423                  */
2424                 if (trans->transaction->delayed_refs.flushing)
2425                         min_bytes = max(bytes, (bytes + empty_size) >> 1);
2426                 else
2427                         min_bytes = max(bytes, (bytes + empty_size) >> 4);
2428         } else
2429                 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2430
2431         spin_lock(&ctl->tree_lock);
2432
2433         /*
2434          * If we know we don't have enough space to make a cluster don't even
2435          * bother doing all the work to try and find one.
2436          */
2437         if (ctl->free_space < min_bytes) {
2438                 spin_unlock(&ctl->tree_lock);
2439                 return -ENOSPC;
2440         }
2441
2442         spin_lock(&cluster->lock);
2443
2444         /* someone already found a cluster, hooray */
2445         if (cluster->block_group) {
2446                 ret = 0;
2447                 goto out;
2448         }
2449
2450         INIT_LIST_HEAD(&bitmaps);
2451         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2452                                       bytes, min_bytes);
2453         if (ret)
2454                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2455                                            offset, bytes, min_bytes);
2456
2457         /* Clear our temporary list */
2458         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
2459                 list_del_init(&entry->list);
2460
2461         if (!ret) {
2462                 atomic_inc(&block_group->count);
2463                 list_add_tail(&cluster->block_group_list,
2464                               &block_group->cluster_list);
2465                 cluster->block_group = block_group;
2466         }
2467 out:
2468         spin_unlock(&cluster->lock);
2469         spin_unlock(&ctl->tree_lock);
2470
2471         return ret;
2472 }
2473
2474 /*
2475  * simple code to zero out a cluster
2476  */
2477 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2478 {
2479         spin_lock_init(&cluster->lock);
2480         spin_lock_init(&cluster->refill_lock);
2481         cluster->root = RB_ROOT;
2482         cluster->max_size = 0;
2483         INIT_LIST_HEAD(&cluster->block_group_list);
2484         cluster->block_group = NULL;
2485 }
2486
2487 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2488                            u64 *trimmed, u64 start, u64 end, u64 minlen)
2489 {
2490         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2491         struct btrfs_free_space *entry = NULL;
2492         struct btrfs_fs_info *fs_info = block_group->fs_info;
2493         u64 bytes = 0;
2494         u64 actually_trimmed;
2495         int ret = 0;
2496
2497         *trimmed = 0;
2498
2499         while (start < end) {
2500                 spin_lock(&ctl->tree_lock);
2501
2502                 if (ctl->free_space < minlen) {
2503                         spin_unlock(&ctl->tree_lock);
2504                         break;
2505                 }
2506
2507                 entry = tree_search_offset(ctl, start, 0, 1);
2508                 if (!entry)
2509                         entry = tree_search_offset(ctl,
2510                                                    offset_to_bitmap(ctl, start),
2511                                                    1, 1);
2512
2513                 if (!entry || entry->offset >= end) {
2514                         spin_unlock(&ctl->tree_lock);
2515                         break;
2516                 }
2517
2518                 if (entry->bitmap) {
2519                         ret = search_bitmap(ctl, entry, &start, &bytes);
2520                         if (!ret) {
2521                                 if (start >= end) {
2522                                         spin_unlock(&ctl->tree_lock);
2523                                         break;
2524                                 }
2525                                 bytes = min(bytes, end - start);
2526                                 bitmap_clear_bits(ctl, entry, start, bytes);
2527                                 if (entry->bytes == 0)
2528                                         free_bitmap(ctl, entry);
2529                         } else {
2530                                 start = entry->offset + BITS_PER_BITMAP *
2531                                         block_group->sectorsize;
2532                                 spin_unlock(&ctl->tree_lock);
2533                                 ret = 0;
2534                                 continue;
2535                         }
2536                 } else {
2537                         start = entry->offset;
2538                         bytes = min(entry->bytes, end - start);
2539                         unlink_free_space(ctl, entry);
2540                         kmem_cache_free(btrfs_free_space_cachep, entry);
2541                 }
2542
2543                 spin_unlock(&ctl->tree_lock);
2544
2545                 if (bytes >= minlen) {
2546                         struct btrfs_space_info *space_info;
2547                         int update = 0;
2548
2549                         space_info = block_group->space_info;
2550                         spin_lock(&space_info->lock);
2551                         spin_lock(&block_group->lock);
2552                         if (!block_group->ro) {
2553                                 block_group->reserved += bytes;
2554                                 space_info->bytes_reserved += bytes;
2555                                 update = 1;
2556                         }
2557                         spin_unlock(&block_group->lock);
2558                         spin_unlock(&space_info->lock);
2559
2560                         ret = btrfs_error_discard_extent(fs_info->extent_root,
2561                                                          start,
2562                                                          bytes,
2563                                                          &actually_trimmed);
2564
2565                         btrfs_add_free_space(block_group, start, bytes);
2566                         if (update) {
2567                                 spin_lock(&space_info->lock);
2568                                 spin_lock(&block_group->lock);
2569                                 if (block_group->ro)
2570                                         space_info->bytes_readonly += bytes;
2571                                 block_group->reserved -= bytes;
2572                                 space_info->bytes_reserved -= bytes;
2573                                 spin_unlock(&space_info->lock);
2574                                 spin_unlock(&block_group->lock);
2575                         }
2576
2577                         if (ret)
2578                                 break;
2579                         *trimmed += actually_trimmed;
2580                 }
2581                 start += bytes;
2582                 bytes = 0;
2583
2584                 if (fatal_signal_pending(current)) {
2585                         ret = -ERESTARTSYS;
2586                         break;
2587                 }
2588
2589                 cond_resched();
2590         }
2591
2592         return ret;
2593 }
2594
2595 /*
2596  * Find the left-most item in the cache tree, and then return the
2597  * smallest inode number in the item.
2598  *
2599  * Note: the returned inode number may not be the smallest one in
2600  * the tree, if the left-most item is a bitmap.
2601  */
2602 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
2603 {
2604         struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
2605         struct btrfs_free_space *entry = NULL;
2606         u64 ino = 0;
2607
2608         spin_lock(&ctl->tree_lock);
2609
2610         if (RB_EMPTY_ROOT(&ctl->free_space_offset))
2611                 goto out;
2612
2613         entry = rb_entry(rb_first(&ctl->free_space_offset),
2614                          struct btrfs_free_space, offset_index);
2615
2616         if (!entry->bitmap) {
2617                 ino = entry->offset;
2618
2619                 unlink_free_space(ctl, entry);
2620                 entry->offset++;
2621                 entry->bytes--;
2622                 if (!entry->bytes)
2623                         kmem_cache_free(btrfs_free_space_cachep, entry);
2624                 else
2625                         link_free_space(ctl, entry);
2626         } else {
2627                 u64 offset = 0;
2628                 u64 count = 1;
2629                 int ret;
2630
2631                 ret = search_bitmap(ctl, entry, &offset, &count);
2632                 BUG_ON(ret);
2633
2634                 ino = offset;
2635                 bitmap_clear_bits(ctl, entry, offset, 1);
2636                 if (entry->bytes == 0)
2637                         free_bitmap(ctl, entry);
2638         }
2639 out:
2640         spin_unlock(&ctl->tree_lock);
2641
2642         return ino;
2643 }
2644
2645 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
2646                                     struct btrfs_path *path)
2647 {
2648         struct inode *inode = NULL;
2649
2650         spin_lock(&root->cache_lock);
2651         if (root->cache_inode)
2652                 inode = igrab(root->cache_inode);
2653         spin_unlock(&root->cache_lock);
2654         if (inode)
2655                 return inode;
2656
2657         inode = __lookup_free_space_inode(root, path, 0);
2658         if (IS_ERR(inode))
2659                 return inode;
2660
2661         spin_lock(&root->cache_lock);
2662         if (!btrfs_fs_closing(root->fs_info))
2663                 root->cache_inode = igrab(inode);
2664         spin_unlock(&root->cache_lock);
2665
2666         return inode;
2667 }
2668
2669 int create_free_ino_inode(struct btrfs_root *root,
2670                           struct btrfs_trans_handle *trans,
2671                           struct btrfs_path *path)
2672 {
2673         return __create_free_space_inode(root, trans, path,
2674                                          BTRFS_FREE_INO_OBJECTID, 0);
2675 }
2676
2677 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2678 {
2679         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
2680         struct btrfs_path *path;
2681         struct inode *inode;
2682         int ret = 0;
2683         u64 root_gen = btrfs_root_generation(&root->root_item);
2684
2685         if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2686                 return 0;
2687
2688         /*
2689          * If we're unmounting then just return, since this does a search on the
2690          * normal root and not the commit root and we could deadlock.
2691          */
2692         if (btrfs_fs_closing(fs_info))
2693                 return 0;
2694
2695         path = btrfs_alloc_path();
2696         if (!path)
2697                 return 0;
2698
2699         inode = lookup_free_ino_inode(root, path);
2700         if (IS_ERR(inode))
2701                 goto out;
2702
2703         if (root_gen != BTRFS_I(inode)->generation)
2704                 goto out_put;
2705
2706         ret = __load_free_space_cache(root, inode, ctl, path, 0);
2707
2708         if (ret < 0)
2709                 printk(KERN_ERR "btrfs: failed to load free ino cache for "
2710                        "root %llu\n", root->root_key.objectid);
2711 out_put:
2712         iput(inode);
2713 out:
2714         btrfs_free_path(path);
2715         return ret;
2716 }
2717
2718 int btrfs_write_out_ino_cache(struct btrfs_root *root,
2719                               struct btrfs_trans_handle *trans,
2720                               struct btrfs_path *path)
2721 {
2722         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
2723         struct inode *inode;
2724         int ret;
2725
2726         if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2727                 return 0;
2728
2729         inode = lookup_free_ino_inode(root, path);
2730         if (IS_ERR(inode))
2731                 return 0;
2732
2733         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2734         if (ret) {
2735                 btrfs_delalloc_release_metadata(inode, inode->i_size);
2736 #ifdef DEBUG
2737                 printk(KERN_ERR "btrfs: failed to write free ino cache "
2738                        "for root %llu\n", root->root_key.objectid);
2739 #endif
2740         }
2741
2742         iput(inode);
2743         return ret;
2744 }