2 * Copyright (C) 2008 Red Hat. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/slab.h>
22 #include <linux/math64.h>
23 #include <linux/ratelimit.h>
25 #include "free-space-cache.h"
26 #include "transaction.h"
28 #include "extent_io.h"
29 #include "inode-map.h"
31 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
32 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
34 static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 struct btrfs_free_space *info);
37 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
38 struct btrfs_path *path,
42 struct btrfs_key location;
43 struct btrfs_disk_key disk_key;
44 struct btrfs_free_space_header *header;
45 struct extent_buffer *leaf;
46 struct inode *inode = NULL;
49 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
53 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
57 btrfs_release_path(path);
58 return ERR_PTR(-ENOENT);
61 leaf = path->nodes[0];
62 header = btrfs_item_ptr(leaf, path->slots[0],
63 struct btrfs_free_space_header);
64 btrfs_free_space_key(leaf, header, &disk_key);
65 btrfs_disk_key_to_cpu(&location, &disk_key);
66 btrfs_release_path(path);
68 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
70 return ERR_PTR(-ENOENT);
73 if (is_bad_inode(inode)) {
75 return ERR_PTR(-ENOENT);
78 inode->i_mapping->flags &= ~__GFP_FS;
83 struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 struct btrfs_block_group_cache
85 *block_group, struct btrfs_path *path)
87 struct inode *inode = NULL;
89 spin_lock(&block_group->lock);
90 if (block_group->inode)
91 inode = igrab(block_group->inode);
92 spin_unlock(&block_group->lock);
96 inode = __lookup_free_space_inode(root, path,
97 block_group->key.objectid);
101 spin_lock(&block_group->lock);
102 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
103 printk(KERN_INFO "Old style space inode found, converting.\n");
104 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
105 block_group->disk_cache_state = BTRFS_DC_CLEAR;
108 if (!block_group->iref) {
109 block_group->inode = igrab(inode);
110 block_group->iref = 1;
112 spin_unlock(&block_group->lock);
117 int __create_free_space_inode(struct btrfs_root *root,
118 struct btrfs_trans_handle *trans,
119 struct btrfs_path *path, u64 ino, u64 offset)
121 struct btrfs_key key;
122 struct btrfs_disk_key disk_key;
123 struct btrfs_free_space_header *header;
124 struct btrfs_inode_item *inode_item;
125 struct extent_buffer *leaf;
128 ret = btrfs_insert_empty_inode(trans, root, path, ino);
132 leaf = path->nodes[0];
133 inode_item = btrfs_item_ptr(leaf, path->slots[0],
134 struct btrfs_inode_item);
135 btrfs_item_key(leaf, &disk_key, path->slots[0]);
136 memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
137 sizeof(*inode_item));
138 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
139 btrfs_set_inode_size(leaf, inode_item, 0);
140 btrfs_set_inode_nbytes(leaf, inode_item, 0);
141 btrfs_set_inode_uid(leaf, inode_item, 0);
142 btrfs_set_inode_gid(leaf, inode_item, 0);
143 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
144 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
145 BTRFS_INODE_PREALLOC);
146 btrfs_set_inode_nlink(leaf, inode_item, 1);
147 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
148 btrfs_set_inode_block_group(leaf, inode_item, offset);
149 btrfs_mark_buffer_dirty(leaf);
150 btrfs_release_path(path);
152 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
156 ret = btrfs_insert_empty_item(trans, root, path, &key,
157 sizeof(struct btrfs_free_space_header));
159 btrfs_release_path(path);
162 leaf = path->nodes[0];
163 header = btrfs_item_ptr(leaf, path->slots[0],
164 struct btrfs_free_space_header);
165 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
166 btrfs_set_free_space_key(leaf, header, &disk_key);
167 btrfs_mark_buffer_dirty(leaf);
168 btrfs_release_path(path);
173 int create_free_space_inode(struct btrfs_root *root,
174 struct btrfs_trans_handle *trans,
175 struct btrfs_block_group_cache *block_group,
176 struct btrfs_path *path)
181 ret = btrfs_find_free_objectid(root, &ino);
185 return __create_free_space_inode(root, trans, path, ino,
186 block_group->key.objectid);
189 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
190 struct btrfs_trans_handle *trans,
191 struct btrfs_path *path,
194 struct btrfs_block_rsv *rsv;
198 rsv = trans->block_rsv;
199 trans->block_rsv = root->orphan_block_rsv;
200 ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0);
204 oldsize = i_size_read(inode);
205 btrfs_i_size_write(inode, 0);
206 truncate_pagecache(inode, oldsize, 0);
209 * We don't need an orphan item because truncating the free space cache
210 * will never be split across transactions.
212 ret = btrfs_truncate_inode_items(trans, root, inode,
213 0, BTRFS_EXTENT_DATA_KEY);
215 trans->block_rsv = rsv;
221 ret = btrfs_update_inode(trans, root, inode);
225 static int readahead_cache(struct inode *inode)
227 struct file_ra_state *ra;
228 unsigned long last_index;
230 ra = kzalloc(sizeof(*ra), GFP_NOFS);
234 file_ra_state_init(ra, inode->i_mapping);
235 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
237 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
248 struct btrfs_root *root;
254 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
255 struct btrfs_root *root)
257 memset(io_ctl, 0, sizeof(struct io_ctl));
258 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
260 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
268 static void io_ctl_free(struct io_ctl *io_ctl)
270 kfree(io_ctl->pages);
273 static void io_ctl_unmap_page(struct io_ctl *io_ctl)
276 kunmap(io_ctl->page);
282 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
284 WARN_ON(io_ctl->cur);
285 BUG_ON(io_ctl->index >= io_ctl->num_pages);
286 io_ctl->page = io_ctl->pages[io_ctl->index++];
287 io_ctl->cur = kmap(io_ctl->page);
288 io_ctl->orig = io_ctl->cur;
289 io_ctl->size = PAGE_CACHE_SIZE;
291 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
294 static void io_ctl_drop_pages(struct io_ctl *io_ctl)
298 io_ctl_unmap_page(io_ctl);
300 for (i = 0; i < io_ctl->num_pages; i++) {
301 ClearPageChecked(io_ctl->pages[i]);
302 unlock_page(io_ctl->pages[i]);
303 page_cache_release(io_ctl->pages[i]);
307 static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
311 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
314 for (i = 0; i < io_ctl->num_pages; i++) {
315 page = find_or_create_page(inode->i_mapping, i, mask);
317 io_ctl_drop_pages(io_ctl);
320 io_ctl->pages[i] = page;
321 if (uptodate && !PageUptodate(page)) {
322 btrfs_readpage(NULL, page);
324 if (!PageUptodate(page)) {
325 printk(KERN_ERR "btrfs: error reading free "
327 io_ctl_drop_pages(io_ctl);
336 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
340 io_ctl_map_page(io_ctl, 1);
343 * Skip the first 64bits to make sure theres a bogus crc for old
346 io_ctl->cur += sizeof(u64);
349 *val = cpu_to_le64(generation);
350 io_ctl->cur += sizeof(u64);
351 io_ctl->size -= sizeof(u64) * 2;
354 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
358 io_ctl_map_page(io_ctl, 0);
360 /* Skip the bogus crc area */
361 io_ctl->cur += sizeof(u64);
363 if (le64_to_cpu(*gen) != generation) {
364 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
365 "(%Lu) does not match inode (%Lu)\n", *gen,
367 io_ctl_unmap_page(io_ctl);
370 io_ctl->cur += sizeof(u64);
371 io_ctl->size -= sizeof(u64) * 2;
375 static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
378 struct btrfs_free_space_entry *entry;
384 entry->offset = cpu_to_le64(offset);
385 entry->bytes = cpu_to_le64(bytes);
386 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
387 BTRFS_FREE_SPACE_EXTENT;
388 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
389 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
391 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
395 * index == 1 means the current page is 0, we need to generate a bogus
396 * crc for older kernels.
398 if (io_ctl->index == 1) {
402 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64),
403 crc, PAGE_CACHE_SIZE - sizeof(u64));
404 btrfs_csum_final(crc, (char *)&crc);
409 io_ctl_unmap_page(io_ctl);
411 /* No more pages to map */
412 if (io_ctl->index >= io_ctl->num_pages)
415 /* map the next page */
416 io_ctl_map_page(io_ctl, 1);
420 static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
426 * If we aren't at the start of the current page, unmap this one and
427 * map the next one if there is any left.
429 if (io_ctl->cur != io_ctl->orig) {
430 io_ctl_unmap_page(io_ctl);
431 if (io_ctl->index >= io_ctl->num_pages)
433 io_ctl_map_page(io_ctl, 0);
436 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
437 io_ctl_unmap_page(io_ctl);
438 if (io_ctl->index < io_ctl->num_pages)
439 io_ctl_map_page(io_ctl, 0);
443 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
445 io_ctl_unmap_page(io_ctl);
447 while (io_ctl->index < io_ctl->num_pages) {
448 io_ctl_map_page(io_ctl, 1);
449 io_ctl_unmap_page(io_ctl);
453 static u8 io_ctl_read_entry(struct io_ctl *io_ctl,
454 struct btrfs_free_space *entry)
456 struct btrfs_free_space_entry *e;
460 entry->offset = le64_to_cpu(e->offset);
461 entry->bytes = le64_to_cpu(e->bytes);
463 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
464 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
466 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
469 io_ctl_unmap_page(io_ctl);
471 if (io_ctl->index >= io_ctl->num_pages)
474 io_ctl_map_page(io_ctl, 0);
478 static void io_ctl_read_bitmap(struct io_ctl *io_ctl,
479 struct btrfs_free_space *entry)
481 BUG_ON(!io_ctl->cur);
482 if (io_ctl->cur != io_ctl->orig) {
483 io_ctl_unmap_page(io_ctl);
484 io_ctl_map_page(io_ctl, 0);
486 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
487 io_ctl_unmap_page(io_ctl);
488 if (io_ctl->index < io_ctl->num_pages)
489 io_ctl_map_page(io_ctl, 0);
492 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
493 struct btrfs_free_space_ctl *ctl,
494 struct btrfs_path *path, u64 offset)
496 struct btrfs_free_space_header *header;
497 struct extent_buffer *leaf;
498 struct io_ctl io_ctl;
499 struct btrfs_key key;
500 struct btrfs_free_space *e, *n;
501 struct list_head bitmaps;
508 INIT_LIST_HEAD(&bitmaps);
510 /* Nothing in the space cache, goodbye */
511 if (!i_size_read(inode))
514 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
518 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
522 btrfs_release_path(path);
528 leaf = path->nodes[0];
529 header = btrfs_item_ptr(leaf, path->slots[0],
530 struct btrfs_free_space_header);
531 num_entries = btrfs_free_space_entries(leaf, header);
532 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
533 generation = btrfs_free_space_generation(leaf, header);
534 btrfs_release_path(path);
536 if (BTRFS_I(inode)->generation != generation) {
537 printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
538 " not match free space cache generation (%llu)\n",
539 (unsigned long long)BTRFS_I(inode)->generation,
540 (unsigned long long)generation);
547 io_ctl_init(&io_ctl, inode, root);
548 ret = readahead_cache(inode);
552 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
556 ret = io_ctl_check_generation(&io_ctl, generation);
560 while (num_entries) {
561 e = kmem_cache_zalloc(btrfs_free_space_cachep,
566 type = io_ctl_read_entry(&io_ctl, e);
568 kmem_cache_free(btrfs_free_space_cachep, e);
572 if (type == BTRFS_FREE_SPACE_EXTENT) {
573 spin_lock(&ctl->tree_lock);
574 ret = link_free_space(ctl, e);
575 spin_unlock(&ctl->tree_lock);
577 printk(KERN_ERR "Duplicate entries in "
578 "free space cache, dumping\n");
579 kmem_cache_free(btrfs_free_space_cachep, e);
583 BUG_ON(!num_bitmaps);
585 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
588 btrfs_free_space_cachep, e);
591 spin_lock(&ctl->tree_lock);
592 ret = link_free_space(ctl, e);
593 ctl->total_bitmaps++;
594 ctl->op->recalc_thresholds(ctl);
595 spin_unlock(&ctl->tree_lock);
597 printk(KERN_ERR "Duplicate entries in "
598 "free space cache, dumping\n");
599 kmem_cache_free(btrfs_free_space_cachep, e);
602 list_add_tail(&e->list, &bitmaps);
609 * We add the bitmaps at the end of the entries in order that
610 * the bitmap entries are added to the cache.
612 list_for_each_entry_safe(e, n, &bitmaps, list) {
613 list_del_init(&e->list);
614 io_ctl_read_bitmap(&io_ctl, e);
617 io_ctl_drop_pages(&io_ctl);
620 io_ctl_free(&io_ctl);
623 io_ctl_drop_pages(&io_ctl);
624 __btrfs_remove_free_space_cache(ctl);
628 int load_free_space_cache(struct btrfs_fs_info *fs_info,
629 struct btrfs_block_group_cache *block_group)
631 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
632 struct btrfs_root *root = fs_info->tree_root;
634 struct btrfs_path *path;
637 u64 used = btrfs_block_group_used(&block_group->item);
640 * If we're unmounting then just return, since this does a search on the
641 * normal root and not the commit root and we could deadlock.
643 if (btrfs_fs_closing(fs_info))
647 * If this block group has been marked to be cleared for one reason or
648 * another then we can't trust the on disk cache, so just return.
650 spin_lock(&block_group->lock);
651 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
652 spin_unlock(&block_group->lock);
655 spin_unlock(&block_group->lock);
657 path = btrfs_alloc_path();
661 inode = lookup_free_space_inode(root, block_group, path);
663 btrfs_free_path(path);
667 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
668 path, block_group->key.objectid);
669 btrfs_free_path(path);
673 spin_lock(&ctl->tree_lock);
674 matched = (ctl->free_space == (block_group->key.offset - used -
675 block_group->bytes_super));
676 spin_unlock(&ctl->tree_lock);
679 __btrfs_remove_free_space_cache(ctl);
680 printk(KERN_ERR "block group %llu has an wrong amount of free "
681 "space\n", block_group->key.objectid);
686 /* This cache is bogus, make sure it gets cleared */
687 spin_lock(&block_group->lock);
688 block_group->disk_cache_state = BTRFS_DC_CLEAR;
689 spin_unlock(&block_group->lock);
692 printk(KERN_ERR "btrfs: failed to load free space cache "
693 "for block group %llu\n", block_group->key.objectid);
701 * __btrfs_write_out_cache - write out cached info to an inode
702 * @root - the root the inode belongs to
703 * @ctl - the free space cache we are going to write out
704 * @block_group - the block_group for this cache if it belongs to a block_group
705 * @trans - the trans handle
706 * @path - the path to use
707 * @offset - the offset for the key we'll insert
709 * This function writes out a free space cache struct to disk for quick recovery
710 * on mount. This will return 0 if it was successfull in writing the cache out,
711 * and -1 if it was not.
713 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
714 struct btrfs_free_space_ctl *ctl,
715 struct btrfs_block_group_cache *block_group,
716 struct btrfs_trans_handle *trans,
717 struct btrfs_path *path, u64 offset)
719 struct btrfs_free_space_header *header;
720 struct extent_buffer *leaf;
721 struct rb_node *node;
722 struct list_head *pos, *n;
723 struct extent_state *cached_state = NULL;
724 struct btrfs_free_cluster *cluster = NULL;
725 struct extent_io_tree *unpin = NULL;
726 struct io_ctl io_ctl;
727 struct list_head bitmap_list;
728 struct btrfs_key key;
735 INIT_LIST_HEAD(&bitmap_list);
737 if (!i_size_read(inode))
740 filemap_write_and_wait(inode->i_mapping);
741 btrfs_wait_ordered_range(inode, inode->i_size &
742 ~(root->sectorsize - 1), (u64)-1);
744 io_ctl_init(&io_ctl, inode, root);
746 /* Get the cluster for this block_group if it exists */
747 if (block_group && !list_empty(&block_group->cluster_list))
748 cluster = list_entry(block_group->cluster_list.next,
749 struct btrfs_free_cluster,
753 * We shouldn't have switched the pinned extents yet so this is the
756 unpin = root->fs_info->pinned_extents;
758 /* Lock all pages first so we can lock the extent safely. */
759 io_ctl_prepare_pages(&io_ctl, inode, 0);
761 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
762 0, &cached_state, GFP_NOFS);
765 * When searching for pinned extents, we need to start at our start
769 start = block_group->key.objectid;
771 node = rb_first(&ctl->free_space_offset);
772 if (!node && cluster) {
773 node = rb_first(&cluster->root);
777 io_ctl_set_generation(&io_ctl, trans->transid);
779 /* Write out the extent entries */
781 struct btrfs_free_space *e;
783 e = rb_entry(node, struct btrfs_free_space, offset_index);
786 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
792 list_add_tail(&e->list, &bitmap_list);
795 node = rb_next(node);
796 if (!node && cluster) {
797 node = rb_first(&cluster->root);
803 * We want to add any pinned extents to our free space cache
804 * so we don't leak the space
806 while (block_group && (start < block_group->key.objectid +
807 block_group->key.offset)) {
808 ret = find_first_extent_bit(unpin, start, &start, &end,
815 /* This pinned extent is out of our range */
816 if (start >= block_group->key.objectid +
817 block_group->key.offset)
820 len = block_group->key.objectid +
821 block_group->key.offset - start;
822 len = min(len, end + 1 - start);
825 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
832 /* Write out the bitmaps */
833 list_for_each_safe(pos, n, &bitmap_list) {
834 struct btrfs_free_space *entry =
835 list_entry(pos, struct btrfs_free_space, list);
837 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
840 list_del_init(&entry->list);
843 /* Zero out the rest of the pages just to make sure */
844 io_ctl_zero_remaining_pages(&io_ctl);
846 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
847 0, i_size_read(inode), &cached_state);
848 io_ctl_drop_pages(&io_ctl);
849 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
850 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
855 BTRFS_I(inode)->generation = trans->transid;
857 filemap_write_and_wait(inode->i_mapping);
859 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
863 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
865 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
866 EXTENT_DIRTY | EXTENT_DELALLOC |
867 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
870 leaf = path->nodes[0];
872 struct btrfs_key found_key;
873 BUG_ON(!path->slots[0]);
875 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
876 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
877 found_key.offset != offset) {
878 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
880 EXTENT_DIRTY | EXTENT_DELALLOC |
881 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
883 btrfs_release_path(path);
887 header = btrfs_item_ptr(leaf, path->slots[0],
888 struct btrfs_free_space_header);
889 btrfs_set_free_space_entries(leaf, header, entries);
890 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
891 btrfs_set_free_space_generation(leaf, header, trans->transid);
892 btrfs_mark_buffer_dirty(leaf);
893 btrfs_release_path(path);
897 io_ctl_free(&io_ctl);
899 invalidate_inode_pages2(inode->i_mapping);
900 BTRFS_I(inode)->generation = 0;
902 btrfs_update_inode(trans, root, inode);
906 list_for_each_safe(pos, n, &bitmap_list) {
907 struct btrfs_free_space *entry =
908 list_entry(pos, struct btrfs_free_space, list);
909 list_del_init(&entry->list);
911 io_ctl_drop_pages(&io_ctl);
912 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
913 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
917 int btrfs_write_out_cache(struct btrfs_root *root,
918 struct btrfs_trans_handle *trans,
919 struct btrfs_block_group_cache *block_group,
920 struct btrfs_path *path)
922 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
926 root = root->fs_info->tree_root;
928 spin_lock(&block_group->lock);
929 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
930 spin_unlock(&block_group->lock);
933 spin_unlock(&block_group->lock);
935 inode = lookup_free_space_inode(root, block_group, path);
939 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
940 path, block_group->key.objectid);
942 btrfs_delalloc_release_metadata(inode, inode->i_size);
943 spin_lock(&block_group->lock);
944 block_group->disk_cache_state = BTRFS_DC_ERROR;
945 spin_unlock(&block_group->lock);
948 printk(KERN_ERR "btrfs: failed to write free space cace "
949 "for block group %llu\n", block_group->key.objectid);
957 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
960 BUG_ON(offset < bitmap_start);
961 offset -= bitmap_start;
962 return (unsigned long)(div_u64(offset, unit));
965 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
967 return (unsigned long)(div_u64(bytes, unit));
970 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
974 u64 bytes_per_bitmap;
976 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
977 bitmap_start = offset - ctl->start;
978 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
979 bitmap_start *= bytes_per_bitmap;
980 bitmap_start += ctl->start;
985 static int tree_insert_offset(struct rb_root *root, u64 offset,
986 struct rb_node *node, int bitmap)
988 struct rb_node **p = &root->rb_node;
989 struct rb_node *parent = NULL;
990 struct btrfs_free_space *info;
994 info = rb_entry(parent, struct btrfs_free_space, offset_index);
996 if (offset < info->offset) {
998 } else if (offset > info->offset) {
1002 * we could have a bitmap entry and an extent entry
1003 * share the same offset. If this is the case, we want
1004 * the extent entry to always be found first if we do a
1005 * linear search through the tree, since we want to have
1006 * the quickest allocation time, and allocating from an
1007 * extent is faster than allocating from a bitmap. So
1008 * if we're inserting a bitmap and we find an entry at
1009 * this offset, we want to go right, or after this entry
1010 * logically. If we are inserting an extent and we've
1011 * found a bitmap, we want to go left, or before
1019 p = &(*p)->rb_right;
1021 if (!info->bitmap) {
1030 rb_link_node(node, parent, p);
1031 rb_insert_color(node, root);
1037 * searches the tree for the given offset.
1039 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1040 * want a section that has at least bytes size and comes at or after the given
1043 static struct btrfs_free_space *
1044 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1045 u64 offset, int bitmap_only, int fuzzy)
1047 struct rb_node *n = ctl->free_space_offset.rb_node;
1048 struct btrfs_free_space *entry, *prev = NULL;
1050 /* find entry that is closest to the 'offset' */
1057 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1060 if (offset < entry->offset)
1062 else if (offset > entry->offset)
1075 * bitmap entry and extent entry may share same offset,
1076 * in that case, bitmap entry comes after extent entry.
1081 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1082 if (entry->offset != offset)
1085 WARN_ON(!entry->bitmap);
1088 if (entry->bitmap) {
1090 * if previous extent entry covers the offset,
1091 * we should return it instead of the bitmap entry
1093 n = &entry->offset_index;
1098 prev = rb_entry(n, struct btrfs_free_space,
1100 if (!prev->bitmap) {
1101 if (prev->offset + prev->bytes > offset)
1113 /* find last entry before the 'offset' */
1115 if (entry->offset > offset) {
1116 n = rb_prev(&entry->offset_index);
1118 entry = rb_entry(n, struct btrfs_free_space,
1120 BUG_ON(entry->offset > offset);
1129 if (entry->bitmap) {
1130 n = &entry->offset_index;
1135 prev = rb_entry(n, struct btrfs_free_space,
1137 if (!prev->bitmap) {
1138 if (prev->offset + prev->bytes > offset)
1143 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1145 } else if (entry->offset + entry->bytes > offset)
1152 if (entry->bitmap) {
1153 if (entry->offset + BITS_PER_BITMAP *
1157 if (entry->offset + entry->bytes > offset)
1161 n = rb_next(&entry->offset_index);
1164 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1170 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1171 struct btrfs_free_space *info)
1173 rb_erase(&info->offset_index, &ctl->free_space_offset);
1174 ctl->free_extents--;
1177 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1178 struct btrfs_free_space *info)
1180 __unlink_free_space(ctl, info);
1181 ctl->free_space -= info->bytes;
1184 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1185 struct btrfs_free_space *info)
1189 BUG_ON(!info->bitmap && !info->bytes);
1190 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1191 &info->offset_index, (info->bitmap != NULL));
1195 ctl->free_space += info->bytes;
1196 ctl->free_extents++;
1200 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1202 struct btrfs_block_group_cache *block_group = ctl->private;
1206 u64 size = block_group->key.offset;
1207 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
1208 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1210 BUG_ON(ctl->total_bitmaps > max_bitmaps);
1213 * The goal is to keep the total amount of memory used per 1gb of space
1214 * at or below 32k, so we need to adjust how much memory we allow to be
1215 * used by extent based free space tracking
1217 if (size < 1024 * 1024 * 1024)
1218 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1220 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1221 div64_u64(size, 1024 * 1024 * 1024);
1224 * we want to account for 1 more bitmap than what we have so we can make
1225 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1226 * we add more bitmaps.
1228 bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
1230 if (bitmap_bytes >= max_bytes) {
1231 ctl->extents_thresh = 0;
1236 * we want the extent entry threshold to always be at most 1/2 the maxw
1237 * bytes we can have, or whatever is less than that.
1239 extent_bytes = max_bytes - bitmap_bytes;
1240 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
1242 ctl->extents_thresh =
1243 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1246 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1247 struct btrfs_free_space *info,
1248 u64 offset, u64 bytes)
1250 unsigned long start, count;
1252 start = offset_to_bit(info->offset, ctl->unit, offset);
1253 count = bytes_to_bits(bytes, ctl->unit);
1254 BUG_ON(start + count > BITS_PER_BITMAP);
1256 bitmap_clear(info->bitmap, start, count);
1258 info->bytes -= bytes;
1261 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1262 struct btrfs_free_space *info, u64 offset,
1265 __bitmap_clear_bits(ctl, info, offset, bytes);
1266 ctl->free_space -= bytes;
1269 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1270 struct btrfs_free_space *info, u64 offset,
1273 unsigned long start, count;
1275 start = offset_to_bit(info->offset, ctl->unit, offset);
1276 count = bytes_to_bits(bytes, ctl->unit);
1277 BUG_ON(start + count > BITS_PER_BITMAP);
1279 bitmap_set(info->bitmap, start, count);
1281 info->bytes += bytes;
1282 ctl->free_space += bytes;
1285 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1286 struct btrfs_free_space *bitmap_info, u64 *offset,
1289 unsigned long found_bits = 0;
1290 unsigned long bits, i;
1291 unsigned long next_zero;
1293 i = offset_to_bit(bitmap_info->offset, ctl->unit,
1294 max_t(u64, *offset, bitmap_info->offset));
1295 bits = bytes_to_bits(*bytes, ctl->unit);
1297 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
1298 i < BITS_PER_BITMAP;
1299 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
1300 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1301 BITS_PER_BITMAP, i);
1302 if ((next_zero - i) >= bits) {
1303 found_bits = next_zero - i;
1310 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1311 *bytes = (u64)(found_bits) * ctl->unit;
1318 static struct btrfs_free_space *
1319 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1321 struct btrfs_free_space *entry;
1322 struct rb_node *node;
1325 if (!ctl->free_space_offset.rb_node)
1328 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1332 for (node = &entry->offset_index; node; node = rb_next(node)) {
1333 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1334 if (entry->bytes < *bytes)
1337 if (entry->bitmap) {
1338 ret = search_bitmap(ctl, entry, offset, bytes);
1344 *offset = entry->offset;
1345 *bytes = entry->bytes;
1352 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1353 struct btrfs_free_space *info, u64 offset)
1355 info->offset = offset_to_bitmap(ctl, offset);
1357 link_free_space(ctl, info);
1358 ctl->total_bitmaps++;
1360 ctl->op->recalc_thresholds(ctl);
1363 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1364 struct btrfs_free_space *bitmap_info)
1366 unlink_free_space(ctl, bitmap_info);
1367 kfree(bitmap_info->bitmap);
1368 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1369 ctl->total_bitmaps--;
1370 ctl->op->recalc_thresholds(ctl);
1373 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1374 struct btrfs_free_space *bitmap_info,
1375 u64 *offset, u64 *bytes)
1378 u64 search_start, search_bytes;
1382 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1385 * XXX - this can go away after a few releases.
1387 * since the only user of btrfs_remove_free_space is the tree logging
1388 * stuff, and the only way to test that is under crash conditions, we
1389 * want to have this debug stuff here just in case somethings not
1390 * working. Search the bitmap for the space we are trying to use to
1391 * make sure its actually there. If its not there then we need to stop
1392 * because something has gone wrong.
1394 search_start = *offset;
1395 search_bytes = *bytes;
1396 search_bytes = min(search_bytes, end - search_start + 1);
1397 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
1398 BUG_ON(ret < 0 || search_start != *offset);
1400 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
1401 bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
1402 *bytes -= end - *offset + 1;
1404 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
1405 bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
1410 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1411 if (!bitmap_info->bytes)
1412 free_bitmap(ctl, bitmap_info);
1415 * no entry after this bitmap, but we still have bytes to
1416 * remove, so something has gone wrong.
1421 bitmap_info = rb_entry(next, struct btrfs_free_space,
1425 * if the next entry isn't a bitmap we need to return to let the
1426 * extent stuff do its work.
1428 if (!bitmap_info->bitmap)
1432 * Ok the next item is a bitmap, but it may not actually hold
1433 * the information for the rest of this free space stuff, so
1434 * look for it, and if we don't find it return so we can try
1435 * everything over again.
1437 search_start = *offset;
1438 search_bytes = *bytes;
1439 ret = search_bitmap(ctl, bitmap_info, &search_start,
1441 if (ret < 0 || search_start != *offset)
1445 } else if (!bitmap_info->bytes)
1446 free_bitmap(ctl, bitmap_info);
1451 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1452 struct btrfs_free_space *info, u64 offset,
1455 u64 bytes_to_set = 0;
1458 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1460 bytes_to_set = min(end - offset, bytes);
1462 bitmap_set_bits(ctl, info, offset, bytes_to_set);
1464 return bytes_to_set;
1468 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1469 struct btrfs_free_space *info)
1471 struct btrfs_block_group_cache *block_group = ctl->private;
1474 * If we are below the extents threshold then we can add this as an
1475 * extent, and don't have to deal with the bitmap
1477 if (ctl->free_extents < ctl->extents_thresh) {
1479 * If this block group has some small extents we don't want to
1480 * use up all of our free slots in the cache with them, we want
1481 * to reserve them to larger extents, however if we have plent
1482 * of cache left then go ahead an dadd them, no sense in adding
1483 * the overhead of a bitmap if we don't have to.
1485 if (info->bytes <= block_group->sectorsize * 4) {
1486 if (ctl->free_extents * 2 <= ctl->extents_thresh)
1494 * some block groups are so tiny they can't be enveloped by a bitmap, so
1495 * don't even bother to create a bitmap for this
1497 if (BITS_PER_BITMAP * block_group->sectorsize >
1498 block_group->key.offset)
1504 static struct btrfs_free_space_op free_space_op = {
1505 .recalc_thresholds = recalculate_thresholds,
1506 .use_bitmap = use_bitmap,
1509 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
1510 struct btrfs_free_space *info)
1512 struct btrfs_free_space *bitmap_info;
1513 struct btrfs_block_group_cache *block_group = NULL;
1515 u64 bytes, offset, bytes_added;
1518 bytes = info->bytes;
1519 offset = info->offset;
1521 if (!ctl->op->use_bitmap(ctl, info))
1524 if (ctl->op == &free_space_op)
1525 block_group = ctl->private;
1528 * Since we link bitmaps right into the cluster we need to see if we
1529 * have a cluster here, and if so and it has our bitmap we need to add
1530 * the free space to that bitmap.
1532 if (block_group && !list_empty(&block_group->cluster_list)) {
1533 struct btrfs_free_cluster *cluster;
1534 struct rb_node *node;
1535 struct btrfs_free_space *entry;
1537 cluster = list_entry(block_group->cluster_list.next,
1538 struct btrfs_free_cluster,
1540 spin_lock(&cluster->lock);
1541 node = rb_first(&cluster->root);
1543 spin_unlock(&cluster->lock);
1544 goto no_cluster_bitmap;
1547 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1548 if (!entry->bitmap) {
1549 spin_unlock(&cluster->lock);
1550 goto no_cluster_bitmap;
1553 if (entry->offset == offset_to_bitmap(ctl, offset)) {
1554 bytes_added = add_bytes_to_bitmap(ctl, entry,
1556 bytes -= bytes_added;
1557 offset += bytes_added;
1559 spin_unlock(&cluster->lock);
1567 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1574 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
1575 bytes -= bytes_added;
1576 offset += bytes_added;
1586 if (info && info->bitmap) {
1587 add_new_bitmap(ctl, info, offset);
1592 spin_unlock(&ctl->tree_lock);
1594 /* no pre-allocated info, allocate a new one */
1596 info = kmem_cache_zalloc(btrfs_free_space_cachep,
1599 spin_lock(&ctl->tree_lock);
1605 /* allocate the bitmap */
1606 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
1607 spin_lock(&ctl->tree_lock);
1608 if (!info->bitmap) {
1618 kfree(info->bitmap);
1619 kmem_cache_free(btrfs_free_space_cachep, info);
1625 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
1626 struct btrfs_free_space *info, bool update_stat)
1628 struct btrfs_free_space *left_info;
1629 struct btrfs_free_space *right_info;
1630 bool merged = false;
1631 u64 offset = info->offset;
1632 u64 bytes = info->bytes;
1635 * first we want to see if there is free space adjacent to the range we
1636 * are adding, if there is remove that struct and add a new one to
1637 * cover the entire range
1639 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
1640 if (right_info && rb_prev(&right_info->offset_index))
1641 left_info = rb_entry(rb_prev(&right_info->offset_index),
1642 struct btrfs_free_space, offset_index);
1644 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
1646 if (right_info && !right_info->bitmap) {
1648 unlink_free_space(ctl, right_info);
1650 __unlink_free_space(ctl, right_info);
1651 info->bytes += right_info->bytes;
1652 kmem_cache_free(btrfs_free_space_cachep, right_info);
1656 if (left_info && !left_info->bitmap &&
1657 left_info->offset + left_info->bytes == offset) {
1659 unlink_free_space(ctl, left_info);
1661 __unlink_free_space(ctl, left_info);
1662 info->offset = left_info->offset;
1663 info->bytes += left_info->bytes;
1664 kmem_cache_free(btrfs_free_space_cachep, left_info);
1671 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
1672 u64 offset, u64 bytes)
1674 struct btrfs_free_space *info;
1677 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
1681 info->offset = offset;
1682 info->bytes = bytes;
1684 spin_lock(&ctl->tree_lock);
1686 if (try_merge_free_space(ctl, info, true))
1690 * There was no extent directly to the left or right of this new
1691 * extent then we know we're going to have to allocate a new extent, so
1692 * before we do that see if we need to drop this into a bitmap
1694 ret = insert_into_bitmap(ctl, info);
1702 ret = link_free_space(ctl, info);
1704 kmem_cache_free(btrfs_free_space_cachep, info);
1706 spin_unlock(&ctl->tree_lock);
1709 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
1710 BUG_ON(ret == -EEXIST);
1716 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1717 u64 offset, u64 bytes)
1719 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1720 struct btrfs_free_space *info;
1721 struct btrfs_free_space *next_info = NULL;
1724 spin_lock(&ctl->tree_lock);
1727 info = tree_search_offset(ctl, offset, 0, 0);
1730 * oops didn't find an extent that matched the space we wanted
1731 * to remove, look for a bitmap instead
1733 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1741 if (info->bytes < bytes && rb_next(&info->offset_index)) {
1743 next_info = rb_entry(rb_next(&info->offset_index),
1744 struct btrfs_free_space,
1747 if (next_info->bitmap)
1748 end = next_info->offset +
1749 BITS_PER_BITMAP * ctl->unit - 1;
1751 end = next_info->offset + next_info->bytes;
1753 if (next_info->bytes < bytes ||
1754 next_info->offset > offset || offset > end) {
1755 printk(KERN_CRIT "Found free space at %llu, size %llu,"
1756 " trying to use %llu\n",
1757 (unsigned long long)info->offset,
1758 (unsigned long long)info->bytes,
1759 (unsigned long long)bytes);
1768 if (info->bytes == bytes) {
1769 unlink_free_space(ctl, info);
1771 kfree(info->bitmap);
1772 ctl->total_bitmaps--;
1774 kmem_cache_free(btrfs_free_space_cachep, info);
1778 if (!info->bitmap && info->offset == offset) {
1779 unlink_free_space(ctl, info);
1780 info->offset += bytes;
1781 info->bytes -= bytes;
1782 link_free_space(ctl, info);
1786 if (!info->bitmap && info->offset <= offset &&
1787 info->offset + info->bytes >= offset + bytes) {
1788 u64 old_start = info->offset;
1790 * we're freeing space in the middle of the info,
1791 * this can happen during tree log replay
1793 * first unlink the old info and then
1794 * insert it again after the hole we're creating
1796 unlink_free_space(ctl, info);
1797 if (offset + bytes < info->offset + info->bytes) {
1798 u64 old_end = info->offset + info->bytes;
1800 info->offset = offset + bytes;
1801 info->bytes = old_end - info->offset;
1802 ret = link_free_space(ctl, info);
1807 /* the hole we're creating ends at the end
1808 * of the info struct, just free the info
1810 kmem_cache_free(btrfs_free_space_cachep, info);
1812 spin_unlock(&ctl->tree_lock);
1814 /* step two, insert a new info struct to cover
1815 * anything before the hole
1817 ret = btrfs_add_free_space(block_group, old_start,
1818 offset - old_start);
1823 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
1828 spin_unlock(&ctl->tree_lock);
1833 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1836 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1837 struct btrfs_free_space *info;
1841 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
1842 info = rb_entry(n, struct btrfs_free_space, offset_index);
1843 if (info->bytes >= bytes)
1845 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
1846 (unsigned long long)info->offset,
1847 (unsigned long long)info->bytes,
1848 (info->bitmap) ? "yes" : "no");
1850 printk(KERN_INFO "block group has cluster?: %s\n",
1851 list_empty(&block_group->cluster_list) ? "no" : "yes");
1852 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
1856 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
1858 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1860 spin_lock_init(&ctl->tree_lock);
1861 ctl->unit = block_group->sectorsize;
1862 ctl->start = block_group->key.objectid;
1863 ctl->private = block_group;
1864 ctl->op = &free_space_op;
1867 * we only want to have 32k of ram per block group for keeping
1868 * track of free space, and if we pass 1/2 of that we want to
1869 * start converting things over to using bitmaps
1871 ctl->extents_thresh = ((1024 * 32) / 2) /
1872 sizeof(struct btrfs_free_space);
1876 * for a given cluster, put all of its extents back into the free
1877 * space cache. If the block group passed doesn't match the block group
1878 * pointed to by the cluster, someone else raced in and freed the
1879 * cluster already. In that case, we just return without changing anything
1882 __btrfs_return_cluster_to_free_space(
1883 struct btrfs_block_group_cache *block_group,
1884 struct btrfs_free_cluster *cluster)
1886 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1887 struct btrfs_free_space *entry;
1888 struct rb_node *node;
1890 spin_lock(&cluster->lock);
1891 if (cluster->block_group != block_group)
1894 cluster->block_group = NULL;
1895 cluster->window_start = 0;
1896 list_del_init(&cluster->block_group_list);
1898 node = rb_first(&cluster->root);
1902 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1903 node = rb_next(&entry->offset_index);
1904 rb_erase(&entry->offset_index, &cluster->root);
1906 bitmap = (entry->bitmap != NULL);
1908 try_merge_free_space(ctl, entry, false);
1909 tree_insert_offset(&ctl->free_space_offset,
1910 entry->offset, &entry->offset_index, bitmap);
1912 cluster->root = RB_ROOT;
1915 spin_unlock(&cluster->lock);
1916 btrfs_put_block_group(block_group);
1920 void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
1922 struct btrfs_free_space *info;
1923 struct rb_node *node;
1925 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
1926 info = rb_entry(node, struct btrfs_free_space, offset_index);
1927 if (!info->bitmap) {
1928 unlink_free_space(ctl, info);
1929 kmem_cache_free(btrfs_free_space_cachep, info);
1931 free_bitmap(ctl, info);
1933 if (need_resched()) {
1934 spin_unlock(&ctl->tree_lock);
1936 spin_lock(&ctl->tree_lock);
1941 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
1943 spin_lock(&ctl->tree_lock);
1944 __btrfs_remove_free_space_cache_locked(ctl);
1945 spin_unlock(&ctl->tree_lock);
1948 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
1950 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1951 struct btrfs_free_cluster *cluster;
1952 struct list_head *head;
1954 spin_lock(&ctl->tree_lock);
1955 while ((head = block_group->cluster_list.next) !=
1956 &block_group->cluster_list) {
1957 cluster = list_entry(head, struct btrfs_free_cluster,
1960 WARN_ON(cluster->block_group != block_group);
1961 __btrfs_return_cluster_to_free_space(block_group, cluster);
1962 if (need_resched()) {
1963 spin_unlock(&ctl->tree_lock);
1965 spin_lock(&ctl->tree_lock);
1968 __btrfs_remove_free_space_cache_locked(ctl);
1969 spin_unlock(&ctl->tree_lock);
1973 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1974 u64 offset, u64 bytes, u64 empty_size)
1976 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1977 struct btrfs_free_space *entry = NULL;
1978 u64 bytes_search = bytes + empty_size;
1981 spin_lock(&ctl->tree_lock);
1982 entry = find_free_space(ctl, &offset, &bytes_search);
1987 if (entry->bitmap) {
1988 bitmap_clear_bits(ctl, entry, offset, bytes);
1990 free_bitmap(ctl, entry);
1992 unlink_free_space(ctl, entry);
1993 entry->offset += bytes;
1994 entry->bytes -= bytes;
1996 kmem_cache_free(btrfs_free_space_cachep, entry);
1998 link_free_space(ctl, entry);
2002 spin_unlock(&ctl->tree_lock);
2008 * given a cluster, put all of its extents back into the free space
2009 * cache. If a block group is passed, this function will only free
2010 * a cluster that belongs to the passed block group.
2012 * Otherwise, it'll get a reference on the block group pointed to by the
2013 * cluster and remove the cluster from it.
2015 int btrfs_return_cluster_to_free_space(
2016 struct btrfs_block_group_cache *block_group,
2017 struct btrfs_free_cluster *cluster)
2019 struct btrfs_free_space_ctl *ctl;
2022 /* first, get a safe pointer to the block group */
2023 spin_lock(&cluster->lock);
2025 block_group = cluster->block_group;
2027 spin_unlock(&cluster->lock);
2030 } else if (cluster->block_group != block_group) {
2031 /* someone else has already freed it don't redo their work */
2032 spin_unlock(&cluster->lock);
2035 atomic_inc(&block_group->count);
2036 spin_unlock(&cluster->lock);
2038 ctl = block_group->free_space_ctl;
2040 /* now return any extents the cluster had on it */
2041 spin_lock(&ctl->tree_lock);
2042 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2043 spin_unlock(&ctl->tree_lock);
2045 /* finally drop our ref */
2046 btrfs_put_block_group(block_group);
2050 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2051 struct btrfs_free_cluster *cluster,
2052 struct btrfs_free_space *entry,
2053 u64 bytes, u64 min_start)
2055 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2057 u64 search_start = cluster->window_start;
2058 u64 search_bytes = bytes;
2061 search_start = min_start;
2062 search_bytes = bytes;
2064 err = search_bitmap(ctl, entry, &search_start, &search_bytes);
2069 __bitmap_clear_bits(ctl, entry, ret, bytes);
2075 * given a cluster, try to allocate 'bytes' from it, returns 0
2076 * if it couldn't find anything suitably large, or a logical disk offset
2077 * if things worked out
2079 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2080 struct btrfs_free_cluster *cluster, u64 bytes,
2083 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2084 struct btrfs_free_space *entry = NULL;
2085 struct rb_node *node;
2088 spin_lock(&cluster->lock);
2089 if (bytes > cluster->max_size)
2092 if (cluster->block_group != block_group)
2095 node = rb_first(&cluster->root);
2099 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2101 if (entry->bytes < bytes ||
2102 (!entry->bitmap && entry->offset < min_start)) {
2103 node = rb_next(&entry->offset_index);
2106 entry = rb_entry(node, struct btrfs_free_space,
2111 if (entry->bitmap) {
2112 ret = btrfs_alloc_from_bitmap(block_group,
2113 cluster, entry, bytes,
2116 node = rb_next(&entry->offset_index);
2119 entry = rb_entry(node, struct btrfs_free_space,
2124 ret = entry->offset;
2126 entry->offset += bytes;
2127 entry->bytes -= bytes;
2130 if (entry->bytes == 0)
2131 rb_erase(&entry->offset_index, &cluster->root);
2135 spin_unlock(&cluster->lock);
2140 spin_lock(&ctl->tree_lock);
2142 ctl->free_space -= bytes;
2143 if (entry->bytes == 0) {
2144 ctl->free_extents--;
2145 if (entry->bitmap) {
2146 kfree(entry->bitmap);
2147 ctl->total_bitmaps--;
2148 ctl->op->recalc_thresholds(ctl);
2150 kmem_cache_free(btrfs_free_space_cachep, entry);
2153 spin_unlock(&ctl->tree_lock);
2158 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2159 struct btrfs_free_space *entry,
2160 struct btrfs_free_cluster *cluster,
2161 u64 offset, u64 bytes, u64 min_bytes)
2163 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2164 unsigned long next_zero;
2166 unsigned long search_bits;
2167 unsigned long total_bits;
2168 unsigned long found_bits;
2169 unsigned long start = 0;
2170 unsigned long total_found = 0;
2174 i = offset_to_bit(entry->offset, block_group->sectorsize,
2175 max_t(u64, offset, entry->offset));
2176 search_bits = bytes_to_bits(bytes, block_group->sectorsize);
2177 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2181 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
2182 i < BITS_PER_BITMAP;
2183 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2184 next_zero = find_next_zero_bit(entry->bitmap,
2185 BITS_PER_BITMAP, i);
2186 if (next_zero - i >= search_bits) {
2187 found_bits = next_zero - i;
2201 total_found += found_bits;
2203 if (cluster->max_size < found_bits * block_group->sectorsize)
2204 cluster->max_size = found_bits * block_group->sectorsize;
2206 if (total_found < total_bits) {
2207 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
2208 if (i - start > total_bits * 2) {
2210 cluster->max_size = 0;
2216 cluster->window_start = start * block_group->sectorsize +
2218 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2219 ret = tree_insert_offset(&cluster->root, entry->offset,
2220 &entry->offset_index, 1);
2227 * This searches the block group for just extents to fill the cluster with.
2230 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2231 struct btrfs_free_cluster *cluster,
2232 struct list_head *bitmaps, u64 offset, u64 bytes,
2235 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2236 struct btrfs_free_space *first = NULL;
2237 struct btrfs_free_space *entry = NULL;
2238 struct btrfs_free_space *prev = NULL;
2239 struct btrfs_free_space *last;
2240 struct rb_node *node;
2244 u64 max_gap = 128 * 1024;
2246 entry = tree_search_offset(ctl, offset, 0, 1);
2251 * We don't want bitmaps, so just move along until we find a normal
2254 while (entry->bitmap) {
2255 if (list_empty(&entry->list))
2256 list_add_tail(&entry->list, bitmaps);
2257 node = rb_next(&entry->offset_index);
2260 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2263 window_start = entry->offset;
2264 window_free = entry->bytes;
2265 max_extent = entry->bytes;
2270 while (window_free <= min_bytes) {
2271 node = rb_next(&entry->offset_index);
2274 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2276 if (entry->bitmap) {
2277 if (list_empty(&entry->list))
2278 list_add_tail(&entry->list, bitmaps);
2283 * we haven't filled the empty size and the window is
2284 * very large. reset and try again
2286 if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
2287 entry->offset - window_start > (min_bytes * 2)) {
2289 window_start = entry->offset;
2290 window_free = entry->bytes;
2292 max_extent = entry->bytes;
2295 window_free += entry->bytes;
2296 if (entry->bytes > max_extent)
2297 max_extent = entry->bytes;
2302 cluster->window_start = first->offset;
2304 node = &first->offset_index;
2307 * now we've found our entries, pull them out of the free space
2308 * cache and put them into the cluster rbtree
2313 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2314 node = rb_next(&entry->offset_index);
2318 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2319 ret = tree_insert_offset(&cluster->root, entry->offset,
2320 &entry->offset_index, 0);
2322 } while (node && entry != last);
2324 cluster->max_size = max_extent;
2330 * This specifically looks for bitmaps that may work in the cluster, we assume
2331 * that we have already failed to find extents that will work.
2334 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2335 struct btrfs_free_cluster *cluster,
2336 struct list_head *bitmaps, u64 offset, u64 bytes,
2339 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2340 struct btrfs_free_space *entry;
2341 struct rb_node *node;
2344 if (ctl->total_bitmaps == 0)
2348 * First check our cached list of bitmaps and see if there is an entry
2349 * here that will work.
2351 list_for_each_entry(entry, bitmaps, list) {
2352 if (entry->bytes < min_bytes)
2354 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2361 * If we do have entries on our list and we are here then we didn't find
2362 * anything, so go ahead and get the next entry after the last entry in
2363 * this list and start the search from there.
2365 if (!list_empty(bitmaps)) {
2366 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2368 node = rb_next(&entry->offset_index);
2371 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2375 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2380 node = &entry->offset_index;
2382 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2383 node = rb_next(&entry->offset_index);
2386 if (entry->bytes < min_bytes)
2388 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2390 } while (ret && node);
2396 * here we try to find a cluster of blocks in a block group. The goal
2397 * is to find at least bytes free and up to empty_size + bytes free.
2398 * We might not find them all in one contiguous area.
2400 * returns zero and sets up cluster if things worked out, otherwise
2401 * it returns -enospc
2403 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2404 struct btrfs_root *root,
2405 struct btrfs_block_group_cache *block_group,
2406 struct btrfs_free_cluster *cluster,
2407 u64 offset, u64 bytes, u64 empty_size)
2409 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2410 struct list_head bitmaps;
2411 struct btrfs_free_space *entry, *tmp;
2415 /* for metadata, allow allocates with more holes */
2416 if (btrfs_test_opt(root, SSD_SPREAD)) {
2417 min_bytes = bytes + empty_size;
2418 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2420 * we want to do larger allocations when we are
2421 * flushing out the delayed refs, it helps prevent
2422 * making more work as we go along.
2424 if (trans->transaction->delayed_refs.flushing)
2425 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2427 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2429 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2431 spin_lock(&ctl->tree_lock);
2434 * If we know we don't have enough space to make a cluster don't even
2435 * bother doing all the work to try and find one.
2437 if (ctl->free_space < min_bytes) {
2438 spin_unlock(&ctl->tree_lock);
2442 spin_lock(&cluster->lock);
2444 /* someone already found a cluster, hooray */
2445 if (cluster->block_group) {
2450 INIT_LIST_HEAD(&bitmaps);
2451 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2454 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2455 offset, bytes, min_bytes);
2457 /* Clear our temporary list */
2458 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
2459 list_del_init(&entry->list);
2462 atomic_inc(&block_group->count);
2463 list_add_tail(&cluster->block_group_list,
2464 &block_group->cluster_list);
2465 cluster->block_group = block_group;
2468 spin_unlock(&cluster->lock);
2469 spin_unlock(&ctl->tree_lock);
2475 * simple code to zero out a cluster
2477 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2479 spin_lock_init(&cluster->lock);
2480 spin_lock_init(&cluster->refill_lock);
2481 cluster->root = RB_ROOT;
2482 cluster->max_size = 0;
2483 INIT_LIST_HEAD(&cluster->block_group_list);
2484 cluster->block_group = NULL;
2487 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2488 u64 *trimmed, u64 start, u64 end, u64 minlen)
2490 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2491 struct btrfs_free_space *entry = NULL;
2492 struct btrfs_fs_info *fs_info = block_group->fs_info;
2494 u64 actually_trimmed;
2499 while (start < end) {
2500 spin_lock(&ctl->tree_lock);
2502 if (ctl->free_space < minlen) {
2503 spin_unlock(&ctl->tree_lock);
2507 entry = tree_search_offset(ctl, start, 0, 1);
2509 entry = tree_search_offset(ctl,
2510 offset_to_bitmap(ctl, start),
2513 if (!entry || entry->offset >= end) {
2514 spin_unlock(&ctl->tree_lock);
2518 if (entry->bitmap) {
2519 ret = search_bitmap(ctl, entry, &start, &bytes);
2522 spin_unlock(&ctl->tree_lock);
2525 bytes = min(bytes, end - start);
2526 bitmap_clear_bits(ctl, entry, start, bytes);
2527 if (entry->bytes == 0)
2528 free_bitmap(ctl, entry);
2530 start = entry->offset + BITS_PER_BITMAP *
2531 block_group->sectorsize;
2532 spin_unlock(&ctl->tree_lock);
2537 start = entry->offset;
2538 bytes = min(entry->bytes, end - start);
2539 unlink_free_space(ctl, entry);
2540 kmem_cache_free(btrfs_free_space_cachep, entry);
2543 spin_unlock(&ctl->tree_lock);
2545 if (bytes >= minlen) {
2546 struct btrfs_space_info *space_info;
2549 space_info = block_group->space_info;
2550 spin_lock(&space_info->lock);
2551 spin_lock(&block_group->lock);
2552 if (!block_group->ro) {
2553 block_group->reserved += bytes;
2554 space_info->bytes_reserved += bytes;
2557 spin_unlock(&block_group->lock);
2558 spin_unlock(&space_info->lock);
2560 ret = btrfs_error_discard_extent(fs_info->extent_root,
2565 btrfs_add_free_space(block_group, start, bytes);
2567 spin_lock(&space_info->lock);
2568 spin_lock(&block_group->lock);
2569 if (block_group->ro)
2570 space_info->bytes_readonly += bytes;
2571 block_group->reserved -= bytes;
2572 space_info->bytes_reserved -= bytes;
2573 spin_unlock(&space_info->lock);
2574 spin_unlock(&block_group->lock);
2579 *trimmed += actually_trimmed;
2584 if (fatal_signal_pending(current)) {
2596 * Find the left-most item in the cache tree, and then return the
2597 * smallest inode number in the item.
2599 * Note: the returned inode number may not be the smallest one in
2600 * the tree, if the left-most item is a bitmap.
2602 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
2604 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
2605 struct btrfs_free_space *entry = NULL;
2608 spin_lock(&ctl->tree_lock);
2610 if (RB_EMPTY_ROOT(&ctl->free_space_offset))
2613 entry = rb_entry(rb_first(&ctl->free_space_offset),
2614 struct btrfs_free_space, offset_index);
2616 if (!entry->bitmap) {
2617 ino = entry->offset;
2619 unlink_free_space(ctl, entry);
2623 kmem_cache_free(btrfs_free_space_cachep, entry);
2625 link_free_space(ctl, entry);
2631 ret = search_bitmap(ctl, entry, &offset, &count);
2635 bitmap_clear_bits(ctl, entry, offset, 1);
2636 if (entry->bytes == 0)
2637 free_bitmap(ctl, entry);
2640 spin_unlock(&ctl->tree_lock);
2645 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
2646 struct btrfs_path *path)
2648 struct inode *inode = NULL;
2650 spin_lock(&root->cache_lock);
2651 if (root->cache_inode)
2652 inode = igrab(root->cache_inode);
2653 spin_unlock(&root->cache_lock);
2657 inode = __lookup_free_space_inode(root, path, 0);
2661 spin_lock(&root->cache_lock);
2662 if (!btrfs_fs_closing(root->fs_info))
2663 root->cache_inode = igrab(inode);
2664 spin_unlock(&root->cache_lock);
2669 int create_free_ino_inode(struct btrfs_root *root,
2670 struct btrfs_trans_handle *trans,
2671 struct btrfs_path *path)
2673 return __create_free_space_inode(root, trans, path,
2674 BTRFS_FREE_INO_OBJECTID, 0);
2677 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2679 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
2680 struct btrfs_path *path;
2681 struct inode *inode;
2683 u64 root_gen = btrfs_root_generation(&root->root_item);
2685 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2689 * If we're unmounting then just return, since this does a search on the
2690 * normal root and not the commit root and we could deadlock.
2692 if (btrfs_fs_closing(fs_info))
2695 path = btrfs_alloc_path();
2699 inode = lookup_free_ino_inode(root, path);
2703 if (root_gen != BTRFS_I(inode)->generation)
2706 ret = __load_free_space_cache(root, inode, ctl, path, 0);
2709 printk(KERN_ERR "btrfs: failed to load free ino cache for "
2710 "root %llu\n", root->root_key.objectid);
2714 btrfs_free_path(path);
2718 int btrfs_write_out_ino_cache(struct btrfs_root *root,
2719 struct btrfs_trans_handle *trans,
2720 struct btrfs_path *path)
2722 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
2723 struct inode *inode;
2726 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2729 inode = lookup_free_ino_inode(root, path);
2733 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2735 btrfs_delalloc_release_metadata(inode, inode->i_size);
2737 printk(KERN_ERR "btrfs: failed to write free ino cache "
2738 "for root %llu\n", root->root_key.objectid);