2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
26 #include "writeback.h"
28 #include <linux/slab.h>
29 #include <linux/bitops.h>
30 #include <linux/freezer.h>
31 #include <linux/hash.h>
32 #include <linux/kthread.h>
33 #include <linux/prefetch.h>
34 #include <linux/random.h>
35 #include <linux/rcupdate.h>
36 #include <trace/events/bcache.h>
40 * register_bcache: Return errors out to userspace correctly
42 * Writeback: don't undirty key until after a cache flush
44 * Create an iterator for key pointers
46 * On btree write error, mark bucket such that it won't be freed from the cache
49 * Check for bad keys in replay
51 * Refcount journal entries in journal_replay
54 * Finish incremental gc
55 * Gc should free old UUIDs, data for invalid UUIDs
57 * Provide a way to list backing device UUIDs we have data cached for, and
58 * probably how long it's been since we've seen them, and a way to invalidate
59 * dirty data for devices that will never be attached again
61 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
62 * that based on that and how much dirty data we have we can keep writeback
65 * Add a tracepoint or somesuch to watch for writeback starvation
67 * When btree depth > 1 and splitting an interior node, we have to make sure
68 * alloc_bucket() cannot fail. This should be true but is not completely
71 * Make sure all allocations get charged to the root cgroup
75 * If data write is less than hard sector size of ssd, round up offset in open
76 * bucket to the next whole sector
78 * Also lookup by cgroup in get_open_bucket()
80 * Superblock needs to be fleshed out for multiple cache devices
82 * Add a sysfs tunable for the number of writeback IOs in flight
84 * Add a sysfs tunable for the number of open data buckets
86 * IO tracking: Can we track when one process is doing io on behalf of another?
87 * IO tracking: Don't use just an average, weigh more recent stuff higher
89 * Test module load/unload
93 BTREE_INSERT_STATUS_INSERT,
94 BTREE_INSERT_STATUS_BACK_MERGE,
95 BTREE_INSERT_STATUS_OVERWROTE,
96 BTREE_INSERT_STATUS_FRONT_MERGE,
99 #define MAX_NEED_GC 64
100 #define MAX_SAVE_PRIO 72
102 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
104 #define PTR_HASH(c, k) \
105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107 static struct workqueue_struct *btree_io_wq;
109 static inline bool should_split(struct btree *b)
111 struct bset *i = write_block(b);
112 return b->written >= btree_blocks(b) ||
113 (b->written + __set_blocks(i, i->keys + 15, b->c)
117 #define insert_lock(s, b) ((b)->level <= (s)->lock)
120 * These macros are for recursing down the btree - they handle the details of
121 * locking and looking up nodes in the cache for you. They're best treated as
122 * mere syntax when reading code that uses them.
124 * op->lock determines whether we take a read or a write lock at a given depth.
125 * If you've got a read lock and find that you need a write lock (i.e. you're
126 * going to have to split), set op->lock and return -EINTR; btree_root() will
127 * call you again and you'll have the correct lock.
131 * btree - recurse down the btree on a specified key
132 * @fn: function to call, which will be passed the child node
133 * @key: key to recurse on
134 * @b: parent btree node
135 * @op: pointer to struct btree_op
137 #define btree(fn, key, b, op, ...) \
139 int _r, l = (b)->level - 1; \
140 bool _w = l <= (op)->lock; \
141 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \
142 if (!IS_ERR(_child)) { \
143 _child->parent = (b); \
144 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
145 rw_unlock(_w, _child); \
147 _r = PTR_ERR(_child); \
152 * btree_root - call a function on the root of the btree
153 * @fn: function to call, which will be passed the child node
155 * @op: pointer to struct btree_op
157 #define btree_root(fn, c, op, ...) \
161 struct btree *_b = (c)->root; \
162 bool _w = insert_lock(op, _b); \
163 rw_lock(_w, _b, _b->level); \
164 if (_b == (c)->root && \
165 _w == insert_lock(op, _b)) { \
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
172 bch_cannibalize_unlock(c); \
173 if (_r == -ENOSPC) { \
174 wait_event((c)->try_wait, \
178 } while (_r == -EINTR); \
180 finish_wait(&(c)->bucket_wait, &(op)->wait); \
184 /* Btree key manipulation */
186 void bkey_put(struct cache_set *c, struct bkey *k)
190 for (i = 0; i < KEY_PTRS(k); i++)
191 if (ptr_available(c, k, i))
192 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
197 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
199 uint64_t crc = b->key.ptr[0];
200 void *data = (void *) i + 8, *end = end(i);
202 crc = bch_crc64_update(crc, data, end - data);
203 return crc ^ 0xffffffffffffffffULL;
206 void bch_btree_node_read_done(struct btree *b)
208 const char *err = "bad btree header";
209 struct bset *i = b->sets[0].data;
210 struct btree_iter *iter;
212 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
213 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
216 #ifdef CONFIG_BCACHE_DEBUG
224 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
225 i = write_block(b)) {
226 err = "unsupported bset version";
227 if (i->version > BCACHE_BSET_VERSION)
230 err = "bad btree header";
231 if (b->written + set_blocks(i, b->c) > btree_blocks(b))
235 if (i->magic != bset_magic(&b->c->sb))
238 err = "bad checksum";
239 switch (i->version) {
241 if (i->csum != csum_set(i))
244 case BCACHE_BSET_VERSION:
245 if (i->csum != btree_csum_set(b, i))
251 if (i != b->sets[0].data && !i->keys)
254 bch_btree_iter_push(iter, i->start, end(i));
256 b->written += set_blocks(i, b->c);
259 err = "corrupted btree";
260 for (i = write_block(b);
261 bset_sector_offset(b, i) < KEY_SIZE(&b->key);
262 i = ((void *) i) + block_bytes(b->c))
263 if (i->seq == b->sets[0].data->seq)
266 bch_btree_sort_and_fix_extents(b, iter);
269 err = "short btree key";
270 if (b->sets[0].size &&
271 bkey_cmp(&b->key, &b->sets[0].end) < 0)
274 if (b->written < btree_blocks(b))
275 bch_bset_init_next(b);
277 mempool_free(iter, b->c->fill_iter);
280 set_btree_node_io_error(b);
281 bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
282 err, PTR_BUCKET_NR(b->c, &b->key, 0),
283 bset_block_offset(b, i), i->keys);
287 static void btree_node_read_endio(struct bio *bio, int error)
289 struct closure *cl = bio->bi_private;
293 static void bch_btree_node_read(struct btree *b)
295 uint64_t start_time = local_clock();
299 trace_bcache_btree_read(b);
301 closure_init_stack(&cl);
303 bio = bch_bbio_alloc(b->c);
304 bio->bi_rw = REQ_META|READ_SYNC;
305 bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
306 bio->bi_end_io = btree_node_read_endio;
307 bio->bi_private = &cl;
309 bch_bio_map(bio, b->sets[0].data);
311 bch_submit_bbio(bio, b->c, &b->key, 0);
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
315 set_btree_node_io_error(b);
317 bch_bbio_free(bio, b->c);
319 if (btree_node_io_error(b))
322 bch_btree_node_read_done(b);
323 bch_time_stats_update(&b->c->btree_read_time, start_time);
327 bch_cache_set_error(b->c, "io error reading bucket %zu",
328 PTR_BUCKET_NR(b->c, &b->key, 0));
331 static void btree_complete_write(struct btree *b, struct btree_write *w)
333 if (w->prio_blocked &&
334 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
335 wake_up_allocators(b->c);
338 atomic_dec_bug(w->journal);
339 __closure_wake_up(&b->c->journal.wait);
346 static void btree_node_write_unlock(struct closure *cl)
348 struct btree *b = container_of(cl, struct btree, io);
353 static void __btree_node_write_done(struct closure *cl)
355 struct btree *b = container_of(cl, struct btree, io);
356 struct btree_write *w = btree_prev_write(b);
358 bch_bbio_free(b->bio, b->c);
360 btree_complete_write(b, w);
362 if (btree_node_dirty(b))
363 queue_delayed_work(btree_io_wq, &b->work,
364 msecs_to_jiffies(30000));
366 closure_return_with_destructor(cl, btree_node_write_unlock);
369 static void btree_node_write_done(struct closure *cl)
371 struct btree *b = container_of(cl, struct btree, io);
375 bio_for_each_segment_all(bv, b->bio, n)
376 __free_page(bv->bv_page);
378 __btree_node_write_done(cl);
381 static void btree_node_write_endio(struct bio *bio, int error)
383 struct closure *cl = bio->bi_private;
384 struct btree *b = container_of(cl, struct btree, io);
387 set_btree_node_io_error(b);
389 bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
393 static void do_btree_node_write(struct btree *b)
395 struct closure *cl = &b->io;
396 struct bset *i = b->sets[b->nsets].data;
399 i->version = BCACHE_BSET_VERSION;
400 i->csum = btree_csum_set(b, i);
403 b->bio = bch_bbio_alloc(b->c);
405 b->bio->bi_end_io = btree_node_write_endio;
406 b->bio->bi_private = cl;
407 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
408 b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c);
409 bch_bio_map(b->bio, i);
412 * If we're appending to a leaf node, we don't technically need FUA -
413 * this write just needs to be persisted before the next journal write,
414 * which will be marked FLUSH|FUA.
416 * Similarly if we're writing a new btree root - the pointer is going to
417 * be in the next journal entry.
419 * But if we're writing a new btree node (that isn't a root) or
420 * appending to a non leaf btree node, we need either FUA or a flush
421 * when we write the parent with the new pointer. FUA is cheaper than a
422 * flush, and writes appending to leaf nodes aren't blocking anything so
423 * just make all btree node writes FUA to keep things sane.
426 bkey_copy(&k.key, &b->key);
427 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
429 if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
432 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
434 bio_for_each_segment_all(bv, b->bio, j)
435 memcpy(page_address(bv->bv_page),
436 base + j * PAGE_SIZE, PAGE_SIZE);
438 bch_submit_bbio(b->bio, b->c, &k.key, 0);
440 continue_at(cl, btree_node_write_done, NULL);
443 bch_bio_map(b->bio, i);
445 bch_submit_bbio(b->bio, b->c, &k.key, 0);
448 continue_at_nobarrier(cl, __btree_node_write_done, NULL);
452 void bch_btree_node_write(struct btree *b, struct closure *parent)
454 struct bset *i = b->sets[b->nsets].data;
456 trace_bcache_btree_write(b);
458 BUG_ON(current->bio_list);
459 BUG_ON(b->written >= btree_blocks(b));
460 BUG_ON(b->written && !i->keys);
461 BUG_ON(b->sets->data->seq != i->seq);
462 bch_check_keys(b, "writing");
464 cancel_delayed_work(&b->work);
466 /* If caller isn't waiting for write, parent refcount is cache set */
468 closure_init(&b->io, parent ?: &b->c->cl);
470 clear_bit(BTREE_NODE_dirty, &b->flags);
471 change_bit(BTREE_NODE_write_idx, &b->flags);
473 do_btree_node_write(b);
475 b->written += set_blocks(i, b->c);
476 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
477 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
479 bch_btree_sort_lazy(b);
482 * do verify if there was more than one set initially (i.e. we did a
483 * sort) and we sorted down to a single set:
485 if (i != b->sets->data && !b->nsets)
488 if (b->written < btree_blocks(b))
489 bch_bset_init_next(b);
492 static void bch_btree_node_write_sync(struct btree *b)
496 closure_init_stack(&cl);
497 bch_btree_node_write(b, &cl);
501 static void btree_node_write_work(struct work_struct *w)
503 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
505 rw_lock(true, b, b->level);
507 if (btree_node_dirty(b))
508 bch_btree_node_write(b, NULL);
512 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
514 struct bset *i = b->sets[b->nsets].data;
515 struct btree_write *w = btree_current_write(b);
520 if (!btree_node_dirty(b))
521 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
523 set_btree_node_dirty(b);
527 journal_pin_cmp(b->c, w->journal, journal_ref)) {
528 atomic_dec_bug(w->journal);
533 w->journal = journal_ref;
534 atomic_inc(w->journal);
538 /* Force write if set is too big */
539 if (set_bytes(i) > PAGE_SIZE - 48 &&
541 bch_btree_node_write(b, NULL);
545 * Btree in memory cache - allocation/freeing
546 * mca -> memory cache
549 static void mca_reinit(struct btree *b)
557 for (i = 0; i < MAX_BSETS; i++)
560 * Second loop starts at 1 because b->sets[0]->data is the memory we
563 for (i = 1; i < MAX_BSETS; i++)
564 b->sets[i].data = NULL;
567 #define mca_reserve(c) (((c->root && c->root->level) \
568 ? c->root->level : 1) * 8 + 16)
569 #define mca_can_free(c) \
570 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
572 static void mca_data_free(struct btree *b)
574 struct bset_tree *t = b->sets;
576 BUG_ON(b->io_mutex.count != 1);
578 if (bset_prev_bytes(b) < PAGE_SIZE)
581 free_pages((unsigned long) t->prev,
582 get_order(bset_prev_bytes(b)));
584 if (bset_tree_bytes(b) < PAGE_SIZE)
587 free_pages((unsigned long) t->tree,
588 get_order(bset_tree_bytes(b)));
590 free_pages((unsigned long) t->data, b->page_order);
595 list_move(&b->list, &b->c->btree_cache_freed);
596 b->c->bucket_cache_used--;
599 static void mca_bucket_free(struct btree *b)
601 BUG_ON(btree_node_dirty(b));
604 hlist_del_init_rcu(&b->hash);
605 list_move(&b->list, &b->c->btree_cache_freeable);
608 static unsigned btree_order(struct bkey *k)
610 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
613 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
615 struct bset_tree *t = b->sets;
618 b->page_order = max_t(unsigned,
619 ilog2(b->c->btree_pages),
622 t->data = (void *) __get_free_pages(gfp, b->page_order);
626 t->tree = bset_tree_bytes(b) < PAGE_SIZE
627 ? kmalloc(bset_tree_bytes(b), gfp)
628 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
632 t->prev = bset_prev_bytes(b) < PAGE_SIZE
633 ? kmalloc(bset_prev_bytes(b), gfp)
634 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
638 list_move(&b->list, &b->c->btree_cache);
639 b->c->bucket_cache_used++;
645 static struct btree *mca_bucket_alloc(struct cache_set *c,
646 struct bkey *k, gfp_t gfp)
648 struct btree *b = kzalloc(sizeof(struct btree), gfp);
652 init_rwsem(&b->lock);
653 lockdep_set_novalidate_class(&b->lock);
654 INIT_LIST_HEAD(&b->list);
655 INIT_DELAYED_WORK(&b->work, btree_node_write_work);
657 sema_init(&b->io_mutex, 1);
659 mca_data_alloc(b, k, gfp);
663 static int mca_reap(struct btree *b, unsigned min_order, bool flush)
667 closure_init_stack(&cl);
668 lockdep_assert_held(&b->c->bucket_lock);
670 if (!down_write_trylock(&b->lock))
673 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
675 if (b->page_order < min_order)
679 if (btree_node_dirty(b))
682 if (down_trylock(&b->io_mutex))
687 if (btree_node_dirty(b))
688 bch_btree_node_write_sync(b);
690 /* wait for any in flight btree write */
700 static unsigned long bch_mca_scan(struct shrinker *shrink,
701 struct shrink_control *sc)
703 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
705 unsigned long i, nr = sc->nr_to_scan;
706 unsigned long freed = 0;
708 if (c->shrinker_disabled)
714 /* Return -1 if we can't do anything right now */
715 if (sc->gfp_mask & __GFP_IO)
716 mutex_lock(&c->bucket_lock);
717 else if (!mutex_trylock(&c->bucket_lock))
721 * It's _really_ critical that we don't free too many btree nodes - we
722 * have to always leave ourselves a reserve. The reserve is how we
723 * guarantee that allocating memory for a new btree node can always
724 * succeed, so that inserting keys into the btree can always succeed and
725 * IO can always make forward progress:
727 nr /= c->btree_pages;
728 nr = min_t(unsigned long, nr, mca_can_free(c));
731 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
736 !mca_reap(b, 0, false)) {
743 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
744 if (list_empty(&c->btree_cache))
747 b = list_first_entry(&c->btree_cache, struct btree, list);
748 list_rotate_left(&c->btree_cache);
751 !mca_reap(b, 0, false)) {
760 mutex_unlock(&c->bucket_lock);
764 static unsigned long bch_mca_count(struct shrinker *shrink,
765 struct shrink_control *sc)
767 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
769 if (c->shrinker_disabled)
775 return mca_can_free(c) * c->btree_pages;
778 void bch_btree_cache_free(struct cache_set *c)
782 closure_init_stack(&cl);
784 if (c->shrink.list.next)
785 unregister_shrinker(&c->shrink);
787 mutex_lock(&c->bucket_lock);
789 #ifdef CONFIG_BCACHE_DEBUG
791 list_move(&c->verify_data->list, &c->btree_cache);
793 free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
796 list_splice(&c->btree_cache_freeable,
799 while (!list_empty(&c->btree_cache)) {
800 b = list_first_entry(&c->btree_cache, struct btree, list);
802 if (btree_node_dirty(b))
803 btree_complete_write(b, btree_current_write(b));
804 clear_bit(BTREE_NODE_dirty, &b->flags);
809 while (!list_empty(&c->btree_cache_freed)) {
810 b = list_first_entry(&c->btree_cache_freed,
813 cancel_delayed_work_sync(&b->work);
817 mutex_unlock(&c->bucket_lock);
820 int bch_btree_cache_alloc(struct cache_set *c)
824 for (i = 0; i < mca_reserve(c); i++)
825 if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
828 list_splice_init(&c->btree_cache,
829 &c->btree_cache_freeable);
831 #ifdef CONFIG_BCACHE_DEBUG
832 mutex_init(&c->verify_lock);
834 c->verify_ondisk = (void *)
835 __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
837 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
839 if (c->verify_data &&
840 c->verify_data->sets[0].data)
841 list_del_init(&c->verify_data->list);
843 c->verify_data = NULL;
846 c->shrink.count_objects = bch_mca_count;
847 c->shrink.scan_objects = bch_mca_scan;
849 c->shrink.batch = c->btree_pages * 2;
850 register_shrinker(&c->shrink);
855 /* Btree in memory cache - hash table */
857 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
859 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
862 static struct btree *mca_find(struct cache_set *c, struct bkey *k)
867 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
868 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
876 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
880 trace_bcache_btree_cache_cannibalize(c);
882 if (!c->try_harder) {
883 c->try_harder = current;
884 c->try_harder_start = local_clock();
885 } else if (c->try_harder != current)
886 return ERR_PTR(-ENOSPC);
888 list_for_each_entry_reverse(b, &c->btree_cache, list)
889 if (!mca_reap(b, btree_order(k), false))
892 list_for_each_entry_reverse(b, &c->btree_cache, list)
893 if (!mca_reap(b, btree_order(k), true))
896 return ERR_PTR(-ENOMEM);
900 * We can only have one thread cannibalizing other cached btree nodes at a time,
901 * or we'll deadlock. We use an open coded mutex to ensure that, which a
902 * cannibalize_bucket() will take. This means every time we unlock the root of
903 * the btree, we need to release this lock if we have it held.
905 static void bch_cannibalize_unlock(struct cache_set *c)
907 if (c->try_harder == current) {
908 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
909 c->try_harder = NULL;
910 wake_up(&c->try_wait);
914 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
918 BUG_ON(current->bio_list);
920 lockdep_assert_held(&c->bucket_lock);
925 /* btree_free() doesn't free memory; it sticks the node on the end of
926 * the list. Check if there's any freed nodes there:
928 list_for_each_entry(b, &c->btree_cache_freeable, list)
929 if (!mca_reap(b, btree_order(k), false))
932 /* We never free struct btree itself, just the memory that holds the on
933 * disk node. Check the freed list before allocating a new one:
935 list_for_each_entry(b, &c->btree_cache_freed, list)
936 if (!mca_reap(b, 0, false)) {
937 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
938 if (!b->sets[0].data)
944 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
948 BUG_ON(!down_write_trylock(&b->lock));
952 BUG_ON(b->io_mutex.count != 1);
954 bkey_copy(&b->key, k);
955 list_move(&b->list, &c->btree_cache);
956 hlist_del_init_rcu(&b->hash);
957 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
959 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
961 b->parent = (void *) ~0UL;
970 b = mca_cannibalize(c, k);
978 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
979 * in from disk if necessary.
981 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
983 * The btree node will have either a read or a write lock held, depending on
984 * level and op->lock.
986 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
987 int level, bool write)
997 if (current->bio_list)
998 return ERR_PTR(-EAGAIN);
1000 mutex_lock(&c->bucket_lock);
1001 b = mca_alloc(c, k, level);
1002 mutex_unlock(&c->bucket_lock);
1009 bch_btree_node_read(b);
1012 downgrade_write(&b->lock);
1014 rw_lock(write, b, level);
1015 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
1016 rw_unlock(write, b);
1019 BUG_ON(b->level != level);
1024 for (; i <= b->nsets && b->sets[i].size; i++) {
1025 prefetch(b->sets[i].tree);
1026 prefetch(b->sets[i].data);
1029 for (; i <= b->nsets; i++)
1030 prefetch(b->sets[i].data);
1032 if (btree_node_io_error(b)) {
1033 rw_unlock(write, b);
1034 return ERR_PTR(-EIO);
1037 BUG_ON(!b->written);
1042 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
1046 mutex_lock(&c->bucket_lock);
1047 b = mca_alloc(c, k, level);
1048 mutex_unlock(&c->bucket_lock);
1050 if (!IS_ERR_OR_NULL(b)) {
1051 bch_btree_node_read(b);
1058 static void btree_node_free(struct btree *b)
1062 trace_bcache_btree_node_free(b);
1064 BUG_ON(b == b->c->root);
1066 if (btree_node_dirty(b))
1067 btree_complete_write(b, btree_current_write(b));
1068 clear_bit(BTREE_NODE_dirty, &b->flags);
1070 cancel_delayed_work(&b->work);
1072 mutex_lock(&b->c->bucket_lock);
1074 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1075 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
1077 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1078 PTR_BUCKET(b->c, &b->key, i));
1081 bch_bucket_free(b->c, &b->key);
1083 mutex_unlock(&b->c->bucket_lock);
1086 struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1089 struct btree *b = ERR_PTR(-EAGAIN);
1091 mutex_lock(&c->bucket_lock);
1093 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1096 bkey_put(c, &k.key);
1097 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1099 b = mca_alloc(c, &k.key, level);
1105 "Tried to allocate bucket that was in btree cache");
1110 bch_bset_init_next(b);
1112 mutex_unlock(&c->bucket_lock);
1114 trace_bcache_btree_node_alloc(b);
1117 bch_bucket_free(c, &k.key);
1119 mutex_unlock(&c->bucket_lock);
1121 trace_bcache_btree_node_alloc_fail(b);
1125 static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1127 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1128 if (!IS_ERR_OR_NULL(n))
1129 bch_btree_sort_into(b, n);
1134 static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1138 bkey_copy(k, &b->key);
1139 bkey_copy_key(k, &ZERO_KEY);
1141 for (i = 0; i < KEY_PTRS(k); i++) {
1142 uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
1144 SET_PTR_GEN(k, i, g);
1147 atomic_inc(&b->c->prio_blocked);
1150 static int btree_check_reserve(struct btree *b, struct btree_op *op)
1152 struct cache_set *c = b->c;
1154 unsigned i, reserve = c->root->level * 2 + 1;
1157 mutex_lock(&c->bucket_lock);
1159 for_each_cache(ca, c, i)
1160 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1162 prepare_to_wait(&c->bucket_wait, &op->wait,
1163 TASK_UNINTERRUPTIBLE);
1168 mutex_unlock(&c->bucket_lock);
1172 /* Garbage collection */
1174 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1181 * ptr_invalid() can't return true for the keys that mark btree nodes as
1182 * freed, but since ptr_bad() returns true we'll never actually use them
1183 * for anything and thus we don't want mark their pointers here
1185 if (!bkey_cmp(k, &ZERO_KEY))
1188 for (i = 0; i < KEY_PTRS(k); i++) {
1189 if (!ptr_available(c, k, i))
1192 g = PTR_BUCKET(c, k, i);
1194 if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1195 g->gc_gen = PTR_GEN(k, i);
1197 if (ptr_stale(c, k, i)) {
1198 stale = max(stale, ptr_stale(c, k, i));
1202 cache_bug_on(GC_MARK(g) &&
1203 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1204 c, "inconsistent ptrs: mark = %llu, level = %i",
1208 SET_GC_MARK(g, GC_MARK_METADATA);
1209 else if (KEY_DIRTY(k))
1210 SET_GC_MARK(g, GC_MARK_DIRTY);
1212 /* guard against overflow */
1213 SET_GC_SECTORS_USED(g, min_t(unsigned,
1214 GC_SECTORS_USED(g) + KEY_SIZE(k),
1217 BUG_ON(!GC_SECTORS_USED(g));
1223 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1225 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1228 unsigned keys = 0, good_keys = 0;
1230 struct btree_iter iter;
1231 struct bset_tree *t;
1235 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1236 stale = max(stale, btree_mark_key(b, k));
1239 if (bch_ptr_bad(b, k))
1242 gc->key_bytes += bkey_u64s(k);
1246 gc->data += KEY_SIZE(k);
1249 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1250 btree_bug_on(t->size &&
1251 bset_written(b, t) &&
1252 bkey_cmp(&b->key, &t->end) < 0,
1253 b, "found short btree key in gc");
1255 if (b->c->gc_always_rewrite)
1261 if ((keys - good_keys) * 2 > keys)
1267 #define GC_MERGE_NODES 4U
1269 struct gc_merge_info {
1274 static int bch_btree_insert_node(struct btree *, struct btree_op *,
1275 struct keylist *, atomic_t *, struct bkey *);
1277 static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1278 struct keylist *keylist, struct gc_stat *gc,
1279 struct gc_merge_info *r)
1281 unsigned i, nodes = 0, keys = 0, blocks;
1282 struct btree *new_nodes[GC_MERGE_NODES];
1286 memset(new_nodes, 0, sizeof(new_nodes));
1287 closure_init_stack(&cl);
1289 while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1290 keys += r[nodes++].keys;
1292 blocks = btree_default_blocks(b->c) * 2 / 3;
1295 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1298 for (i = 0; i < nodes; i++) {
1299 new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
1300 if (IS_ERR_OR_NULL(new_nodes[i]))
1301 goto out_nocoalesce;
1304 for (i = nodes - 1; i > 0; --i) {
1305 struct bset *n1 = new_nodes[i]->sets->data;
1306 struct bset *n2 = new_nodes[i - 1]->sets->data;
1307 struct bkey *k, *last = NULL;
1315 if (__set_blocks(n1, n1->keys + keys +
1316 bkey_u64s(k), b->c) > blocks)
1320 keys += bkey_u64s(k);
1324 * Last node we're not getting rid of - we're getting
1325 * rid of the node at r[0]. Have to try and fit all of
1326 * the remaining keys into this node; we can't ensure
1327 * they will always fit due to rounding and variable
1328 * length keys (shouldn't be possible in practice,
1331 if (__set_blocks(n1, n1->keys + n2->keys,
1332 b->c) > btree_blocks(new_nodes[i]))
1333 goto out_nocoalesce;
1336 /* Take the key of the node we're getting rid of */
1340 BUG_ON(__set_blocks(n1, n1->keys + keys,
1341 b->c) > btree_blocks(new_nodes[i]));
1344 bkey_copy_key(&new_nodes[i]->key, last);
1348 (void *) node(n2, keys) - (void *) n2->start);
1351 r[i].keys = n1->keys;
1355 (void *) end(n2) - (void *) node(n2, keys));
1359 if (__bch_keylist_realloc(keylist,
1360 bkey_u64s(&new_nodes[i]->key)))
1361 goto out_nocoalesce;
1363 bch_btree_node_write(new_nodes[i], &cl);
1364 bch_keylist_add(keylist, &new_nodes[i]->key);
1367 for (i = 0; i < nodes; i++) {
1368 if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
1369 goto out_nocoalesce;
1371 make_btree_freeing_key(r[i].b, keylist->top);
1372 bch_keylist_push(keylist);
1375 /* We emptied out this node */
1376 BUG_ON(new_nodes[0]->sets->data->keys);
1377 btree_node_free(new_nodes[0]);
1378 rw_unlock(true, new_nodes[0]);
1382 for (i = 0; i < nodes; i++) {
1383 btree_node_free(r[i].b);
1384 rw_unlock(true, r[i].b);
1386 r[i].b = new_nodes[i];
1389 bch_btree_insert_node(b, op, keylist, NULL, NULL);
1390 BUG_ON(!bch_keylist_empty(keylist));
1392 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1393 r[nodes - 1].b = ERR_PTR(-EINTR);
1395 trace_bcache_btree_gc_coalesce(nodes);
1398 /* Invalidated our iterator */
1404 while ((k = bch_keylist_pop(keylist)))
1405 if (!bkey_cmp(k, &ZERO_KEY))
1406 atomic_dec(&b->c->prio_blocked);
1408 for (i = 0; i < nodes; i++)
1409 if (!IS_ERR_OR_NULL(new_nodes[i])) {
1410 btree_node_free(new_nodes[i]);
1411 rw_unlock(true, new_nodes[i]);
1416 static unsigned btree_gc_count_keys(struct btree *b)
1419 struct btree_iter iter;
1422 for_each_key_filter(b, k, &iter, bch_ptr_bad)
1423 ret += bkey_u64s(k);
1428 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1429 struct closure *writes, struct gc_stat *gc)
1433 bool should_rewrite;
1436 struct keylist keys;
1437 struct btree_iter iter;
1438 struct gc_merge_info r[GC_MERGE_NODES];
1439 struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1441 bch_keylist_init(&keys);
1442 bch_btree_iter_init(b, &iter, &b->c->gc_done);
1444 for (i = 0; i < GC_MERGE_NODES; i++)
1445 r[i].b = ERR_PTR(-EINTR);
1448 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1450 r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1452 ret = PTR_ERR(r->b);
1456 r->keys = btree_gc_count_keys(r->b);
1458 ret = btree_gc_coalesce(b, op, &keys, gc, r);
1466 if (!IS_ERR(last->b)) {
1467 should_rewrite = btree_gc_mark_node(last->b, gc);
1468 if (should_rewrite &&
1469 !btree_check_reserve(b, NULL)) {
1470 n = btree_node_alloc_replacement(last->b,
1473 if (!IS_ERR_OR_NULL(n)) {
1474 bch_btree_node_write_sync(n);
1475 bch_keylist_add(&keys, &n->key);
1477 make_btree_freeing_key(last->b,
1479 bch_keylist_push(&keys);
1481 btree_node_free(last->b);
1483 bch_btree_insert_node(b, op, &keys,
1485 BUG_ON(!bch_keylist_empty(&keys));
1487 rw_unlock(true, last->b);
1490 /* Invalidated our iterator */
1496 if (last->b->level) {
1497 ret = btree_gc_recurse(last->b, op, writes, gc);
1502 bkey_copy_key(&b->c->gc_done, &last->b->key);
1505 * Must flush leaf nodes before gc ends, since replace
1506 * operations aren't journalled
1508 if (btree_node_dirty(last->b))
1509 bch_btree_node_write(last->b, writes);
1510 rw_unlock(true, last->b);
1513 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1516 if (need_resched()) {
1522 for (i = 0; i < GC_MERGE_NODES; i++)
1523 if (!IS_ERR_OR_NULL(r[i].b)) {
1524 if (btree_node_dirty(r[i].b))
1525 bch_btree_node_write(r[i].b, writes);
1526 rw_unlock(true, r[i].b);
1529 bch_keylist_free(&keys);
1534 static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1535 struct closure *writes, struct gc_stat *gc)
1537 struct btree *n = NULL;
1539 bool should_rewrite;
1541 should_rewrite = btree_gc_mark_node(b, gc);
1542 if (should_rewrite) {
1543 n = btree_node_alloc_replacement(b, false);
1545 if (!IS_ERR_OR_NULL(n)) {
1546 bch_btree_node_write_sync(n);
1547 bch_btree_set_root(n);
1556 ret = btree_gc_recurse(b, op, writes, gc);
1561 bkey_copy_key(&b->c->gc_done, &b->key);
1566 static void btree_gc_start(struct cache_set *c)
1572 if (!c->gc_mark_valid)
1575 mutex_lock(&c->bucket_lock);
1577 c->gc_mark_valid = 0;
1578 c->gc_done = ZERO_KEY;
1580 for_each_cache(ca, c, i)
1581 for_each_bucket(b, ca) {
1583 if (!atomic_read(&b->pin)) {
1584 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1585 SET_GC_SECTORS_USED(b, 0);
1589 mutex_unlock(&c->bucket_lock);
1592 size_t bch_btree_gc_finish(struct cache_set *c)
1594 size_t available = 0;
1599 mutex_lock(&c->bucket_lock);
1602 c->gc_mark_valid = 1;
1606 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1607 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1610 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1611 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1614 /* don't reclaim buckets to which writeback keys point */
1616 for (i = 0; i < c->nr_uuids; i++) {
1617 struct bcache_device *d = c->devices[i];
1618 struct cached_dev *dc;
1619 struct keybuf_key *w, *n;
1622 if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1624 dc = container_of(d, struct cached_dev, disk);
1626 spin_lock(&dc->writeback_keys.lock);
1627 rbtree_postorder_for_each_entry_safe(w, n,
1628 &dc->writeback_keys.keys, node)
1629 for (j = 0; j < KEY_PTRS(&w->key); j++)
1630 SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1632 spin_unlock(&dc->writeback_keys.lock);
1636 for_each_cache(ca, c, i) {
1639 ca->invalidate_needs_gc = 0;
1641 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1642 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1644 for (i = ca->prio_buckets;
1645 i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1646 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1648 for_each_bucket(b, ca) {
1649 b->last_gc = b->gc_gen;
1650 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1652 if (!atomic_read(&b->pin) &&
1653 GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1655 if (!GC_SECTORS_USED(b))
1656 bch_bucket_add_unused(ca, b);
1661 mutex_unlock(&c->bucket_lock);
1665 static void bch_btree_gc(struct cache_set *c)
1668 unsigned long available;
1669 struct gc_stat stats;
1670 struct closure writes;
1672 uint64_t start_time = local_clock();
1674 trace_bcache_gc_start(c);
1676 memset(&stats, 0, sizeof(struct gc_stat));
1677 closure_init_stack(&writes);
1678 bch_btree_op_init(&op, SHRT_MAX);
1683 ret = btree_root(gc_root, c, &op, &writes, &stats);
1684 closure_sync(&writes);
1686 if (ret && ret != -EAGAIN)
1687 pr_warn("gc failed!");
1690 available = bch_btree_gc_finish(c);
1691 wake_up_allocators(c);
1693 bch_time_stats_update(&c->btree_gc_time, start_time);
1695 stats.key_bytes *= sizeof(uint64_t);
1697 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1698 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1700 trace_bcache_gc_end(c);
1705 static int bch_gc_thread(void *arg)
1707 struct cache_set *c = arg;
1715 set_current_state(TASK_INTERRUPTIBLE);
1716 if (kthread_should_stop())
1719 mutex_lock(&c->bucket_lock);
1721 for_each_cache(ca, c, i)
1722 if (ca->invalidate_needs_gc) {
1723 mutex_unlock(&c->bucket_lock);
1724 set_current_state(TASK_RUNNING);
1728 mutex_unlock(&c->bucket_lock);
1737 int bch_gc_thread_start(struct cache_set *c)
1739 c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
1740 if (IS_ERR(c->gc_thread))
1741 return PTR_ERR(c->gc_thread);
1743 set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
1747 /* Initial partial gc */
1749 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1750 unsigned long **seen)
1754 struct bkey *k, *p = NULL;
1756 struct btree_iter iter;
1758 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1759 for (i = 0; i < KEY_PTRS(k); i++) {
1760 if (!ptr_available(b->c, k, i))
1763 g = PTR_BUCKET(b->c, k, i);
1765 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1766 seen[PTR_DEV(k, i)]) ||
1767 !ptr_stale(b->c, k, i)) {
1768 g->gen = PTR_GEN(k, i);
1771 g->prio = BTREE_PRIO;
1772 else if (g->prio == BTREE_PRIO)
1773 g->prio = INITIAL_PRIO;
1777 btree_mark_key(b, k);
1781 bch_btree_iter_init(b, &iter, NULL);
1784 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1786 btree_node_prefetch(b->c, k, b->level - 1);
1789 ret = btree(check_recurse, p, b, op, seen);
1792 } while (p && !ret);
1798 int bch_btree_check(struct cache_set *c)
1802 unsigned long *seen[MAX_CACHES_PER_SET];
1805 memset(seen, 0, sizeof(seen));
1806 bch_btree_op_init(&op, SHRT_MAX);
1808 for (i = 0; c->cache[i]; i++) {
1809 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1810 seen[i] = kmalloc(n, GFP_KERNEL);
1814 /* Disables the seen array until prio_read() uses it too */
1815 memset(seen[i], 0xFF, n);
1818 ret = btree_root(check_recurse, c, &op, seen);
1820 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1825 /* Btree insertion */
1827 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1829 struct bset *i = b->sets[b->nsets].data;
1831 memmove((uint64_t *) where + bkey_u64s(insert),
1833 (void *) end(i) - (void *) where);
1835 i->keys += bkey_u64s(insert);
1836 bkey_copy(where, insert);
1837 bch_bset_fix_lookup_table(b, where);
1840 static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1841 struct btree_iter *iter,
1842 struct bkey *replace_key)
1844 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1847 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1851 uint64_t old_offset;
1852 unsigned old_size, sectors_found = 0;
1855 struct bkey *k = bch_btree_iter_next(iter);
1857 bkey_cmp(&START_KEY(k), insert) >= 0)
1860 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1863 old_offset = KEY_START(k);
1864 old_size = KEY_SIZE(k);
1867 * We might overlap with 0 size extents; we can't skip these
1868 * because if they're in the set we're inserting to we have to
1869 * adjust them so they don't overlap with the key we're
1870 * inserting. But we don't want to check them for replace
1874 if (replace_key && KEY_SIZE(k)) {
1876 * k might have been split since we inserted/found the
1877 * key we're replacing
1880 uint64_t offset = KEY_START(k) -
1881 KEY_START(replace_key);
1883 /* But it must be a subset of the replace key */
1884 if (KEY_START(k) < KEY_START(replace_key) ||
1885 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1888 /* We didn't find a key that we were supposed to */
1889 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1892 if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
1893 KEY_DIRTY(k) != KEY_DIRTY(replace_key))
1899 BUG_ON(!KEY_PTRS(replace_key));
1901 for (i = 0; i < KEY_PTRS(replace_key); i++)
1902 if (k->ptr[i] != replace_key->ptr[i] + offset)
1905 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1908 if (bkey_cmp(insert, k) < 0 &&
1909 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1911 * We overlapped in the middle of an existing key: that
1912 * means we have to split the old key. But we have to do
1913 * slightly different things depending on whether the
1914 * old key has been written out yet.
1919 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1921 if (bkey_written(b, k)) {
1923 * We insert a new key to cover the top of the
1924 * old key, and the old key is modified in place
1925 * to represent the bottom split.
1927 * It's completely arbitrary whether the new key
1928 * is the top or the bottom, but it has to match
1929 * up with what btree_sort_fixup() does - it
1930 * doesn't check for this kind of overlap, it
1931 * depends on us inserting a new key for the top
1934 top = bch_bset_search(b, &b->sets[b->nsets],
1936 shift_keys(b, top, k);
1938 BKEY_PADDED(key) temp;
1939 bkey_copy(&temp.key, k);
1940 shift_keys(b, k, &temp.key);
1944 bch_cut_front(insert, top);
1945 bch_cut_back(&START_KEY(insert), k);
1946 bch_bset_fix_invalidated_key(b, k);
1950 if (bkey_cmp(insert, k) < 0) {
1951 bch_cut_front(insert, k);
1953 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1954 old_offset = KEY_START(insert);
1956 if (bkey_written(b, k) &&
1957 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1959 * Completely overwrote, so we don't have to
1960 * invalidate the binary search tree
1962 bch_cut_front(k, k);
1964 __bch_cut_back(&START_KEY(insert), k);
1965 bch_bset_fix_invalidated_key(b, k);
1969 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1974 if (!sectors_found) {
1976 } else if (sectors_found < KEY_SIZE(insert)) {
1977 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1978 (KEY_SIZE(insert) - sectors_found));
1979 SET_KEY_SIZE(insert, sectors_found);
1986 static bool btree_insert_key(struct btree *b, struct btree_op *op,
1987 struct bkey *k, struct bkey *replace_key)
1989 struct bset *i = b->sets[b->nsets].data;
1990 struct bkey *m, *prev;
1991 unsigned status = BTREE_INSERT_STATUS_INSERT;
1993 BUG_ON(bkey_cmp(k, &b->key) > 0);
1994 BUG_ON(b->level && !KEY_PTRS(k));
1995 BUG_ON(!b->level && !KEY_OFFSET(k));
1998 struct btree_iter iter;
2001 * bset_search() returns the first key that is strictly greater
2002 * than the search key - but for back merging, we want to find
2006 m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
2008 if (fix_overlapping_extents(b, k, &iter, replace_key)) {
2009 op->insert_collision = true;
2014 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
2015 KEY_START(k), KEY_SIZE(k));
2017 while (m != end(i) &&
2018 bkey_cmp(k, &START_KEY(m)) > 0)
2019 prev = m, m = bkey_next(m);
2021 if (key_merging_disabled(b->c))
2024 /* prev is in the tree, if we merge we're done */
2025 status = BTREE_INSERT_STATUS_BACK_MERGE;
2027 bch_bkey_try_merge(b, prev, k))
2030 status = BTREE_INSERT_STATUS_OVERWROTE;
2032 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
2035 status = BTREE_INSERT_STATUS_FRONT_MERGE;
2037 bch_bkey_try_merge(b, k, m))
2040 BUG_ON(replace_key);
2041 m = bch_bset_search(b, &b->sets[b->nsets], k);
2044 insert: shift_keys(b, m, k);
2045 copy: bkey_copy(m, k);
2047 bch_check_keys(b, "%u for %s", status,
2048 replace_key ? "replace" : "insert");
2050 if (b->level && !KEY_OFFSET(k))
2051 btree_current_write(b)->prio_blocked++;
2053 trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
2058 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2059 struct keylist *insert_keys,
2060 struct bkey *replace_key)
2063 int oldsize = bch_count_data(b);
2065 while (!bch_keylist_empty(insert_keys)) {
2066 struct bset *i = write_block(b);
2067 struct bkey *k = insert_keys->keys;
2069 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
2073 if (bkey_cmp(k, &b->key) <= 0) {
2077 ret |= btree_insert_key(b, op, k, replace_key);
2078 bch_keylist_pop_front(insert_keys);
2079 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2080 BKEY_PADDED(key) temp;
2081 bkey_copy(&temp.key, insert_keys->keys);
2083 bch_cut_back(&b->key, &temp.key);
2084 bch_cut_front(&b->key, insert_keys->keys);
2086 ret |= btree_insert_key(b, op, &temp.key, replace_key);
2093 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2095 BUG_ON(bch_count_data(b) < oldsize);
2099 static int btree_split(struct btree *b, struct btree_op *op,
2100 struct keylist *insert_keys,
2101 struct bkey *replace_key)
2104 struct btree *n1, *n2 = NULL, *n3 = NULL;
2105 uint64_t start_time = local_clock();
2107 struct keylist parent_keys;
2109 closure_init_stack(&cl);
2110 bch_keylist_init(&parent_keys);
2113 btree_check_reserve(b, op))
2116 n1 = btree_node_alloc_replacement(b, true);
2120 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
2125 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
2127 n2 = bch_btree_node_alloc(b->c, b->level, true);
2132 n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
2137 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2140 * Has to be a linear search because we don't have an auxiliary
2144 while (keys < (n1->sets[0].data->keys * 3) / 5)
2145 keys += bkey_u64s(node(n1->sets[0].data, keys));
2147 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
2148 keys += bkey_u64s(node(n1->sets[0].data, keys));
2150 n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
2151 n1->sets[0].data->keys = keys;
2153 memcpy(n2->sets[0].data->start,
2154 end(n1->sets[0].data),
2155 n2->sets[0].data->keys * sizeof(uint64_t));
2157 bkey_copy_key(&n2->key, &b->key);
2159 bch_keylist_add(&parent_keys, &n2->key);
2160 bch_btree_node_write(n2, &cl);
2161 rw_unlock(true, n2);
2163 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
2165 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2168 bch_keylist_add(&parent_keys, &n1->key);
2169 bch_btree_node_write(n1, &cl);
2172 /* Depth increases, make a new root */
2173 bkey_copy_key(&n3->key, &MAX_KEY);
2174 bch_btree_insert_keys(n3, op, &parent_keys, NULL);
2175 bch_btree_node_write(n3, &cl);
2178 bch_btree_set_root(n3);
2179 rw_unlock(true, n3);
2182 } else if (!b->parent) {
2183 /* Root filled up but didn't need to be split */
2185 bch_btree_set_root(n1);
2189 /* Split a non root node */
2191 make_btree_freeing_key(b, parent_keys.top);
2192 bch_keylist_push(&parent_keys);
2196 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2197 BUG_ON(!bch_keylist_empty(&parent_keys));
2200 rw_unlock(true, n1);
2202 bch_time_stats_update(&b->c->btree_split_time, start_time);
2206 bkey_put(b->c, &n2->key);
2207 btree_node_free(n2);
2208 rw_unlock(true, n2);
2210 bkey_put(b->c, &n1->key);
2211 btree_node_free(n1);
2212 rw_unlock(true, n1);
2214 WARN(1, "bcache: btree split failed");
2216 if (n3 == ERR_PTR(-EAGAIN) ||
2217 n2 == ERR_PTR(-EAGAIN) ||
2218 n1 == ERR_PTR(-EAGAIN))
2224 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2225 struct keylist *insert_keys,
2226 atomic_t *journal_ref,
2227 struct bkey *replace_key)
2229 BUG_ON(b->level && replace_key);
2231 if (should_split(b)) {
2232 if (current->bio_list) {
2233 op->lock = b->c->root->level + 1;
2235 } else if (op->lock <= b->c->root->level) {
2236 op->lock = b->c->root->level + 1;
2239 /* Invalidated all iterators */
2240 return btree_split(b, op, insert_keys, replace_key) ?:
2244 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2246 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2248 bch_btree_leaf_dirty(b, journal_ref);
2250 bch_btree_node_write_sync(b);
2257 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2258 struct bkey *check_key)
2261 uint64_t btree_ptr = b->key.ptr[0];
2262 unsigned long seq = b->seq;
2263 struct keylist insert;
2264 bool upgrade = op->lock == -1;
2266 bch_keylist_init(&insert);
2269 rw_unlock(false, b);
2270 rw_lock(true, b, b->level);
2272 if (b->key.ptr[0] != btree_ptr ||
2277 SET_KEY_PTRS(check_key, 1);
2278 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2280 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2282 bch_keylist_add(&insert, check_key);
2284 ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2286 BUG_ON(!ret && !bch_keylist_empty(&insert));
2289 downgrade_write(&b->lock);
2293 struct btree_insert_op {
2295 struct keylist *keys;
2296 atomic_t *journal_ref;
2297 struct bkey *replace_key;
2300 static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2302 struct btree_insert_op *op = container_of(b_op,
2303 struct btree_insert_op, op);
2305 int ret = bch_btree_insert_node(b, &op->op, op->keys,
2306 op->journal_ref, op->replace_key);
2307 if (ret && !bch_keylist_empty(op->keys))
2313 int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2314 atomic_t *journal_ref, struct bkey *replace_key)
2316 struct btree_insert_op op;
2319 BUG_ON(current->bio_list);
2320 BUG_ON(bch_keylist_empty(keys));
2322 bch_btree_op_init(&op.op, 0);
2324 op.journal_ref = journal_ref;
2325 op.replace_key = replace_key;
2327 while (!ret && !bch_keylist_empty(keys)) {
2329 ret = bch_btree_map_leaf_nodes(&op.op, c,
2330 &START_KEY(keys->keys),
2337 pr_err("error %i", ret);
2339 while ((k = bch_keylist_pop(keys)))
2341 } else if (op.op.insert_collision)
2347 void bch_btree_set_root(struct btree *b)
2352 closure_init_stack(&cl);
2354 trace_bcache_btree_set_root(b);
2356 BUG_ON(!b->written);
2358 for (i = 0; i < KEY_PTRS(&b->key); i++)
2359 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2361 mutex_lock(&b->c->bucket_lock);
2362 list_del_init(&b->list);
2363 mutex_unlock(&b->c->bucket_lock);
2367 bch_journal_meta(b->c, &cl);
2371 /* Map across nodes or keys */
2373 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2375 btree_map_nodes_fn *fn, int flags)
2377 int ret = MAP_CONTINUE;
2381 struct btree_iter iter;
2383 bch_btree_iter_init(b, &iter, from);
2385 while ((k = bch_btree_iter_next_filter(&iter, b,
2387 ret = btree(map_nodes_recurse, k, b,
2388 op, from, fn, flags);
2391 if (ret != MAP_CONTINUE)
2396 if (!b->level || flags == MAP_ALL_NODES)
2402 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2403 struct bkey *from, btree_map_nodes_fn *fn, int flags)
2405 return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2408 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2409 struct bkey *from, btree_map_keys_fn *fn,
2412 int ret = MAP_CONTINUE;
2414 struct btree_iter iter;
2416 bch_btree_iter_init(b, &iter, from);
2418 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
2421 : btree(map_keys_recurse, k, b, op, from, fn, flags);
2424 if (ret != MAP_CONTINUE)
2428 if (!b->level && (flags & MAP_END_KEY))
2429 ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2430 KEY_OFFSET(&b->key), 0));
2435 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2436 struct bkey *from, btree_map_keys_fn *fn, int flags)
2438 return btree_root(map_keys_recurse, c, op, from, fn, flags);
2443 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2445 /* Overlapping keys compare equal */
2446 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2448 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2453 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2454 struct keybuf_key *r)
2456 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2464 keybuf_pred_fn *pred;
2467 static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2470 struct refill *refill = container_of(op, struct refill, op);
2471 struct keybuf *buf = refill->buf;
2472 int ret = MAP_CONTINUE;
2474 if (bkey_cmp(k, refill->end) >= 0) {
2479 if (!KEY_SIZE(k)) /* end key */
2482 if (refill->pred(buf, k)) {
2483 struct keybuf_key *w;
2485 spin_lock(&buf->lock);
2487 w = array_alloc(&buf->freelist);
2489 spin_unlock(&buf->lock);
2494 bkey_copy(&w->key, k);
2496 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2497 array_free(&buf->freelist, w);
2501 if (array_freelist_empty(&buf->freelist))
2504 spin_unlock(&buf->lock);
2507 buf->last_scanned = *k;
2511 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2512 struct bkey *end, keybuf_pred_fn *pred)
2514 struct bkey start = buf->last_scanned;
2515 struct refill refill;
2519 bch_btree_op_init(&refill.op, -1);
2520 refill.nr_found = 0;
2525 bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2526 refill_keybuf_fn, MAP_END_KEY);
2528 trace_bcache_keyscan(refill.nr_found,
2529 KEY_INODE(&start), KEY_OFFSET(&start),
2530 KEY_INODE(&buf->last_scanned),
2531 KEY_OFFSET(&buf->last_scanned));
2533 spin_lock(&buf->lock);
2535 if (!RB_EMPTY_ROOT(&buf->keys)) {
2536 struct keybuf_key *w;
2537 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2538 buf->start = START_KEY(&w->key);
2540 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2543 buf->start = MAX_KEY;
2547 spin_unlock(&buf->lock);
2550 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2552 rb_erase(&w->node, &buf->keys);
2553 array_free(&buf->freelist, w);
2556 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2558 spin_lock(&buf->lock);
2559 __bch_keybuf_del(buf, w);
2560 spin_unlock(&buf->lock);
2563 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2567 struct keybuf_key *p, *w, s;
2570 if (bkey_cmp(end, &buf->start) <= 0 ||
2571 bkey_cmp(start, &buf->end) >= 0)
2574 spin_lock(&buf->lock);
2575 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2577 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2579 w = RB_NEXT(w, node);
2584 __bch_keybuf_del(buf, p);
2587 spin_unlock(&buf->lock);
2591 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2593 struct keybuf_key *w;
2594 spin_lock(&buf->lock);
2596 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2598 while (w && w->private)
2599 w = RB_NEXT(w, node);
2602 w->private = ERR_PTR(-EINTR);
2604 spin_unlock(&buf->lock);
2608 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2611 keybuf_pred_fn *pred)
2613 struct keybuf_key *ret;
2616 ret = bch_keybuf_next(buf);
2620 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2621 pr_debug("scan finished");
2625 bch_refill_keybuf(c, buf, end, pred);
2631 void bch_keybuf_init(struct keybuf *buf)
2633 buf->last_scanned = MAX_KEY;
2634 buf->keys = RB_ROOT;
2636 spin_lock_init(&buf->lock);
2637 array_allocator_init(&buf->freelist);
2640 void bch_btree_exit(void)
2643 destroy_workqueue(btree_io_wq);
2646 int __init bch_btree_init(void)
2648 btree_io_wq = create_singlethread_workqueue("bch_btree_io");