Merge branch 'bcache-for-3.15' of git://evilpiepirate.org/~kent/linux-bcache into...

author Jens Axboe <axboe@fb.com>

Tue, 18 Mar 2014 19:57:01 +0000 (13:57 -0600)

committer Jens Axboe <axboe@fb.com>

Tue, 18 Mar 2014 19:57:01 +0000 (13:57 -0600)
author Jens Axboe <axboe@fb.com>
Tue, 18 Mar 2014 19:57:01 +0000 (13:57 -0600)
committer Jens Axboe <axboe@fb.com>
Tue, 18 Mar 2014 19:57:01 +0000 (13:57 -0600)
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig

index 2638417b19aa74d1207de209b93bbb1dfc2689e8..4d200883c505b1162d9c3527ddfe5692a37ae8fb 100644 (file)
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG
         Keeps all active closures in a linked list and provides a debugfs
         interface to list them, which makes it possible to see asynchronous
         operations that get stuck.
-
-# cgroup code needs to be updated:
-#
-#config CGROUP_BCACHE
-#      bool "Cgroup controls for bcache"
-#      depends on BCACHE && BLK_CGROUP
-#      ---help---
-#      TODO
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c

index c0d37d0824439f2daff4c049ab82dc6b3c6e646e..443d03fbac4705bd97f5acf77f622e0562b7fc55 100644 (file)
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
         ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
         WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
  
-       if (CACHE_SYNC(&ca->set->sb)) {
-               ca->need_save_prio = max(ca->need_save_prio,
-                                        bucket_disk_gen(b));
-               WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
-       }
-
         return ret;
  }
  
@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
         mutex_unlock(&c->bucket_lock);
  }
  
-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
  
  static inline bool can_inc_bucket_gen(struct bucket *b)
  {
-       return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
-               bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+       return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
  }
  
-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
  {
-       BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
-
-       if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-               unsigned i;
-
-               for (i = 0; i < RESERVE_NONE; i++)
-                       if (!fifo_full(&ca->free[i]))
-                               goto add;
+       BUG_ON(!ca->set->gc_mark_valid);
  
-               return false;
-       }
-add:
-       b->prio = 0;
-
-       if (can_inc_bucket_gen(b) &&
-           fifo_push(&ca->unused, b - ca->buckets)) {
-               atomic_inc(&b->pin);
-               return true;
-       }
-
-       return false;
-}
-
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
-       return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+       return (!GC_MARK(b) ||
+               GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
                 !atomic_read(&b->pin) &&
                 can_inc_bucket_gen(b);
  }
  
-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
  {
+       lockdep_assert_held(&ca->set->bucket_lock);
+       BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
+
+       if (GC_SECTORS_USED(b))
+               trace_bcache_invalidate(ca, b - ca->buckets);
+
         bch_inc_gen(ca, b);
         b->prio = INITIAL_PRIO;
         atomic_inc(&b->pin);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+       __bch_invalidate_one_bucket(ca, b);
+
         fifo_push(&ca->free_inc, b - ca->buckets);
  }
  
@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
         ca->heap.used = 0;
  
         for_each_bucket(b, ca) {
-               /*
-                * If we fill up the unused list, if we then return before
-                * adding anything to the free_inc list we'll skip writing
-                * prios/gens and just go back to allocating from the unused
-                * list:
-                */
-               if (fifo_full(&ca->unused))
-                       return;
-
-               if (!can_invalidate_bucket(ca, b))
-                       continue;
-
-               if (!GC_SECTORS_USED(b) &&
-                   bch_bucket_add_unused(ca, b))
+               if (!bch_can_invalidate_bucket(ca, b))
                         continue;
  
                 if (!heap_full(&ca->heap))
@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
                         return;
                 }
  
-               invalidate_one_bucket(ca, b);
+               bch_invalidate_one_bucket(ca, b);
         }
  }
  
@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
  
                 b = ca->buckets + ca->fifo_last_bucket++;
  
-               if (can_invalidate_bucket(ca, b))
-                       invalidate_one_bucket(ca, b);
+               if (bch_can_invalidate_bucket(ca, b))
+                       bch_invalidate_one_bucket(ca, b);
  
                 if (++checked >= ca->sb.nbuckets) {
                         ca->invalidate_needs_gc = 1;
@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
  
                 b = ca->buckets + n;
  
-               if (can_invalidate_bucket(ca, b))
-                       invalidate_one_bucket(ca, b);
+               if (bch_can_invalidate_bucket(ca, b))
+                       bch_invalidate_one_bucket(ca, b);
  
                 if (++checked >= ca->sb.nbuckets / 2) {
                         ca->invalidate_needs_gc = 1;
@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
  
  static void invalidate_buckets(struct cache *ca)
  {
-       if (ca->invalidate_needs_gc)
-               return;
+       BUG_ON(ca->invalidate_needs_gc);
  
         switch (CACHE_REPLACEMENT(&ca->sb)) {
         case CACHE_REPLACEMENT_LRU:
@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca)
                 invalidate_buckets_random(ca);
                 break;
         }
-
-       trace_bcache_alloc_invalidate(ca);
  }
  
  #define allocator_wait(ca, cond)                                       \
@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg)
                  * possibly issue discards to them, then we add the bucket to
                  * the free list:
                  */
-               while (1) {
+               while (!fifo_empty(&ca->free_inc)) {
                         long bucket;
  
-                       if ((!atomic_read(&ca->set->prio_blocked) ||
-                            !CACHE_SYNC(&ca->set->sb)) &&
-                           !fifo_empty(&ca->unused))
-                               fifo_pop(&ca->unused, bucket);
-                       else if (!fifo_empty(&ca->free_inc))
-                               fifo_pop(&ca->free_inc, bucket);
-                       else
-                               break;
+                       fifo_pop(&ca->free_inc, bucket);
  
                         if (ca->discard) {
                                 mutex_unlock(&ca->set->bucket_lock);
@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg)
                         }
  
                         allocator_wait(ca, bch_allocator_push(ca, bucket));
+                       wake_up(&ca->set->btree_cache_wait);
                         wake_up(&ca->set->bucket_wait);
                 }
  
@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg)
                  * them to the free_inc list:
                  */
  
+retry_invalidate:
                 allocator_wait(ca, ca->set->gc_mark_valid &&
-                              (ca->need_save_prio > 64 ||
-                               !ca->invalidate_needs_gc));
+                              !ca->invalidate_needs_gc);
                 invalidate_buckets(ca);
  
                 /*
@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg)
                  * new stuff to them:
                  */
                 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
-               if (CACHE_SYNC(&ca->set->sb) &&
-                   (!fifo_empty(&ca->free_inc) ||
-                    ca->need_save_prio > 64))
+               if (CACHE_SYNC(&ca->set->sb)) {
+                       /*
+                        * This could deadlock if an allocation with a btree
+                        * node locked ever blocked - having the btree node
+                        * locked would block garbage collection, but here we're
+                        * waiting on garbage collection before we invalidate
+                        * and free anything.
+                        *
+                        * But this should be safe since the btree code always
+                        * uses btree_check_reserve() before allocating now, and
+                        * if it fails it blocks without btree nodes locked.
+                        */
+                       if (!fifo_full(&ca->free_inc))
+                               goto retry_invalidate;
+
                         bch_prio_write(ca);
+               }
         }
  }
  
+/* Allocation */
+
  long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
  {
         DEFINE_WAIT(w);
@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
             fifo_pop(&ca->free[reserve], r))
                 goto out;
  
-       if (!wait)
+       if (!wait) {
+               trace_bcache_alloc_fail(ca, reserve);
                 return -1;
+       }
  
         do {
                 prepare_to_wait(&ca->set->bucket_wait, &w,
@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
  out:
         wake_up_process(ca->alloc_thread);
  
+       trace_bcache_alloc(ca, reserve);
+
         if (expensive_debug_checks(ca->set)) {
                 size_t iter;
                 long i;
@@ -438,8 +423,6 @@ out:
                                 BUG_ON(i == r);
                 fifo_for_each(i, &ca->free_inc, iter)
                         BUG_ON(i == r);
-               fifo_for_each(i, &ca->unused, iter)
-                       BUG_ON(i == r);
         }
  
         b = ca->buckets + r;
@@ -461,17 +444,19 @@ out:
         return r;
  }
  
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+       SET_GC_MARK(b, 0);
+       SET_GC_SECTORS_USED(b, 0);
+}
+
  void bch_bucket_free(struct cache_set *c, struct bkey *k)
  {
         unsigned i;
  
-       for (i = 0; i < KEY_PTRS(k); i++) {
-               struct bucket *b = PTR_BUCKET(c, k, i);
-
-               SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
-               SET_GC_SECTORS_USED(b, 0);
-               bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
-       }
+       for (i = 0; i < KEY_PTRS(k); i++)
+               __bch_bucket_free(PTR_CACHE(c, k, i),
+                                 PTR_BUCKET(c, k, i));
  }
  
  int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca)
         ca->alloc_thread = k;
         return 0;
  }
-
-int bch_cache_allocator_init(struct cache *ca)
-{
-       /*
-        * Reserve:
-        * Prio/gen writes first
-        * Then 8 for btree allocations
-        * Then half for the moving garbage collector
-        */
-#if 0
-       ca->watermark[WATERMARK_PRIO] = 0;
-
-       ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
-
-       ca->watermark[WATERMARK_MOVINGGC] = 8 +
-               ca->watermark[WATERMARK_METADATA];
-
-       ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
-               ca->watermark[WATERMARK_MOVINGGC];
-#endif
-       return 0;
-}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h

index a4c7306ff43de4f1a928962251ffaea384dcb287..82c9c5d35251f643b325b7e2075f03d66532833c 100644 (file)
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -195,9 +195,7 @@ struct bucket {
         atomic_t        pin;
         uint16_t        prio;
         uint8_t         gen;
-       uint8_t         disk_gen;
         uint8_t         last_gc; /* Most out of date gen in the btree */
-       uint8_t         gc_gen;
         uint16_t        gc_mark; /* Bitfield used by GC. See below for field */
  };
  
@@ -207,9 +205,9 @@ struct bucket {
   */
  
  BITMASK(GC_MARK,        struct bucket, gc_mark, 0, 2);
-#define GC_MARK_RECLAIMABLE    0
-#define GC_MARK_DIRTY          1
-#define GC_MARK_METADATA       2
+#define GC_MARK_RECLAIMABLE    1
+#define GC_MARK_DIRTY          2
+#define GC_MARK_METADATA       3
  #define GC_SECTORS_USED_SIZE   13
  #define MAX_GC_SECTORS_USED    (~(~0ULL << GC_SECTORS_USED_SIZE))
  BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
@@ -426,14 +424,9 @@ struct cache {
          * their new gen to disk. After prio_write() finishes writing the new
          * gens/prios, they'll be moved to the free list (and possibly discarded
          * in the process)
-        *
-        * unused: GC found nothing pointing into these buckets (possibly
-        * because all the data they contained was overwritten), so we only
-        * need to discard them before they can be moved to the free list.
          */
         DECLARE_FIFO(long, free)[RESERVE_NR];
         DECLARE_FIFO(long, free_inc);
-       DECLARE_FIFO(long, unused);
  
         size_t                  fifo_last_bucket;
  
@@ -442,12 +435,6 @@ struct cache {
  
         DECLARE_HEAP(struct bucket *, heap);
  
-       /*
-        * max(gen - disk_gen) for all buckets. When it gets too big we have to
-        * call prio_write() to keep gens from wrapping.
-        */
-       uint8_t                 need_save_prio;
-
         /*
          * If nonzero, we know we aren't going to find any buckets to invalidate
          * until a gc finishes - otherwise we could pointlessly burn a ton of
@@ -562,19 +549,16 @@ struct cache_set {
         struct list_head        btree_cache_freed;
  
         /* Number of elements in btree_cache + btree_cache_freeable lists */
-       unsigned                bucket_cache_used;
+       unsigned                btree_cache_used;
  
         /*
          * If we need to allocate memory for a new btree node and that
          * allocation fails, we can cannibalize another node in the btree cache
-        * to satisfy the allocation. However, only one thread can be doing this
-        * at a time, for obvious reasons - try_harder and try_wait are
-        * basically a lock for this that we can wait on asynchronously. The
-        * btree_root() macro releases the lock when it returns.
+        * to satisfy the allocation - lock to guarantee only one thread does
+        * this at a time:
          */
-       struct task_struct      *try_harder;
-       wait_queue_head_t       try_wait;
-       uint64_t                try_harder_start;
+       wait_queue_head_t       btree_cache_wait;
+       struct task_struct      *btree_cache_alloc_lock;
  
         /*
          * When we free a btree node, we increment the gen of the bucket the
@@ -603,7 +587,7 @@ struct cache_set {
         uint16_t                min_prio;
  
         /*
-        * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+        * max(gen - last_gc) for all buckets. When it gets too big we have to gc
          * to keep gens from wrapping around.
          */
         uint8_t                 need_gc;
@@ -628,6 +612,8 @@ struct cache_set {
         /* Number of moving GC bios in flight */
         struct semaphore        moving_in_flight;
  
+       struct workqueue_struct *moving_gc_wq;
+
         struct btree            *root;
  
  #ifdef CONFIG_BCACHE_DEBUG
@@ -667,7 +653,6 @@ struct cache_set {
         struct time_stats       btree_gc_time;
         struct time_stats       btree_split_time;
         struct time_stats       btree_read_time;
-       struct time_stats       try_harder_time;
  
         atomic_long_t           cache_read_races;
         atomic_long_t           writeback_keys_done;
@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
  /*
   * bucket_gc_gen() returns the difference between the bucket's current gen and
   * the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
   */
  
  static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
         return b->gen - b->last_gc;
  }
  
-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
-       return b->gen - b->disk_gen;
-}
-
  #define BUCKET_GC_GEN_MAX      96U
-#define BUCKET_DISK_GEN_MAX    64U
  
  #define kobj_attribute_write(n, fn)                                    \
         static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
  
  uint8_t bch_inc_gen(struct cache *, struct bucket *);
  void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);
  
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
  void bch_bucket_free(struct cache_set *, struct bkey *);
  
+long bch_bucket_alloc(struct cache *, unsigned, bool);
  int __bch_bucket_alloc_set(struct cache_set *, unsigned,
                            struct bkey *, int, bool);
  int bch_bucket_alloc_set(struct cache_set *, unsigned,
@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *);
  void bch_open_buckets_free(struct cache_set *);
  
  int bch_cache_allocator_start(struct cache *ca);
-int bch_cache_allocator_init(struct cache *ca);
  
  void bch_debug_exit(void);
  int bch_debug_init(struct kobject *);
  void bch_request_exit(void);
  int bch_request_init(void);
-void bch_btree_exit(void);
-int bch_btree_init(void);
  
  #endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c

index 3f74b4b0747b9fec3fcb7ad02cfed9ff1baa2f63..54541641530569c442f7113b687428fad4bb18d6 100644 (file)
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
         for (k = i->start; k < bset_bkey_last(i); k = next) {
                 next = bkey_next(k);
  
-               printk(KERN_ERR "block %u key %li/%u: ", set,
-                      (uint64_t *) k - i->d, i->keys);
+               printk(KERN_ERR "block %u key %u/%u: ", set,
+                      (unsigned) ((u64 *) k - i->d), i->keys);
  
                 if (b->ops->key_dump)
                         b->ops->key_dump(b, k);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h

index 003260f4ddf6e725417956531e64c9cb33022ae4..5f6728d5d4ddb0f406b0b7296025f412f193793b 100644 (file)
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l)
         l->top_p = l->keys_p = l->inline_keys;
  }
  
+static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
+{
+       l->keys = k;
+       l->top = bkey_next(k);
+}
+
  static inline void bch_keylist_push(struct keylist *l)
  {
         l->top = bkey_next(l->top);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c

index 5f9c2a665ca5079bd372646f70de2117a77b995a..7347b61009615089e307fba6b5964fcb83ad2040 100644 (file)
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -68,15 +68,11 @@
   * alloc_bucket() cannot fail. This should be true but is not completely
   * obvious.
   *
- * Make sure all allocations get charged to the root cgroup
- *
   * Plugging?
   *
   * If data write is less than hard sector size of ssd, round up offset in open
   * bucket to the next whole sector
   *
- * Also lookup by cgroup in get_open_bucket()
- *
   * Superblock needs to be fleshed out for multiple cache devices
   *
   * Add a sysfs tunable for the number of writeback IOs in flight
@@ -97,8 +93,6 @@
  #define PTR_HASH(c, k)                                                 \
         (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
  
-static struct workqueue_struct *btree_io_wq;
-
  #define insert_lock(s, b)      ((b)->level <= (s)->lock)
  
  /*
@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq;
  ({                                                                     \
         int _r, l = (b)->level - 1;                                     \
         bool _w = l <= (op)->lock;                                      \
-       struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);  \
+       struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
         if (!IS_ERR(_child)) {                                          \
                 _child->parent = (b);                                   \
                 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);       \
@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq;
                         _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
                 }                                                       \
                 rw_unlock(_w, _b);                                      \
+               bch_cannibalize_unlock(c);                              \
                 if (_r == -EINTR)                                       \
                         schedule();                                     \
-               bch_cannibalize_unlock(c);                              \
-               if (_r == -ENOSPC) {                                    \
-                       wait_event((c)->try_wait,                       \
-                                  !(c)->try_harder);                   \
-                       _r = -EINTR;                                    \
-               }                                                       \
         } while (_r == -EINTR);                                         \
                                                                         \
-       finish_wait(&(c)->bucket_wait, &(op)->wait);                    \
+       finish_wait(&(c)->btree_cache_wait, &(op)->wait);               \
         _r;                                                             \
  })
  
@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b)
         return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
  }
  
+static void bch_btree_init_next(struct btree *b)
+{
+       /* If not a leaf node, always sort */
+       if (b->level && b->keys.nsets)
+               bch_btree_sort(&b->keys, &b->c->sort);
+       else
+               bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+       if (b->written < btree_blocks(b))
+               bch_bset_init_next(&b->keys, write_block(b),
+                                  bset_magic(&b->c->sb));
+
+}
+
  /* Btree key manipulation */
  
  void bkey_put(struct cache_set *c, struct bkey *k)
@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl)
         btree_complete_write(b, w);
  
         if (btree_node_dirty(b))
-               queue_delayed_work(btree_io_wq, &b->work,
-                                  msecs_to_jiffies(30000));
+               schedule_delayed_work(&b->work, 30 * HZ);
  
         closure_return_with_destructor(cl, btree_node_write_unlock);
  }
@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b)
         }
  }
  
-void bch_btree_node_write(struct btree *b, struct closure *parent)
+void __bch_btree_node_write(struct btree *b, struct closure *parent)
  {
         struct bset *i = btree_bset_last(b);
  
+       lockdep_assert_held(&b->write_lock);
+
         trace_bcache_btree_write(b);
  
         BUG_ON(current->bio_list);
@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
                         &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
  
         b->written += set_blocks(i, block_bytes(b->c));
+}
  
-       /* If not a leaf node, always sort */
-       if (b->level && b->keys.nsets)
-               bch_btree_sort(&b->keys, &b->c->sort);
-       else
-               bch_btree_sort_lazy(&b->keys, &b->c->sort);
+void bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+       unsigned nsets = b->keys.nsets;
+
+       lockdep_assert_held(&b->lock);
+
+       __bch_btree_node_write(b, parent);
  
         /*
          * do verify if there was more than one set initially (i.e. we did a
          * sort) and we sorted down to a single set:
          */
-       if (i != b->keys.set->data && !b->keys.nsets)
+       if (nsets && !b->keys.nsets)
                 bch_btree_verify(b);
  
-       if (b->written < btree_blocks(b))
-               bch_bset_init_next(&b->keys, write_block(b),
-                                  bset_magic(&b->c->sb));
+       bch_btree_init_next(b);
  }
  
  static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b)
         struct closure cl;
  
         closure_init_stack(&cl);
+
+       mutex_lock(&b->write_lock);
         bch_btree_node_write(b, &cl);
+       mutex_unlock(&b->write_lock);
+
         closure_sync(&cl);
  }
  
@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w)
  {
         struct btree *b = container_of(to_delayed_work(w), struct btree, work);
  
-       rw_lock(true, b, b->level);
-
+       mutex_lock(&b->write_lock);
         if (btree_node_dirty(b))
-               bch_btree_node_write(b, NULL);
-       rw_unlock(true, b);
+               __bch_btree_node_write(b, NULL);
+       mutex_unlock(&b->write_lock);
  }
  
  static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
         struct bset *i = btree_bset_last(b);
         struct btree_write *w = btree_current_write(b);
  
+       lockdep_assert_held(&b->write_lock);
+
         BUG_ON(!b->written);
         BUG_ON(!i->keys);
  
         if (!btree_node_dirty(b))
-               queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
+               schedule_delayed_work(&b->work, 30 * HZ);
  
         set_btree_node_dirty(b);
  
@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
  #define mca_reserve(c) (((c->root && c->root->level)           \
                           ? c->root->level : 1) * 8 + 16)
  #define mca_can_free(c)                                                \
-       max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+       max_t(int, 0, c->btree_cache_used - mca_reserve(c))
  
  static void mca_data_free(struct btree *b)
  {
@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b)
  
         bch_btree_keys_free(&b->keys);
  
-       b->c->bucket_cache_used--;
+       b->c->btree_cache_used--;
         list_move(&b->list, &b->c->btree_cache_freed);
  }
  
@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
                                         ilog2(b->c->btree_pages),
                                         btree_order(k)),
                                   gfp)) {
-               b->c->bucket_cache_used++;
+               b->c->btree_cache_used++;
                 list_move(&b->list, &b->c->btree_cache);
         } else {
                 list_move(&b->list, &b->c->btree_cache_freed);
@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
  
         init_rwsem(&b->lock);
         lockdep_set_novalidate_class(&b->lock);
+       mutex_init(&b->write_lock);
+       lockdep_set_novalidate_class(&b->write_lock);
         INIT_LIST_HEAD(&b->list);
         INIT_DELAYED_WORK(&b->work, btree_node_write_work);
         b->c = c;
@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
                 up(&b->io_mutex);
         }
  
+       mutex_lock(&b->write_lock);
         if (btree_node_dirty(b))
-               bch_btree_node_write_sync(b);
+               __bch_btree_node_write(b, &cl);
+       mutex_unlock(&b->write_lock);
+
+       closure_sync(&cl);
  
         /* wait for any in flight btree write */
         down(&b->io_mutex);
@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
         if (c->shrinker_disabled)
                 return SHRINK_STOP;
  
-       if (c->try_harder)
+       if (c->btree_cache_alloc_lock)
                 return SHRINK_STOP;
  
         /* Return -1 if we can't do anything right now */
@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
                 }
         }
  
-       for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+       for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
                 if (list_empty(&c->btree_cache))
                         goto out;
  
@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
         if (c->shrinker_disabled)
                 return 0;
  
-       if (c->try_harder)
+       if (c->btree_cache_alloc_lock)
                 return 0;
  
         return mca_can_free(c) * c->btree_pages;
@@ -819,17 +835,30 @@ out:
         return b;
  }
  
-static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
+static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+{
+       struct task_struct *old;
+
+       old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+       if (old && old != current) {
+               if (op)
+                       prepare_to_wait(&c->btree_cache_wait, &op->wait,
+                                       TASK_UNINTERRUPTIBLE);
+               return -EINTR;
+       }
+
+       return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+                                    struct bkey *k)
  {
         struct btree *b;
  
         trace_bcache_btree_cache_cannibalize(c);
  
-       if (!c->try_harder) {
-               c->try_harder = current;
-               c->try_harder_start = local_clock();
-       } else if (c->try_harder != current)
-               return ERR_PTR(-ENOSPC);
+       if (mca_cannibalize_lock(c, op))
+               return ERR_PTR(-EINTR);
  
         list_for_each_entry_reverse(b, &c->btree_cache, list)
                 if (!mca_reap(b, btree_order(k), false))
@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
                 if (!mca_reap(b, btree_order(k), true))
                         return b;
  
+       WARN(1, "btree cache cannibalize failed\n");
         return ERR_PTR(-ENOMEM);
  }
  
@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
   */
  static void bch_cannibalize_unlock(struct cache_set *c)
  {
-       if (c->try_harder == current) {
-               bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
-               c->try_harder = NULL;
-               wake_up(&c->try_wait);
+       if (c->btree_cache_alloc_lock == current) {
+               c->btree_cache_alloc_lock = NULL;
+               wake_up(&c->btree_cache_wait);
         }
  }
  
-static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
+static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+                              struct bkey *k, int level)
  {
         struct btree *b;
  
@@ -920,7 +950,7 @@ err:
         if (b)
                 rw_unlock(true, b);
  
-       b = mca_cannibalize(c, k);
+       b = mca_cannibalize(c, op, k);
         if (!IS_ERR(b))
                 goto out;
  
@@ -936,8 +966,8 @@ err:
   * The btree node will have either a read or a write lock held, depending on
   * level and op->lock.
   */
-struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
-                                int level, bool write)
+struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
+                                struct bkey *k, int level, bool write)
  {
         int i = 0;
         struct btree *b;
@@ -951,7 +981,7 @@ retry:
                         return ERR_PTR(-EAGAIN);
  
                 mutex_lock(&c->bucket_lock);
-               b = mca_alloc(c, k, level);
+               b = mca_alloc(c, op, k, level);
                 mutex_unlock(&c->bucket_lock);
  
                 if (!b)
@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
         struct btree *b;
  
         mutex_lock(&c->bucket_lock);
-       b = mca_alloc(c, k, level);
+       b = mca_alloc(c, NULL, k, level);
         mutex_unlock(&c->bucket_lock);
  
         if (!IS_ERR_OR_NULL(b)) {
@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
  
  static void btree_node_free(struct btree *b)
  {
-       unsigned i;
-
         trace_bcache_btree_node_free(b);
  
         BUG_ON(b == b->c->root);
  
+       mutex_lock(&b->write_lock);
+
         if (btree_node_dirty(b))
                 btree_complete_write(b, btree_current_write(b));
         clear_bit(BTREE_NODE_dirty, &b->flags);
  
+       mutex_unlock(&b->write_lock);
+
         cancel_delayed_work(&b->work);
  
         mutex_lock(&b->c->bucket_lock);
-
-       for (i = 0; i < KEY_PTRS(&b->key); i++) {
-               BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
-
-               bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
-                           PTR_BUCKET(b->c, &b->key, i));
-       }
-
         bch_bucket_free(b->c, &b->key);
         mca_bucket_free(b);
         mutex_unlock(&b->c->bucket_lock);
  }
  
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
+struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+                                  int level)
  {
         BKEY_PADDED(key) k;
         struct btree *b = ERR_PTR(-EAGAIN);
  
         mutex_lock(&c->bucket_lock);
  retry:
-       if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
+       if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
                 goto err;
  
         bkey_put(c, &k.key);
         SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
  
-       b = mca_alloc(c, &k.key, level);
+       b = mca_alloc(c, op, &k.key, level);
         if (IS_ERR(b))
                 goto err_free;
  
@@ -1075,12 +1100,15 @@ err:
         return b;
  }
  
-static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
+static struct btree *btree_node_alloc_replacement(struct btree *b,
+                                                 struct btree_op *op)
  {
-       struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
+       struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
         if (!IS_ERR_OR_NULL(n)) {
+               mutex_lock(&n->write_lock);
                 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
                 bkey_copy_key(&n->key, &b->key);
+               mutex_unlock(&n->write_lock);
         }
  
         return n;
@@ -1090,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
  {
         unsigned i;
  
+       mutex_lock(&b->c->bucket_lock);
+
+       atomic_inc(&b->c->prio_blocked);
+
         bkey_copy(k, &b->key);
         bkey_copy_key(k, &ZERO_KEY);
  
-       for (i = 0; i < KEY_PTRS(k); i++) {
-               uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
-
-               SET_PTR_GEN(k, i, g);
-       }
+       for (i = 0; i < KEY_PTRS(k); i++)
+               SET_PTR_GEN(k, i,
+                           bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+                                       PTR_BUCKET(b->c, &b->key, i)));
  
-       atomic_inc(&b->c->prio_blocked);
+       mutex_unlock(&b->c->bucket_lock);
  }
  
  static int btree_check_reserve(struct btree *b, struct btree_op *op)
  {
         struct cache_set *c = b->c;
         struct cache *ca;
-       unsigned i, reserve = c->root->level * 2 + 1;
-       int ret = 0;
+       unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
  
         mutex_lock(&c->bucket_lock);
  
         for_each_cache(ca, c, i)
                 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
                         if (op)
-                               prepare_to_wait(&c->bucket_wait, &op->wait,
+                               prepare_to_wait(&c->btree_cache_wait, &op->wait,
                                                 TASK_UNINTERRUPTIBLE);
-                       ret = -EINTR;
-                       break;
+                       mutex_unlock(&c->bucket_lock);
+                       return -EINTR;
                 }
  
         mutex_unlock(&c->bucket_lock);
-       return ret;
+
+       return mca_cannibalize_lock(b->c, op);
  }
  
  /* Garbage collection */
  
-uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
+static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
+                                   struct bkey *k)
  {
         uint8_t stale = 0;
         unsigned i;
@@ -1146,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
  
                 g = PTR_BUCKET(c, k, i);
  
-               if (gen_after(g->gc_gen, PTR_GEN(k, i)))
-                       g->gc_gen = PTR_GEN(k, i);
+               if (gen_after(g->last_gc, PTR_GEN(k, i)))
+                       g->last_gc = PTR_GEN(k, i);
  
                 if (ptr_stale(c, k, i)) {
                         stale = max(stale, ptr_stale(c, k, i));
@@ -1163,6 +1195,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
                         SET_GC_MARK(g, GC_MARK_METADATA);
                 else if (KEY_DIRTY(k))
                         SET_GC_MARK(g, GC_MARK_DIRTY);
+               else if (!GC_MARK(g))
+                       SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
  
                 /* guard against overflow */
                 SET_GC_SECTORS_USED(g, min_t(unsigned,
@@ -1177,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
  
  #define btree_mark_key(b, k)   __bch_btree_mark_key(b->c, b->level, k)
  
+void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+       unsigned i;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (ptr_available(c, k, i) &&
+                   !ptr_stale(c, k, i)) {
+                       struct bucket *b = PTR_BUCKET(c, k, i);
+
+                       b->gen = PTR_GEN(k, i);
+
+                       if (level && bkey_cmp(k, &ZERO_KEY))
+                               b->prio = BTREE_PRIO;
+                       else if (!level && b->prio == BTREE_PRIO)
+                               b->prio = INITIAL_PRIO;
+               }
+
+       __bch_btree_mark_key(c, level, k);
+}
+
  static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
  {
         uint8_t stale = 0;
@@ -1230,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
                                  struct keylist *, atomic_t *, struct bkey *);
  
  static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
-                            struct keylist *keylist, struct gc_stat *gc,
-                            struct gc_merge_info *r)
+                            struct gc_stat *gc, struct gc_merge_info *r)
  {
         unsigned i, nodes = 0, keys = 0, blocks;
         struct btree *new_nodes[GC_MERGE_NODES];
+       struct keylist keylist;
         struct closure cl;
         struct bkey *k;
  
+       bch_keylist_init(&keylist);
+
+       if (btree_check_reserve(b, NULL))
+               return 0;
+
         memset(new_nodes, 0, sizeof(new_nodes));
         closure_init_stack(&cl);
  
@@ -1252,11 +1311,23 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                 return 0;
  
         for (i = 0; i < nodes; i++) {
-               new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
+               new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
                 if (IS_ERR_OR_NULL(new_nodes[i]))
                         goto out_nocoalesce;
         }
  
+       /*
+        * We have to check the reserve here, after we've allocated our new
+        * nodes, to make sure the insert below will succeed - we also check
+        * before as an optimization to potentially avoid a bunch of expensive
+        * allocs/sorts
+        */
+       if (btree_check_reserve(b, NULL))
+               goto out_nocoalesce;
+
+       for (i = 0; i < nodes; i++)
+               mutex_lock(&new_nodes[i]->write_lock);
+
         for (i = nodes - 1; i > 0; --i) {
                 struct bset *n1 = btree_bset_first(new_nodes[i]);
                 struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
@@ -1315,28 +1386,34 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
  
                 n2->keys -= keys;
  
-               if (__bch_keylist_realloc(keylist,
+               if (__bch_keylist_realloc(&keylist,
                                           bkey_u64s(&new_nodes[i]->key)))
                         goto out_nocoalesce;
  
                 bch_btree_node_write(new_nodes[i], &cl);
-               bch_keylist_add(keylist, &new_nodes[i]->key);
+               bch_keylist_add(&keylist, &new_nodes[i]->key);
         }
  
-       for (i = 0; i < nodes; i++) {
-               if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
-                       goto out_nocoalesce;
+       for (i = 0; i < nodes; i++)
+               mutex_unlock(&new_nodes[i]->write_lock);
  
-               make_btree_freeing_key(r[i].b, keylist->top);
-               bch_keylist_push(keylist);
-       }
+       closure_sync(&cl);
  
         /* We emptied out this node */
         BUG_ON(btree_bset_first(new_nodes[0])->keys);
         btree_node_free(new_nodes[0]);
         rw_unlock(true, new_nodes[0]);
  
-       closure_sync(&cl);
+       for (i = 0; i < nodes; i++) {
+               if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
+                       goto out_nocoalesce;
+
+               make_btree_freeing_key(r[i].b, keylist.top);
+               bch_keylist_push(&keylist);
+       }
+
+       bch_btree_insert_node(b, op, &keylist, NULL, NULL);
+       BUG_ON(!bch_keylist_empty(&keylist));
  
         for (i = 0; i < nodes; i++) {
                 btree_node_free(r[i].b);
@@ -1345,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                 r[i].b = new_nodes[i];
         }
  
-       bch_btree_insert_node(b, op, keylist, NULL, NULL);
-       BUG_ON(!bch_keylist_empty(keylist));
-
         memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
         r[nodes - 1].b = ERR_PTR(-EINTR);
  
         trace_bcache_btree_gc_coalesce(nodes);
         gc->nodes--;
  
+       bch_keylist_free(&keylist);
+
         /* Invalidated our iterator */
         return -EINTR;
  
  out_nocoalesce:
         closure_sync(&cl);
+       bch_keylist_free(&keylist);
  
-       while ((k = bch_keylist_pop(keylist)))
+       while ((k = bch_keylist_pop(&keylist)))
                 if (!bkey_cmp(k, &ZERO_KEY))
                         atomic_dec(&b->c->prio_blocked);
  
@@ -1372,6 +1449,42 @@ out_nocoalesce:
         return 0;
  }
  
+static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
+                                struct btree *replace)
+{
+       struct keylist keys;
+       struct btree *n;
+
+       if (btree_check_reserve(b, NULL))
+               return 0;
+
+       n = btree_node_alloc_replacement(replace, NULL);
+
+       /* recheck reserve after allocating replacement node */
+       if (btree_check_reserve(b, NULL)) {
+               btree_node_free(n);
+               rw_unlock(true, n);
+               return 0;
+       }
+
+       bch_btree_node_write_sync(n);
+
+       bch_keylist_init(&keys);
+       bch_keylist_add(&keys, &n->key);
+
+       make_btree_freeing_key(replace, keys.top);
+       bch_keylist_push(&keys);
+
+       bch_btree_insert_node(b, op, &keys, NULL, NULL);
+       BUG_ON(!bch_keylist_empty(&keys));
+
+       btree_node_free(replace);
+       rw_unlock(true, n);
+
+       /* Invalidated our iterator */
+       return -EINTR;
+}
+
  static unsigned btree_gc_count_keys(struct btree *b)
  {
         struct bkey *k;
@@ -1387,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b)
  static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                             struct closure *writes, struct gc_stat *gc)
  {
-       unsigned i;
         int ret = 0;
         bool should_rewrite;
-       struct btree *n;
         struct bkey *k;
-       struct keylist keys;
         struct btree_iter iter;
         struct gc_merge_info r[GC_MERGE_NODES];
-       struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
+       struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
  
-       bch_keylist_init(&keys);
         bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
  
-       for (i = 0; i < GC_MERGE_NODES; i++)
-               r[i].b = ERR_PTR(-EINTR);
+       for (i = r; i < r + ARRAY_SIZE(r); i++)
+               i->b = ERR_PTR(-EINTR);
  
         while (1) {
                 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
                 if (k) {
-                       r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
+                       r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
+                                                 true);
                         if (IS_ERR(r->b)) {
                                 ret = PTR_ERR(r->b);
                                 break;
@@ -1414,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
  
                         r->keys = btree_gc_count_keys(r->b);
  
-                       ret = btree_gc_coalesce(b, op, &keys, gc, r);
+                       ret = btree_gc_coalesce(b, op, gc, r);
                         if (ret)
                                 break;
                 }
@@ -1424,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
  
                 if (!IS_ERR(last->b)) {
                         should_rewrite = btree_gc_mark_node(last->b, gc);
-                       if (should_rewrite &&
-                           !btree_check_reserve(b, NULL)) {
-                               n = btree_node_alloc_replacement(last->b,
-                                                                false);
-
-                               if (!IS_ERR_OR_NULL(n)) {
-                                       bch_btree_node_write_sync(n);
-                                       bch_keylist_add(&keys, &n->key);
-
-                                       make_btree_freeing_key(last->b,
-                                                              keys.top);
-                                       bch_keylist_push(&keys);
-
-                                       btree_node_free(last->b);
-
-                                       bch_btree_insert_node(b, op, &keys,
-                                                             NULL, NULL);
-                                       BUG_ON(!bch_keylist_empty(&keys));
-
-                                       rw_unlock(true, last->b);
-                                       last->b = n;
-
-                                       /* Invalidated our iterator */
-                                       ret = -EINTR;
+                       if (should_rewrite) {
+                               ret = btree_gc_rewrite_node(b, op, last->b);
+                               if (ret)
                                         break;
-                               }
                         }
  
                         if (last->b->level) {
@@ -1464,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                          * Must flush leaf nodes before gc ends, since replace
                          * operations aren't journalled
                          */
+                       mutex_lock(&last->b->write_lock);
                         if (btree_node_dirty(last->b))
                                 bch_btree_node_write(last->b, writes);
+                       mutex_unlock(&last->b->write_lock);
                         rw_unlock(true, last->b);
                 }
  
@@ -1478,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                 }
         }
  
-       for (i = 0; i < GC_MERGE_NODES; i++)
-               if (!IS_ERR_OR_NULL(r[i].b)) {
-                       if (btree_node_dirty(r[i].b))
-                               bch_btree_node_write(r[i].b, writes);
-                       rw_unlock(true, r[i].b);
+       for (i = r; i < r + ARRAY_SIZE(r); i++)
+               if (!IS_ERR_OR_NULL(i->b)) {
+                       mutex_lock(&i->b->write_lock);
+                       if (btree_node_dirty(i->b))
+                               bch_btree_node_write(i->b, writes);
+                       mutex_unlock(&i->b->write_lock);
+                       rw_unlock(true, i->b);
                 }
  
-       bch_keylist_free(&keys);
-
         return ret;
  }
  
@@ -1499,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
  
         should_rewrite = btree_gc_mark_node(b, gc);
         if (should_rewrite) {
-               n = btree_node_alloc_replacement(b, false);
+               n = btree_node_alloc_replacement(b, NULL);
  
                 if (!IS_ERR_OR_NULL(n)) {
                         bch_btree_node_write_sync(n);
+
                         bch_btree_set_root(n);
                         btree_node_free(b);
                         rw_unlock(true, n);
@@ -1511,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
                 }
         }
  
+       __bch_btree_mark_key(b->c, b->level + 1, &b->key);
+
         if (b->level) {
                 ret = btree_gc_recurse(b, op, writes, gc);
                 if (ret)
@@ -1538,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c)
  
         for_each_cache(ca, c, i)
                 for_each_bucket(b, ca) {
-                       b->gc_gen = b->gen;
+                       b->last_gc = b->gen;
                         if (!atomic_read(&b->pin)) {
-                               SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+                               SET_GC_MARK(b, 0);
                                 SET_GC_SECTORS_USED(b, 0);
                         }
                 }
@@ -1548,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)
         mutex_unlock(&c->bucket_lock);
  }
  
-size_t bch_btree_gc_finish(struct cache_set *c)
+static size_t bch_btree_gc_finish(struct cache_set *c)
  {
         size_t available = 0;
         struct bucket *b;
@@ -1561,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
         c->gc_mark_valid = 1;
         c->need_gc      = 0;
  
-       if (c->root)
-               for (i = 0; i < KEY_PTRS(&c->root->key); i++)
-                       SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
-                                   GC_MARK_METADATA);
-
         for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
                 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
                             GC_MARK_METADATA);
@@ -1605,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c)
                         SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
  
                 for_each_bucket(b, ca) {
-                       b->last_gc      = b->gc_gen;
                         c->need_gc      = max(c->need_gc, bucket_gc_gen(b));
  
-                       if (!atomic_read(&b->pin) &&
-                           GC_MARK(b) == GC_MARK_RECLAIMABLE) {
+                       if (atomic_read(&b->pin))
+                               continue;
+
+                       BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
+
+                       if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
                                 available++;
-                               if (!GC_SECTORS_USED(b))
-                                       bch_bucket_add_unused(ca, b);
-                       }
                 }
         }
  
@@ -1705,36 +1793,16 @@ int bch_gc_thread_start(struct cache_set *c)
  
  /* Initial partial gc */
  
-static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
-                                  unsigned long **seen)
+static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
  {
         int ret = 0;
-       unsigned i;
         struct bkey *k, *p = NULL;
-       struct bucket *g;
         struct btree_iter iter;
  
-       for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
-               for (i = 0; i < KEY_PTRS(k); i++) {
-                       if (!ptr_available(b->c, k, i))
-                               continue;
-
-                       g = PTR_BUCKET(b->c, k, i);
+       for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
+               bch_initial_mark_key(b->c, b->level, k);
  
-                       if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
-                                               seen[PTR_DEV(k, i)]) ||
-                           !ptr_stale(b->c, k, i)) {
-                               g->gen = PTR_GEN(k, i);
-
-                               if (b->level)
-                                       g->prio = BTREE_PRIO;
-                               else if (g->prio == BTREE_PRIO)
-                                       g->prio = INITIAL_PRIO;
-                       }
-               }
-
-               btree_mark_key(b, k);
-       }
+       bch_initial_mark_key(b->c, b->level + 1, &b->key);
  
         if (b->level) {
                 bch_btree_iter_init(&b->keys, &iter, NULL);
@@ -1746,40 +1814,58 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
                                 btree_node_prefetch(b->c, k, b->level - 1);
  
                         if (p)
-                               ret = btree(check_recurse, p, b, op, seen);
+                               ret = btree(check_recurse, p, b, op);
  
                         p = k;
                 } while (p && !ret);
         }
  
-       return 0;
+       return ret;
  }
  
  int bch_btree_check(struct cache_set *c)
  {
-       int ret = -ENOMEM;
-       unsigned i;
-       unsigned long *seen[MAX_CACHES_PER_SET];
         struct btree_op op;
  
-       memset(seen, 0, sizeof(seen));
         bch_btree_op_init(&op, SHRT_MAX);
  
-       for (i = 0; c->cache[i]; i++) {
-               size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
-               seen[i] = kmalloc(n, GFP_KERNEL);
-               if (!seen[i])
-                       goto err;
+       return btree_root(check_recurse, c, &op);
+}
+
+void bch_initial_gc_finish(struct cache_set *c)
+{
+       struct cache *ca;
+       struct bucket *b;
+       unsigned i;
+
+       bch_btree_gc_finish(c);
  
-               /* Disables the seen array until prio_read() uses it too */
-               memset(seen[i], 0xFF, n);
+       mutex_lock(&c->bucket_lock);
+
+       /*
+        * We need to put some unused buckets directly on the prio freelist in
+        * order to get the allocator thread started - it needs freed buckets in
+        * order to rewrite the prios and gens, and it needs to rewrite prios
+        * and gens in order to free buckets.
+        *
+        * This is only safe for buckets that have no live data in them, which
+        * there should always be some of.
+        */
+       for_each_cache(ca, c, i) {
+               for_each_bucket(b, ca) {
+                       if (fifo_full(&ca->free[RESERVE_PRIO]))
+                               break;
+
+                       if (bch_can_invalidate_bucket(ca, b) &&
+                           !GC_MARK(b)) {
+                               __bch_invalidate_one_bucket(ca, b);
+                               fifo_push(&ca->free[RESERVE_PRIO],
+                                         b - ca->buckets);
+                       }
+               }
         }
  
-       ret = btree_root(check_recurse, c, &op, seen);
-err:
-       for (i = 0; i < MAX_CACHES_PER_SET; i++)
-               kfree(seen[i]);
-       return ret;
+       mutex_unlock(&c->bucket_lock);
  }
  
  /* Btree insertion */
@@ -1871,11 +1957,14 @@ static int btree_split(struct btree *b, struct btree_op *op,
         closure_init_stack(&cl);
         bch_keylist_init(&parent_keys);
  
-       if (!b->level &&
-           btree_check_reserve(b, op))
-               return -EINTR;
+       if (btree_check_reserve(b, op)) {
+               if (!b->level)
+                       return -EINTR;
+               else
+                       WARN(1, "insufficient reserve for split\n");
+       }
  
-       n1 = btree_node_alloc_replacement(b, true);
+       n1 = btree_node_alloc_replacement(b, op);
         if (IS_ERR(n1))
                 goto err;
  
@@ -1887,16 +1976,19 @@ static int btree_split(struct btree *b, struct btree_op *op,
  
                 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
  
-               n2 = bch_btree_node_alloc(b->c, b->level, true);
+               n2 = bch_btree_node_alloc(b->c, op, b->level);
                 if (IS_ERR(n2))
                         goto err_free1;
  
                 if (!b->parent) {
-                       n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
+                       n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
                         if (IS_ERR(n3))
                                 goto err_free2;
                 }
  
+               mutex_lock(&n1->write_lock);
+               mutex_lock(&n2->write_lock);
+
                 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
  
                 /*
@@ -1923,45 +2015,45 @@ static int btree_split(struct btree *b, struct btree_op *op,
  
                 bch_keylist_add(&parent_keys, &n2->key);
                 bch_btree_node_write(n2, &cl);
+               mutex_unlock(&n2->write_lock);
                 rw_unlock(true, n2);
         } else {
                 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
  
+               mutex_lock(&n1->write_lock);
                 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
         }
  
         bch_keylist_add(&parent_keys, &n1->key);
         bch_btree_node_write(n1, &cl);
+       mutex_unlock(&n1->write_lock);
  
         if (n3) {
                 /* Depth increases, make a new root */
+               mutex_lock(&n3->write_lock);
                 bkey_copy_key(&n3->key, &MAX_KEY);
                 bch_btree_insert_keys(n3, op, &parent_keys, NULL);
                 bch_btree_node_write(n3, &cl);
+               mutex_unlock(&n3->write_lock);
  
                 closure_sync(&cl);
                 bch_btree_set_root(n3);
                 rw_unlock(true, n3);
-
-               btree_node_free(b);
         } else if (!b->parent) {
                 /* Root filled up but didn't need to be split */
                 closure_sync(&cl);
                 bch_btree_set_root(n1);
-
-               btree_node_free(b);
         } else {
                 /* Split a non root node */
                 closure_sync(&cl);
                 make_btree_freeing_key(b, parent_keys.top);
                 bch_keylist_push(&parent_keys);
  
-               btree_node_free(b);
-
                 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
                 BUG_ON(!bch_keylist_empty(&parent_keys));
         }
  
+       btree_node_free(b);
         rw_unlock(true, n1);
  
         bch_time_stats_update(&b->c->btree_split_time, start_time);
@@ -1976,7 +2068,7 @@ err_free1:
         btree_node_free(n1);
         rw_unlock(true, n1);
  err:
-       WARN(1, "bcache: btree split failed");
+       WARN(1, "bcache: btree split failed (level %u)", b->level);
  
         if (n3 == ERR_PTR(-EAGAIN) ||
             n2 == ERR_PTR(-EAGAIN) ||
@@ -1991,33 +2083,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
                                  atomic_t *journal_ref,
                                  struct bkey *replace_key)
  {
+       struct closure cl;
+
         BUG_ON(b->level && replace_key);
  
+       closure_init_stack(&cl);
+
+       mutex_lock(&b->write_lock);
+
+       if (write_block(b) != btree_bset_last(b) &&
+           b->keys.last_set_unwritten)
+               bch_btree_init_next(b); /* just wrote a set */
+
         if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
-               if (current->bio_list) {
-                       op->lock = b->c->root->level + 1;
-                       return -EAGAIN;
-               } else if (op->lock <= b->c->root->level) {
-                       op->lock = b->c->root->level + 1;
-                       return -EINTR;
-               } else {
-                       /* Invalidated all iterators */
-                       int ret = btree_split(b, op, insert_keys, replace_key);
+               mutex_unlock(&b->write_lock);
+               goto split;
+       }
  
-                       return bch_keylist_empty(insert_keys) ?
-                               0 : ret ?: -EINTR;
-               }
-       } else {
-               BUG_ON(write_block(b) != btree_bset_last(b));
+       BUG_ON(write_block(b) != btree_bset_last(b));
  
-               if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
-                       if (!b->level)
-                               bch_btree_leaf_dirty(b, journal_ref);
-                       else
-                               bch_btree_node_write_sync(b);
-               }
+       if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
+               if (!b->level)
+                       bch_btree_leaf_dirty(b, journal_ref);
+               else
+                       bch_btree_node_write(b, &cl);
+       }
  
-               return 0;
+       mutex_unlock(&b->write_lock);
+
+       /* wait for btree node write if necessary, after unlock */
+       closure_sync(&cl);
+
+       return 0;
+split:
+       if (current->bio_list) {
+               op->lock = b->c->root->level + 1;
+               return -EAGAIN;
+       } else if (op->lock <= b->c->root->level) {
+               op->lock = b->c->root->level + 1;
+               return -EINTR;
+       } else {
+               /* Invalidated all iterators */
+               int ret = btree_split(b, op, insert_keys, replace_key);
+
+               if (bch_keylist_empty(insert_keys))
+                       return 0;
+               else if (!ret)
+                       return -EINTR;
+               return ret;
         }
  }
  
@@ -2403,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf)
         spin_lock_init(&buf->lock);
         array_allocator_init(&buf->freelist);
  }
-
-void bch_btree_exit(void)
-{
-       if (btree_io_wq)
-               destroy_workqueue(btree_io_wq);
-}
-
-int __init bch_btree_init(void)
-{
-       btree_io_wq = create_singlethread_workqueue("bch_btree_io");
-       if (!btree_io_wq)
-               return -ENOMEM;
-
-       return 0;
-}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h

index af065e97e55c4186782422db0bae20498fd3cdb8..91dfa5e696857ded36b3de7b66a8f862d9540cea 100644 (file)
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -127,6 +127,8 @@ struct btree {
         struct cache_set        *c;
         struct btree            *parent;
  
+       struct mutex            write_lock;
+
         unsigned long           flags;
         uint16_t                written;        /* would be nice to kill */
         uint8_t                 level;
@@ -236,11 +238,13 @@ static inline void rw_unlock(bool w, struct btree *b)
  }
  
  void bch_btree_node_read_done(struct btree *);
+void __bch_btree_node_write(struct btree *, struct closure *);
  void bch_btree_node_write(struct btree *, struct closure *);
  
  void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
-struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
+struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
+                                struct bkey *, int, bool);
  
  int bch_btree_insert_check_key(struct btree *, struct btree_op *,
                                struct bkey *);
@@ -248,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *,
                      atomic_t *, struct bkey *);
  
  int bch_gc_thread_start(struct cache_set *);
-size_t bch_btree_gc_finish(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
  void bch_moving_gc(struct cache_set *);
  int bch_btree_check(struct cache_set *);
-uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
+void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
  
  static inline void wake_up_gc(struct cache_set *c)
  {
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c

index 416d1a3e028e03a34a9d03e31b9e6fa583c9b357..3a0de4cf9771031e9d4707fc49f356aee48b50e7 100644 (file)
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -194,9 +194,9 @@ err:
         mutex_unlock(&b->c->bucket_lock);
         bch_extent_to_text(buf, sizeof(buf), k);
         btree_bug(b,
-"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
                   buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
-                 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+                 g->prio, g->gen, g->last_gc, GC_MARK(g));
         return true;
  }
  
@@ -308,6 +308,16 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
         return NULL;
  }
  
+static void bch_subtract_dirty(struct bkey *k,
+                          struct cache_set *c,
+                          uint64_t offset,
+                          int sectors)
+{
+       if (KEY_DIRTY(k))
+               bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+                                            offset, -sectors);
+}
+
  static bool bch_extent_insert_fixup(struct btree_keys *b,
                                     struct bkey *insert,
                                     struct btree_iter *iter,
@@ -315,13 +325,6 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
  {
         struct cache_set *c = container_of(b, struct btree, keys)->c;
  
-       void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
-       {
-               if (KEY_DIRTY(k))
-                       bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
-                                                    offset, -sectors);
-       }
-
         uint64_t old_offset;
         unsigned old_size, sectors_found = 0;
  
@@ -398,7 +401,8 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
  
                         struct bkey *top;
  
-                       subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
+                       bch_subtract_dirty(k, c, KEY_START(insert),
+                                      KEY_SIZE(insert));
  
                         if (bkey_written(b, k)) {
                                 /*
@@ -448,7 +452,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
                         }
                 }
  
-               subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
+               bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
         }
  
  check_failed:
@@ -499,9 +503,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
  
         if (mutex_trylock(&b->c->bucket_lock)) {
                 if (b->c->gc_mark_valid &&
-                   ((GC_MARK(g) != GC_MARK_DIRTY &&
-                     KEY_DIRTY(k)) ||
-                    GC_MARK(g) == GC_MARK_METADATA))
+                   (!GC_MARK(g) ||
+                    GC_MARK(g) == GC_MARK_METADATA ||
+                    (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
                         goto err;
  
                 if (g->prio == BTREE_PRIO)
@@ -515,9 +519,9 @@ err:
         mutex_unlock(&b->c->bucket_lock);
         bch_extent_to_text(buf, sizeof(buf), k);
         btree_bug(b,
-"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
                   buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
-                 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+                 g->prio, g->gen, g->last_gc, GC_MARK(g));
         return true;
  }
  
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c

index 18039affc306b539e187b9b57ba3f1c3c3b95e3d..59e82021b5bb320d9c03606e35e0d5347417f1ff 100644 (file)
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -237,8 +237,14 @@ bsearch:
                 for (i = 0; i < ca->sb.njournal_buckets; i++)
                         if (ja->seq[i] > seq) {
                                 seq = ja->seq[i];
-                               ja->cur_idx = ja->discard_idx =
-                                       ja->last_idx = i;
+                               /*
+                                * When journal_reclaim() goes to allocate for
+                                * the first time, it'll use the bucket after
+                                * ja->cur_idx
+                                */
+                               ja->cur_idx = i;
+                               ja->last_idx = ja->discard_idx = (i + 1) %
+                                       ca->sb.njournal_buckets;
  
                         }
         }
@@ -288,16 +294,11 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
                      k = bkey_next(k)) {
                         unsigned j;
  
-                       for (j = 0; j < KEY_PTRS(k); j++) {
-                               struct bucket *g = PTR_BUCKET(c, k, j);
-                               atomic_inc(&g->pin);
+                       for (j = 0; j < KEY_PTRS(k); j++)
+                               if (ptr_available(c, k, j))
+                                       atomic_inc(&PTR_BUCKET(c, k, j)->pin);
  
-                               if (g->prio == BTREE_PRIO &&
-                                   !ptr_stale(c, k, j))
-                                       g->prio = INITIAL_PRIO;
-                       }
-
-                       __bch_btree_mark_key(c, 0, k);
+                       bch_initial_mark_key(c, 0, k);
                 }
         }
  }
@@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
         uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
         struct keylist keylist;
  
-       bch_keylist_init(&keylist);
-
         list_for_each_entry(i, list, list) {
                 BUG_ON(i->pin && atomic_read(i->pin) != 1);
  
@@ -326,8 +325,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
                      k = bkey_next(k)) {
                         trace_bcache_journal_replay_key(k);
  
-                       bkey_copy(keylist.top, k);
-                       bch_keylist_push(&keylist);
+                       bch_keylist_init_single(&keylist, k);
  
                         ret = bch_btree_insert(s, &keylist, i->pin, NULL);
                         if (ret)
@@ -383,16 +381,15 @@ retry:
  
         b = best;
         if (b) {
-               rw_lock(true, b, b->level);
-
+               mutex_lock(&b->write_lock);
                 if (!btree_current_write(b)->journal) {
-                       rw_unlock(true, b);
+                       mutex_unlock(&b->write_lock);
                         /* We raced */
                         goto retry;
                 }
  
-               bch_btree_node_write(b, NULL);
-               rw_unlock(true, b);
+               __bch_btree_node_write(b, NULL);
+               mutex_unlock(&b->write_lock);
         }
  }
  
@@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j)
         atomic_set(&fifo_back(&j->pin), 1);
  
         j->cur->data->seq       = ++j->seq;
+       j->cur->dirty           = false;
         j->cur->need_write      = false;
         j->cur->data->keys      = 0;
  
@@ -731,7 +729,10 @@ static void journal_write_work(struct work_struct *work)
                                            struct cache_set,
                                            journal.work);
         spin_lock(&c->journal.lock);
-       journal_try_write(c);
+       if (c->journal.cur->dirty)
+               journal_try_write(c);
+       else
+               spin_unlock(&c->journal.lock);
  }
  
  /*
@@ -761,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c,
         if (parent) {
                 closure_wait(&w->wait, parent);
                 journal_try_write(c);
-       } else if (!w->need_write) {
+       } else if (!w->dirty) {
+               w->dirty = true;
                 schedule_delayed_work(&c->journal.work,
                                       msecs_to_jiffies(c->journal_delay_ms));
                 spin_unlock(&c->journal.lock);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h

index 9180c44650759b61b843ea8fe763a38b62277cd4..e3c39457afbb19a7a47b9c23eb9b127520223523 100644 (file)
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -95,6 +95,7 @@ struct journal_write {
  
         struct cache_set        *c;
         struct closure_waitlist wait;
+       bool                    dirty;
         bool                    need_write;
  };
  
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c

index 9eb60d102de84532e3a662390b1ba2934b744673..cd7490311e518b9139db7d95818111685d52f0fa 100644 (file)
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
                                            moving_gc_keys);
         unsigned i;
  
-       for (i = 0; i < KEY_PTRS(k); i++) {
-               struct bucket *g = PTR_BUCKET(c, k, i);
-
-               if (GC_MOVE(g))
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (ptr_available(c, k, i) &&
+                   GC_MOVE(PTR_BUCKET(c, k, i)))
                         return true;
-       }
  
         return false;
  }
@@ -115,7 +113,7 @@ static void write_moving(struct closure *cl)
                 closure_call(&op->cl, bch_data_insert, NULL, cl);
         }
  
-       continue_at(cl, write_moving_finish, system_wq);
+       continue_at(cl, write_moving_finish, op->wq);
  }
  
  static void read_moving_submit(struct closure *cl)
@@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl)
  
         bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
  
-       continue_at(cl, write_moving, system_wq);
+       continue_at(cl, write_moving, io->op.wq);
  }
  
  static void read_moving(struct cache_set *c)
@@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c)
                 io->w           = w;
                 io->op.inode    = KEY_INODE(&w->key);
                 io->op.c        = c;
+               io->op.wq       = c->moving_gc_wq;
  
                 moving_init(io);
                 bio = &io->bio.bio;
@@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c)
                 ca->heap.used = 0;
  
                 for_each_bucket(b, ca) {
-                       if (!GC_SECTORS_USED(b))
+                       if (GC_MARK(b) == GC_MARK_METADATA ||
+                           !GC_SECTORS_USED(b) ||
+                           GC_SECTORS_USED(b) == ca->sb.bucket_size ||
+                           atomic_read(&b->pin))
                                 continue;
  
                         if (!heap_full(&ca->heap)) {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c

index 5d5d031cf3813247adb89653f2fe5fcd227ce764..15fff4f68a7ce75f441a1e429d961eac2d2b0d6e 100644 (file)
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -12,11 +12,9 @@
  #include "request.h"
  #include "writeback.h"
  
-#include <linux/cgroup.h>
  #include <linux/module.h>
  #include <linux/hash.h>
  #include <linux/random.h>
-#include "blk-cgroup.h"
  
  #include <trace/events/bcache.h>
  
@@ -27,171 +25,13 @@ struct kmem_cache *bch_search_cache;
  
  static void bch_data_insert_start(struct closure *);
  
-/* Cgroup interface */
-
-#ifdef CONFIG_CGROUP_BCACHE
-static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
-
-static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
-{
-       struct cgroup_subsys_state *css;
-       return cgroup &&
-               (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
-               ? container_of(css, struct bch_cgroup, css)
-               : &bcache_default_cgroup;
-}
-
-struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
-{
-       struct cgroup_subsys_state *css = bio->bi_css
-               ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
-               : task_subsys_state(current, bcache_subsys_id);
-
-       return css
-               ? container_of(css, struct bch_cgroup, css)
-               : &bcache_default_cgroup;
-}
-
-static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
-                       struct file *file,
-                       char __user *buf, size_t nbytes, loff_t *ppos)
-{
-       char tmp[1024];
-       int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
-                                         cgroup_to_bcache(cgrp)->cache_mode + 1);
-
-       if (len < 0)
-               return len;
-
-       return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
-}
-
-static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
-                           const char *buf)
-{
-       int v = bch_read_string_list(buf, bch_cache_modes);
-       if (v < 0)
-               return v;
-
-       cgroup_to_bcache(cgrp)->cache_mode = v - 1;
-       return 0;
-}
-
-static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
-{
-       return cgroup_to_bcache(cgrp)->verify;
-}
-
-static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
-{
-       cgroup_to_bcache(cgrp)->verify = val;
-       return 0;
-}
-
-static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
-{
-       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-       return atomic_read(&bcachecg->stats.cache_hits);
-}
-
-static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
-{
-       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-       return atomic_read(&bcachecg->stats.cache_misses);
-}
-
-static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
-                                        struct cftype *cft)
-{
-       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-       return atomic_read(&bcachecg->stats.cache_bypass_hits);
-}
-
-static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
-                                          struct cftype *cft)
-{
-       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-       return atomic_read(&bcachecg->stats.cache_bypass_misses);
-}
-
-static struct cftype bch_files[] = {
-       {
-               .name           = "cache_mode",
-               .read           = cache_mode_read,
-               .write_string   = cache_mode_write,
-       },
-       {
-               .name           = "verify",
-               .read_u64       = bch_verify_read,
-               .write_u64      = bch_verify_write,
-       },
-       {
-               .name           = "cache_hits",
-               .read_u64       = bch_cache_hits_read,
-       },
-       {
-               .name           = "cache_misses",
-               .read_u64       = bch_cache_misses_read,
-       },
-       {
-               .name           = "cache_bypass_hits",
-               .read_u64       = bch_cache_bypass_hits_read,
-       },
-       {
-               .name           = "cache_bypass_misses",
-               .read_u64       = bch_cache_bypass_misses_read,
-       },
-       { }     /* terminate */
-};
-
-static void init_bch_cgroup(struct bch_cgroup *cg)
-{
-       cg->cache_mode = -1;
-}
-
-static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
-{
-       struct bch_cgroup *cg;
-
-       cg = kzalloc(sizeof(*cg), GFP_KERNEL);
-       if (!cg)
-               return ERR_PTR(-ENOMEM);
-       init_bch_cgroup(cg);
-       return &cg->css;
-}
-
-static void bcachecg_destroy(struct cgroup *cgroup)
-{
-       struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
-       kfree(cg);
-}
-
-struct cgroup_subsys bcache_subsys = {
-       .create         = bcachecg_create,
-       .destroy        = bcachecg_destroy,
-       .subsys_id      = bcache_subsys_id,
-       .name           = "bcache",
-       .module         = THIS_MODULE,
-};
-EXPORT_SYMBOL_GPL(bcache_subsys);
-#endif
-
  static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
  {
-#ifdef CONFIG_CGROUP_BCACHE
-       int r = bch_bio_to_cgroup(bio)->cache_mode;
-       if (r >= 0)
-               return r;
-#endif
         return BDEV_CACHE_MODE(&dc->sb);
  }
  
  static bool verify(struct cached_dev *dc, struct bio *bio)
  {
-#ifdef CONFIG_CGROUP_BCACHE
-       if (bch_bio_to_cgroup(bio)->verify)
-               return true;
-#endif
         return dc->verify;
  }
  
@@ -248,7 +88,7 @@ static void bch_data_insert_keys(struct closure *cl)
                 atomic_dec_bug(journal_ref);
  
         if (!op->insert_data_done)
-               continue_at(cl, bch_data_insert_start, bcache_wq);
+               continue_at(cl, bch_data_insert_start, op->wq);
  
         bch_keylist_free(&op->insert_keys);
         closure_return(cl);
@@ -297,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl)
         op->insert_data_done = true;
         bio_put(bio);
  out:
-       continue_at(cl, bch_data_insert_keys, bcache_wq);
+       continue_at(cl, bch_data_insert_keys, op->wq);
  }
  
  static void bch_data_insert_error(struct closure *cl)
@@ -340,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error)
                 if (op->writeback)
                         op->error = error;
                 else if (!op->replace)
-                       set_closure_fn(cl, bch_data_insert_error, bcache_wq);
+                       set_closure_fn(cl, bch_data_insert_error, op->wq);
                 else
                         set_closure_fn(cl, NULL, NULL);
         }
@@ -376,7 +216,7 @@ static void bch_data_insert_start(struct closure *cl)
                 if (bch_keylist_realloc(&op->insert_keys,
                                         3 + (op->csum ? 1 : 0),
                                         op->c))
-                       continue_at(cl, bch_data_insert_keys, bcache_wq);
+                       continue_at(cl, bch_data_insert_keys, op->wq);
  
                 k = op->insert_keys.top;
                 bkey_init(k);
@@ -413,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl)
         } while (n != bio);
  
         op->insert_data_done = true;
-       continue_at(cl, bch_data_insert_keys, bcache_wq);
+       continue_at(cl, bch_data_insert_keys, op->wq);
  err:
         /* bch_alloc_sectors() blocks if s->writeback = true */
         BUG_ON(op->writeback);
@@ -442,7 +282,7 @@ err:
                 bio_put(bio);
  
                 if (!bch_keylist_empty(&op->insert_keys))
-                       continue_at(cl, bch_data_insert_keys, bcache_wq);
+                       continue_at(cl, bch_data_insert_keys, op->wq);
                 else
                         closure_return(cl);
         }
@@ -824,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio,
         s->iop.error            = 0;
         s->iop.flags            = 0;
         s->iop.flush_journal    = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+       s->iop.wq               = bcache_wq;
  
         return s;
  }
@@ -1203,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
  static int flash_dev_cache_miss(struct btree *b, struct search *s,
                                 struct bio *bio, unsigned sectors)
  {
-       struct bio_vec bv;
-       struct bvec_iter iter;
-
-       /* Zero fill bio */
+       unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
  
-       bio_for_each_segment(bv, bio, iter) {
-               unsigned j = min(bv.bv_len >> 9, sectors);
-
-               void *p = kmap(bv.bv_page);
-               memset(p + bv.bv_offset, 0, j << 9);
-               kunmap(bv.bv_page);
-
-               sectors -= j;
-       }
+       swap(bio->bi_iter.bi_size, bytes);
+       zero_fill_bio(bio);
+       swap(bio->bi_iter.bi_size, bytes);
  
-       bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
+       bio_advance(bio, bytes);
  
         if (!bio->bi_iter.bi_size)
                 return MAP_DONE;
@@ -1313,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d)
  
  void bch_request_exit(void)
  {
-#ifdef CONFIG_CGROUP_BCACHE
-       cgroup_unload_subsys(&bcache_subsys);
-#endif
         if (bch_search_cache)
                 kmem_cache_destroy(bch_search_cache);
  }
@@ -1326,11 +1155,5 @@ int __init bch_request_init(void)
         if (!bch_search_cache)
                 return -ENOMEM;
  
-#ifdef CONFIG_CGROUP_BCACHE
-       cgroup_load_subsys(&bcache_subsys);
-       init_bch_cgroup(&bcache_default_cgroup);
-
-       cgroup_add_cftypes(&bcache_subsys, bch_files);
-#endif
         return 0;
  }
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h

index 39f21dbedc38b43ec7bf4818cfca319d27d27433..1ff36875c2b30bcb29e650290b194c52cc57bc09 100644 (file)
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -1,12 +1,11 @@
  #ifndef _BCACHE_REQUEST_H_
  #define _BCACHE_REQUEST_H_
  
-#include <linux/cgroup.h>
-
  struct data_insert_op {
         struct closure          cl;
         struct cache_set        *c;
         struct bio              *bio;
+       struct workqueue_struct *wq;
  
         unsigned                inode;
         uint16_t                write_point;
@@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d);
  
  extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
  
-struct bch_cgroup {
-#ifdef CONFIG_CGROUP_BCACHE
-       struct cgroup_subsys_state      css;
-#endif
-       /*
-        * We subtract one from the index into bch_cache_modes[], so that
-        * default == -1; this makes it so the rest match up with d->cache_mode,
-        * and we use d->cache_mode if cgrp->cache_mode < 0
-        */
-       short                           cache_mode;
-       bool                            verify;
-       struct cache_stat_collector     stats;
-};
-
-struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
-
  #endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c

index 84d0782f702eac3e09d790077a4e353fa6a265e7..0ca072c20d0d35dacce85e29d1ad8e8762854344 100644 (file)
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
         struct cached_dev *dc = container_of(d, struct cached_dev, disk);
         mark_cache_stats(&dc->accounting.collector, hit, bypass);
         mark_cache_stats(&c->accounting.collector, hit, bypass);
-#ifdef CONFIG_CGROUP_BCACHE
-       mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
-#endif
  }
  
  void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c

index 24a3a1546caa7a4c8e9c0aa70c9461574f87a591..926ded8ccbf58c39788a471dffb4ed7f876028bc 100644 (file)
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
         closure_sync(cl);
  }
  
-#define buckets_free(c)        "free %zu, free_inc %zu, unused %zu",           \
-       fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
-
  void bch_prio_write(struct cache *ca)
  {
         int i;
@@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca)
  
         lockdep_assert_held(&ca->set->bucket_lock);
  
-       for (b = ca->buckets;
-            b < ca->buckets + ca->sb.nbuckets; b++)
-               b->disk_gen = b->gen;
-
         ca->disk_buckets->seq++;
  
         atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
@@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca)
  
         mutex_lock(&ca->set->bucket_lock);
  
-       ca->need_save_prio = 0;
-
         /*
          * Don't want the old priorities to get garbage collected until after we
          * finish writing the new ones, and they're journalled
          */
-       for (i = 0; i < prio_buckets(ca); i++)
+       for (i = 0; i < prio_buckets(ca); i++) {
+               if (ca->prio_last_buckets[i])
+                       __bch_bucket_free(ca,
+                               &ca->buckets[ca->prio_last_buckets[i]]);
+
                 ca->prio_last_buckets[i] = ca->prio_buckets[i];
+       }
  }
  
  static void prio_read(struct cache *ca, uint64_t bucket)
@@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
                 }
  
                 b->prio = le16_to_cpu(d->prio);
-               b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+               b->gen = b->last_gc = d->gen;
         }
  }
  
@@ -843,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
         q->limits.max_segment_size      = UINT_MAX;
         q->limits.max_segments          = BIO_MAX_PAGES;
         q->limits.max_discard_sectors   = UINT_MAX;
+       q->limits.discard_granularity   = 512;
         q->limits.io_min                = block_size;
         q->limits.logical_block_size    = block_size;
         q->limits.physical_block_size   = block_size;
@@ -1355,6 +1352,8 @@ static void cache_set_free(struct closure *cl)
         bch_bset_sort_state_free(&c->sort);
         free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
  
+       if (c->moving_gc_wq)
+               destroy_workqueue(c->moving_gc_wq);
         if (c->bio_split)
                 bioset_free(c->bio_split);
         if (c->fill_iter)
@@ -1395,14 +1394,21 @@ static void cache_set_flush(struct closure *cl)
                 list_add(&c->root->list, &c->btree_cache);
  
         /* Should skip this if we're unregistering because of an error */
-       list_for_each_entry(b, &c->btree_cache, list)
+       list_for_each_entry(b, &c->btree_cache, list) {
+               mutex_lock(&b->write_lock);
                 if (btree_node_dirty(b))
-                       bch_btree_node_write(b, NULL);
+                       __bch_btree_node_write(b, NULL);
+               mutex_unlock(&b->write_lock);
+       }
  
         for_each_cache(ca, c, i)
                 if (ca->alloc_thread)
                         kthread_stop(ca->alloc_thread);
  
+       cancel_delayed_work_sync(&c->journal.work);
+       /* flush last journal entry if needed */
+       c->journal.work.work.func(&c->journal.work.work);
+
         closure_return(cl);
  }
  
@@ -1485,14 +1491,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
  
         sema_init(&c->sb_write_mutex, 1);
         mutex_init(&c->bucket_lock);
-       init_waitqueue_head(&c->try_wait);
+       init_waitqueue_head(&c->btree_cache_wait);
         init_waitqueue_head(&c->bucket_wait);
         sema_init(&c->uuid_write_mutex, 1);
  
         spin_lock_init(&c->btree_gc_time.lock);
         spin_lock_init(&c->btree_split_time.lock);
         spin_lock_init(&c->btree_read_time.lock);
-       spin_lock_init(&c->try_harder_time.lock);
  
         bch_moving_init_cache_set(c);
  
@@ -1517,6 +1522,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
             !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
             !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
             !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+           !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
             bch_journal_alloc(c) ||
             bch_btree_cache_alloc(c) ||
             bch_open_buckets_alloc(c) ||
@@ -1580,7 +1586,7 @@ static void run_cache_set(struct cache_set *c)
                         goto err;
  
                 err = "error reading btree root";
-               c->root = bch_btree_node_get(c, k, j->btree_level, true);
+               c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
                 if (IS_ERR_OR_NULL(c->root))
                         goto err;
  
@@ -1596,7 +1602,7 @@ static void run_cache_set(struct cache_set *c)
                         goto err;
  
                 bch_journal_mark(c, &journal);
-               bch_btree_gc_finish(c);
+               bch_initial_gc_finish(c);
                 pr_debug("btree_check() done");
  
                 /*
@@ -1638,7 +1644,7 @@ static void run_cache_set(struct cache_set *c)
                                 ca->sb.d[j] = ca->sb.first_bucket + j;
                 }
  
-               bch_btree_gc_finish(c);
+               bch_initial_gc_finish(c);
  
                 err = "error starting allocator thread";
                 for_each_cache(ca, c, i)
@@ -1655,12 +1661,14 @@ static void run_cache_set(struct cache_set *c)
                         goto err;
  
                 err = "cannot allocate new btree root";
-               c->root = bch_btree_node_alloc(c, 0, true);
+               c->root = bch_btree_node_alloc(c, NULL, 0);
                 if (IS_ERR_OR_NULL(c->root))
                         goto err;
  
+               mutex_lock(&c->root->write_lock);
                 bkey_copy_key(&c->root->key, &MAX_KEY);
                 bch_btree_node_write(c->root, &cl);
+               mutex_unlock(&c->root->write_lock);
  
                 bch_btree_set_root(c->root);
                 rw_unlock(true, c->root);
@@ -1782,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj)
         vfree(ca->buckets);
  
         free_heap(&ca->heap);
-       free_fifo(&ca->unused);
         free_fifo(&ca->free_inc);
  
         for (i = 0; i < RESERVE_NR; i++)
@@ -1819,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
             !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
             !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
             !init_fifo(&ca->free_inc,   free << 2, GFP_KERNEL) ||
-           !init_fifo(&ca->unused,     free << 2, GFP_KERNEL) ||
             !init_heap(&ca->heap,       free << 3, GFP_KERNEL) ||
             !(ca->buckets       = vzalloc(sizeof(struct bucket) *
                                           ca->sb.nbuckets)) ||
@@ -1834,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
         for_each_bucket(b, ca)
                 atomic_set(&b->pin, 0);
  
-       if (bch_cache_allocator_init(ca))
-               goto err;
-
         return 0;
-err:
-       kobject_put(&ca->kobj);
-       return -ENOMEM;
  }
  
  static void register_cache(struct cache_sb *sb, struct page *sb_page,
@@ -1869,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
         if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
                 goto err;
  
+       mutex_lock(&bch_register_lock);
         err = register_cache_set(ca);
+       mutex_unlock(&bch_register_lock);
+
         if (err)
                 goto err;
  
@@ -1931,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
         if (!try_module_get(THIS_MODULE))
                 return -EBUSY;
  
-       mutex_lock(&bch_register_lock);
-
         if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
             !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
                 goto err;
@@ -1965,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
                 if (!dc)
                         goto err_close;
  
+               mutex_lock(&bch_register_lock);
                 register_bdev(sb, sb_page, bdev, dc);
+               mutex_unlock(&bch_register_lock);
         } else {
                 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
                 if (!ca)
@@ -1978,7 +1981,6 @@ out:
                 put_page(sb_page);
         kfree(sb);
         kfree(path);
-       mutex_unlock(&bch_register_lock);
         module_put(THIS_MODULE);
         return ret;
  
@@ -2057,7 +2059,6 @@ static void bcache_exit(void)
  {
         bch_debug_exit();
         bch_request_exit();
-       bch_btree_exit();
         if (bcache_kobj)
                 kobject_put(bcache_kobj);
         if (bcache_wq)
@@ -2087,7 +2088,6 @@ static int __init bcache_init(void)
         if (!(bcache_wq = create_workqueue("bcache")) ||
             !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
             sysfs_create_files(bcache_kobj, files) ||
-           bch_btree_init() ||
             bch_request_init() ||
             bch_debug_init(bcache_kobj))
                 goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c

index d8458d477a1282110a8c685859ecc66dcf947057..b3ff57d61ddea7d053c7b01d5cb1819d2a5a326e 100644 (file)
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc,  sec, ms);
  sysfs_time_stats_attribute(btree_split, sec, us);
  sysfs_time_stats_attribute(btree_sort, ms,  us);
  sysfs_time_stats_attribute(btree_read, ms,  us);
-sysfs_time_stats_attribute(try_harder, ms,  us);
  
  read_attribute(btree_nodes);
  read_attribute(btree_used_percent);
@@ -406,7 +405,7 @@ struct bset_stats_op {
         struct bset_stats stats;
  };
  
-static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
+static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b)
  {
         struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
  
@@ -424,7 +423,7 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
         memset(&op, 0, sizeof(op));
         bch_btree_op_init(&op.op, -1);
  
-       ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
+       ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats);
         if (ret < 0)
                 return ret;
  
@@ -442,81 +441,81 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
                         op.stats.floats, op.stats.failed);
  }
  
-SHOW(__bch_cache_set)
+static unsigned bch_root_usage(struct cache_set *c)
  {
-       unsigned root_usage(struct cache_set *c)
-       {
-               unsigned bytes = 0;
-               struct bkey *k;
-               struct btree *b;
-               struct btree_iter iter;
+       unsigned bytes = 0;
+       struct bkey *k;
+       struct btree *b;
+       struct btree_iter iter;
  
-               goto lock_root;
+       goto lock_root;
  
-               do {
-                       rw_unlock(false, b);
+       do {
+               rw_unlock(false, b);
  lock_root:
-                       b = c->root;
-                       rw_lock(false, b, b->level);
-               } while (b != c->root);
+               b = c->root;
+               rw_lock(false, b, b->level);
+       } while (b != c->root);
  
-               for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
-                       bytes += bkey_bytes(k);
+       for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+               bytes += bkey_bytes(k);
  
-               rw_unlock(false, b);
+       rw_unlock(false, b);
  
-               return (bytes * 100) / btree_bytes(c);
-       }
+       return (bytes * 100) / btree_bytes(c);
+}
  
-       size_t cache_size(struct cache_set *c)
-       {
-               size_t ret = 0;
-               struct btree *b;
+static size_t bch_cache_size(struct cache_set *c)
+{
+       size_t ret = 0;
+       struct btree *b;
  
-               mutex_lock(&c->bucket_lock);
-               list_for_each_entry(b, &c->btree_cache, list)
-                       ret += 1 << (b->keys.page_order + PAGE_SHIFT);
+       mutex_lock(&c->bucket_lock);
+       list_for_each_entry(b, &c->btree_cache, list)
+               ret += 1 << (b->keys.page_order + PAGE_SHIFT);
  
-               mutex_unlock(&c->bucket_lock);
-               return ret;
-       }
-
-       unsigned cache_max_chain(struct cache_set *c)
-       {
-               unsigned ret = 0;
-               struct hlist_head *h;
+       mutex_unlock(&c->bucket_lock);
+       return ret;
+}
  
-               mutex_lock(&c->bucket_lock);
+static unsigned bch_cache_max_chain(struct cache_set *c)
+{
+       unsigned ret = 0;
+       struct hlist_head *h;
  
-               for (h = c->bucket_hash;
-                    h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
-                    h++) {
-                       unsigned i = 0;
-                       struct hlist_node *p;
+       mutex_lock(&c->bucket_lock);
  
-                       hlist_for_each(p, h)
-                               i++;
+       for (h = c->bucket_hash;
+            h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+            h++) {
+               unsigned i = 0;
+               struct hlist_node *p;
  
-                       ret = max(ret, i);
-               }
+               hlist_for_each(p, h)
+                       i++;
  
-               mutex_unlock(&c->bucket_lock);
-               return ret;
+               ret = max(ret, i);
         }
  
-       unsigned btree_used(struct cache_set *c)
-       {
-               return div64_u64(c->gc_stats.key_bytes * 100,
-                                (c->gc_stats.nodes ?: 1) * btree_bytes(c));
-       }
+       mutex_unlock(&c->bucket_lock);
+       return ret;
+}
  
-       unsigned average_key_size(struct cache_set *c)
-       {
-               return c->gc_stats.nkeys
-                       ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
-                       : 0;
-       }
+static unsigned bch_btree_used(struct cache_set *c)
+{
+       return div64_u64(c->gc_stats.key_bytes * 100,
+                        (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+}
  
+static unsigned bch_average_key_size(struct cache_set *c)
+{
+       return c->gc_stats.nkeys
+               ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+               : 0;
+}
+
+SHOW(__bch_cache_set)
+{
         struct cache_set *c = container_of(kobj, struct cache_set, kobj);
  
         sysfs_print(synchronous,                CACHE_SYNC(&c->sb));
@@ -524,21 +523,20 @@ lock_root:
         sysfs_hprint(bucket_size,               bucket_bytes(c));
         sysfs_hprint(block_size,                block_bytes(c));
         sysfs_print(tree_depth,                 c->root->level);
-       sysfs_print(root_usage_percent,         root_usage(c));
+       sysfs_print(root_usage_percent,         bch_root_usage(c));
  
-       sysfs_hprint(btree_cache_size,          cache_size(c));
-       sysfs_print(btree_cache_max_chain,      cache_max_chain(c));
+       sysfs_hprint(btree_cache_size,          bch_cache_size(c));
+       sysfs_print(btree_cache_max_chain,      bch_cache_max_chain(c));
         sysfs_print(cache_available_percent,    100 - c->gc_stats.in_use);
  
         sysfs_print_time_stats(&c->btree_gc_time,       btree_gc, sec, ms);
         sysfs_print_time_stats(&c->btree_split_time,    btree_split, sec, us);
         sysfs_print_time_stats(&c->sort.time,           btree_sort, ms, us);
         sysfs_print_time_stats(&c->btree_read_time,     btree_read, ms, us);
-       sysfs_print_time_stats(&c->try_harder_time,     try_harder, ms, us);
  
-       sysfs_print(btree_used_percent, btree_used(c));
+       sysfs_print(btree_used_percent, bch_btree_used(c));
         sysfs_print(btree_nodes,        c->gc_stats.nodes);
-       sysfs_hprint(average_key_size,  average_key_size(c));
+       sysfs_hprint(average_key_size,  bch_average_key_size(c));
  
         sysfs_print(cache_read_races,
                     atomic_long_read(&c->cache_read_races));
@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
         sysfs_time_stats_attribute_list(btree_split, sec, us)
         sysfs_time_stats_attribute_list(btree_sort, ms, us)
         sysfs_time_stats_attribute_list(btree_read, ms, us)
-       sysfs_time_stats_attribute_list(try_harder, ms, us)
  
         &sysfs_btree_nodes,
         &sysfs_btree_used_percent,
@@ -761,7 +758,9 @@ SHOW(__bch_cache)
                 int cmp(const void *l, const void *r)
                 {       return *((uint16_t *) r) - *((uint16_t *) l); }
  
-               size_t n = ca->sb.nbuckets, i, unused, btree;
+               struct bucket *b;
+               size_t n = ca->sb.nbuckets, i;
+               size_t unused = 0, available = 0, dirty = 0, meta = 0;
                 uint64_t sum = 0;
                 /* Compute 31 quantiles */
                 uint16_t q[31], *p, *cached;
@@ -772,6 +771,17 @@ SHOW(__bch_cache)
                         return -ENOMEM;
  
                 mutex_lock(&ca->set->bucket_lock);
+               for_each_bucket(b, ca) {
+                       if (!GC_SECTORS_USED(b))
+                               unused++;
+                       if (GC_MARK(b) == GC_MARK_RECLAIMABLE)
+                               available++;
+                       if (GC_MARK(b) == GC_MARK_DIRTY)
+                               dirty++;
+                       if (GC_MARK(b) == GC_MARK_METADATA)
+                               meta++;
+               }
+
                 for (i = ca->sb.first_bucket; i < n; i++)
                         p[i] = ca->buckets[i].prio;
                 mutex_unlock(&ca->set->bucket_lock);
@@ -786,10 +796,7 @@ SHOW(__bch_cache)
  
                 while (cached < p + n &&
                        *cached == BTREE_PRIO)
-                       cached++;
-
-               btree = cached - p;
-               n -= btree;
+                       cached++, n--;
  
                 for (i = 0; i < n; i++)
                         sum += INITIAL_PRIO - cached[i];
@@ -805,12 +812,16 @@ SHOW(__bch_cache)
  
                 ret = scnprintf(buf, PAGE_SIZE,
                                 "Unused:                %zu%%\n"
+                               "Clean:         %zu%%\n"
+                               "Dirty:         %zu%%\n"
                                 "Metadata:      %zu%%\n"
                                 "Average:       %llu\n"
                                 "Sectors per Q: %zu\n"
                                 "Quantiles:     [",
                                 unused * 100 / (size_t) ca->sb.nbuckets,
-                               btree * 100 / (size_t) ca->sb.nbuckets, sum,
+                               available * 100 / (size_t) ca->sb.nbuckets,
+                               dirty * 100 / (size_t) ca->sb.nbuckets,
+                               meta * 100 / (size_t) ca->sb.nbuckets, sum,
                                 n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
  
                 for (i = 0; i < ARRAY_SIZE(q); i++)
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c

index adbc3df17a8063933fd395de7763c5e9d560ea67..b7820b0d2621a1d34aa971c84601e884b33fc023 100644 (file)
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
  EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
  EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
  
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate);
  EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h

index 7110897c3dfa595e385b8a946f7547ca05583fec..c9c3c044b32f060b749c63381b956fceabc64f89 100644 (file)
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -399,26 +399,43 @@ TRACE_EVENT(bcache_keyscan,
  
  /* Allocator */
  
-TRACE_EVENT(bcache_alloc_invalidate,
-       TP_PROTO(struct cache *ca),
-       TP_ARGS(ca),
+TRACE_EVENT(bcache_invalidate,
+       TP_PROTO(struct cache *ca, size_t bucket),
+       TP_ARGS(ca, bucket),
  
         TP_STRUCT__entry(
-               __field(unsigned,       free                    )
-               __field(unsigned,       free_inc                )
-               __field(unsigned,       free_inc_size           )
-               __field(unsigned,       unused                  )
+               __field(unsigned,       sectors                 )
+               __field(dev_t,          dev                     )
+               __field(__u64,          offset                  )
         ),
  
         TP_fast_assign(
-               __entry->free           = fifo_used(&ca->free[RESERVE_NONE]);
-               __entry->free_inc       = fifo_used(&ca->free_inc);
-               __entry->free_inc_size  = ca->free_inc.size;
-               __entry->unused         = fifo_used(&ca->unused);
+               __entry->dev            = ca->bdev->bd_dev;
+               __entry->offset         = bucket << ca->set->bucket_bits;
+               __entry->sectors        = GC_SECTORS_USED(&ca->buckets[bucket]);
         ),
  
-       TP_printk("free %u free_inc %u/%u unused %u", __entry->free,
-                 __entry->free_inc, __entry->free_inc_size, __entry->unused)
+       TP_printk("invalidated %u sectors at %d,%d sector=%llu",
+                 __entry->sectors, MAJOR(__entry->dev),
+                 MINOR(__entry->dev), __entry->offset)
+);
+
+TRACE_EVENT(bcache_alloc,
+       TP_PROTO(struct cache *ca, size_t bucket),
+       TP_ARGS(ca, bucket),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(__u64,          offset                  )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = ca->bdev->bd_dev;
+               __entry->offset         = bucket << ca->set->bucket_bits;
+       ),
+
+       TP_printk("allocated %d,%d sector=%llu", MAJOR(__entry->dev),
+                 MINOR(__entry->dev), __entry->offset)
  );
  
  TRACE_EVENT(bcache_alloc_fail,
@@ -426,21 +443,22 @@ TRACE_EVENT(bcache_alloc_fail,
         TP_ARGS(ca, reserve),
  
         TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
                 __field(unsigned,       free                    )
                 __field(unsigned,       free_inc                )
-               __field(unsigned,       unused                  )
                 __field(unsigned,       blocked                 )
         ),
  
         TP_fast_assign(
+               __entry->dev            = ca->bdev->bd_dev;
                 __entry->free           = fifo_used(&ca->free[reserve]);
                 __entry->free_inc       = fifo_used(&ca->free_inc);
-               __entry->unused         = fifo_used(&ca->unused);
                 __entry->blocked        = atomic_read(&ca->set->prio_blocked);
         ),
  
-       TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free,
-                 __entry->free_inc, __entry->unused, __entry->blocked)
+       TP_printk("alloc fail %d,%d free %u free_inc %u blocked %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->free,
+                 __entry->free_inc, __entry->blocked)
  );
  
  /* Background writeback */
author	Jens Axboe <axboe@fb.com>
	Tue, 18 Mar 2014 19:57:01 +0000 (13:57 -0600)
committer	Jens Axboe <axboe@fb.com>
	Tue, 18 Mar 2014 19:57:01 +0000 (13:57 -0600)
drivers/md/bcache/Kconfig		patch \| blob \| history
drivers/md/bcache/alloc.c		patch \| blob \| history
drivers/md/bcache/bcache.h		patch \| blob \| history
drivers/md/bcache/bset.c		patch \| blob \| history
drivers/md/bcache/bset.h		patch \| blob \| history
drivers/md/bcache/btree.c		patch \| blob \| history
drivers/md/bcache/btree.h		patch \| blob \| history
drivers/md/bcache/extents.c		patch \| blob \| history
drivers/md/bcache/journal.c		patch \| blob \| history
drivers/md/bcache/journal.h		patch \| blob \| history
drivers/md/bcache/movinggc.c		patch \| blob \| history
drivers/md/bcache/request.c		patch \| blob \| history
drivers/md/bcache/request.h		patch \| blob \| history
drivers/md/bcache/stats.c		patch \| blob \| history
drivers/md/bcache/super.c		patch \| blob \| history
drivers/md/bcache/sysfs.c		patch \| blob \| history
drivers/md/bcache/trace.c		patch \| blob \| history
include/trace/events/bcache.h		patch \| blob \| history