writeback: don't embed root bdi_writeback_congested in bdi_writeback
authorTejun Heo <tj@kernel.org>
Thu, 2 Jul 2015 14:44:34 +0000 (08:44 -0600)
committerJens Axboe <axboe@fb.com>
Thu, 2 Jul 2015 14:46:00 +0000 (08:46 -0600)
52ebea749aae ("writeback: make backing_dev_info host cgroup-specific
bdi_writebacks") made bdi (backing_dev_info) host per-cgroup wb's
(bdi_writeback's).  As the congested state needs to be per-wb and
referenced from blkcg side and multiple wbs, the patch made all
non-root cong's (bdi_writeback_congested's) reference counted and
indexed on bdi.

When a bdi is destroyed, cgwb_bdi_destroy() tries to drain all
non-root cong's; however, this can hang indefinitely because wb's can
also be referenced from blkcg_gq's which are destroyed after bdi
destruction is complete.

To fix the bug, bdi destruction will be updated to not wait for cong's
to drain, which naturally means that cong's may outlive the associated
bdi.  This is fine for non-root cong's but is problematic for the root
cong's which are embedded in their bdi's as they may end up getting
dereferenced after the containing bdi's are freed.

This patch makes root cong's behave the same as non-root cong's.  They
are no longer embedded in their bdi's but allocated separately during
bdi initialization, indexed and reference counted the same way.

* As cong handling is the same for all wb's, wb->congested
  initialization is moved into wb_init().

* When !CONFIG_CGROUP_WRITEBACK, there was no indexing or refcnting.
  bdi->wb_congested is now a pointer pointing to the root cong
  allocated during bdi init and minimal refcnting operations are
  implemented.

* The above makes root wb init paths diverge depending on
  CONFIG_CGROUP_WRITEBACK.  root wb init is moved to cgwb_bdi_init().

This patch in itself shouldn't cause any consequential behavior
differences but prepares for the actual fix.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jon Christopherson <jon@jons.org>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=100681
Tested-by: Jon Christopherson <jon@jons.org>
Added <linux/slab.h> include to backing-dev.h for kfree() definition.

Signed-off-by: Jens Axboe <axboe@fb.com>
include/linux/backing-dev-defs.h
include/linux/backing-dev.h
mm/backing-dev.c

index a48d90e3bcbb86d98d911f21d248897c3c563730..a23209b43842106c6a74de8a51d6b4b752613157 100644 (file)
@@ -50,10 +50,10 @@ enum wb_stat_item {
  */
 struct bdi_writeback_congested {
        unsigned long state;            /* WB_[a]sync_congested flags */
+       atomic_t refcnt;                /* nr of attached wb's and blkg */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct backing_dev_info *bdi;   /* the associated bdi */
-       atomic_t refcnt;                /* nr of attached wb's and blkg */
        int blkcg_id;                   /* ID of the associated blkcg */
        struct rb_node rb_node;         /* on bdi->cgwb_congestion_tree */
 #endif
@@ -150,11 +150,12 @@ struct backing_dev_info {
        atomic_long_t tot_write_bandwidth;
 
        struct bdi_writeback wb;  /* the root writeback info for this bdi */
-       struct bdi_writeback_congested wb_congested; /* its congested state */
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct rb_root cgwb_congested_tree; /* their congested states */
        atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#else
+       struct bdi_writeback_congested *wb_congested;
 #endif
        wait_queue_head_t wb_waitq;
 
index 0e6d4828a77a358edd3c77ef7d14eecc6f6001b3..0fe9df983ab7410c67143ecdc7a26bac958b4597 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/writeback.h>
 #include <linux/blk-cgroup.h>
 #include <linux/backing-dev-defs.h>
+#include <linux/slab.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
@@ -465,11 +466,14 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 static inline struct bdi_writeback_congested *
 wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
 {
-       return bdi->wb.congested;
+       atomic_inc(&bdi->wb_congested->refcnt);
+       return bdi->wb_congested;
 }
 
 static inline void wb_congested_put(struct bdi_writeback_congested *congested)
 {
+       if (atomic_dec_and_test(&congested->refcnt))
+               kfree(congested);
 }
 
 static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
index 7756da31b02bcbb2a7f7036a4bbbdd093883ad6c..51cc461e7256cb4c5e61b40cbbbdf40561f722e0 100644 (file)
@@ -287,7 +287,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
 #define INIT_BW                (100 << (20 - PAGE_SHIFT))
 
 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
-                  gfp_t gfp)
+                  int blkcg_id, gfp_t gfp)
 {
        int i, err;
 
@@ -311,21 +311,29 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
        INIT_LIST_HEAD(&wb->work_list);
        INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 
+       wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
+       if (!wb->congested)
+               return -ENOMEM;
+
        err = fprop_local_init_percpu(&wb->completions, gfp);
        if (err)
-               return err;
+               goto out_put_cong;
 
        for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
                err = percpu_counter_init(&wb->stat[i], 0, gfp);
-               if (err) {
-                       while (--i)
-                               percpu_counter_destroy(&wb->stat[i]);
-                       fprop_local_destroy_percpu(&wb->completions);
-                       return err;
-               }
+               if (err)
+                       goto out_destroy_stat;
        }
 
        return 0;
+
+out_destroy_stat:
+       while (--i)
+               percpu_counter_destroy(&wb->stat[i]);
+       fprop_local_destroy_percpu(&wb->completions);
+out_put_cong:
+       wb_congested_put(wb->congested);
+       return err;
 }
 
 /*
@@ -361,6 +369,7 @@ static void wb_exit(struct bdi_writeback *wb)
                percpu_counter_destroy(&wb->stat[i]);
 
        fprop_local_destroy_percpu(&wb->completions);
+       wb_congested_put(wb->congested);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -392,9 +401,6 @@ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
        struct bdi_writeback_congested *new_congested = NULL, *congested;
        struct rb_node **node, *parent;
        unsigned long flags;
-
-       if (blkcg_id == 1)
-               return &bdi->wb_congested;
 retry:
        spin_lock_irqsave(&cgwb_lock, flags);
 
@@ -453,9 +459,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
        struct backing_dev_info *bdi = congested->bdi;
        unsigned long flags;
 
-       if (congested->blkcg_id == 1)
-               return;
-
        local_irq_save(flags);
        if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
                local_irq_restore(flags);
@@ -480,7 +483,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 
        css_put(wb->memcg_css);
        css_put(wb->blkcg_css);
-       wb_congested_put(wb->congested);
 
        fprop_local_destroy_percpu(&wb->memcg_completions);
        percpu_ref_exit(&wb->refcnt);
@@ -541,7 +543,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
        if (!wb)
                return -ENOMEM;
 
-       ret = wb_init(wb, bdi, gfp);
+       ret = wb_init(wb, bdi, blkcg_css->id, gfp);
        if (ret)
                goto err_free;
 
@@ -553,12 +555,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
        if (ret)
                goto err_ref_exit;
 
-       wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
-       if (!wb->congested) {
-               ret = -ENOMEM;
-               goto err_fprop_exit;
-       }
-
        wb->memcg_css = memcg_css;
        wb->blkcg_css = blkcg_css;
        INIT_WORK(&wb->release_work, cgwb_release_workfn);
@@ -588,12 +584,10 @@ static int cgwb_create(struct backing_dev_info *bdi,
        if (ret) {
                if (ret == -EEXIST)
                        ret = 0;
-               goto err_put_congested;
+               goto err_fprop_exit;
        }
        goto out_put;
 
-err_put_congested:
-       wb_congested_put(wb->congested);
 err_fprop_exit:
        fprop_local_destroy_percpu(&wb->memcg_completions);
 err_ref_exit:
@@ -662,14 +656,20 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
        return wb;
 }
 
-static void cgwb_bdi_init(struct backing_dev_info *bdi)
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
 {
-       bdi->wb.memcg_css = mem_cgroup_root_css;
-       bdi->wb.blkcg_css = blkcg_root_css;
-       bdi->wb_congested.blkcg_id = 1;
+       int ret;
+
        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        bdi->cgwb_congested_tree = RB_ROOT;
        atomic_set(&bdi->usage_cnt, 1);
+
+       ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+       if (!ret) {
+               bdi->wb.memcg_css = mem_cgroup_root_css;
+               bdi->wb.blkcg_css = blkcg_root_css;
+       }
+       return ret;
 }
 
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
@@ -732,15 +732,28 @@ void wb_blkcg_offline(struct blkcg *blkcg)
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
-static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+       int err;
+
+       bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
+       if (!bdi->wb_congested)
+               return -ENOMEM;
+
+       err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+       if (err) {
+               kfree(bdi->wb_congested);
+               return err;
+       }
+       return 0;
+}
+
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
 
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
 int bdi_init(struct backing_dev_info *bdi)
 {
-       int err;
-
        bdi->dev = NULL;
 
        bdi->min_ratio = 0;
@@ -749,15 +762,7 @@ int bdi_init(struct backing_dev_info *bdi)
        INIT_LIST_HEAD(&bdi->bdi_list);
        init_waitqueue_head(&bdi->wb_waitq);
 
-       err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
-       if (err)
-               return err;
-
-       bdi->wb_congested.state = 0;
-       bdi->wb.congested = &bdi->wb_congested;
-
-       cgwb_bdi_init(bdi);
-       return 0;
+       return cgwb_bdi_init(bdi);
 }
 EXPORT_SYMBOL(bdi_init);