net: sched: do not requeue a NULL skb
[firefly-linux-kernel-4.4.55.git] / fs / fs-writeback.c
index 091a36444972fa0b746118b0cc58ca2a3dcaa97f..60d6fc2e0e4b9f5ec4fabd5bfb257d91e9ee6b5e 100644 (file)
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 #define WB_FRN_HIST_MAX_SLOTS  (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
 
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
        struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -278,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
                wb_get(wb);
                spin_unlock(&inode->i_lock);
                spin_lock(&wb->list_lock);
-               wb_put(wb);             /* not gonna deref it anymore */
 
                /* i_wb may have changed inbetween, can't use inode_to_wb() */
-               if (likely(wb == inode->i_wb))
-                       return wb;      /* @inode already has ref */
+               if (likely(wb == inode->i_wb)) {
+                       wb_put(wb);     /* @inode already has ref */
+                       return wb;
+               }
 
                spin_unlock(&wb->list_lock);
+               wb_put(wb);
                cpu_relax();
                spin_lock(&inode->i_lock);
        }
@@ -424,6 +429,8 @@ skip_switch:
 
        iput(inode);
        kfree(isw);
+
+       atomic_dec(&isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -433,7 +440,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
 
        /* needs to grab bh-unsafe locks, bounce to work item */
        INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-       schedule_work(&isw->work);
+       queue_work(isw_wq, &isw->work);
 }
 
 /**
@@ -469,7 +476,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
-       if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+       if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+           inode->i_state & (I_WB_SWITCH | I_FREEING) ||
            inode_to_wb(inode) == isw->new_wb) {
                spin_unlock(&inode->i_lock);
                goto out_free;
@@ -480,6 +488,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
        ihold(inode);
        isw->inode = inode;
 
+       atomic_inc(&isw_nr_in_flight);
+
        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the mapping's
@@ -778,19 +788,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
 {
-       int next_memcg_id = 0;
-       struct bdi_writeback *wb;
-       struct wb_iter iter;
+       struct bdi_writeback *last_wb = NULL;
+       struct bdi_writeback *wb = list_entry(&bdi->wb_list,
+                                             struct bdi_writeback, bdi_node);
 
        might_sleep();
 restart:
        rcu_read_lock();
-       bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+       list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;
 
+               if (last_wb) {
+                       wb_put(last_wb);
+                       last_wb = NULL;
+               }
+
                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,14 +834,51 @@ restart:
 
                wb_queue_work(wb, work);
 
-               next_memcg_id = wb->memcg_css->id + 1;
+               /*
+                * Pin @wb so that it stays on @bdi->wb_list.  This allows
+                * continuing iteration from @wb after dropping and
+                * regrabbing rcu read lock.
+                */
+               wb_get(wb);
+               last_wb = wb;
+
                rcu_read_unlock();
                wb_wait_for_completion(bdi, &fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();
+
+       if (last_wb)
+               wb_put(last_wb);
+}
+
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+       if (atomic_read(&isw_nr_in_flight)) {
+               synchronize_rcu();
+               flush_workqueue(isw_wq);
+       }
 }
 
+static int __init cgroup_writeback_init(void)
+{
+       isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+       if (!isw_wq)
+               return -ENOMEM;
+       return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static struct bdi_writeback *
@@ -1289,10 +1341,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
  * and does more profound writeback list handling in writeback_sb_inodes().
  */
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
-                      struct writeback_control *wbc)
+static int writeback_single_inode(struct inode *inode,
+                                 struct writeback_control *wbc)
 {
+       struct bdi_writeback *wb;
        int ret = 0;
 
        spin_lock(&inode->i_lock);
@@ -1330,7 +1382,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
        ret = __writeback_single_inode(inode, wbc);
 
        wbc_detach_inode(wbc);
-       spin_lock(&wb->list_lock);
+
+       wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
         * If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1405,6 +1458,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 
        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
+               struct bdi_writeback *tmp_wb;
 
                if (inode->i_sb != sb) {
                        if (work->sb) {
@@ -1495,15 +1549,23 @@ static long writeback_sb_inodes(struct super_block *sb,
                        cond_resched();
                }
 
-
-               spin_lock(&wb->list_lock);
+               /*
+                * Requeue @inode if still dirty.  Be careful as @inode may
+                * have been switched to another wb in the meantime.
+                */
+               tmp_wb = inode_to_wb_and_lock_list(inode);
                spin_lock(&inode->i_lock);
                if (!(inode->i_state & I_DIRTY_ALL))
                        wrote++;
-               requeue_inode(inode, wb, &wbc);
+               requeue_inode(inode, tmp_wb, &wbc);
                inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);
 
+               if (unlikely(tmp_wb != wb)) {
+                       spin_unlock(&tmp_wb->list_lock);
+                       spin_lock(&wb->list_lock);
+               }
+
                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
@@ -1857,12 +1919,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
-               struct wb_iter iter;
 
                if (!bdi_has_dirty_io(bdi))
                        continue;
 
-               bdi_for_each_wb(wb, bdi, &iter, 0)
+               list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
                                           false, reason);
        }
@@ -1894,11 +1955,10 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
-               struct wb_iter iter;
 
-               bdi_for_each_wb(wb, bdi, &iter, 0)
-                       if (!list_empty(&bdi->wb.b_dirty_time))
-                               wb_wakeup(&bdi->wb);
+               list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+                       if (!list_empty(&wb->b_dirty_time))
+                               wb_wakeup(wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1968,9 +2028,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * page->mapping->host, so the page-dirtying time is recorded in the internal
  * blockdev inode.
  */
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
        struct super_block *sb = inode->i_sb;
        int dirtytime;
 
@@ -2080,6 +2140,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 out_unlock_inode:
        spin_unlock(&inode->i_lock);
 
+#undef I_DIRTY_INODE
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
@@ -2136,7 +2197,12 @@ static void wait_sb_inodes(struct super_block *sb)
                iput(old_inode);
                old_inode = inode;
 
-               filemap_fdatawait(mapping);
+               /*
+                * We keep the error status of individual mapping so that
+                * applications can catch the writeback error using fsync(2).
+                * See filemap_fdatawait_keep_errors() for details.
+                */
+               filemap_fdatawait_keep_errors(mapping);
 
                cond_resched();
 
@@ -2286,7 +2352,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
  */
 int write_inode_now(struct inode *inode, int sync)
 {
-       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2298,7 +2363,7 @@ int write_inode_now(struct inode *inode, int sync)
                wbc.nr_to_write = 0;
 
        might_sleep();
-       return writeback_single_inode(inode, wb, &wbc);
+       return writeback_single_inode(inode, &wbc);
 }
 EXPORT_SYMBOL(write_inode_now);
 
@@ -2315,7 +2380,7 @@ EXPORT_SYMBOL(write_inode_now);
  */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
-       return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+       return writeback_single_inode(inode, wbc);
 }
 EXPORT_SYMBOL(sync_inode);