writeback: do not sleep on the congestion queue if there are no congested BDIs or...
authorMel Gorman <mel@csn.ul.ie>
Tue, 26 Oct 2010 21:21:45 +0000 (14:21 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Oct 2010 23:52:07 +0000 (16:52 -0700)
If congestion_wait() is called with no BDI congested, the caller will
sleep for the full timeout and this may be an unnecessary sleep.  This
patch adds a wait_iff_congested() that checks congestion and only sleeps
if a BDI is congested else, it calls cond_resched() to ensure the caller
is not hogging the CPU longer than its quota but otherwise will not sleep.

This is aimed at reducing some of the major desktop stalls reported during
IO.  For example, while kswapd is operating, it calls congestion_wait()
but it could just have been reclaiming clean page cache pages with no
congestion.  Without this patch, it would sleep for a full timeout but
after this patch, it'll just call schedule() if it has been on the CPU too
long.  Similar logic applies to direct reclaimers that are not making
enough progress.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/backing-dev.h
include/linux/mmzone.h
include/trace/events/writeback.h
mm/backing-dev.c
mm/page_alloc.c
mm/vmscan.c

index 35b00746c712fc11776e78345edf786b98f9063b..f1b402a50679c8f97d25404cd53e7bbe1277eb0d 100644 (file)
@@ -285,7 +285,7 @@ enum {
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
-
+long wait_iff_congested(struct zone *zone, int sync, long timeout);
 
 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
 {
index c3c17fb675eedb1e5c7ca5c10c697a7e40c8e797..39c24ebe9cfd4e75b8841deec06aa6d78c91c2ad 100644 (file)
@@ -423,6 +423,9 @@ struct zone {
 typedef enum {
        ZONE_RECLAIM_LOCKED,            /* prevents concurrent reclaim */
        ZONE_OOM_LOCKED,                /* zone is in OOM killer zonelist */
+       ZONE_CONGESTED,                 /* zone has many dirty pages backed by
+                                        * a congested BDI
+                                        */
 } zone_flags_t;
 
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
        clear_bit(flag, &zone->flags);
 }
 
+static inline int zone_is_reclaim_congested(const struct zone *zone)
+{
+       return test_bit(ZONE_CONGESTED, &zone->flags);
+}
+
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
        return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
index d2b2654606ecee415d4cd3cfab9b53259379bd00..89a2b2db43751686129d90f8d58fd3c08787726b 100644 (file)
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
        TP_ARGS(usec_timeout, usec_delayed)
 );
 
+DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
+
+       TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
+
+       TP_ARGS(usec_timeout, usec_delayed)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
index 55627306abe0042968ab1bf70b17f9cd1a74fdec..5ad3c106606b5318437c46fb5fdc7b7bbc33451d 100644 (file)
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
+static atomic_t nr_bdi_congested[2];
 
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
        wait_queue_head_t *wqh = &congestion_wqh[sync];
 
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-       clear_bit(bit, &bdi->state);
+       if (test_and_clear_bit(bit, &bdi->state))
+               atomic_dec(&nr_bdi_congested[sync]);
        smp_mb__after_clear_bit();
        if (waitqueue_active(wqh))
                wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
        enum bdi_state bit;
 
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-       set_bit(bit, &bdi->state);
+       if (!test_and_set_bit(bit, &bdi->state))
+               atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
 
@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout)
 }
 EXPORT_SYMBOL(congestion_wait);
 
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+       long ret;
+       unsigned long start = jiffies;
+       DEFINE_WAIT(wait);
+       wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+       /*
+        * If there is no congestion, or heavy congestion is not being
+        * encountered in the current zone, yield if necessary instead
+        * of sleeping on the congestion queue
+        */
+       if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+                       !zone_is_reclaim_congested(zone)) {
+               cond_resched();
+
+               /* In case we scheduled, work out time remaining */
+               ret = timeout - (jiffies - start);
+               if (ret < 0)
+                       ret = 0;
+
+               goto out;
+       }
+
+       /* Sleep until uncongested or a write happens */
+       prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+       ret = io_schedule_timeout(timeout);
+       finish_wait(wqh, &wait);
+
+out:
+       trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+                                       jiffies_to_usecs(jiffies - start));
+
+       return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
index 6a683f819439a0da44a7dd5287178a6ddcefa86b..b13bc5e5bd7d9213bcb6ac839310c6ba0aba1590 100644 (file)
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                        preferred_zone, migratetype);
 
                if (!page && gfp_mask & __GFP_NOFAIL)
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
 
        return page;
@@ -2095,7 +2095,7 @@ rebalance:
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
        }
 
index 130ad0239f524c90b78c0ae71954e9524fb96c7b..30fd658bb2897076d4fec53a5b0435d095daaec6 100644 (file)
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-       if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
-               disable_lumpy_reclaim_mode(sc);
+       if (!may_write_to_queue(mapping->backing_dev_info, sc))
                return PAGE_KEEP;
-       }
 
        if (clear_page_dirty_for_io(page)) {
                int res;
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
+                                     struct zone *zone,
                                      struct scan_control *sc)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
 
        cond_resched();
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
 
                VM_BUG_ON(PageActive(page));
+               VM_BUG_ON(page_zone(page) != zone);
 
                sc->nr_scanned++;
 
@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
 
                if (PageDirty(page)) {
+                       nr_dirty++;
+
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        /* Page is dirty, try to write it out here */
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
+                               nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
@@ -902,6 +907,15 @@ keep_lumpy:
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
 
+       /*
+        * Tag a zone as congested if all the dirty pages encountered were
+        * backed by a congested BDI. In this case, reclaimers should just
+        * back off and wait for congestion to clear because further reclaim
+        * will encounter the same problem
+        */
+       if (nr_dirty == nr_congested)
+               zone_set_flag(zone, ZONE_CONGESTED);
+
        free_page_list(&free_pages);
 
        list_splice(&ret_pages, page_list);
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
        spin_unlock_irq(&zone->lru_lock);
 
-       nr_reclaimed = shrink_page_list(&page_list, sc);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
 
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_lumpy_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, sc);
+               nr_reclaimed += shrink_page_list(&page_list, zone, sc);
        }
 
        local_irq_disable();
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
                /* Take a nap, wait for some writeback to complete */
                if (!sc->hibernation_mode && sc->nr_scanned &&
-                   priority < DEF_PRIORITY - 2)
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
+                   priority < DEF_PRIORITY - 2) {
+                       struct zone *preferred_zone;
+
+                       first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+                                                       NULL, &preferred_zone);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+               }
        }
 
 out:
@@ -2282,6 +2301,15 @@ loop_again:
                                if (!zone_watermark_ok(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
+                       } else {
+                               /*
+                                * If a zone reaches its high watermark,
+                                * consider it to be no longer congested. It's
+                                * possible there are dirty pages backed by
+                                * congested BDIs but as pressure is relieved,
+                                * spectulatively avoid congestion waits
+                                */
+                               zone_clear_flag(zone, ZONE_CONGESTED);
                        }
 
                }