#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
nr_lumpy_taken++;
if (PageDirty(cursor_page))
nr_lumpy_dirty++;
struct page *page;
list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
- nr_active++;
+ nr_active += numpages;
}
if (count)
- count[lru]++;
+ count[lru] += numpages;
}
return nr_active;
spin_lock_irq(&zone->lru_lock);
continue;
}
- SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
+ if (putback_active_lru_page(zone, page))
+ continue;
}
+ SetPageLRU(page);
+ add_page_to_lru_list(zone, page, lru);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
+ pgmoved += hpage_nr_pages(page);
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
}
if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- nr_rotated++;
+ nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
}
/* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
{
int i;
unsigned long balanced = 0;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return 1;
+ return true;
- /* If after HZ/10, a zone is below the high mark, it's premature */
+ /* Check the watermark levels */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable)
+ /*
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well if kswapd
+ * is to sleep
+ */
+ if (zone->all_unreclaimable) {
+ balanced += zone->present_pages;
continue;
+ }
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- 0, 0))
+ classzone_idx, 0))
all_zones_ok = false;
else
balanced += zone->present_pages;
* must be balanced
*/
if (order)
- return pgdat_balanced(pgdat, balanced, 0);
+ return pgdat_balanced(pgdat, balanced, classzone_idx);
else
return !all_zones_ok;
}
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* of pages is balanced across the zones.
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int classzone_idx)
+ int *classzone_idx)
{
int all_zones_ok;
unsigned long balanced;
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
+ *classzone_idx = i;
break;
}
}
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
+ int compaction;
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
+
+ compaction = 0;
+ if (order &&
+ zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone),
+ end_zone, 0) &&
+ !zone_watermark_ok(zone, order,
+ high_wmark_pages(zone),
+ end_zone, 0)) {
+ compact_zone_order(zone,
+ order,
+ sc.gfp_mask, false,
+ COMPACT_MODE_KSWAPD);
+ compaction = 1;
+ }
+
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && !zone_reclaimable(zone))
+ if (!compaction && nr_slab == 0 &&
+ !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- /*
- * Compact the zone for higher orders to reduce
- * latencies for higher-order allocations that
- * would ordinarily call try_to_compact_pages()
- */
- if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
- compact_zone_order(zone, sc.order, sc.gfp_mask,
- false);
-
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
* spectulatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
- if (i <= classzone_idx)
+ if (i <= *classzone_idx)
balanced += zone->present_pages;
}
}
- if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))
+ if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
* high-order: Balanced zones must make up at least 25% of the node
* for the node to be balanced
*/
- if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))) {
+ if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
cond_resched();
try_to_freeze();
}
}
- return sc.nr_reclaimed;
+ /*
+ * Return the order we were reclaiming at so sleeping_prematurely()
+ * makes a decision on the order we were last reclaiming at. However,
+ * if another caller entered the allocator slow path while kswapd
+ * was awake, order will remain at the higher level
+ */
+ *classzone_idx = end_zone;
+ return order;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (!sleeping_prematurely(pgdat, order, remaining)) {
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order);
+ kswapd_try_to_sleep(pgdat, order, classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
}
ret = try_to_freeze();
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balance_pgdat(pgdat, order, classzone_idx);
+ order = balance_pgdat(pgdat, order, &classzone_idx);
}
}
return 0;