X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=mm%2Fmemcontrol.c;h=eaa3accb01e708a11986135a8df1146f7dbc1b4b;hb=650a7901c0fe0c3b0e5d9c1c945f140058585a6f;hp=194721839cf5d303a0de2b4df611b700db895043;hpb=bd2931b5cff6a3bf39bfe15fae051fb8229c0029;p=firefly-linux-kernel-4.4.55.git diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 194721839cf5..eaa3accb01e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -302,6 +302,7 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; + atomic_t oom_wakeups; atomic_t refcnt; @@ -379,7 +380,7 @@ struct mem_cgroup { static size_t memcg_size(void) { return sizeof(struct mem_cgroup) + - nr_node_ids * sizeof(struct mem_cgroup_per_node); + nr_node_ids * sizeof(struct mem_cgroup_per_node *); } /* internal only representation about the status of kmem accounting. */ @@ -1220,7 +1221,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (dead_count == iter->last_dead_count) { smp_rmb(); last_visited = iter->last_visited; - if (last_visited && + if (last_visited && last_visited != root && !css_tryget(&last_visited->css)) last_visited = NULL; } @@ -1229,7 +1230,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { - if (last_visited) + if (last_visited && last_visited != root) css_put(&last_visited->css); iter->last_visited = memcg; @@ -2075,15 +2076,18 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, return total; } +static DEFINE_SPINLOCK(memcg_oom_lock); + /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. - * Has to be called with memcg_oom_lock */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; + spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* @@ -2097,33 +2101,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) iter->oom_lock = true; } - if (!failed) - return true; - - /* - * OK, we failed to lock the whole subtree so we have to clean up - * what we set up to the failing subtree - */ - for_each_mem_cgroup_tree(iter, memcg) { - if (iter == failed) { - mem_cgroup_iter_break(memcg, iter); - break; + if (failed) { + /* + * OK, we failed to lock the whole subtree so we have + * to clean up what we set up to the failing subtree + */ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter == failed) { + mem_cgroup_iter_break(memcg, iter); + break; + } + iter->oom_lock = false; } - iter->oom_lock = false; } - return false; + + spin_unlock(&memcg_oom_lock); + + return !failed; } -/* - * Has to be called with memcg_oom_lock - */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; - return 0; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) @@ -2147,7 +2151,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) atomic_add_unless(&iter->under_oom, -1, 0); } -static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -2177,6 +2180,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, static void memcg_wakeup_oom(struct mem_cgroup *memcg) { + atomic_inc(&memcg->oom_wakeups); /* for filtering, pass "memcg" as argument. */ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } @@ -2187,57 +2191,97 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) memcg_wakeup_oom(memcg); } -/* - * try to call OOM killer. returns false if we should exit memory-reclaim loop. +static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +{ + if (!current->memcg_oom.may_oom) + return; + /* + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. + */ + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; +} + +/** + * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state + * + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. + * + * Memcg supports userspace OOM handling where failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at + * the end of the page fault to complete the OOM handling. + * + * Returns %true if an ongoing memcg OOM situation was detected and + * completed, %false otherwise. */ -static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, - int order) +bool mem_cgroup_oom_synchronize(bool handle) { + struct mem_cgroup *memcg = current->memcg_oom.memcg; struct oom_wait_info owait; - bool locked, need_to_kill; + bool locked; + + /* OOM is global, do not handle */ + if (!memcg) + return false; + + if (!handle) + goto cleanup; owait.memcg = memcg; owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); - need_to_kill = true; - mem_cgroup_mark_under_oom(memcg); - /* At first, try to OOM lock hierarchy under memcg.*/ - spin_lock(&memcg_oom_lock); - locked = mem_cgroup_oom_lock(memcg); - /* - * Even if signal_pending(), we can't quit charge() loop without - * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL - * under OOM is always welcomed, use TASK_KILLABLE here. - */ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - if (!locked || memcg->oom_kill_disable) - need_to_kill = false; + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + if (locked) mem_cgroup_oom_notify(memcg); - spin_unlock(&memcg_oom_lock); - if (need_to_kill) { + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, mask, order); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); } else { schedule(); + mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); } - spin_lock(&memcg_oom_lock); - if (locked) - mem_cgroup_oom_unlock(memcg); - memcg_wakeup_oom(memcg); - spin_unlock(&memcg_oom_lock); - mem_cgroup_unmark_under_oom(memcg); - - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - return false; - /* Give chance to dying process */ - schedule_timeout_uninterruptible(1); + if (locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); + } +cleanup: + current->memcg_oom.memcg = NULL; + css_put(&memcg->css); return true; } @@ -2550,12 +2594,11 @@ enum { CHARGE_RETRY, /* need to retry but retry is not bad */ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages, unsigned int min_pages, - bool oom_check) + bool invoke_oom) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2612,14 +2655,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (mem_cgroup_wait_acct_move(mem_over_limit)) return CHARGE_RETRY; - /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check) - return CHARGE_NOMEM; - /* check OOM */ - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) - return CHARGE_OOM_DIE; + if (invoke_oom) + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - return CHARGE_RETRY; + return CHARGE_NOMEM; } /* @@ -2663,6 +2702,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, || fatal_signal_pending(current))) goto bypass; + if (unlikely(task_in_memcg_oom(current))) + goto bypass; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the @@ -2722,7 +2764,7 @@ again: } do { - bool oom_check; + bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ if (fatal_signal_pending(current)) { @@ -2730,14 +2772,8 @@ again: goto bypass; } - oom_check = false; - if (oom && !nr_oom_retries) { - oom_check = true; - nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - } - - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, - oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, + nr_pages, invoke_oom); switch (ret) { case CHARGE_OK: break; @@ -2750,16 +2786,12 @@ again: css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom) { + if (!oom || invoke_oom) { css_put(&memcg->css); goto nomem; } - /* If oom, we never return -ENOMEM */ nr_oom_retries--; break; - case CHARGE_OOM_DIE: /* Killed by OOM Killer */ - css_put(&memcg->css); - goto bypass; } } while (ret != CHARGE_OK); @@ -3186,11 +3218,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, if (!s->memcg_params) return -ENOMEM; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); } else s->memcg_params->is_root_cache = true; @@ -5584,7 +5616,13 @@ static int compare_thresholds(const void *a, const void *b) const struct mem_cgroup_threshold *_a = a; const struct mem_cgroup_threshold *_b = b; - return _a->threshold - _b->threshold; + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; } static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) @@ -6296,16 +6334,6 @@ mem_cgroup_css_online(struct cgroup *cont) error = memcg_init_kmem(memcg, &mem_cgroup_subsys); mutex_unlock(&memcg_create_mutex); - if (error) { - /* - * We call put now because our (and parent's) refcnts - * are already in place. mem_cgroup_put() will internally - * call __mem_cgroup_free, so return directly - */ - mem_cgroup_put(memcg); - if (parent->use_hierarchy) - mem_cgroup_put(parent); - } return error; } @@ -6330,9 +6358,23 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct cgroup *iter; mem_cgroup_invalidate_reclaim_iterators(memcg); + + /* + * This requires that offlining is serialized. Right now that is + * guaranteed because css_killed_work_fn() holds the cgroup_mutex. + */ + rcu_read_lock(); + cgroup_for_each_descendant_post(iter, cont) { + rcu_read_unlock(); + mem_cgroup_reparent_charges(mem_cgroup_from_cont(iter)); + rcu_read_lock(); + } + rcu_read_unlock(); mem_cgroup_reparent_charges(memcg); + mem_cgroup_destroy_all_caches(memcg); }