Merge tag 'fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/arm...

[firefly-linux-kernel-4.4.55.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index b11fb2fe77c15775718b5d07379ddf543e58dbaa..795e525afaba8914f3f8863de6a7299d84898ada 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -379,7 +379,6 @@ static bool move_file(void)
  enum charge_type {
         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
         MEM_CGROUP_CHARGE_TYPE_ANON,
-       MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
         NR_CHARGE_TYPE,
@@ -406,6 +405,12 @@ enum charge_type {
  static void mem_cgroup_get(struct mem_cgroup *memcg);
  static void mem_cgroup_put(struct mem_cgroup *memcg);
  
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+       return container_of(s, struct mem_cgroup, css);
+}
+
  /* Writing them here to avoid exposing memcg's inner layout */
  #ifdef CONFIG_MEMCG_KMEM
  #include <net/sock.h>
@@ -863,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
  
  struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  {
-       return container_of(cgroup_subsys_state(cont,
-                               mem_cgroup_subsys_id), struct mem_cgroup,
-                               css);
+       return mem_cgroup_from_css(
+               cgroup_subsys_state(cont, mem_cgroup_subsys_id));
  }
  
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -878,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
         if (unlikely(!p))
                 return NULL;
  
-       return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
-                               struct mem_cgroup, css);
+       return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
  }
  
  struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -965,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
                 if (css) {
                         if (css == &root->css || css_tryget(css))
-                               memcg = container_of(css,
-                                                    struct mem_cgroup, css);
+                               memcg = mem_cgroup_from_css(css);
                 } else
                         id = 0;
                 rcu_read_unlock();
@@ -1453,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
  /*
   * Return the memory (and swap, if configured) limit for a memcg.
   */
-u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
  {
         u64 limit;
         u64 memsw;
@@ -1469,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
         return min(limit, memsw);
  }
  
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                             int order)
+{
+       struct mem_cgroup *iter;
+       unsigned long chosen_points = 0;
+       unsigned long totalpages;
+       unsigned int points = 0;
+       struct task_struct *chosen = NULL;
+
+       /*
+        * If current has a pending SIGKILL, then automatically select it.  The
+        * goal is to allow it to allocate so that it may quickly exit and free
+        * its memory.
+        */
+       if (fatal_signal_pending(current)) {
+               set_thread_flag(TIF_MEMDIE);
+               return;
+       }
+
+       check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+       totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+       for_each_mem_cgroup_tree(iter, memcg) {
+               struct cgroup *cgroup = iter->css.cgroup;
+               struct cgroup_iter it;
+               struct task_struct *task;
+
+               cgroup_iter_start(cgroup, &it);
+               while ((task = cgroup_iter_next(cgroup, &it))) {
+                       switch (oom_scan_process_thread(task, totalpages, NULL,
+                                                       false)) {
+                       case OOM_SCAN_SELECT:
+                               if (chosen)
+                                       put_task_struct(chosen);
+                               chosen = task;
+                               chosen_points = ULONG_MAX;
+                               get_task_struct(chosen);
+                               /* fall through */
+                       case OOM_SCAN_CONTINUE:
+                               continue;
+                       case OOM_SCAN_ABORT:
+                               cgroup_iter_end(cgroup, &it);
+                               mem_cgroup_iter_break(memcg, iter);
+                               if (chosen)
+                                       put_task_struct(chosen);
+                               return;
+                       case OOM_SCAN_OK:
+                               break;
+                       };
+                       points = oom_badness(task, memcg, NULL, totalpages);
+                       if (points > chosen_points) {
+                               if (chosen)
+                                       put_task_struct(chosen);
+                               chosen = task;
+                               chosen_points = points;
+                               get_task_struct(chosen);
+                       }
+               }
+               cgroup_iter_end(cgroup, &it);
+       }
+
+       if (!chosen)
+               return;
+       points = chosen_points * 1000 / totalpages;
+       oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+                        NULL, "Memory cgroup out of memory");
+}
+
  static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
                                         gfp_t gfp_mask,
                                         unsigned long flags)
@@ -1898,7 +1967,7 @@ again:
                 return;
         /*
          * If this memory cgroup is not under account moving, we don't
-        * need to take move_lock_page_cgroup(). Because we already hold
+        * need to take move_lock_mem_cgroup(). Because we already hold
          * rcu_read_lock(), any calls to move_account will be delayed until
          * rcu_read_unlock() if mem_cgroup_stolen() == true.
          */
@@ -1920,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
         /*
          * It's guaranteed that pc->mem_cgroup never changes while
          * lock is held because a routine modifies pc->mem_cgroup
-        * should take move_lock_page_cgroup().
+        * should take move_lock_mem_cgroup().
          */
         move_unlock_mem_cgroup(pc->mem_cgroup, flags);
  }
@@ -2267,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
          * We always charge the cgroup the mm_struct belongs to.
          * The mm_struct's mem_cgroup changes on task migration if the
          * thread group leader migrates. It's possible that mm is not
-        * set, if so charge the init_mm (happens for pagecache usage).
+        * set, if so charge the root memcg (happens for pagecache usage).
          */
         if (!*ptr && !mm)
                 *ptr = root_mem_cgroup;
@@ -2428,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
         css = css_lookup(&mem_cgroup_subsys, id);
         if (!css)
                 return NULL;
-       return container_of(css, struct mem_cgroup, css);
+       return mem_cgroup_from_css(css);
  }
  
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2472,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         bool anon;
  
         lock_page_cgroup(pc);
-       if (unlikely(PageCgroupUsed(pc))) {
-               unlock_page_cgroup(pc);
-               __mem_cgroup_cancel_charge(memcg, nr_pages);
-               return;
-       }
+       VM_BUG_ON(PageCgroupUsed(pc));
         /*
          * we don't need page_cgroup_lock about tail pages, becase they are not
          * accessed by any other context at this point.
@@ -2729,64 +2794,32 @@ int mem_cgroup_newpage_charge(struct page *page,
                                         MEM_CGROUP_CHARGE_TYPE_ANON);
  }
  
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
-                                       enum charge_type ctype);
-
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-                               gfp_t gfp_mask)
-{
-       struct mem_cgroup *memcg = NULL;
-       enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-       int ret;
-
-       if (mem_cgroup_disabled())
-               return 0;
-       if (PageCompound(page))
-               return 0;
-
-       if (unlikely(!mm))
-               mm = &init_mm;
-       if (!page_is_file_cache(page))
-               type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-
-       if (!PageSwapCache(page))
-               ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-       else { /* page is swapcache/shmem */
-               ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
-               if (!ret)
-                       __mem_cgroup_commit_charge_swapin(page, memcg, type);
-       }
-       return ret;
-}
-
  /*
   * While swap-in, try_charge -> commit or cancel, the page is locked.
   * And when try_charge() successfully returns, one refcnt to memcg without
   * struct page_cgroup is acquired. This refcnt will be consumed by
   * "commit()" or removed by "cancel()"
   */
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-                                struct page *page,
-                                gfp_t mask, struct mem_cgroup **memcgp)
+static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+                                         struct page *page,
+                                         gfp_t mask,
+                                         struct mem_cgroup **memcgp)
  {
         struct mem_cgroup *memcg;
+       struct page_cgroup *pc;
         int ret;
  
-       *memcgp = NULL;
-
-       if (mem_cgroup_disabled())
-               return 0;
-
-       if (!do_swap_account)
-               goto charge_cur_mm;
+       pc = lookup_page_cgroup(page);
         /*
-        * A racing thread's fault, or swapoff, may have already updated
-        * the pte, and even removed page from swap cache: in those cases
-        * do_swap_page()'s pte_same() test will fail; but there's also a
-        * KSM case which does need to charge the page.
+        * Every swap fault against a single page tries to charge the
+        * page, bail as early as possible.  shmem_unuse() encounters
+        * already charged pages, too.  The USED bit is protected by
+        * the page lock, which serializes swap cache removal, which
+        * in turn serializes uncharging.
          */
-       if (!PageSwapCache(page))
+       if (PageCgroupUsed(pc))
+               return 0;
+       if (!do_swap_account)
                 goto charge_cur_mm;
         memcg = try_get_mem_cgroup_from_page(page);
         if (!memcg)
@@ -2798,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                 ret = 0;
         return ret;
  charge_cur_mm:
-       if (unlikely(!mm))
-               mm = &init_mm;
         ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
         if (ret == -EINTR)
                 ret = 0;
         return ret;
  }
  
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
+                                gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+       *memcgp = NULL;
+       if (mem_cgroup_disabled())
+               return 0;
+       /*
+        * A racing thread's fault, or swapoff, may have already
+        * updated the pte, and even removed page from swap cache: in
+        * those cases unuse_pte()'s pte_same() test will fail; but
+        * there's also a KSM case which does need to charge the page.
+        */
+       if (!PageSwapCache(page)) {
+               int ret;
+
+               ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+               if (ret == -EINTR)
+                       ret = 0;
+               return ret;
+       }
+       return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
+}
+
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+{
+       if (mem_cgroup_disabled())
+               return;
+       if (!memcg)
+               return;
+       __mem_cgroup_cancel_charge(memcg, 1);
+}
+
  static void
  __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                         enum charge_type ctype)
@@ -2843,13 +2906,27 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
                                           MEM_CGROUP_CHARGE_TYPE_ANON);
  }
  
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+                               gfp_t gfp_mask)
  {
+       struct mem_cgroup *memcg = NULL;
+       enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+       int ret;
+
         if (mem_cgroup_disabled())
-               return;
-       if (!memcg)
-               return;
-       __mem_cgroup_cancel_charge(memcg, 1);
+               return 0;
+       if (PageCompound(page))
+               return 0;
+
+       if (!PageSwapCache(page))
+               ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+       else { /* page is swapcache/shmem */
+               ret = __mem_cgroup_try_charge_swapin(mm, page,
+                                                    gfp_mask, &memcg);
+               if (!ret)
+                       __mem_cgroup_commit_charge_swapin(page, memcg, type);
+       }
+       return ret;
  }
  
  static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2909,7 +2986,8 @@ direct_uncharge:
   * uncharge if !page_mapped(page)
   */
  static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+                            bool end_migration)
  {
         struct mem_cgroup *memcg = NULL;
         unsigned int nr_pages = 1;
@@ -2919,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         if (mem_cgroup_disabled())
                 return NULL;
  
-       if (PageSwapCache(page))
-               return NULL;
+       VM_BUG_ON(PageSwapCache(page));
  
         if (PageTransHuge(page)) {
                 nr_pages <<= compound_order(page);
@@ -2953,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                 /* fallthrough */
         case MEM_CGROUP_CHARGE_TYPE_DROP:
                 /* See mem_cgroup_prepare_migration() */
-               if (page_mapped(page) || PageCgroupMigration(pc))
+               if (page_mapped(page))
+                       goto unlock_out;
+               /*
+                * Pages under migration may not be uncharged.  But
+                * end_migration() /must/ be the one uncharging the
+                * unused post-migration page and so it has to call
+                * here with the migration bit still set.  See the
+                * res_counter handling below.
+                */
+               if (!end_migration && PageCgroupMigration(pc))
                         goto unlock_out;
                 break;
         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2987,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                 mem_cgroup_swap_statistics(memcg, true);
                 mem_cgroup_get(memcg);
         }
-       if (!mem_cgroup_is_root(memcg))
+       /*
+        * Migration does not charge the res_counter for the
+        * replacement page, so leave it alone when phasing out the
+        * page that is unused after the migration.
+        */
+       if (!end_migration && !mem_cgroup_is_root(memcg))
                 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
  
         return memcg;
@@ -3003,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
         if (page_mapped(page))
                 return;
         VM_BUG_ON(page->mapping && !PageAnon(page));
-       __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON);
+       if (PageSwapCache(page))
+               return;
+       __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
  }
  
  void mem_cgroup_uncharge_cache_page(struct page *page)
  {
         VM_BUG_ON(page_mapped(page));
         VM_BUG_ON(page->mapping);
-       __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+       __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
  }
  
  /*
@@ -3074,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
         if (!swapout) /* this was a swap cache but the swap is unused ! */
                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
  
-       memcg = __mem_cgroup_uncharge_common(page, ctype);
+       memcg = __mem_cgroup_uncharge_common(page, ctype, false);
  
         /*
          * record memcg information,  if swapout && memcg != NULL,
@@ -3164,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
   * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
   * page belongs to.
   */
-int mem_cgroup_prepare_migration(struct page *page,
-       struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
+                                 struct mem_cgroup **memcgp)
  {
         struct mem_cgroup *memcg = NULL;
         struct page_cgroup *pc;
         enum charge_type ctype;
-       int ret = 0;
  
         *memcgp = NULL;
  
         VM_BUG_ON(PageTransHuge(page));
         if (mem_cgroup_disabled())
-               return 0;
+               return;
  
         pc = lookup_page_cgroup(page);
         lock_page_cgroup(pc);
@@ -3221,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
          * we return here.
          */
         if (!memcg)
-               return 0;
+               return;
  
         *memcgp = memcg;
-       ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
-       css_put(&memcg->css);/* drop extra refcnt */
-       if (ret) {
-               if (PageAnon(page)) {
-                       lock_page_cgroup(pc);
-                       ClearPageCgroupMigration(pc);
-                       unlock_page_cgroup(pc);
-                       /*
-                        * The old page may be fully unmapped while we kept it.
-                        */
-                       mem_cgroup_uncharge_page(page);
-               }
-               /* we'll need to revisit this error code (we have -EINTR) */
-               return -ENOMEM;
-       }
         /*
          * We charge new page before it's used/mapped. So, even if unlock_page()
          * is called before end_migration, we can catch all events on this new
@@ -3247,12 +3324,14 @@ int mem_cgroup_prepare_migration(struct page *page,
          */
         if (PageAnon(page))
                 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-       else if (page_is_file_cache(page))
-               ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
         else
-               ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+               ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+       /*
+        * The page is committed to the memcg, but it's not actually
+        * charged to the res_counter since we plan on replacing the
+        * old one and only one page is going to be left afterwards.
+        */
         __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
-       return ret;
  }
  
  /* remove redundant charge if migration failed*/
@@ -3274,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
                 used = newpage;
                 unused = oldpage;
         }
+       anon = PageAnon(used);
+       __mem_cgroup_uncharge_common(unused,
+                                    anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+                                    : MEM_CGROUP_CHARGE_TYPE_CACHE,
+                                    true);
+       css_put(&memcg->css);
         /*
          * We disallowed uncharge of pages under migration because mapcount
          * of the page goes down to zero, temporarly.
@@ -3283,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
         lock_page_cgroup(pc);
         ClearPageCgroupMigration(pc);
         unlock_page_cgroup(pc);
-       anon = PageAnon(used);
-       __mem_cgroup_uncharge_common(unused,
-               anon ? MEM_CGROUP_CHARGE_TYPE_ANON
-                    : MEM_CGROUP_CHARGE_TYPE_CACHE);
  
         /*
          * If a page is a file cache, radix-tree replacement is very atomic
@@ -3338,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
          */
         if (!memcg)
                 return;
-
-       if (PageSwapBacked(oldpage))
-               type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-
         /*
          * Even if newpage->mapping was NULL before starting replacement,
          * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3416,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                 /*
                  * Rather than hide all in some function, I do this in
                  * open coded manner. You see what this really does.
-                * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                  */
                 mutex_lock(&set_limit_mutex);
                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3477,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                 /*
                  * Rather than hide all in some function, I do this in
                  * open coded manner. You see what this really does.
-                * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                  */
                 mutex_lock(&set_limit_mutex);
                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);