Merge branch 'slab/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/penber...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)
* 'slab/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg/linux:
  slub: disallow changing cpu_partial from userspace for debug caches
  slub: add missed accounting
  slub: Extract get_freelist from __slab_alloc
  slub: Switch per cpu partial page support off for debugging
  slub: fix a possible memleak in __slab_alloc()
  slub: fix slub_max_order Documentation
  slub: add missed accounting
  slab: add taint flag outputting to debug paths.
  slub: add taint flag outputting to debug paths
  slab: introduce slab_max_order kernel parameter
  slab: rename slab_break_gfp_order to slab_max_order

1  2 
Documentation/kernel-parameters.txt
mm/slab.c
mm/slub.c

index c92b1532f05adadf9f8aaeac671434080b444424,1aefc79031a4164b147dc363b3cddc5f9bd648d7..a8d389d72405030eec327a6974811f822f56997c
@@@ -329,11 -329,6 +329,11 @@@ bytes respectively. Such letter suffixe
                                    is a lot of faster
                        off       - do not initialize any AMD IOMMU found in
                                    the system
 +                      force_isolation - Force device isolation for all
 +                                        devices. The IOMMU driver is not
 +                                        allowed anymore to lift isolation
 +                                        requirements as needed. This option
 +                                        does not override iommu=pt
  
        amijoy.map=     [HW,JOY] Amiga joystick support
                        Map of devices attached to JOY0DAT and JOY1DAT
        no_debug_objects
                        [KNL] Disable object debugging
  
 +      debug_guardpage_minorder=
 +                      [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
 +                      parameter allows control of the order of pages that will
 +                      be intentionally kept free (and hence protected) by the
 +                      buddy allocator. Bigger value increase the probability
 +                      of catching random memory corruption, but reduce the
 +                      amount of memory for normal system use. The maximum
 +                      possible value is MAX_ORDER/2.  Setting this parameter
 +                      to 1 or 2 should be enough to identify most random
 +                      memory corruption problems caused by bugs in kernel or
 +                      driver code when a CPU writes to (or reads from) a
 +                      random memory location. Note that there exists a class
 +                      of memory corruptions problems caused by buggy H/W or
 +                      F/W or by drivers badly programing DMA (basically when
 +                      memory is written at bus level and the CPU MMU is
 +                      bypassed) which are not detectable by
 +                      CONFIG_DEBUG_PAGEALLOC, hence this option will not help
 +                      tracking down these problems.
 +
        debugpat        [X86] Enable PAT debugging
  
        decnet.addr=    [HW,NET]
                nomerge
                forcesac
                soft
 -              pt      [x86, IA-64]
 +              pt              [x86, IA-64]
 +              group_mf        [x86, IA-64]
 +
  
        io7=            [HW] IO7 for Marvel based alpha systems
                        See comment before marvel_specify_io7 in
        kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
                        Default is 0 (don't ignore, but inject #GP)
  
 -      kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
 -                      Default is 1 (enabled)
 -
        kvm.mmu_audit=  [KVM] This is a R/W parameter which allows audit
                        KVM MMU at runtime.
                        Default is 0 (off)
                        The default is to return 64-bit inode numbers.
  
        nfs.nfs4_disable_idmapping=
 -                      [NFSv4] When set, this option disables the NFSv4
 -                      idmapper on the client, but only if the mount
 -                      is using the 'sec=sys' security flavour. This may
 -                      make migration from legacy NFSv2/v3 systems easier
 -                      provided that the server has the appropriate support.
 -                      The default is to always enable NFSv4 idmapping.
 +                      [NFSv4] When set to the default of '1', this option
 +                      ensures that both the RPC level authentication
 +                      scheme and the NFS level operations agree to use
 +                      numeric uids/gids if the mount is using the
 +                      'sec=sys' security flavour. In effect it is
 +                      disabling idmapping, which can make migration from
 +                      legacy NFSv2/v3 systems to NFSv4 easier.
 +                      Servers that do not support this mode of operation
 +                      will be autodetected by the client, and it will fall
 +                      back to using the idmapper.
 +                      To turn off this behaviour, set the value to '0'.
  
        nmi_debug=      [KNL,AVR32,SH] Specify one or more actions to take
                        when a NMI is triggered.
                        arch_perfmon: [X86] Force use of architectural
                                perfmon on Intel CPUs instead of the
                                CPU specific event set.
 +                      timer: [X86] Force use of architectural NMI
 +                              timer mode (see also oprofile.timer
 +                              for generic hr timer mode)
 +                              [s390] Force legacy basic mode sampling
 +                                (report cpu_type "timer")
  
        oops=panic      Always panic on oopses. Default is to just kill the
                        process, but there is a small probability of
  
        slram=          [HW,MTD]
  
+       slab_max_order= [MM, SLAB]
+                       Determines the maximum allowed order for slabs.
+                       A high setting may cause OOMs due to memory
+                       fragmentation.  Defaults to 1 for systems with
+                       more than 32MB of RAM, 0 otherwise.
        slub_debug[=options[,slabs]]    [MM, SLUB]
                        Enabling slub_debug allows one to determine the
                        culprit if slab objects become corrupted. Enabling
                        [USB] Start with the old device initialization
                        scheme (default 0 = off).
  
 +      usbcore.usbfs_memory_mb=
 +                      [USB] Memory limit (in MB) for buffers allocated by
 +                      usbfs (default = 16, 0 = max = 2047).
 +
        usbcore.use_both_schemes=
                        [USB] Try the other device initialization scheme
                        if the first one fails (default 1 = enabled).
                        functions are at fixed addresses, they make nice
                        targets for exploits that can control RIP.
  
 -                      emulate     Vsyscalls turn into traps and are emulated
 -                                  reasonably safely.
 +                      emulate     [default] Vsyscalls turn into traps and are
 +                                  emulated reasonably safely.
  
 -                      native      [default] Vsyscalls are native syscall
 -                                  instructions.
 +                      native      Vsyscalls are native syscall instructions.
                                    This is a little bit faster than trapping
                                    and makes a few dynamic recompilers work
                                    better than they would in emulation mode.
diff --combined mm/slab.c
index 2acfa0d9094379ae999c1937dd9d4ed475af1837,4ef42baf66f0bb6a6ae73aff3ee2ffed1a276740..f0bd7857ab3bed2adf6649e60dda6ad712ef0b92
+++ b/mm/slab.c
  #include      <asm/tlbflush.h>
  #include      <asm/page.h>
  
 +#include <trace/events/kmem.h>
 +
  /*
   * DEBUG      - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
   *              0 for faster, smaller code (especially in the critical paths).
@@@ -481,11 -479,13 +481,13 @@@ EXPORT_SYMBOL(slab_buffer_size)
  #endif
  
  /*
-  * Do not go above this order unless 0 objects fit into the slab.
+  * Do not go above this order unless 0 objects fit into the slab or
+  * overridden on the command line.
   */
- #define       BREAK_GFP_ORDER_HI      1
- #define       BREAK_GFP_ORDER_LO      0
- static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+ #define       SLAB_MAX_ORDER_HI       1
+ #define       SLAB_MAX_ORDER_LO       0
+ static int slab_max_order = SLAB_MAX_ORDER_LO;
+ static bool slab_max_order_set __initdata;
  
  /*
   * Functions for storing/retrieving the cachep and or slab from the page
@@@ -854,6 -854,17 +856,17 @@@ static int __init noaliencache_setup(ch
  }
  __setup("noaliencache", noaliencache_setup);
  
+ static int __init slab_max_order_setup(char *str)
+ {
+       get_option(&str, &slab_max_order);
+       slab_max_order = slab_max_order < 0 ? 0 :
+                               min(slab_max_order, MAX_ORDER - 1);
+       slab_max_order_set = true;
+       return 1;
+ }
+ __setup("slab_max_order=", slab_max_order_setup);
  #ifdef CONFIG_NUMA
  /*
   * Special reaping functions for NUMA systems called from cache_reap().
@@@ -1502,10 -1513,11 +1515,11 @@@ void __init kmem_cache_init(void
  
        /*
         * Fragmentation resistance on low memory - only use bigger
-        * page orders on machines with more than 32MB of memory.
+        * page orders on machines with more than 32MB of memory if
+        * not overridden on the command line.
         */
-       if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
-               slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+       if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+               slab_max_order = SLAB_MAX_ORDER_HI;
  
        /* Bootstrap is tricky, because several objects are allocated
         * from caches that do not exist yet:
@@@ -1932,8 -1944,8 +1946,8 @@@ static void check_poison_obj(struct kme
                        /* Print header */
                        if (lines == 0) {
                                printk(KERN_ERR
-                                       "Slab corruption: %s start=%p, len=%d\n",
-                                       cachep->name, realobj, size);
+                                       "Slab corruption (%s): %s start=%p, len=%d\n",
+                                       print_tainted(), cachep->name, realobj, size);
                                print_objinfo(cachep, objp, 0);
                        }
                        /* Hexdump the affected line */
@@@ -2117,7 -2129,7 +2131,7 @@@ static size_t calculate_slab_order(stru
                 * Large number of objects is good, but very large slabs are
                 * currently bad for the gfp()s.
                 */
-               if (gfporder >= slab_break_gfp_order)
+               if (gfporder >= slab_max_order)
                        break;
  
                /*
@@@ -3042,8 -3054,9 +3056,9 @@@ static void check_slabp(struct kmem_cac
        if (entries != cachep->num - slabp->inuse) {
  bad:
                printk(KERN_ERR "slab: Internal list corruption detected in "
-                               "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                       cachep->name, cachep->num, slabp, slabp->inuse);
+                       "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
+                       cachep->name, cachep->num, slabp, slabp->inuse,
+                       print_tainted());
                print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
                        sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
                        1);
diff --combined mm/slub.c
index d99acbf14e0179c766332b5f54f4f946b37a5d0f,19436f53876079b6de95e13270d1ef5b22a169c6..5d37b5e44140f2cd0884fc1ae7798ec02b0afb12
+++ b/mm/slub.c
@@@ -368,7 -368,7 +368,7 @@@ static inline bool __cmpxchg_double_sla
        VM_BUG_ON(!irqs_disabled());
  #ifdef CONFIG_CMPXCHG_DOUBLE
        if (s->flags & __CMPXCHG_DOUBLE) {
 -              if (cmpxchg_double(&page->freelist,
 +              if (cmpxchg_double(&page->freelist, &page->counters,
                        freelist_old, counters_old,
                        freelist_new, counters_new))
                return 1;
@@@ -402,7 -402,7 +402,7 @@@ static inline bool cmpxchg_double_slab(
  {
  #ifdef CONFIG_CMPXCHG_DOUBLE
        if (s->flags & __CMPXCHG_DOUBLE) {
 -              if (cmpxchg_double(&page->freelist,
 +              if (cmpxchg_double(&page->freelist, &page->counters,
                        freelist_old, counters_old,
                        freelist_new, counters_new))
                return 1;
@@@ -570,7 -570,7 +570,7 @@@ static void slab_bug(struct kmem_cache 
        va_end(args);
        printk(KERN_ERR "========================================"
                        "=====================================\n");
-       printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+       printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
        printk(KERN_ERR "----------------------------------------"
                        "-------------------------------------\n\n");
  }
@@@ -1901,11 -1901,14 +1901,14 @@@ static void unfreeze_partials(struct km
                        }
  
                        if (l != m) {
-                               if (l == M_PARTIAL)
+                               if (l == M_PARTIAL) {
                                        remove_partial(n, page);
-                               else
+                                       stat(s, FREE_REMOVE_PARTIAL);
+                               } else {
                                        add_partial(n, page,
                                                DEACTIVATE_TO_TAIL);
+                                       stat(s, FREE_ADD_PARTIAL);
+                               }
  
                                l = m;
                        }
@@@ -1978,7 -1981,7 +1981,7 @@@ int put_cpu_partial(struct kmem_cache *
                page->pobjects = pobjects;
                page->next = oldpage;
  
 -      } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
 +      } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
        stat(s, CPU_PARTIAL_FREE);
        return pobjects;
  }
@@@ -2123,6 -2126,37 +2126,37 @@@ static inline void *new_slab_objects(st
        return object;
  }
  
+ /*
+  * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
+  * or deactivate the page.
+  *
+  * The page is still frozen if the return value is not NULL.
+  *
+  * If this function returns NULL then the page has been unfrozen.
+  */
+ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+ {
+       struct page new;
+       unsigned long counters;
+       void *freelist;
+       do {
+               freelist = page->freelist;
+               counters = page->counters;
+               new.counters = counters;
+               VM_BUG_ON(!new.frozen);
+               new.inuse = page->objects;
+               new.frozen = freelist != NULL;
+       } while (!cmpxchg_double_slab(s, page,
+               freelist, counters,
+               NULL, new.counters,
+               "get_freelist"));
+       return freelist;
+ }
  /*
   * Slow path. The lockless freelist is empty or we need to perform
   * debugging duties.
@@@ -2144,8 -2178,6 +2178,6 @@@ static void *__slab_alloc(struct kmem_c
  {
        void **object;
        unsigned long flags;
-       struct page new;
-       unsigned long counters;
  
        local_irq_save(flags);
  #ifdef CONFIG_PREEMPT
@@@ -2166,31 -2198,14 +2198,14 @@@ redo
                goto new_slab;
        }
  
-       stat(s, ALLOC_SLOWPATH);
-       do {
-               object = c->page->freelist;
-               counters = c->page->counters;
-               new.counters = counters;
-               VM_BUG_ON(!new.frozen);
-               /*
-                * If there is no object left then we use this loop to
-                * deactivate the slab which is simple since no objects
-                * are left in the slab and therefore we do not need to
-                * put the page back onto the partial list.
-                *
-                * If there are objects left then we retrieve them
-                * and use them to refill the per cpu queue.
-                */
+       /* must check again c->freelist in case of cpu migration or IRQ */
+       object = c->freelist;
+       if (object)
+               goto load_freelist;
  
-               new.inuse = c->page->objects;
-               new.frozen = object != NULL;
+       stat(s, ALLOC_SLOWPATH);
  
-       } while (!__cmpxchg_double_slab(s, c->page,
-                       object, counters,
-                       NULL, new.counters,
-                       "__slab_alloc"));
+       object = get_freelist(s, c->page);
  
        if (!object) {
                c->page = NULL;
@@@ -2304,7 -2319,7 +2319,7 @@@ redo
                 * Since this is without lock semantics the protection is only against
                 * code executing on this cpu *not* from access by other cpus.
                 */
 -              if (unlikely(!irqsafe_cpu_cmpxchg_double(
 +              if (unlikely(!this_cpu_cmpxchg_double(
                                s->cpu_slab->freelist, s->cpu_slab->tid,
                                object, tid,
                                get_freepointer_safe(s, object), next_tid(tid)))) {
@@@ -2534,7 -2549,7 +2549,7 @@@ redo
        if (likely(page == c->page)) {
                set_freepointer(s, object, c->freelist);
  
 -              if (unlikely(!irqsafe_cpu_cmpxchg_double(
 +              if (unlikely(!this_cpu_cmpxchg_double(
                                s->cpu_slab->freelist, s->cpu_slab->tid,
                                c->freelist, tid,
                                object, next_tid(tid)))) {
@@@ -3028,7 -3043,9 +3043,9 @@@ static int kmem_cache_open(struct kmem_
         *    per node list when we run out of per cpu objects. We only fetch 50%
         *    to keep some capacity around for frees.
         */
-       if (s->size >= PAGE_SIZE)
+       if (kmem_cache_debug(s))
+               s->cpu_partial = 0;
+       else if (s->size >= PAGE_SIZE)
                s->cpu_partial = 2;
        else if (s->size >= 1024)
                s->cpu_partial = 6;
@@@ -3654,9 -3671,6 +3671,9 @@@ void __init kmem_cache_init(void
        struct kmem_cache *temp_kmem_cache_node;
        unsigned long kmalloc_size;
  
 +      if (debug_guardpage_minorder())
 +              slub_max_order = 0;
 +
        kmem_size = offsetof(struct kmem_cache, node) +
                                nr_node_ids * sizeof(struct kmem_cache_node *);
  
@@@ -4637,6 -4651,8 +4654,8 @@@ static ssize_t cpu_partial_store(struc
        err = strict_strtoul(buf, 10, &objects);
        if (err)
                return err;
+       if (objects && kmem_cache_debug(s))
+               return -EINVAL;
  
        s->cpu_partial = objects;
        flush_all(s);