Merge branch 'slab/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/penber...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)
diff --combined Documentation/kernel-parameters.txt

index c92b1532f05adadf9f8aaeac671434080b444424,1aefc79031a4164b147dc363b3cddc5f9bd648d7..a8d389d72405030eec327a6974811f822f56997c
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -329,11 -329,6 +329,11 @@@ bytes respectively. Such letter suffixe
                                     is a lot of faster
                         off       - do not initialize any AMD IOMMU found in
                                     the system
+ +                      force_isolation - Force device isolation for all
+ +                                        devices. The IOMMU driver is not
+ +                                        allowed anymore to lift isolation
+ +                                        requirements as needed. This option
+ +                                        does not override iommu=pt
   
         amijoy.map=     [HW,JOY] Amiga joystick support
                         Map of devices attached to JOY0DAT and JOY1DAT
@@@ -628,25 -623,6 +628,25 @@@
         no_debug_objects
                         [KNL] Disable object debugging
   
+ +      debug_guardpage_minorder=
+ +                      [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
+ +                      parameter allows control of the order of pages that will
+ +                      be intentionally kept free (and hence protected) by the
+ +                      buddy allocator. Bigger value increase the probability
+ +                      of catching random memory corruption, but reduce the
+ +                      amount of memory for normal system use. The maximum
+ +                      possible value is MAX_ORDER/2.  Setting this parameter
+ +                      to 1 or 2 should be enough to identify most random
+ +                      memory corruption problems caused by bugs in kernel or
+ +                      driver code when a CPU writes to (or reads from) a
+ +                      random memory location. Note that there exists a class
+ +                      of memory corruptions problems caused by buggy H/W or
+ +                      F/W or by drivers badly programing DMA (basically when
+ +                      memory is written at bus level and the CPU MMU is
+ +                      bypassed) which are not detectable by
+ +                      CONFIG_DEBUG_PAGEALLOC, hence this option will not help
+ +                      tracking down these problems.
+ +
         debugpat        [X86] Enable PAT debugging
   
         decnet.addr=    [HW,NET]
@@@ -1083,9 -1059,7 +1083,9 @@@
                 nomerge
                 forcesac
                 soft
- -              pt      [x86, IA-64]
+ +              pt              [x86, IA-64]
+ +              group_mf        [x86, IA-64]
+ +
   
         io7=            [HW] IO7 for Marvel based alpha systems
                         See comment before marvel_specify_io7 in
@@@ -1204,6 -1178,9 +1204,6 @@@
         kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
                         Default is 0 (don't ignore, but inject #GP)
   
- -      kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
- -                      Default is 1 (enabled)
- -
         kvm.mmu_audit=  [KVM] This is a R/W parameter which allows audit
                         KVM MMU at runtime.
                         Default is 0 (off)
@@@ -1653,17 -1630,12 +1653,17 @@@
                         The default is to return 64-bit inode numbers.
   
         nfs.nfs4_disable_idmapping=
- -                      [NFSv4] When set, this option disables the NFSv4
- -                      idmapper on the client, but only if the mount
- -                      is using the 'sec=sys' security flavour. This may
- -                      make migration from legacy NFSv2/v3 systems easier
- -                      provided that the server has the appropriate support.
- -                      The default is to always enable NFSv4 idmapping.
+ +                      [NFSv4] When set to the default of '1', this option
+ +                      ensures that both the RPC level authentication
+ +                      scheme and the NFS level operations agree to use
+ +                      numeric uids/gids if the mount is using the
+ +                      'sec=sys' security flavour. In effect it is
+ +                      disabling idmapping, which can make migration from
+ +                      legacy NFSv2/v3 systems to NFSv4 easier.
+ +                      Servers that do not support this mode of operation
+ +                      will be autodetected by the client, and it will fall
+ +                      back to using the idmapper.
+ +                      To turn off this behaviour, set the value to '0'.
   
         nmi_debug=      [KNL,AVR32,SH] Specify one or more actions to take
                         when a NMI is triggered.
@@@ -1913,11 -1885,6 +1913,11 @@@
                         arch_perfmon: [X86] Force use of architectural
                                 perfmon on Intel CPUs instead of the
                                 CPU specific event set.
+ +                      timer: [X86] Force use of architectural NMI
+ +                              timer mode (see also oprofile.timer
+ +                              for generic hr timer mode)
+ +                              [s390] Force legacy basic mode sampling
+ +                                (report cpu_type "timer")
   
         oops=panic      Always panic on oopses. Default is to just kill the
                         process, but there is a small probability of
@@@ -2395,6 -2362,12 +2395,12 @@@
   
         slram=          [HW,MTD]
   
+       slab_max_order= [MM, SLAB]
+                       Determines the maximum allowed order for slabs.
+                       A high setting may cause OOMs due to memory
+                       fragmentation.  Defaults to 1 for systems with
+                       more than 32MB of RAM, 0 otherwise.
+ 
         slub_debug[=options[,slabs]]    [MM, SLUB]
                         Enabling slub_debug allows one to determine the
                         culprit if slab objects become corrupted. Enabling
@@@ -2665,10 -2638,6 +2671,10 @@@
                         [USB] Start with the old device initialization
                         scheme (default 0 = off).
   
+ +      usbcore.usbfs_memory_mb=
+ +                      [USB] Memory limit (in MB) for buffers allocated by
+ +                      usbfs (default = 16, 0 = max = 2047).
+ +
         usbcore.use_both_schemes=
                         [USB] Try the other device initialization scheme
                         if the first one fails (default 1 = enabled).
@@@ -2787,10 -2756,11 +2793,10 @@@
                         functions are at fixed addresses, they make nice
                         targets for exploits that can control RIP.
   
- -                      emulate     Vsyscalls turn into traps and are emulated
- -                                  reasonably safely.
+ +                      emulate     [default] Vsyscalls turn into traps and are
+ +                                  emulated reasonably safely.
   
- -                      native      [default] Vsyscalls are native syscall
- -                                  instructions.
+ +                      native      Vsyscalls are native syscall instructions.
                                     This is a little bit faster than trapping
                                     and makes a few dynamic recompilers work
                                     better than they would in emulation mode.
diff --combined mm/slab.c

index 2acfa0d9094379ae999c1937dd9d4ed475af1837,4ef42baf66f0bb6a6ae73aff3ee2ffed1a276740..f0bd7857ab3bed2adf6649e60dda6ad712ef0b92
--- 1/mm/slab.c
--- 2/mm/slab.c
+++ b/mm/slab.c
@@@ -121,8 -121,6 +121,8 @@@
   #include      <asm/tlbflush.h>
   #include      <asm/page.h>
   
+ +#include <trace/events/kmem.h>
+ +
   /*
    * DEBUG      - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
    *              0 for faster, smaller code (especially in the critical paths).
@@@ -481,11 -479,13 +481,13 @@@ EXPORT_SYMBOL(slab_buffer_size)
   #endif
   
   /*
-  * Do not go above this order unless 0 objects fit into the slab.
+  * Do not go above this order unless 0 objects fit into the slab or
+  * overridden on the command line.
    */
- #define       BREAK_GFP_ORDER_HI      1
- #define       BREAK_GFP_ORDER_LO      0
- static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+ #define       SLAB_MAX_ORDER_HI       1
+ #define       SLAB_MAX_ORDER_LO       0
+ static int slab_max_order = SLAB_MAX_ORDER_LO;
+ static bool slab_max_order_set __initdata;
   
   /*
    * Functions for storing/retrieving the cachep and or slab from the page
@@@ -854,6 -854,17 +856,17 @@@ static int __init noaliencache_setup(ch
   }
   __setup("noaliencache", noaliencache_setup);
   
+ static int __init slab_max_order_setup(char *str)
+ {
+       get_option(&str, &slab_max_order);
+       slab_max_order = slab_max_order < 0 ? 0 :
+                               min(slab_max_order, MAX_ORDER - 1);
+       slab_max_order_set = true;
+ 
+       return 1;
+ }
+ __setup("slab_max_order=", slab_max_order_setup);
+ 
   #ifdef CONFIG_NUMA
   /*
    * Special reaping functions for NUMA systems called from cache_reap().
@@@ -1502,10 -1513,11 +1515,11 @@@ void __init kmem_cache_init(void
   
         /*
          * Fragmentation resistance on low memory - only use bigger
-        * page orders on machines with more than 32MB of memory.
+        * page orders on machines with more than 32MB of memory if
+        * not overridden on the command line.
          */
-       if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
-               slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+       if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+               slab_max_order = SLAB_MAX_ORDER_HI;
   
         /* Bootstrap is tricky, because several objects are allocated
          * from caches that do not exist yet:
@@@ -1932,8 -1944,8 +1946,8 @@@ static void check_poison_obj(struct kme
                         /* Print header */
                         if (lines == 0) {
                                 printk(KERN_ERR
-                                       "Slab corruption: %s start=%p, len=%d\n",
-                                       cachep->name, realobj, size);
+                                       "Slab corruption (%s): %s start=%p, len=%d\n",
+                                       print_tainted(), cachep->name, realobj, size);
                                 print_objinfo(cachep, objp, 0);
                         }
                         /* Hexdump the affected line */
@@@ -2117,7 -2129,7 +2131,7 @@@ static size_t calculate_slab_order(stru
                  * Large number of objects is good, but very large slabs are
                  * currently bad for the gfp()s.
                  */
-               if (gfporder >= slab_break_gfp_order)
+               if (gfporder >= slab_max_order)
                         break;
   
                 /*
@@@ -3042,8 -3054,9 +3056,9 @@@ static void check_slabp(struct kmem_cac
         if (entries != cachep->num - slabp->inuse) {
   bad:
                 printk(KERN_ERR "slab: Internal list corruption detected in "
-                               "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                       cachep->name, cachep->num, slabp, slabp->inuse);
+                       "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
+                       cachep->name, cachep->num, slabp, slabp->inuse,
+                       print_tainted());
                 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
                         sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
                         1);
diff --combined mm/slub.c

index d99acbf14e0179c766332b5f54f4f946b37a5d0f,19436f53876079b6de95e13270d1ef5b22a169c6..5d37b5e44140f2cd0884fc1ae7798ec02b0afb12
--- 1/mm/slub.c
--- 2/mm/slub.c
+++ b/mm/slub.c
@@@ -368,7 -368,7 +368,7 @@@ static inline bool __cmpxchg_double_sla
         VM_BUG_ON(!irqs_disabled());
   #ifdef CONFIG_CMPXCHG_DOUBLE
         if (s->flags & __CMPXCHG_DOUBLE) {
- -              if (cmpxchg_double(&page->freelist,
+ +              if (cmpxchg_double(&page->freelist, &page->counters,
                         freelist_old, counters_old,
                         freelist_new, counters_new))
                 return 1;
@@@ -402,7 -402,7 +402,7 @@@ static inline bool cmpxchg_double_slab(
   {
   #ifdef CONFIG_CMPXCHG_DOUBLE
         if (s->flags & __CMPXCHG_DOUBLE) {
- -              if (cmpxchg_double(&page->freelist,
+ +              if (cmpxchg_double(&page->freelist, &page->counters,
                         freelist_old, counters_old,
                         freelist_new, counters_new))
                 return 1;
@@@ -570,7 -570,7 +570,7 @@@ static void slab_bug(struct kmem_cache 
         va_end(args);
         printk(KERN_ERR "========================================"
                         "=====================================\n");
-       printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+       printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
         printk(KERN_ERR "----------------------------------------"
                         "-------------------------------------\n\n");
   }
@@@ -1901,11 -1901,14 +1901,14 @@@ static void unfreeze_partials(struct km
                         }
   
                         if (l != m) {
-                               if (l == M_PARTIAL)
+                               if (l == M_PARTIAL) {
                                         remove_partial(n, page);
-                               else
+                                       stat(s, FREE_REMOVE_PARTIAL);
+                               } else {
                                         add_partial(n, page,
                                                 DEACTIVATE_TO_TAIL);
+                                       stat(s, FREE_ADD_PARTIAL);
+                               }
   
                                 l = m;
                         }
@@@ -1978,7 -1981,7 +1981,7 @@@ int put_cpu_partial(struct kmem_cache *
                 page->pobjects = pobjects;
                 page->next = oldpage;
   
- -      } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+ +      } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
         stat(s, CPU_PARTIAL_FREE);
         return pobjects;
   }
@@@ -2123,6 -2126,37 +2126,37 @@@ static inline void *new_slab_objects(st
         return object;
   }
   
+ /*
+  * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
+  * or deactivate the page.
+  *
+  * The page is still frozen if the return value is not NULL.
+  *
+  * If this function returns NULL then the page has been unfrozen.
+  */
+ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+ {
+       struct page new;
+       unsigned long counters;
+       void *freelist;
+ 
+       do {
+               freelist = page->freelist;
+               counters = page->counters;
+               new.counters = counters;
+               VM_BUG_ON(!new.frozen);
+ 
+               new.inuse = page->objects;
+               new.frozen = freelist != NULL;
+ 
+       } while (!cmpxchg_double_slab(s, page,
+               freelist, counters,
+               NULL, new.counters,
+               "get_freelist"));
+ 
+       return freelist;
+ }
+ 
   /*
    * Slow path. The lockless freelist is empty or we need to perform
    * debugging duties.
@@@ -2144,8 -2178,6 +2178,6 @@@ static void *__slab_alloc(struct kmem_c
   {
         void **object;
         unsigned long flags;
-       struct page new;
-       unsigned long counters;
   
         local_irq_save(flags);
   #ifdef CONFIG_PREEMPT
@@@ -2166,31 -2198,14 +2198,14 @@@ redo
                 goto new_slab;
         }
   
-       stat(s, ALLOC_SLOWPATH);
- 
-       do {
-               object = c->page->freelist;
-               counters = c->page->counters;
-               new.counters = counters;
-               VM_BUG_ON(!new.frozen);
- 
-               /*
-                * If there is no object left then we use this loop to
-                * deactivate the slab which is simple since no objects
-                * are left in the slab and therefore we do not need to
-                * put the page back onto the partial list.
-                *
-                * If there are objects left then we retrieve them
-                * and use them to refill the per cpu queue.
-                */
+       /* must check again c->freelist in case of cpu migration or IRQ */
+       object = c->freelist;
+       if (object)
+               goto load_freelist;
   
-               new.inuse = c->page->objects;
-               new.frozen = object != NULL;
+       stat(s, ALLOC_SLOWPATH);
   
-       } while (!__cmpxchg_double_slab(s, c->page,
-                       object, counters,
-                       NULL, new.counters,
-                       "__slab_alloc"));
+       object = get_freelist(s, c->page);
   
         if (!object) {
                 c->page = NULL;
@@@ -2304,7 -2319,7 +2319,7 @@@ redo
                  * Since this is without lock semantics the protection is only against
                  * code executing on this cpu *not* from access by other cpus.
                  */
- -              if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ +              if (unlikely(!this_cpu_cmpxchg_double(
                                 s->cpu_slab->freelist, s->cpu_slab->tid,
                                 object, tid,
                                 get_freepointer_safe(s, object), next_tid(tid)))) {
@@@ -2534,7 -2549,7 +2549,7 @@@ redo
         if (likely(page == c->page)) {
                 set_freepointer(s, object, c->freelist);
   
- -              if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ +              if (unlikely(!this_cpu_cmpxchg_double(
                                 s->cpu_slab->freelist, s->cpu_slab->tid,
                                 c->freelist, tid,
                                 object, next_tid(tid)))) {
@@@ -3028,7 -3043,9 +3043,9 @@@ static int kmem_cache_open(struct kmem_
          *    per node list when we run out of per cpu objects. We only fetch 50%
          *    to keep some capacity around for frees.
          */
-       if (s->size >= PAGE_SIZE)
+       if (kmem_cache_debug(s))
+               s->cpu_partial = 0;
+       else if (s->size >= PAGE_SIZE)
                 s->cpu_partial = 2;
         else if (s->size >= 1024)
                 s->cpu_partial = 6;
@@@ -3654,9 -3671,6 +3671,9 @@@ void __init kmem_cache_init(void
         struct kmem_cache *temp_kmem_cache_node;
         unsigned long kmalloc_size;
   
+ +      if (debug_guardpage_minorder())
+ +              slub_max_order = 0;
+ +
         kmem_size = offsetof(struct kmem_cache, node) +
                                 nr_node_ids * sizeof(struct kmem_cache_node *);
   
@@@ -4637,6 -4651,8 +4654,8 @@@ static ssize_t cpu_partial_store(struc
         err = strict_strtoul(buf, 10, &objects);
         if (err)
                 return err;
+       if (objects && kmem_cache_debug(s))
+               return -EINVAL;
   
         s->cpu_partial = objects;
         flush_all(s);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Jan 2012 02:52:23 +0000 (18:52 -0800)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slab.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history