Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 03:47:21 +0000 (20:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 03:47:21 +0000 (20:47 -0700)
Merge first patchbomb from Andrew Morton:

 - a few misc things

 - ocfs2 udpates

 - kernel/watchdog.c feature work (took ages to get right)

 - most of MM.  A few tricky bits are held up and probably won't make 4.2.

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (91 commits)
  mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc()
  mm, thp: respect MPOL_PREFERRED policy with non-local node
  tmpfs: truncate prealloc blocks past i_size
  mm/memory hotplug: print the last vmemmap region at the end of hot add memory
  mm/mmap.c: optimization of do_mmap_pgoff function
  mm: kmemleak: optimise kmemleak_lock acquiring during kmemleak_scan
  mm: kmemleak: avoid deadlock on the kmemleak object insertion error path
  mm: kmemleak: do not acquire scan_mutex in kmemleak_do_cleanup()
  mm: kmemleak: fix delete_object_*() race when called on the same memory block
  mm: kmemleak: allow safe memory scanning during kmemleak disabling
  memcg: convert mem_cgroup->under_oom from atomic_t to int
  memcg: remove unused mem_cgroup->oom_wakeups
  frontswap: allow multiple backends
  x86, mirror: x86 enabling - find mirrored memory ranges
  mm/memblock: allocate boot time data structures from mirrored memory
  mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute
  mm: do not ignore mapping_gfp_mask in page cache allocation paths
  mm/cma.c: fix typos in comments
  mm/oom_kill.c: print points as unsigned int
  mm/hugetlb: handle races in alloc_huge_page and hugetlb_reserve_pages
  ...

151 files changed:
Documentation/lockup-watchdogs.txt
Documentation/sysctl/kernel.txt
Documentation/vm/unevictable-lru.txt
arch/alpha/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/arc/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/arm/include/asm/hugetlb.h
arch/arm/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/arm/mm/hugetlbpage.c
arch/arm64/include/asm/hugetlb.h
arch/arm64/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/arm64/mm/hugetlbpage.c
arch/avr32/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/blackfin/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/c6x/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/cris/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/frv/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/hexagon/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/ia64/include/asm/hugetlb.h
arch/ia64/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/ia64/mm/hugetlbpage.c
arch/m32r/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/m68k/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/metag/include/asm/dma-mapping.h
arch/metag/include/asm/hugetlb.h
arch/metag/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/metag/mm/hugetlbpage.c
arch/microblaze/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/mips/include/asm/hugetlb.h
arch/mips/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/mips/include/asm/pgtable.h
arch/mips/mm/hugetlbpage.c
arch/mn10300/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/nios2/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/openrisc/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/parisc/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/parisc/kernel/pci-dma.c
arch/powerpc/include/asm/hugetlb.h
arch/powerpc/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/powerpc/include/asm/mmu_context.h
arch/powerpc/include/asm/pgtable-ppc64.h
arch/powerpc/kernel/vio.c
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/pgtable_64.c
arch/s390/include/asm/hugetlb.h
arch/s390/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/s390/include/asm/pgtable.h
arch/s390/kernel/crash_dump.c
arch/s390/mm/hugetlbpage.c
arch/score/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/sh/include/asm/hugetlb.h
arch/sh/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/sh/mm/hugetlbpage.c
arch/sparc/include/asm/hugetlb.h
arch/sparc/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/sparc/include/asm/pgtable_64.h
arch/sparc/kernel/ldc.c
arch/sparc/mm/hugetlbpage.c
arch/sparc/mm/init_64.c
arch/tile/include/asm/hugetlb.h
arch/tile/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/tile/include/asm/pgtable.h
arch/tile/mm/hugetlbpage.c
arch/um/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/unicore32/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/x86/include/asm/hugetlb.h
arch/x86/include/asm/mm-arch-hooks.h [new file with mode: 0644]
arch/x86/include/asm/pgtable.h
arch/x86/kernel/check.c
arch/x86/kernel/e820.c
arch/x86/kernel/setup.c
arch/x86/mm/init_32.c
arch/x86/platform/efi/efi.c
arch/xtensa/include/asm/dma-mapping.h
arch/xtensa/include/asm/mm-arch-hooks.h [new file with mode: 0644]
drivers/staging/android/lowmemorykiller.c
drivers/tty/sysrq.c
drivers/xen/tmem.c
fs/configfs/item.c
fs/hugetlbfs/inode.c
fs/ntfs/file.c
fs/ntfs/malloc.h
fs/ocfs2/alloc.c
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/cluster/masklog.c
fs/ocfs2/cluster/masklog.h
fs/ocfs2/cluster/tcp.c
fs/ocfs2/dir.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/file.c
fs/ocfs2/journal.c
fs/ocfs2/namei.c
fs/ocfs2/namei.h
fs/ocfs2/ocfs2.h
fs/ocfs2/refcounttree.c
fs/ocfs2/xattr.c
fs/proc/array.c
fs/splice.c
include/asm-generic/pgtable.h
include/linux/bootmem.h
include/linux/configfs.h
include/linux/efi.h
include/linux/frontswap.h
include/linux/fsnotify_backend.h
include/linux/kmemleak.h
include/linux/memblock.h
include/linux/mm-arch-hooks.h [new file with mode: 0644]
include/linux/mm.h
include/linux/mmu_notifier.h
include/linux/nmi.h
include/linux/oom.h
include/linux/slab.h
include/linux/smpboot.h
include/ras/ras_event.h
kernel/exit.c
kernel/smpboot.c
kernel/sysctl.c
kernel/watchdog.c
mm/Kconfig
mm/cma.c
mm/filemap.c
mm/frontswap.c
mm/huge_memory.c
mm/hugetlb.c
mm/hwpoison-inject.c
mm/kmemleak.c
mm/memblock.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/memtest.c
mm/migrate.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/nobootmem.c
mm/nommu.c
mm/oom_kill.c
mm/page_alloc.c
mm/percpu.c
mm/pgtable-generic.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slab.h
mm/slab_common.c
mm/slub.c
mm/swap.c
mm/vmscan.c

index ab0baa692c13be183e0ead6e67ecf9e70afc0dd1..22dd6af2e4bd42152edbe872b224b85a769e7184 100644 (file)
@@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows
 administrators to configure the period of the hrtimer and the perf
 event. The right value for a particular environment is a trade-off
 between fast response to lockups and detection overhead.
+
+By default, the watchdog runs on all online cores.  However, on a
+kernel configured with NO_HZ_FULL, by default the watchdog runs only
+on the housekeeping cores, not the cores specified in the "nohz_full"
+boot argument.  If we allowed the watchdog to run by default on
+the "nohz_full" cores, we would have to run timer ticks to activate
+the scheduler, which would prevent the "nohz_full" functionality
+from protecting the user code on those cores from the kernel.
+Of course, disabling it by default on the nohz_full cores means that
+when those cores do enter the kernel, by default we will not be
+able to detect if they lock up.  However, allowing the watchdog
+to continue to run on the housekeeping (non-tickless) cores means
+that we will continue to detect lockups properly on those cores.
+
+In either case, the set of cores excluded from running the watchdog
+may be adjusted via the kernel.watchdog_cpumask sysctl.  For
+nohz_full cores, this may be useful for debugging a case where the
+kernel seems to be hanging on the nohz_full cores.
index c831001c45f1162334b7a30544853a8baa47c6a5..e5d528e0c46e88fa8ca7050f40f70213aebc7204 100644 (file)
@@ -923,6 +923,27 @@ and nmi_watchdog.
 
 ==============================================================
 
+watchdog_cpumask:
+
+This value can be used to control on which cpus the watchdog may run.
+The default cpumask is all possible cores, but if NO_HZ_FULL is
+enabled in the kernel config, and cores are specified with the
+nohz_full= boot argument, those cores are excluded by default.
+Offline cores can be included in this mask, and if the core is later
+brought online, the watchdog will be started based on the mask value.
+
+Typically this value would only be touched in the nohz_full case
+to re-enable cores that by default were not running the watchdog,
+if a kernel lockup was suspected on those cores.
+
+The argument value is the standard cpulist format for cpumasks,
+so for example to enable the watchdog on cores 0, 2, 3, and 4 you
+might say:
+
+  echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
+
+==============================================================
+
 watchdog_thresh:
 
 This value can be used to control the frequency of hrtimer and NMI
index 3be0bfc4738df5f7c00ed5aa6d722a8f23a0949f..32ee3a67dba20e64aa46762e24de99fb8ba825c3 100644 (file)
@@ -467,7 +467,13 @@ mmap(MAP_LOCKED) SYSTEM CALL HANDLING
 
 In addition the mlock()/mlockall() system calls, an application can request
 that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
-call.  Furthermore, any mmap() call or brk() call that expands the heap by a
+call. There is one important and subtle difference here, though. mmap() + mlock()
+will fail if the range cannot be faulted in (e.g. because mm_populate fails)
+and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped
+area will still have properties of the locked area - aka. pages will not get
+swapped out - but major page faults to fault memory in might still happen.
+
+Furthermore, any mmap() call or brk() call that expands the heap by a
 task that has previously called mlockall() with the MCL_FUTURE flag will result
 in the newly mapped memory being mlocked.  Before the unevictable/mlock
 changes, the kernel simply called make_pages_present() to allocate pages and
diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..b07fd86
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H
+#define _ASM_ALPHA_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */
diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..c37541c
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARC_MM_ARCH_HOOKS_H
+#define _ASM_ARC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */
index 1f1b1cd112f3adf08c4f65bec4e132b89433ce63..31bb7dccb971caa51e2b4f4dbc4676978c8e68e6 100644 (file)
@@ -53,10 +53,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
 static inline int huge_pte_none(pte_t pte)
 {
        return pte_none(pte);
diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..7056660
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARM_MM_ARCH_HOOKS_H
+#define _ASM_ARM_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARM_MM_ARCH_HOOKS_H */
index c724124150934f3a196a1437f85abebc7c355bf4..fcafb521f14eddea24b0252b18ba7c16a62a614e 100644 (file)
@@ -41,11 +41,6 @@ int pud_huge(pud_t pud)
        return 0;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 int pmd_huge(pmd_t pmd)
 {
        return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
index 5b7ca8ace95f466faebb8fbb58d13ac974c7cf33..734c17e89e94582539372ec1e4ea6ceab7fb7fb1 100644 (file)
@@ -86,10 +86,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
 static inline int huge_pte_none(pte_t pte)
 {
        return pte_none(pte);
diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..562b655
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARM64_MM_ARCH_HOOKS_H
+#define _ASM_ARM64_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */
index 2de9d2e59d96808325829a37bca559c2ddb48a1c..cccc4af87a0372976e25e16667143892562b3457 100644 (file)
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 
-#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-#endif
-
 int pmd_huge(pmd_t pmd)
 {
        return !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..145452f
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_AVR32_MM_ARCH_HOOKS_H
+#define _ASM_AVR32_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */
diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..1c5211e
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+#define _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */
diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..bb3c4a6
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_C6X_MM_ARCH_HOOKS_H
+#define _ASM_C6X_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_C6X_MM_ARCH_HOOKS_H */
diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..314f774
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_CRIS_MM_ARCH_HOOKS_H
+#define _ASM_CRIS_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */
diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..51d13a8
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_FRV_MM_ARCH_HOOKS_H
+#define _ASM_FRV_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_FRV_MM_ARCH_HOOKS_H */
diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..05e8b93
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H
+#define _ASM_HEXAGON_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */
index aa910054b8e7323995adfdadfe3f8eb51a896b61..ff1377bc02a657ed3870c54be7ecdc0b87ab042d 100644 (file)
@@ -20,10 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
                REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte)
 {
diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..ab4b5c6
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_IA64_MM_ARCH_HOOKS_H
+#define _ASM_IA64_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_IA64_MM_ARCH_HOOKS_H */
index 52b7604b5215f899b0820bf6741618b5bc774dfa..f50d4b3f501ad6a5cd0fcf10dfd66b82500b0e54 100644 (file)
@@ -65,11 +65,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 
 /*
diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..6d60b47
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_M32R_MM_ARCH_HOOKS_H
+#define _ASM_M32R_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_M32R_MM_ARCH_HOOKS_H */
diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..7e8709b
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_M68K_MM_ARCH_HOOKS_H
+#define _ASM_M68K_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_M68K_MM_ARCH_HOOKS_H */
index 14b23efd9b7a91be8036e8bb8fea130453de4f51..eb5cdec94be031f0eb0702b6d29ec83616211e1b 100644 (file)
@@ -134,20 +134,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
 }
 
 static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
                    enum dma_data_direction direction)
 {
        int i;
-       for (i = 0; i < nelems; i++, sg++)
+       struct scatterlist *sg;
+
+       for_each_sg(sglist, sg, nelems, i)
                dma_sync_for_cpu(sg_virt(sg), sg->length, direction);
 }
 
 static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
-                      enum dma_data_direction direction)
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
+                      int nelems, enum dma_data_direction direction)
 {
        int i;
-       for (i = 0; i < nelems; i++, sg++)
+       struct scatterlist *sg;
+
+       for_each_sg(sglist, sg, nelems, i)
                dma_sync_for_device(sg_virt(sg), sg->length, direction);
 }
 
index 471f481e67f3ebb07b34364890dc9a3feb56b013..f730b396d79bdd1f08a269b45af0fa4b0880715e 100644 (file)
@@ -14,10 +14,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 int prepare_hugepage_range(struct file *file, unsigned long addr,
                                                unsigned long len);
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..b0072b2
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_METAG_MM_ARCH_HOOKS_H
+#define _ASM_METAG_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_METAG_MM_ARCH_HOOKS_H */
index 7ca80ac42ed5a9ec13ce90ad291ed024594b819f..53f0f6c470273b9c7c48ad6976b3e526a929ed52 100644 (file)
@@ -89,11 +89,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 int pmd_huge(pmd_t pmd)
 {
        return pmd_page_shift(pmd) > PAGE_SHIFT;
diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..5c40659
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+#define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */
index fe0d15d3266015445555901c7e32b41312c37be7..4a5bb5453408edd2eb729794f967e4692673e91a 100644 (file)
@@ -38,10 +38,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr,
                                          unsigned long end,
diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..b5609fe
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MIPS_MM_ARCH_HOOKS_H
+#define _ASM_MIPS_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */
index 819af9d057a8ba65fa1cb1e39ee5462556513aba..9d810675814291d14ce004f9d4447678af2227c3 100644 (file)
@@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 }
 
 /*
- * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a
+ * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() with a
  * different prototype.
  */
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-                                      unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                           unsigned long address, pmd_t *pmdp)
 {
        pmd_t old = *pmdp;
 
index 06e0f421b41b197b628b0f75cf8d7145eee9ba3a..74aa6f62468f2eb463372c58320cc5e7c69cc2fe 100644 (file)
@@ -51,11 +51,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return (pte_t *) pmd;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..e2029a6
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MN10300_MM_ARCH_HOOKS_H
+#define _ASM_MN10300_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */
diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..d7290dc
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H
+#define _ASM_NIOS2_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */
diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..6d33cb5
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H
+#define _ASM_OPENRISC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..654ec63
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_PARISC_MM_ARCH_HOOKS_H
+#define _ASM_PARISC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */
index ff834fd67478acce421a2b6c6be35585de35aacb..b9402c9b34545e81c42b894cd9e1532bac66fccf 100644 (file)
@@ -478,14 +478,16 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz
 static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+       struct scatterlist *sg;
 
        BUG_ON(direction == DMA_NONE);
 
-       for (i = 0; i < nents; i++, sglist++ ) {
-               unsigned long vaddr = (unsigned long)sg_virt(sglist);
-               sg_dma_address(sglist) = (dma_addr_t) virt_to_phys(vaddr);
-               sg_dma_len(sglist) = sglist->length;
-               flush_kernel_dcache_range(vaddr, sglist->length);
+       for_each_sg(sglist, sg, nents, i) {
+               unsigned long vaddr = (unsigned long)sg_virt(sg);
+
+               sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr);
+               sg_dma_len(sg) = sg->length;
+               flush_kernel_dcache_range(vaddr, sg->length);
        }
        return nents;
 }
@@ -493,6 +495,7 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n
 static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+       struct scatterlist *sg;
 
        BUG_ON(direction == DMA_NONE);
 
@@ -501,8 +504,8 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in
 
        /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
 
-       for (i = 0; i < nents; i++, sglist++ )
-               flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+       for_each_sg(sglist, sg, nents, i)
+               flush_kernel_vmap_range(sg_virt(sg), sg->length);
        return;
 }
 
@@ -523,21 +526,23 @@ static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_h
 static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+       struct scatterlist *sg;
 
        /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
 
-       for (i = 0; i < nents; i++, sglist++ )
-               flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+       for_each_sg(sglist, sg, nents, i)
+               flush_kernel_vmap_range(sg_virt(sg), sg->length);
 }
 
 static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
 {
        int i;
+       struct scatterlist *sg;
 
        /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
 
-       for (i = 0; i < nents; i++, sglist++ )
-               flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+       for_each_sg(sglist, sg, nents, i)
+               flush_kernel_vmap_range(sg_virt(sg), sg->length);
 }
 
 struct hppa_dma_ops pcxl_dma_ops = {
index 1d53a65b4ec17e08d08db4912ed3a079495db351..4bbd3c8c2888ecbb9d8e07c28bfd0c32a0c235ca 100644 (file)
@@ -112,11 +112,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
-
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte)
 {
diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..f2a2da8
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H
+#define _ASM_POWERPC_MM_ARCH_HOOKS_H
+
+static inline void arch_remap(struct mm_struct *mm,
+                             unsigned long old_start, unsigned long old_end,
+                             unsigned long new_start, unsigned long new_end)
+{
+       /*
+        * mremap() doesn't allow moving multiple vmas so we can limit the
+        * check to old_start == vdso_base.
+        */
+       if (old_start == mm->context.vdso_base)
+               mm->context.vdso_base = new_start;
+}
+#define arch_remap arch_remap
+
+#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */
index 3e5184210d9b984fca5aa57cfe3c70e6539d74bb..878c27771717260cb1bc5590ad4c6138b2b096f7 100644 (file)
@@ -8,7 +8,6 @@
 #include <linux/spinlock.h>
 #include <asm/mmu.h>   
 #include <asm/cputable.h>
-#include <asm-generic/mm_hooks.h>
 #include <asm/cputhreads.h>
 
 /*
@@ -127,5 +126,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
 
+static inline void arch_dup_mmap(struct mm_struct *oldmm,
+                                struct mm_struct *mm)
+{
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+                             struct vm_area_struct *vma,
+                             unsigned long start, unsigned long end)
+{
+       if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
+               mm->context.vdso_base = 0;
+}
+
+static inline void arch_bprm_mm_init(struct mm_struct *mm,
+                                    struct vm_area_struct *vma)
+{
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
index f890f7ce159323d8a3f35fca2f95a594ed0fa9e9..3bb7488bd24b19aa76dce072b0d3b1710beb46d3 100644 (file)
@@ -569,13 +569,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-                               unsigned long addr, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
-                             pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                    unsigned long addr, pmd_t *pmdp);
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
@@ -592,6 +588,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 extern void pmdp_splitting_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
 
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp);
+#define pmdp_collapse_flush pmdp_collapse_flush
+
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
index b41426c60ef62ea0f9757a62578ff86d143af3bf..5f8dcdaa2820167496b5878f9ab883585922a2cf 100644 (file)
@@ -557,11 +557,11 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
        struct vio_dev *viodev = to_vio_dev(dev);
        struct iommu_table *tbl;
        struct scatterlist *sgl;
-       int ret, count = 0;
+       int ret, count;
        size_t alloc_size = 0;
 
        tbl = get_iommu_table_base(dev);
-       for (sgl = sglist; count < nelems; count++, sgl++)
+       for_each_sg(sglist, sgl, nelems, count)
                alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
 
        if (vio_cmo_alloc(viodev, alloc_size)) {
@@ -577,7 +577,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
                return ret;
        }
 
-       for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+       for_each_sg(sglist, sgl, ret, count)
                alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
        if (alloc_size)
                vio_cmo_dealloc(viodev, alloc_size);
@@ -594,10 +594,10 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
        struct iommu_table *tbl;
        struct scatterlist *sgl;
        size_t alloc_size = 0;
-       int count = 0;
+       int count;
 
        tbl = get_iommu_table_base(dev);
-       for (sgl = sglist; count < nelems; count++, sgl++)
+       for_each_sg(sglist, sgl, nelems, count)
                alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
 
        dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
index 3385e3d0506ec575f3eeebad77d2c65264a3acf3..38bd5d998c81aadda711b03093a9b854e1018cc8 100644 (file)
@@ -439,11 +439,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 }
 #endif
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 #ifdef CONFIG_PPC_FSL_BOOK3E
 #define HUGEPD_FREELIST_SIZE \
        ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
index 6bfadf1aa5cbbfadbd7237e5f3da8ac8653a5651..876232d641268831c7a1963be7d7d6fbffc55ab1 100644 (file)
@@ -554,47 +554,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
        return old;
 }
 
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
-                      pmd_t *pmdp)
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                         pmd_t *pmdp)
 {
        pmd_t pmd;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       if (pmd_trans_huge(*pmdp)) {
-               pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
-       } else {
-               /*
-                * khugepaged calls this for normal pmd
-                */
-               pmd = *pmdp;
-               pmd_clear(pmdp);
-               /*
-                * Wait for all pending hash_page to finish. This is needed
-                * in case of subpage collapse. When we collapse normal pages
-                * to hugepage, we first clear the pmd, then invalidate all
-                * the PTE entries. The assumption here is that any low level
-                * page fault will see a none pmd and take the slow path that
-                * will wait on mmap_sem. But we could very well be in a
-                * hash_page with local ptep pointer value. Such a hash page
-                * can result in adding new HPTE entries for normal subpages.
-                * That means we could be modifying the page content as we
-                * copy them to a huge page. So wait for parallel hash_page
-                * to finish before invalidating HPTE entries. We can do this
-                * by sending an IPI to all the cpus and executing a dummy
-                * function there.
-                */
-               kick_all_cpus_sync();
-               /*
-                * Now invalidate the hpte entries in the range
-                * covered by pmd. This make sure we take a
-                * fault and will find the pmd as none, which will
-                * result in a major fault which takes mmap_sem and
-                * hence wait for collapse to complete. Without this
-                * the __collapse_huge_page_copy can result in copying
-                * the old content.
-                */
-               flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
-       }
+       VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+       pmd = *pmdp;
+       pmd_clear(pmdp);
+       /*
+        * Wait for all pending hash_page to finish. This is needed
+        * in case of subpage collapse. When we collapse normal pages
+        * to hugepage, we first clear the pmd, then invalidate all
+        * the PTE entries. The assumption here is that any low level
+        * page fault will see a none pmd and take the slow path that
+        * will wait on mmap_sem. But we could very well be in a
+        * hash_page with local ptep pointer value. Such a hash page
+        * can result in adding new HPTE entries for normal subpages.
+        * That means we could be modifying the page content as we
+        * copy them to a huge page. So wait for parallel hash_page
+        * to finish before invalidating HPTE entries. We can do this
+        * by sending an IPI to all the cpus and executing a dummy
+        * function there.
+        */
+       kick_all_cpus_sync();
+       /*
+        * Now invalidate the hpte entries in the range
+        * covered by pmd. This make sure we take a
+        * fault and will find the pmd as none, which will
+        * result in a major fault which takes mmap_sem and
+        * hence wait for collapse to complete. Without this
+        * the __collapse_huge_page_copy can result in copying
+        * the old content.
+        */
+       flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
        return pmd;
 }
 
@@ -817,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
        return;
 }
 
-pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-                        unsigned long addr, pmd_t *pmdp)
+pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                             unsigned long addr, pmd_t *pmdp)
 {
        pmd_t old_pmd;
        pgtable_t pgtable;
index 11eae5f55b709d1e37e69f2b44eb762d48475e6e..dfb542ade6b16653cb320cc84c472b62568bf9cc 100644 (file)
@@ -35,7 +35,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-#define hugetlb_prefault_arch_hook(mm)         do { } while (0)
 #define arch_clear_hugepage_flags(page)                do { } while (0)
 
 int arch_prepare_hugepage(struct page *page);
diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..07680b2
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_S390_MM_ARCH_HOOKS_H
+#define _ASM_S390_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_S390_MM_ARCH_HOOKS_H */
index 0bb2da79adf351dd2935077cd2ca6fd82b7d4e8a..f66d82798a6a7725f7e3eeeb547d8bb2d433719e 100644 (file)
@@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
        return pmd_young(pmd);
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-                                      unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                           unsigned long address, pmd_t *pmdp)
 {
        pmd_t pmd = *pmdp;
 
@@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
        return pmd;
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
-                                           unsigned long address,
-                                           pmd_t *pmdp, int full)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+                                                unsigned long address,
+                                                pmd_t *pmdp, int full)
 {
        pmd_t pmd = *pmdp;
 
@@ -1522,11 +1522,11 @@ static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
        return pmd;
 }
 
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
-static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
-                                    unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
+static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
+                                         unsigned long address, pmd_t *pmdp)
 {
-       return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+       return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 }
 
 #define __HAVE_ARCH_PMDP_INVALIDATE
@@ -1548,6 +1548,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
        }
 }
 
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                       unsigned long address,
+                                       pmd_t *pmdp)
+{
+       return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+}
+#define pmdp_collapse_flush pmdp_collapse_flush
+
 #define pfn_pmd(pfn, pgprot)   mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
 #define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
 
index d9f0dcfcae5eaed275f30bb2fb91cf2ef7c1dd01..7a75ad4594e3e721bd91daa1a80fb95fdcd47a9a 100644 (file)
@@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = {
 };
 
 #define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid)         \
-       for (i = 0, __next_mem_range(&i, nid, &memblock.physmem,        \
+       for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE,            \
+                                    &memblock.physmem,                 \
                                     &oldmem_type, p_start,             \
                                     p_end, p_nid);                     \
             i != (u64)ULLONG_MAX;                                      \
-            __next_mem_range(&i, nid, &memblock.physmem,               \
+            __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\
                              &oldmem_type,                             \
                              p_start, p_end, p_nid))
 
index e617e74b7be22aade65168f2c103c32b18b18653..c3f8e3df92ff9de3dff55395d57541304487dfc2 100644 (file)
@@ -193,11 +193,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return (pte_t *) pmdp;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 int pmd_huge(pmd_t pmd)
 {
        if (!MACHINE_HAS_HPAGE)
diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..5e38689
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SCORE_MM_ARCH_HOOKS_H
+#define _ASM_SCORE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */
index 699255d6d1c634bc9b3fe53d1c2abb5778a6a7dd..b788a9bc89185bc7f7af515f7d6b065d175967da 100644 (file)
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
-
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..1808729
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SH_MM_ARCH_HOOKS_H
+#define _ASM_SH_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SH_MM_ARCH_HOOKS_H */
index 534bc978af8a58f069709cfc95f5f8c02a1a1a50..6385f60209b65adefc2bbbcc8e5309a5d574455d 100644 (file)
@@ -62,11 +62,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 int pmd_huge(pmd_t pmd)
 {
        return 0;
index e4cab465b81f86eb4d7f2a3b03c5a62b67d28a8d..3130d763631287cb4315316b1ed380e0ccb9d2fb 100644 (file)
@@ -11,10 +11,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep);
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
 static inline int is_hugepage_only_range(struct mm_struct *mm,
                                         unsigned long addr,
                                         unsigned long len) {
diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..b89ba44
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SPARC_MM_ARCH_HOOKS_H
+#define _ASM_SPARC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */
index 2a52c91d2c8acbf5f904e082400ba782d7279947..131d36fcd07a60af83ae1b6e8968e48577df54f0 100644 (file)
@@ -865,10 +865,10 @@ static inline unsigned long pud_pfn(pud_t pud)
 void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
                   pte_t *ptep, pte_t orig, int fullmm);
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-                                      unsigned long addr,
-                                      pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                           unsigned long addr,
+                                           pmd_t *pmdp)
 {
        pmd_t pmd = *pmdp;
        set_pmd_at(mm, addr, pmdp, __pmd(0UL));
index 7d3ca30fcd1506d81dfca9859ffb4cc290fae109..1ae5eb1bb045130f05c8edc4573dbcd75e00475f 100644 (file)
@@ -2086,6 +2086,7 @@ int ldc_map_sg(struct ldc_channel *lp,
        struct cookie_state state;
        struct ldc_iommu *iommu;
        int err;
+       struct scatterlist *s;
 
        if (map_perm & ~LDC_MAP_ALL)
                return -EINVAL;
@@ -2112,9 +2113,10 @@ int ldc_map_sg(struct ldc_channel *lp,
        state.pte_idx = (base - iommu->page_table);
        state.nc = 0;
 
-       for (i = 0; i < num_sg; i++)
-               fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT,
-                            sg[i].offset, sg[i].length);
+       for_each_sg(sg, s, num_sg, i) {
+               fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT,
+                            s->offset, s->length);
+       }
 
        return state.nc;
 }
index 4242eab12e10738e8bdbf6117f7be26d45528a48..131eaf4ad7f598aacaaf31a8f59da9ad1335c2da 100644 (file)
@@ -172,11 +172,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return pte;
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t entry)
 {
index c5d08b89a96c811ce3d77d81e0ec7b14570d0ac8..4ac88b7575147fb50443902fb15e1dd7a007bc84 100644 (file)
@@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void)
        phys_addr_t pa_start, pa_end;
        u64 i;
 
-       for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL)
+       for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+                               &pa_end, NULL)
                available = available + (pa_end  - pa_start);
 
        return available;
@@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram)
        if (limit_ram >= avail_ram)
                return;
 
-       for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) {
+       for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+                               &pa_end, NULL) {
                phys_addr_t region_size = pa_end - pa_start;
                phys_addr_t clip_start = pa_start;
 
index 3257733003f8dcd550376b02bbaa3acb7a79895b..1abd00c5523695ccc8d1556f69f7e97e1e6929ae 100644 (file)
@@ -40,10 +40,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..d1709ea
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_TILE_MM_ARCH_HOOKS_H
+#define _ASM_TILE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_TILE_MM_ARCH_HOOKS_H */
index 95a4f19d16c52625e40990b2a3d9f0dc0039b84c..2b05ccbebed9b88623871eef69234a24915bcdbb 100644 (file)
@@ -414,10 +414,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 }
 
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-                                      unsigned long address,
-                                      pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                           unsigned long address,
+                                           pmd_t *pmdp)
 {
        return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
 }
index 8416240c322c929f6becd672673e6316603173d0..c034dc3fe2d42cb80502517816dd188ff8801557 100644 (file)
@@ -160,11 +160,6 @@ int pud_huge(pud_t pud)
        return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
 }
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-       return 0;
-}
-
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
                unsigned long addr, unsigned long len,
diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..a7c8b0d
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_UM_MM_ARCH_HOOKS_H
+#define _ASM_UM_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_UM_MM_ARCH_HOOKS_H */
diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..4d79a85
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H
+#define _ASM_UNICORE32_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */
index 68c05398bba9b449a1324d54b584ce52d52aa8d1..dab7a3a750bfe521b9e23f80484caf139ed34f29 100644 (file)
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
        return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
-
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                          unsigned long addr, unsigned long end,
                                          unsigned long floor,
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..4e881a3
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_MM_ARCH_HOOKS_H
+#define _ASM_X86_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_X86_MM_ARCH_HOOKS_H */
index 2562e303405b9d3c314c46a1ce180d4d097d7a11..867da5bbb4a33df1d268d3f282cda30d0fdf671d 100644 (file)
@@ -805,8 +805,8 @@ static inline int pmd_write(pmd_t pmd)
        return pmd_flags(pmd) & _PAGE_RW;
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
 {
        pmd_t pmd = native_pmdp_get_and_clear(pmdp);
index 83a7995625a6de35c293537c7e78c3a0ba6a1a63..58118e207a69c4b5a249c6e9458f6b54f6295e52 100644 (file)
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
 
        corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
 
-       for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
+       for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+                               NULL) {
                start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
                                PAGE_SIZE, corruption_check_size);
                end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
index e2ce85db228303b61d1afd80e9c0656e2c343423..c8dda42cb6a326a950a50f81335272354f4bf634 100644 (file)
@@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void)
                nr_pages += end_pfn - start_pfn;
        }
 
-       for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+       for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+                               NULL) {
                start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
                end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
                if (start_pfn < end_pfn)
index 39ca113676fe54a73ffe3d01276e89cbc2252b62..d3b95b89e9b2974401b48b534bf71604d7db57cc 100644 (file)
@@ -1105,6 +1105,9 @@ void __init setup_arch(char **cmdline_p)
        memblock_set_current_limit(ISA_END_ADDRESS);
        memblock_x86_fill();
 
+       if (efi_enabled(EFI_BOOT))
+               efi_find_mirror();
+
        /*
         * The EFI specification says that boot service code won't be called
         * after ExitBootServices(). This is, in fact, a lie.
index c8140e12816a51f77702b273289222071ecb2350..8340e45c891a1dc879a6e23ab95fe0e2e82b5eec 100644 (file)
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
        phys_addr_t start, end;
        u64 i;
 
-       for_each_free_mem_range(i, nid, &start, &end, NULL) {
+       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
                unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
                                            start_pfn, end_pfn);
                unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
index 3b984c3aa1b0b5ba6e7b5e5321edb4fbc7013e5d..c1c382c58c60d58c3ea7a93e09b5f2e04112c6d8 100644 (file)
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
        now->tv_nsec = 0;
 }
 
+void __init efi_find_mirror(void)
+{
+       void *p;
+       u64 mirror_size = 0, total_size = 0;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               efi_memory_desc_t *md = p;
+               unsigned long long start = md->phys_addr;
+               unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+               total_size += size;
+               if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+                       memblock_mark_mirror(start, size);
+                       mirror_size += size;
+               }
+       }
+       if (mirror_size)
+               pr_info("Memory: %lldM/%lldM mirrored memory\n",
+                       mirror_size>>20, total_size>>20);
+}
+
 /*
  * Tell the kernel about the EFI memory map.  This might include
  * more than the max 128 entries that can fit in the e820 legacy
index ba78ccf651e7764e9db92cfca37927a8d68e3892..1f5f6dc097365ab4f4be0a00a481859dc29e4256 100644 (file)
@@ -52,14 +52,15 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 }
 
 static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
           enum dma_data_direction direction)
 {
        int i;
+       struct scatterlist *sg;
 
        BUG_ON(direction == DMA_NONE);
 
-       for (i = 0; i < nents; i++, sg++ ) {
+       for_each_sg(sglist, sg, nents, i) {
                BUG_ON(!sg_page(sg));
 
                sg->dma_address = sg_phys(sg);
@@ -124,20 +125,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
        consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction);
 }
 static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
                 enum dma_data_direction dir)
 {
        int i;
-       for (i = 0; i < nelems; i++, sg++)
+       struct scatterlist *sg;
+
+       for_each_sg(sglist, sg, nelems, i)
                consistent_sync(sg_virt(sg), sg->length, dir);
 }
 
 static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
-                enum dma_data_direction dir)
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
+                      int nelems, enum dma_data_direction dir)
 {
        int i;
-       for (i = 0; i < nelems; i++, sg++)
+       struct scatterlist *sg;
+
+       for_each_sg(sglist, sg, nelems, i)
                consistent_sync(sg_virt(sg), sg->length, dir);
 }
 static inline int
diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..d2e5cfd
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H
+#define _ASM_XTENSA_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */
index feafa172b155a2ecabea27dfe3b149efb0dd2745..2345ee7342d9bfb428f672c7cb3af2cc56cc89f6 100644 (file)
@@ -165,7 +165,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
                 * infrastructure. There is no real reason why the selected
                 * task should have access to the memory reserves.
                 */
-               mark_tsk_oom_victim(selected);
+               mark_oom_victim(selected);
                send_sig(SIGKILL, selected, 0);
                rem += selected_tasksize;
        }
index 9ffdfcf2ec6ed498f8bcc259141c9b2a5da2a034..1c4791033b723d322d484f0b1e029f6dce3a8696 100644 (file)
@@ -353,9 +353,11 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
+       mutex_lock(&oom_lock);
        if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
                           GFP_KERNEL, 0, NULL, true))
                pr_info("OOM request ignored because killer is disabled\n");
+       mutex_unlock(&oom_lock);
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
index c4211a31612d447d3f40ad61f11c10e4cb4d0e5d..d88f36754bf7efcd67750cce72ec607564bc6fd4 100644 (file)
@@ -381,15 +381,9 @@ static int __init xen_tmem_init(void)
 #ifdef CONFIG_FRONTSWAP
        if (tmem_enabled && frontswap) {
                char *s = "";
-               struct frontswap_ops *old_ops;
 
                tmem_frontswap_poolid = -1;
-               old_ops = frontswap_register_ops(&tmem_frontswap_ops);
-               if (IS_ERR(old_ops) || old_ops) {
-                       if (IS_ERR(old_ops))
-                               return PTR_ERR(old_ops);
-                       s = " (WARNING: frontswap_ops overridden)";
-               }
+               frontswap_register_ops(&tmem_frontswap_ops);
                pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
                        s);
        }
index e65f9ffbb99951ba83843a2836fb311f5e6af034..4d6a30e76168c3f548ce48dbbb1428e0baf81d49 100644 (file)
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref);
  *     config_item_init - initialize item.
  *     @item:  item in question.
  */
-void config_item_init(struct config_item *item)
+static void config_item_init(struct config_item *item)
 {
        kref_init(&item->ci_kref);
        INIT_LIST_HEAD(&item->ci_entry);
 }
-EXPORT_SYMBOL(config_item_init);
 
 /**
  *     config_item_set_name - Set the name of an item
index 87724c1d7be66361a4b36fc5365818f02bd6b60b..0cf74df68617b8738342a5f7be7992ccc596bf0a 100644 (file)
@@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
                goto out;
 
        ret = 0;
-       hugetlb_prefault_arch_hook(vma->vm_mm);
        if (vma->vm_flags & VM_WRITE && inode->i_size < len)
                inode->i_size = len;
 out:
index 7bb487e663b478ed52633e43ecc94feb2a636a7f..2cd65367076458e84532eebcb67869944e3aa327 100644 (file)
@@ -525,7 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                }
                        }
                        err = add_to_page_cache_lru(*cached_page, mapping,
-                                       index, GFP_KERNEL);
+                                       index,
+                                       GFP_KERNEL & mapping_gfp_mask(mapping));
                        if (unlikely(err)) {
                                if (err == -EEXIST)
                                        continue;
index a44b14cbceebcd9f8fd7cb9be561c12373c0c271..ab172e5f51d91f874f4dfa19f03fb1600c37d01c 100644 (file)
@@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
 
 static inline void ntfs_free(void *addr)
 {
-       if (!is_vmalloc_addr(addr)) {
-               kfree(addr);
-               /* free_page((unsigned long)addr); */
-               return;
-       }
-       vfree(addr);
+       kvfree(addr);
 }
 
 #endif /* _LINUX_NTFS_MALLOC_H */
index 2d7f76e52c379cf04b7f820c610c171e491d4830..5997c00a1515a6f7ec4d33a96f8eed549ea57bc2 100644 (file)
@@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
        struct ocfs2_path *right_path = NULL;
        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
 
-       BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+       if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
+               return 0;
 
        *empty_extent_path = NULL;
 
@@ -4311,13 +4312,13 @@ out:
        return ret;
 }
 
-static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                               struct ocfs2_path *path,
                               struct ocfs2_extent_list *el, int index,
-                              struct ocfs2_extent_rec *split_rec)
+                              struct ocfs2_extent_rec *split_rec,
+                              struct ocfs2_merge_ctxt *ctxt)
 {
-       int status;
+       int status = 0;
        enum ocfs2_contig_type ret = CONTIG_NONE;
        u32 left_cpos, right_cpos;
        struct ocfs2_extent_rec *rec = NULL;
@@ -4336,8 +4337,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 
                if (left_cpos != 0) {
                        left_path = ocfs2_new_path_from_path(path);
-                       if (!left_path)
+                       if (!left_path) {
+                               status = -ENOMEM;
+                               mlog_errno(status);
                                goto exit;
+                       }
 
                        status = ocfs2_find_path(et->et_ci, left_path,
                                                 left_cpos);
@@ -4392,8 +4396,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
                        goto free_left_path;
 
                right_path = ocfs2_new_path_from_path(path);
-               if (!right_path)
+               if (!right_path) {
+                       status = -ENOMEM;
+                       mlog_errno(status);
                        goto free_left_path;
+               }
 
                status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
                if (status)
@@ -4433,7 +4440,10 @@ free_right_path:
 free_left_path:
        ocfs2_free_path(left_path);
 exit:
-       return ret;
+       if (status == 0)
+               ctxt->c_contig_type = ret;
+
+       return status;
 }
 
 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
@@ -5039,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle,
                goto out;
        }
 
-       ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
-                                                           split_index,
-                                                           split_rec);
+       ret = ocfs2_figure_merge_contig_type(et, path, el,
+                                            split_index,
+                                            split_rec,
+                                            &ctxt);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
 
        /*
         * The core merge / split code wants to know how much room is
index f906a250da6addcd0d2cdc796f383a312fe80224..1a35c6139656344516aacd59c7120f6fb877f2a2 100644 (file)
@@ -523,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
        unsigned long len = bh_result->b_size;
-       unsigned int clusters_to_alloc = 0;
+       unsigned int clusters_to_alloc = 0, contig_clusters = 0;
 
        cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
 
@@ -560,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                /* fill hole, allocate blocks can't be larger than the size
                 * of the hole */
                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
-               if (clusters_to_alloc > contig_blocks)
-                       clusters_to_alloc = contig_blocks;
+               contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
+                               contig_blocks);
+               if (clusters_to_alloc > contig_clusters)
+                       clusters_to_alloc = contig_clusters;
 
                /* allocate extent and insert them into the extent tree */
                ret = ocfs2_extend_allocation(inode, cpos,
@@ -619,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 
-       if (ocfs2_iocb_is_sem_locked(iocb))
-               ocfs2_iocb_clear_sem_locked(iocb);
-
        if (ocfs2_iocb_is_unaligned_aio(iocb)) {
                ocfs2_iocb_clear_unaligned_aio(iocb);
 
@@ -925,13 +924,23 @@ clean_orphan:
                int update_isize = written > 0 ? 1 : 0;
                loff_t end = update_isize ? offset + written : 0;
 
-               tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+               tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
+               if (tmp_ret < 0) {
+                       ret = tmp_ret;
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
                                update_isize, end);
                if (tmp_ret < 0) {
                        ret = tmp_ret;
+                       mlog_errno(ret);
                        goto out;
                }
 
+               ocfs2_inode_unlock(inode, 1);
+
                tmp_ret = jbd2_journal_force_commit(journal);
                if (tmp_ret < 0) {
                        ret = tmp_ret;
index dd59599b022d5ab26dffd82807d048cac170a154..24e496d6bdcdba9036dbc67d5f118aa9d10f431a 100644 (file)
@@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
 enum ocfs2_iocb_lock_bits {
        OCFS2_IOCB_RW_LOCK = 0,
        OCFS2_IOCB_RW_LOCK_LEVEL,
-       OCFS2_IOCB_SEM,
        OCFS2_IOCB_UNALIGNED_IO,
        OCFS2_IOCB_NUM_LOCKS
 };
@@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits {
        clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_sem_locked(iocb) \
-       set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_sem_locked(iocb) \
-       clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_sem_locked(iocb) \
-       test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 
 #define ocfs2_iocb_set_unaligned_aio(iocb) \
        set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
index af7598bff1b51364affd08eba9cfedba8b3acf16..dfe162f5fd4cf3ef23098187ea7bd9823f4b0527 100644 (file)
@@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
        return count;
 }
 
+void __mlog_printk(const u64 *mask, const char *func, int line,
+                  const char *fmt, ...)
+{
+       struct va_format vaf;
+       va_list args;
+       const char *level;
+       const char *prefix = "";
+
+       if (!__mlog_test_u64(*mask, mlog_and_bits) ||
+           __mlog_test_u64(*mask, mlog_not_bits))
+               return;
+
+       if (*mask & ML_ERROR) {
+               level = KERN_ERR;
+               prefix = "ERROR: ";
+       } else if (*mask & ML_NOTICE) {
+               level = KERN_NOTICE;
+       } else {
+               level = KERN_INFO;
+       }
+
+       va_start(args, fmt);
+
+       vaf.fmt = fmt;
+       vaf.va = &args;
+
+       printk("%s(%s,%u,%u):%s:%d %s%pV",
+              level, current->comm, task_pid_nr(current),
+              raw_smp_processor_id(), func, line, prefix, &vaf);
+
+       va_end(args);
+}
+EXPORT_SYMBOL_GPL(__mlog_printk);
+
 struct mlog_attribute {
        struct attribute attr;
        u64 mask;
index 7fdc25a4d8c0e76af574c505d83eb86f1f452e31..308ea0eb35fd112f29a5546c23dba6f80e60c787 100644 (file)
@@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 
 #endif
 
-/*
- * smp_processor_id() "helpfully" screams when called outside preemptible
- * regions in current kernels.  sles doesn't have the variants that don't
- * scream.  just do this instead of trying to guess which we're building
- * against.. *sigh*.
- */
-#define __mlog_cpu_guess ({            \
-       unsigned long _cpu = get_cpu(); \
-       put_cpu();                      \
-       _cpu;                           \
-})
+__printf(4, 5)
+void __mlog_printk(const u64 *m, const char *func, int line,
+                  const char *fmt, ...);
 
-/* In the following two macros, the whitespace after the ',' just
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
- * previous token if args expands to nothing.
+/*
+ * Testing before the __mlog_printk call lets the compiler eliminate the
+ * call completely when (m & ML_ALLOWED_BITS) is 0.
  */
-#define __mlog_printk(level, fmt, args...)                             \
-       printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,           \
-              task_pid_nr(current), __mlog_cpu_guess,                  \
-              __PRETTY_FUNCTION__, __LINE__ , ##args)
-
-#define mlog(mask, fmt, args...) do {                                  \
-       u64 __m = MLOG_MASK_PREFIX | (mask);                            \
-       if ((__m & ML_ALLOWED_BITS) &&                                  \
-           __mlog_test_u64(__m, mlog_and_bits) &&                      \
-           !__mlog_test_u64(__m, mlog_not_bits)) {                     \
-               if (__m & ML_ERROR)                                     \
-                       __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
-               else if (__m & ML_NOTICE)                               \
-                       __mlog_printk(KERN_NOTICE, fmt , ##args);       \
-               else __mlog_printk(KERN_INFO, fmt , ##args);            \
-       }                                                               \
+#define mlog(mask, fmt, ...)                                           \
+do {                                                                   \
+       u64 _m = MLOG_MASK_PREFIX | (mask);                             \
+       if (_m & ML_ALLOWED_BITS)                                       \
+               __mlog_printk(&_m, __func__, __LINE__, fmt,             \
+                             ##__VA_ARGS__);                           \
 } while (0)
 
 #define mlog_errno(st) ({                                              \
index 56c403a563bc3d01a514516ef1ef974b0b0ae037..2d0acd6678fe4c680615685996497dc5ebcf78df 100644 (file)
@@ -2204,7 +2204,7 @@ out:
        kfree(o2net_hand);
        kfree(o2net_keep_req);
        kfree(o2net_keep_resp);
-
+       o2net_debugfs_exit();
        o2quo_exit();
        return -ENOMEM;
 }
index ccd4dcfc36457c211762e4dc7348949774c7df5b..02878a83f0b4e88655114ff78b9883e0494432a6 100644 (file)
@@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle,
        struct ocfs2_dir_entry *de, *de1;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
        struct super_block *sb = dir->i_sb;
-       int retval, status;
+       int retval;
        unsigned int size = sb->s_blocksize;
        struct buffer_head *insert_bh = lookup->dl_leaf_bh;
        char *data_start = insert_bh->b_data;
@@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle,
                        }
 
                        if (insert_bh == parent_fe_bh)
-                               status = ocfs2_journal_access_di(handle,
+                               retval = ocfs2_journal_access_di(handle,
                                                                 INODE_CACHE(dir),
                                                                 insert_bh,
                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                        else {
-                               status = ocfs2_journal_access_db(handle,
+                               retval = ocfs2_journal_access_db(handle,
                                                                 INODE_CACHE(dir),
                                                                 insert_bh,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
 
-                               if (ocfs2_dir_indexed(dir)) {
-                                       status = ocfs2_dx_dir_insert(dir,
+                               if (!retval && ocfs2_dir_indexed(dir))
+                                       retval = ocfs2_dx_dir_insert(dir,
                                                                handle,
                                                                lookup);
-                                       if (status) {
-                                               mlog_errno(status);
-                                               goto bail;
-                                       }
-                               }
+                       }
+
+                       if (retval) {
+                               mlog_errno(retval);
+                               goto bail;
                        }
 
                        /* By now the buffer is marked for journaling */
@@ -3543,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size)
 {
        struct ocfs2_dx_entry *entry1 = a;
        struct ocfs2_dx_entry *entry2 = b;
-       struct ocfs2_dx_entry tmp;
 
        BUG_ON(size != sizeof(*entry1));
 
-       tmp = *entry1;
-       *entry1 = *entry2;
-       *entry2 = tmp;
+       swap(*entry1, *entry2);
 }
 
 static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
index fae17c640df3eebb9fe572485709cb395f73a4bf..e88ccf8c83fff7b13310d76745f34697ea4202b5 100644 (file)
@@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
 
 /* will exit holding res->spinlock, but may drop in function */
 void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
-void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
 
 /* will exit holding res->spinlock, but may drop in function */
 static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
index d8b670cbd909292a6da33e5b16d3c16467727cb3..fbfadb289e628ce32decb024bab92792e4ebc995 100644 (file)
@@ -2250,7 +2250,7 @@ out:
 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                    struct iov_iter *from)
 {
-       int direct_io, appending, rw_level, have_alloc_sem  = 0;
+       int direct_io, appending, rw_level;
        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
        ssize_t ret;
@@ -2279,16 +2279,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 
        mutex_lock(&inode->i_mutex);
 
-       ocfs2_iocb_clear_sem_locked(iocb);
-
 relock:
-       /* to match setattr's i_mutex -> rw_lock ordering */
-       if (direct_io) {
-               have_alloc_sem = 1;
-               /* communicate with ocfs2_dio_end_io */
-               ocfs2_iocb_set_sem_locked(iocb);
-       }
-
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
@@ -2298,7 +2289,7 @@ relock:
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
                mlog_errno(ret);
-               goto out_sems;
+               goto out_mutex;
        }
 
        /*
@@ -2347,7 +2338,6 @@ relock:
        if (direct_io && !can_do_direct) {
                ocfs2_rw_unlock(inode, rw_level);
 
-               have_alloc_sem = 0;
                rw_level = -1;
 
                direct_io = 0;
@@ -2416,7 +2406,6 @@ no_sync:
         */
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
-               have_alloc_sem = 0;
                unaligned_dio = 0;
        }
 
@@ -2429,10 +2418,7 @@ out:
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
 
-out_sems:
-       if (have_alloc_sem)
-               ocfs2_iocb_clear_sem_locked(iocb);
-
+out_mutex:
        mutex_unlock(&inode->i_mutex);
 
        if (written)
@@ -2473,7 +2459,7 @@ bail:
 static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
                                   struct iov_iter *to)
 {
-       int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+       int ret = 0, rw_level = -1, lock_level = 0;
        struct file *filp = iocb->ki_filp;
        struct inode *inode = file_inode(filp);
 
@@ -2490,16 +2476,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
                goto bail;
        }
 
-       ocfs2_iocb_clear_sem_locked(iocb);
-
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
         */
        if (iocb->ki_flags & IOCB_DIRECT) {
-               have_alloc_sem = 1;
-               ocfs2_iocb_set_sem_locked(iocb);
-
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -2535,13 +2516,9 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
        /* see ocfs2_file_write_iter */
        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
                rw_level = -1;
-               have_alloc_sem = 0;
        }
 
 bail:
-       if (have_alloc_sem)
-               ocfs2_iocb_clear_sem_locked(iocb);
-
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
 
index ff531928269ed1d4e1e0f79bfd6335db3cbc3671..7c099f7032fdbcc6ce61a53eb7cad23c3a1bac29 100644 (file)
@@ -108,7 +108,7 @@ struct ocfs2_replay_map {
        unsigned char rm_replay_slots[0];
 };
 
-void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
 {
        if (!osb->replay_map)
                return;
@@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
        return 0;
 }
 
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
                enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
        replay_map->rm_state = REPLAY_DONE;
 }
 
-void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
 {
        struct ocfs2_replay_map *replay_map = osb->replay_map;
 
@@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
             (unsigned long)bh,
             (unsigned long long)bh->b_blocknr);
 
-       /* We aren't guaranteed to have the superblock here - but if we
-        * don't, it'll just crash. */
-       ocfs2_error(bh->b_assoc_map->host->i_sb,
+       ocfs2_error(bh->b_bdev->bd_super,
                    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
 }
 
@@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
        trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
 
        status = jbd2_journal_dirty_metadata(handle, bh);
-       BUG_ON(status);
+       if (status) {
+               mlog_errno(status);
+               if (!is_handle_aborted(handle)) {
+                       journal_t *journal = handle->h_transaction->t_journal;
+                       struct super_block *sb = bh->b_bdev->bd_super;
+
+                       mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
+                                       "Aborting transaction and journal.\n");
+                       handle->h_err = status;
+                       jbd2_journal_abort_handle(handle);
+                       jbd2_journal_abort(journal, status);
+                       ocfs2_abort(sb, "Journal already aborted.\n");
+               }
+       }
 }
 
 #define OCFS2_DEFAULT_COMMIT_INTERVAL  (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
  * hasn't happened.  The node queues a scan and increments the
  * sequence number in the LVB.
  */
-void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 {
        struct ocfs2_orphan_scan *os;
        int status, i;
@@ -1933,7 +1944,7 @@ out:
 }
 
 /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
-void ocfs2_orphan_scan_work(struct work_struct *work)
+static void ocfs2_orphan_scan_work(struct work_struct *work)
 {
        struct ocfs2_orphan_scan *os;
        struct ocfs2_super *osb;
@@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        struct inode *inode = NULL;
        struct inode *iter;
        struct ocfs2_inode_info *oi;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_dinode *di = NULL;
 
        trace_ocfs2_recover_orphans(slot);
 
@@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                oi->ip_next_orphan = NULL;
 
+               ret = ocfs2_rw_lock(inode, 1);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto next;
+               }
                /*
                 * We need to take and drop the inode lock to
                 * force read inode from disk.
                 */
-               ret = ocfs2_inode_lock(inode, NULL, 0);
+               ret = ocfs2_inode_lock(inode, &di_bh, 1);
                if (ret) {
                        mlog_errno(ret);
-                       goto next;
+                       goto unlock_rw;
                }
-               ocfs2_inode_unlock(inode, 0);
+
+               di = (struct ocfs2_dinode *)di_bh->b_data;
 
                if (inode->i_nlink == 0) {
                        spin_lock(&oi->ip_lock);
@@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                         * ocfs2_delete_inode. */
                        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
                        spin_unlock(&oi->ip_lock);
-               } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
-                       struct buffer_head *di_bh = NULL;
-
-                       ret = ocfs2_rw_lock(inode, 1);
-                       if (ret) {
-                               mlog_errno(ret);
-                               goto next;
-                       }
-
-                       ret = ocfs2_inode_lock(inode, &di_bh, 1);
-                       if (ret < 0) {
-                               ocfs2_rw_unlock(inode, 1);
-                               mlog_errno(ret);
-                               goto next;
-                       }
-
+               } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+                               (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
                        ret = ocfs2_truncate_file(inode, di_bh,
                                        i_size_read(inode));
-                       ocfs2_inode_unlock(inode, 1);
-                       ocfs2_rw_unlock(inode, 1);
-                       brelse(di_bh);
                        if (ret < 0) {
                                if (ret != -ENOSPC)
                                        mlog_errno(ret);
-                               goto next;
+                               goto unlock_inode;
                        }
 
-                       ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+                       ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
                        if (ret)
                                mlog_errno(ret);
 
                        wake_up(&OCFS2_I(inode)->append_dio_wq);
                } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-
+unlock_inode:
+               ocfs2_inode_unlock(inode, 1);
+unlock_rw:
+               ocfs2_rw_unlock(inode, 1);
 next:
                iput(inode);
-
+               brelse(di_bh);
+               di_bh = NULL;
                inode = iter;
        }
 
index 176fe6afd94eccf584c7ce8faa5556eee776457b..6e6abb93fda590468406951b54419b9d26bcbcc4 100644 (file)
@@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        int inode1_is_ancestor, inode2_is_ancestor;
        struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
        struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
-       struct buffer_head **tmpbh;
-       struct inode *tmpinode;
 
        trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
                                (unsigned long long)oi2->ip_blkno);
@@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                                (oi1->ip_blkno < oi2->ip_blkno &&
                                inode2_is_ancestor == 0)) {
                        /* switch id1 and id2 around */
-                       tmpbh = bh2;
-                       bh2 = bh1;
-                       bh1 = tmpbh;
-
-                       tmpinode = inode2;
-                       inode2 = inode1;
-                       inode1 = tmpinode;
+                       swap(bh2, bh1);
+                       swap(inode2, inode1);
                }
                /* lock id2 */
                status = ocfs2_inode_lock_nested(inode2, bh2, 1,
@@ -2670,30 +2663,22 @@ bail:
 }
 
 int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
-               struct inode *inode, int update_isize,
-               loff_t end)
+               struct inode *inode, struct buffer_head *di_bh,
+               int update_isize, loff_t end)
 {
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
-       struct buffer_head *di_bh = NULL;
-       struct ocfs2_dinode *di = NULL;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle = NULL;
        int status = 0;
 
-       status = ocfs2_inode_lock(inode, &di_bh, 1);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
-       di = (struct ocfs2_dinode *) di_bh->b_data;
-
        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
                        ORPHAN_DIR_SYSTEM_INODE,
                        le16_to_cpu(di->i_dio_orphaned_slot));
        if (!orphan_dir_inode) {
                status = -ENOENT;
                mlog_errno(status);
-               goto bail_unlock_inode;
+               goto bail;
        }
 
        mutex_lock(&orphan_dir_inode->i_mutex);
@@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
                mutex_unlock(&orphan_dir_inode->i_mutex);
                iput(orphan_dir_inode);
                mlog_errno(status);
-               goto bail_unlock_inode;
+               goto bail;
        }
 
        handle = ocfs2_start_trans(osb,
@@ -2749,10 +2734,6 @@ bail_unlock_orphan:
        brelse(orphan_dir_bh);
        iput(orphan_dir_inode);
 
-bail_unlock_inode:
-       ocfs2_inode_unlock(inode, 1);
-       brelse(di_bh);
-
 bail:
        return status;
 }
index 5ddecce172fad738d2d436df76618bda9f19b756..e173329eb83057ed6ce74c769c3dd3f769e08b24 100644 (file)
@@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
                struct inode *inode);
 int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
-               struct inode *inode, int update_isize,
-               loff_t end);
+               struct inode *inode, struct buffer_head *di_bh,
+               int update_isize, loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *new_inode,
                                   struct dentry *new_dentry);
index 460c6c37e683f844bd9612d51e6b89224cabb22b..690ddc60189b5270246f971c4e21a63bfb0a0254 100644 (file)
@@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
        return (u64)clusters << c_to_b_bits;
 }
 
+static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb,
+               u64 blocks)
+{
+       int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+                       sb->s_blocksize_bits;
+
+       blocks += (1 << b_to_c_bits) - 1;
+       return (u32)(blocks >> b_to_c_bits);
+}
+
 static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
                                           u64 blocks)
 {
index d8c6af101f3ff79ebb59589cdafab63a3f3c8f95..b69dd14c0b9ba1e39aa7662aec0350e8b5044702 100644 (file)
@@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
 
 static void swap_refcount_rec(void *a, void *b, int size)
 {
-       struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+       struct ocfs2_refcount_rec *l = a, *r = b;
 
-       tmp = *l;
-       *l = *r;
-       *r = tmp;
+       swap(*l, *r);
 }
 
 /*
index d03bfbf3d27d503b3b10bc912f30da9d494c1edb..889f3796a0d732638ce8d76fe9bf0b0c484eb7e0 100644 (file)
@@ -7271,7 +7271,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
 
-int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
                     void *fs_info)
 {
        const struct xattr *xattr;
index fd02a9ebfc30e5086bf45e19b3aa6ca8d1e13e2a..3f57dac31ba66983f4f1295ff9fc99ff15211e4c 100644 (file)
@@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk)
 {
        unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
 
+       /*
+        * Parked tasks do not run; they sit in __kthread_parkme().
+        * Without this check, we would report them as running, which is
+        * clearly wrong, so we report them as sleeping instead.
+        */
+       if (tsk->state == TASK_PARKED)
+               state = TASK_INTERRUPTIBLE;
+
        BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
 
        return task_state_array[fls(state)];
index 4f355a1c1a9e87e45678d573ca86a3496d7dd4a5..5fc1e50a7f30c4258c018f560709c54fda005b40 100644 (file)
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                break;
 
                        error = add_to_page_cache_lru(page, mapping, index,
-                                               GFP_KERNEL);
+                                       GFP_KERNEL & mapping_gfp_mask(mapping));
                        if (unlikely(error)) {
                                page_cache_release(page);
                                if (error == -EEXIST)
index bd910ceaccfa2d5b66da77b1b5becc760c50a615..29c57b2cb344dfc0bd73034927bfa50848d6709b 100644 (file)
@@ -96,11 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-                                      unsigned long address,
-                                      pmd_t *pmdp)
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+                                           unsigned long address,
+                                           pmd_t *pmdp)
 {
        pmd_t pmd = *pmdp;
        pmd_clear(pmdp);
@@ -109,13 +109,13 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
 {
-       return pmdp_get_and_clear(mm, address, pmdp);
+       return pmdp_huge_get_and_clear(mm, address, pmdp);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
@@ -152,8 +152,8 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              pte_t *ptep);
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
+extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
 #endif
@@ -189,6 +189,22 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
 #endif
 
+#ifndef pmdp_collapse_flush
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp);
+#else
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+                                       unsigned long address,
+                                       pmd_t *pmdp)
+{
+       BUILD_BUG();
+       return *pmdp;
+}
+#define pmdp_collapse_flush pmdp_collapse_flush
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
index 0995c2de8162c2f6368647503ddda9017a68f6c6..f589222bfa87e457aa211469c22be8341450b6d5 100644 (file)
@@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename,
 /* Only NUMA needs hash distribution. 64bit NUMA architectures have
  * sufficient vmalloc space.
  */
-#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
-#define HASHDIST_DEFAULT 1
+#ifdef CONFIG_NUMA
+#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
+extern int hashdist;           /* Distribute hashes across NUMA nodes? */
 #else
-#define HASHDIST_DEFAULT 0
+#define hashdist (0)
 #endif
-extern int hashdist;           /* Distribute hashes across NUMA nodes? */
 
 
 #endif /* _LINUX_BOOTMEM_H */
index 34025df6182962952e094f9c98e7a38b3c25bab4..c9e5c57e4edf2c09ccc4c09c72c54a87c1ecbd9c 100644 (file)
@@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item)
        return item->ci_name;
 }
 
-extern void config_item_init(struct config_item *);
 extern void config_item_init_type_name(struct config_item *item,
                                       const char *name,
                                       struct config_item_type *type);
index 2092965afca3994606ee8a255a97929a38df8095..5f19efe4eb3f0cb52d43e777be43dc65c9a41bc3 100644 (file)
@@ -96,6 +96,8 @@ typedef       struct {
 #define EFI_MEMORY_WP          ((u64)0x0000000000001000ULL)    /* write-protect */
 #define EFI_MEMORY_RP          ((u64)0x0000000000002000ULL)    /* read-protect */
 #define EFI_MEMORY_XP          ((u64)0x0000000000004000ULL)    /* execute-protect */
+#define EFI_MEMORY_MORE_RELIABLE \
+                               ((u64)0x0000000000010000ULL)    /* higher reliability */
 #define EFI_MEMORY_RUNTIME     ((u64)0x8000000000000000ULL)    /* range requires runtime mapping */
 #define EFI_MEMORY_DESCRIPTOR_VERSION  1
 
@@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void);  /* switch EFI to virtual mode, if pos
 extern void efi_late_init(void);
 extern void efi_free_boot_services(void);
 extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
+extern void efi_find_mirror(void);
 #else
 static inline void efi_late_init(void) {}
 static inline void efi_free_boot_services(void) {}
index 8293262401de39db28832ddca2d6e5bc02fd5923..e65ef959546cd8f35ec4a3a78e4d97d822781f04 100644 (file)
@@ -6,16 +6,16 @@
 #include <linux/bitops.h>
 
 struct frontswap_ops {
-       void (*init)(unsigned);
-       int (*store)(unsigned, pgoff_t, struct page *);
-       int (*load)(unsigned, pgoff_t, struct page *);
-       void (*invalidate_page)(unsigned, pgoff_t);
-       void (*invalidate_area)(unsigned);
+       void (*init)(unsigned); /* this swap type was just swapon'ed */
+       int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
+       int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
+       void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
+       void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
+       struct frontswap_ops *next; /* private pointer to next ops */
 };
 
 extern bool frontswap_enabled;
-extern struct frontswap_ops *
-       frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
 extern void frontswap_writethrough(bool);
index 0f313f93c586f8e60e2c39cf5d1b99ae48b1f6cd..65a517dd32f7ad0e23ce5ed69129be657d3b3fdf 100644 (file)
@@ -84,8 +84,6 @@ struct fsnotify_fname;
  * Each group much define these ops.  The fsnotify infrastructure will call
  * these operations for each relevant group.
  *
- * should_send_event - given a group, inode, and mask this function determines
- *             if the group is interested in this event.
  * handle_event - main call for a group to handle an fs event
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
  * freeing_mark - called when a mark is being destroyed for some reason.  The group
index e705467ddb478d1d0a56f53269f5bebf19822b32..d0a1f99e24e3eb43219e9f8b8aad400c1cc867a9 100644 (file)
@@ -28,7 +28,8 @@
 extern void kmemleak_init(void) __ref;
 extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
                           gfp_t gfp) __ref;
-extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref;
+extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                 gfp_t gfp) __ref;
 extern void kmemleak_free(const void *ptr) __ref;
 extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
 extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
                                            gfp_t gfp)
 {
 }
-static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                        gfp_t gfp)
 {
 }
 static inline void kmemleak_free(const void *ptr)
index 9497ec7c77ea1b45d0a48e9d390d292dd6db5153..0215ffd630690b35016dce6df131ea5f81902a27 100644 (file)
 #define INIT_PHYSMEM_REGIONS   4
 
 /* Definition of memblock flags. */
-#define MEMBLOCK_HOTPLUG       0x1     /* hotpluggable region */
+enum {
+       MEMBLOCK_NONE           = 0x0,  /* No special request */
+       MEMBLOCK_HOTPLUG        = 0x1,  /* hotpluggable region */
+       MEMBLOCK_MIRROR         = 0x2,  /* mirrored region */
+};
 
 struct memblock_region {
        phys_addr_t base;
@@ -61,7 +65,7 @@ extern bool movable_node_enabled;
 
 phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
                                            phys_addr_t start, phys_addr_t end,
-                                           int nid);
+                                           int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
                                   phys_addr_t size, phys_addr_t align);
 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -75,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
+int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
+ulong choose_memblock_flags(void);
 
 /* Low level functions */
 int memblock_add_range(struct memblock_type *type,
@@ -85,11 +91,13 @@ int memblock_remove_range(struct memblock_type *type,
                          phys_addr_t base,
                          phys_addr_t size);
 
-void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range(u64 *idx, int nid, ulong flags,
+                     struct memblock_type *type_a,
                      struct memblock_type *type_b, phys_addr_t *out_start,
                      phys_addr_t *out_end, int *out_nid);
 
-void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+                         struct memblock_type *type_a,
                          struct memblock_type *type_b, phys_addr_t *out_start,
                          phys_addr_t *out_end, int *out_nid);
 
@@ -100,16 +108,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
  * @type_a: ptr to memblock_type to iterate
  * @type_b: ptr to memblock_type which excludes from the iteration
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  */
-#define for_each_mem_range(i, type_a, type_b, nid,                     \
+#define for_each_mem_range(i, type_a, type_b, nid, flags,              \
                           p_start, p_end, p_nid)                       \
-       for (i = 0, __next_mem_range(&i, nid, type_a, type_b,           \
+       for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b,    \
                                     p_start, p_end, p_nid);            \
             i != (u64)ULLONG_MAX;                                      \
-            __next_mem_range(&i, nid, type_a, type_b,                  \
+            __next_mem_range(&i, nid, flags, type_a, type_b,           \
                              p_start, p_end, p_nid))
 
 /**
@@ -119,17 +128,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
  * @type_a: ptr to memblock_type to iterate
  * @type_b: ptr to memblock_type which excludes from the iteration
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  */
-#define for_each_mem_range_rev(i, type_a, type_b, nid,                 \
+#define for_each_mem_range_rev(i, type_a, type_b, nid, flags,          \
                               p_start, p_end, p_nid)                   \
        for (i = (u64)ULLONG_MAX,                                       \
-                    __next_mem_range_rev(&i, nid, type_a, type_b,      \
+                    __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
                                         p_start, p_end, p_nid);        \
             i != (u64)ULLONG_MAX;                                      \
-            __next_mem_range_rev(&i, nid, type_a, type_b,              \
+            __next_mem_range_rev(&i, nid, flags, type_a, type_b,       \
                                  p_start, p_end, p_nid))
 
 #ifdef CONFIG_MOVABLE_NODE
@@ -153,6 +163,11 @@ static inline bool movable_node_is_enabled(void)
 }
 #endif
 
+static inline bool memblock_is_mirror(struct memblock_region *m)
+{
+       return m->flags & MEMBLOCK_MIRROR;
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
                            unsigned long  *end_pfn);
@@ -181,13 +196,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
  *
  * Walks over free (memory && !reserved) areas of memblock.  Available as
  * soon as memblock is initialized.
  */
-#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid)         \
+#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid)  \
        for_each_mem_range(i, &memblock.memory, &memblock.reserved,     \
-                          nid, p_start, p_end, p_nid)
+                          nid, flags, p_start, p_end, p_nid)
 
 /**
  * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -196,13 +212,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
  *
  * Walks over free (memory && !reserved) areas of memblock in reverse
  * order.  Available as soon as memblock is initialized.
  */
-#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
+#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
+                                       p_nid)                          \
        for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
-                              nid, p_start, p_end, p_nid)
+                              nid, flags, p_start, p_end, p_nid)
 
 static inline void memblock_set_region_flags(struct memblock_region *r,
                                             unsigned long flags)
@@ -273,7 +291,8 @@ static inline bool memblock_bottom_up(void) { return false; }
 #define MEMBLOCK_ALLOC_ACCESSIBLE      0
 
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-                                       phys_addr_t start, phys_addr_t end);
+                                       phys_addr_t start, phys_addr_t end,
+                                       ulong flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
                                phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
new file mode 100644 (file)
index 0000000..4efc3f5
--- /dev/null
@@ -0,0 +1,25 @@
+/*
+ * Generic mm no-op hooks.
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_MM_ARCH_HOOKS_H
+#define _LINUX_MM_ARCH_HOOKS_H
+
+#include <asm/mm-arch-hooks.h>
+
+#ifndef arch_remap
+static inline void arch_remap(struct mm_struct *mm,
+                             unsigned long old_start, unsigned long old_end,
+                             unsigned long new_start, unsigned long new_end)
+{
+}
+#define arch_remap arch_remap
+#endif
+
+#endif /* _LINUX_MM_ARCH_HOOKS_H */
index 0755b9fd03a7d936e805efb71cfc1d371ddea0d6..24ad583596d1219b4ec1111e5aea3045230ee650 100644 (file)
@@ -499,7 +499,7 @@ static inline int page_count(struct page *page)
 
 static inline bool __compound_tail_refcounted(struct page *page)
 {
-       return !PageSlab(page) && !PageHeadHuge(page);
+       return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
 }
 
 /*
@@ -2146,12 +2146,47 @@ enum mf_flags {
 extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
+extern int get_hwpoison_page(struct page *page);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t num_poisoned_pages;
 extern int soft_offline_page(struct page *page, int flags);
 
+
+/*
+ * Error handlers for various types of pages.
+ */
+enum mf_result {
+       MF_IGNORED,     /* Error: cannot be handled */
+       MF_FAILED,      /* Error: handling failed */
+       MF_DELAYED,     /* Will be handled later */
+       MF_RECOVERED,   /* Successfully recovered */
+};
+
+enum mf_action_page_type {
+       MF_MSG_KERNEL,
+       MF_MSG_KERNEL_HIGH_ORDER,
+       MF_MSG_SLAB,
+       MF_MSG_DIFFERENT_COMPOUND,
+       MF_MSG_POISONED_HUGE,
+       MF_MSG_HUGE,
+       MF_MSG_FREE_HUGE,
+       MF_MSG_UNMAP_FAILED,
+       MF_MSG_DIRTY_SWAPCACHE,
+       MF_MSG_CLEAN_SWAPCACHE,
+       MF_MSG_DIRTY_MLOCKED_LRU,
+       MF_MSG_CLEAN_MLOCKED_LRU,
+       MF_MSG_DIRTY_UNEVICTABLE_LRU,
+       MF_MSG_CLEAN_UNEVICTABLE_LRU,
+       MF_MSG_DIRTY_LRU,
+       MF_MSG_CLEAN_LRU,
+       MF_MSG_TRUNCATED_LRU,
+       MF_MSG_BUDDY,
+       MF_MSG_BUDDY_2ND,
+       MF_MSG_UNKNOWN,
+};
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
                            unsigned long addr,
index 95243d28a0ee72cfaa0abdeb77dfb83bd4738cd9..61cd67f4d7881cbbd8eba481729c06b31d45e9c6 100644 (file)
@@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        ___pte;                                                         \
 })
 
-#define pmdp_clear_flush_notify(__vma, __haddr, __pmd)                 \
+#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd)            \
 ({                                                                     \
        unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;              \
        struct mm_struct *___mm = (__vma)->vm_mm;                       \
        pmd_t ___pmd;                                                   \
                                                                        \
-       ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd);               \
+       ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd);          \
        mmu_notifier_invalidate_range(___mm, ___haddr,                  \
                                      ___haddr + HPAGE_PMD_SIZE);       \
                                                                        \
        ___pmd;                                                         \
 })
 
-#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd)                        \
+#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)           \
 ({                                                                     \
        unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;              \
        pmd_t ___pmd;                                                   \
                                                                        \
-       ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd);              \
+       ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);         \
        mmu_notifier_invalidate_range(__mm, ___haddr,                   \
                                      ___haddr + HPAGE_PMD_SIZE);       \
                                                                        \
@@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define        ptep_clear_flush_notify ptep_clear_flush
-#define pmdp_clear_flush_notify pmdp_clear_flush
-#define pmdp_get_and_clear_notify pmdp_get_and_clear
+#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
+#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
index 3d46fb4708e051ef78f4be83651f4a707a14b56d..f94da0e65dea90e1fa0950420a66c14f3dcb472a 100644 (file)
@@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern unsigned long *watchdog_cpumask_bits;
 extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
@@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
                              void __user *, size_t *, loff_t *);
 extern int proc_watchdog_thresh(struct ctl_table *, int ,
                                void __user *, size_t *, loff_t *);
+extern int proc_watchdog_cpumask(struct ctl_table *, int,
+                                void __user *, size_t *, loff_t *);
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
index 44b2f6f7bbd8323b6462417f769fc14e7cf37a2c..7deecb7bca5e3f76b480fb6fd514fbdcfc1c90b9 100644 (file)
@@ -32,6 +32,8 @@ enum oom_scan_t {
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN                ((__force oom_flags_t)0x1)
 
+extern struct mutex oom_lock;
+
 static inline void set_current_oom_origin(void)
 {
        current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -47,9 +49,7 @@ static inline bool oom_task_origin(const struct task_struct *p)
        return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
-extern void mark_tsk_oom_victim(struct task_struct *tsk);
-
-extern void unmark_oom_victim(void);
+extern void mark_oom_victim(struct task_struct *tsk);
 
 extern unsigned long oom_badness(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -62,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                             struct mem_cgroup *memcg, nodemask_t *nodemask,
                             const char *message);
 
-extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
-
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                               int order, const nodemask_t *nodemask,
                               struct mem_cgroup *memcg);
@@ -75,6 +72,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 
 extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *mask, bool force_kill);
+
+extern void exit_oom_victim(void);
+
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
index ffd24c8301513f956c6ede0927e8548f73a131c9..9de2fdc8b5e4643b99c94176eb8dea6f1144852f 100644 (file)
@@ -153,8 +153,30 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
+/*
+ * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
+ * to create the kmalloc_caches object in create_kmalloc_caches(). The first
+ * and the second are 96 and 192. You can see that in the kmalloc_index(), if
+ * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
+ * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
+ * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
+ */
+#if KMALLOC_MIN_SIZE <= 32
+#define KMALLOC_LOOP_LOW 1
+#elif KMALLOC_MIN_SIZE <= 64
+#define KMALLOC_LOOP_LOW 2
+#else
+#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
+#endif
+
 #else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+/*
+ * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
+ * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
+ * initialized.
+ */
+#define KMALLOC_LOOP_LOW 1
 #endif
 
 /*
@@ -240,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
  * belongs to.
  * 0 = zero alloc
  * 1 =  65 .. 96 bytes
- * 2 = 120 .. 192 bytes
- * n = 2^(n-1) .. 2^n -1
+ * 2 = 129 .. 192 bytes
+ * n = 2^(n-1)+1 .. 2^n
  */
 static __always_inline int kmalloc_index(size_t size)
 {
index d600afb2192673702a9000d3e64b1788f15ec1af..da3c593f9845b09702453e4db9096806ff7271b7 100644 (file)
@@ -27,6 +27,8 @@ struct smpboot_thread_data;
  * @pre_unpark:                Optional unpark function, called before the thread is
  *                     unparked (cpu online). This is not guaranteed to be
  *                     called on the target cpu of the thread. Careful!
+ * @cpumask:           Internal state.  To update which threads are unparked,
+ *                     call smpboot_update_cpumask_percpu_thread().
  * @selfparking:       Thread is not parked by the park function.
  * @thread_comm:       The base name of the thread
  */
@@ -41,11 +43,14 @@ struct smp_hotplug_thread {
        void                            (*park)(unsigned int cpu);
        void                            (*unpark)(unsigned int cpu);
        void                            (*pre_unpark)(unsigned int cpu);
+       cpumask_var_t                   cpumask;
        bool                            selfparking;
        const char                      *thread_comm;
 };
 
 int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                        const struct cpumask *);
 
 #endif
index 79abb9c71772efa32c41fb448fd3beaddbd3e66e..1443d79e4fe66bb6456840d0597fadeed5b90bf9 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/cper.h>
+#include <linux/mm.h>
 
 /*
  * MCE Extended Error Log trace event
@@ -232,6 +233,90 @@ TRACE_EVENT(aer_event,
                __print_flags(__entry->status, "|", aer_uncorrectable_errors))
 );
 
+/*
+ * memory-failure recovery action result event
+ *
+ * unsigned long pfn - Page Frame Number of the corrupted page
+ * int type    -       Page types of the corrupted page
+ * int result  -       Result of recovery action
+ */
+
+#ifdef CONFIG_MEMORY_FAILURE
+#define MF_ACTION_RESULT       \
+       EM ( MF_IGNORED, "Ignored" )    \
+       EM ( MF_FAILED,  "Failed" )     \
+       EM ( MF_DELAYED, "Delayed" )    \
+       EMe ( MF_RECOVERED, "Recovered" )
+
+#define MF_PAGE_TYPE           \
+       EM ( MF_MSG_KERNEL, "reserved kernel page" )                    \
+       EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" )       \
+       EM ( MF_MSG_SLAB, "kernel slab page" )                          \
+       EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
+       EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" )      \
+       EM ( MF_MSG_HUGE, "huge page" )                                 \
+       EM ( MF_MSG_FREE_HUGE, "free huge page" )                       \
+       EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )             \
+       EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )           \
+       EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )           \
+       EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" )       \
+       EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" )       \
+       EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" )       \
+       EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" )       \
+       EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" )                       \
+       EM ( MF_MSG_CLEAN_LRU, "clean LRU page" )                       \
+       EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" )       \
+       EM ( MF_MSG_BUDDY, "free buddy page" )                          \
+       EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" )            \
+       EMe ( MF_MSG_UNKNOWN, "unknown page" )
+
+/*
+ * First define the enums in MM_ACTION_RESULT to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)      TRACE_DEFINE_ENUM(a);
+
+MF_ACTION_RESULT
+MF_PAGE_TYPE
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)               { a, b },
+#define EMe(a, b)      { a, b }
+
+TRACE_EVENT(memory_failure_event,
+       TP_PROTO(unsigned long pfn,
+                int type,
+                int result),
+
+       TP_ARGS(pfn, type, result),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, pfn)
+               __field(int, type)
+               __field(int, result)
+       ),
+
+       TP_fast_assign(
+               __entry->pfn    = pfn;
+               __entry->type   = type;
+               __entry->result = result;
+       ),
+
+       TP_printk("pfn %#lx: recovery action for %s: %s",
+               __entry->pfn,
+               __print_symbolic(__entry->type, MF_PAGE_TYPE),
+               __print_symbolic(__entry->result, MF_ACTION_RESULT)
+       )
+);
+#endif /* CONFIG_MEMORY_FAILURE */
 #endif /* _TRACE_HW_EVENT_MC_H */
 
 /* This part must be outside protection */
index 22fcc05dec4022fa54f2507cb13eddf2137abacc..185752a729f6657fb9e83c54c609cb145760bbe9 100644 (file)
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
-               unmark_oom_victim();
+               exit_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
index c697f73d82d6a4157a15ef9b0039b007083f7857..7c434c39f02a250f4721475910e881b43b603313 100644 (file)
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu)
 
        mutex_lock(&smpboot_threads_lock);
        list_for_each_entry(cur, &hotplug_threads, list)
-               smpboot_unpark_thread(cur, cpu);
+               if (cpumask_test_cpu(cpu, cur->cpumask))
+                       smpboot_unpark_thread(cur, cpu);
        mutex_unlock(&smpboot_threads_lock);
 }
 
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
        unsigned int cpu;
 
+       /* Unpark any threads that were voluntarily parked. */
+       for_each_cpu_not(cpu, ht->cpumask) {
+               if (cpu_online(cpu)) {
+                       struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+                       if (tsk)
+                               kthread_unpark(tsk);
+               }
+       }
+
        /* We need to destroy also the parked threads of offline cpus */
        for_each_possible_cpu(cpu) {
                struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
        unsigned int cpu;
        int ret = 0;
 
+       if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
+               return -ENOMEM;
+       cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        for_each_online_cpu(cpu) {
@@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
        smpboot_destroy_threads(plug_thread);
        mutex_unlock(&smpboot_threads_lock);
        put_online_cpus();
+       free_cpumask_var(plug_thread->cpumask);
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
 
+/**
+ * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
+ * @plug_thread:       Hotplug thread descriptor
+ * @new:               Revised mask to use
+ *
+ * The cpumask field in the smp_hotplug_thread must not be updated directly
+ * by the client, but only by calling this function.
+ * This function can only be called on a registered smp_hotplug_thread.
+ */
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                        const struct cpumask *new)
+{
+       struct cpumask *old = plug_thread->cpumask;
+       cpumask_var_t tmp;
+       unsigned int cpu;
+
+       if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+               return -ENOMEM;
+
+       get_online_cpus();
+       mutex_lock(&smpboot_threads_lock);
+
+       /* Park threads that were exclusively enabled on the old mask. */
+       cpumask_andnot(tmp, old, new);
+       for_each_cpu_and(cpu, tmp, cpu_online_mask)
+               smpboot_park_thread(plug_thread, cpu);
+
+       /* Unpark threads that are exclusively enabled on the new mask. */
+       cpumask_andnot(tmp, new, old);
+       for_each_cpu_and(cpu, tmp, cpu_online_mask)
+               smpboot_unpark_thread(plug_thread, cpu);
+
+       cpumask_copy(old, new);
+
+       mutex_unlock(&smpboot_threads_lock);
+       put_online_cpus();
+
+       free_cpumask_var(tmp);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
+
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 
 /*
index b13e9d2de302411438ba62898ca27697130d38b0..812fcc3fd3906f7a095888a3fde0fe4893027cb6 100644 (file)
@@ -871,6 +871,13 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+       {
+               .procname       = "watchdog_cpumask",
+               .data           = &watchdog_cpumask_bits,
+               .maxlen         = NR_CPUS,
+               .mode           = 0644,
+               .proc_handler   = proc_watchdog_cpumask,
+       },
        {
                .procname       = "softlockup_panic",
                .data           = &softlockup_panic,
index 581a68a04c64089b847d3b76d1abc138a83bb209..a6ffa43f299301dd750e9be092975df0d5e83786 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
+#include <linux/tick.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #else
 #define sysctl_softlockup_all_cpu_backtrace 0
 #endif
+static struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+       for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void)
         * do we care if a 0 races with a timestamp?
         * all it means is the softlock check starts one cycle later
         */
-       for_each_online_cpu(cpu)
+       for_each_watchdog_cpu(cpu)
                per_cpu(watchdog_touch_ts, cpu) = 0;
 }
 
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void)
                goto unlock;
 
        get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_watchdog_cpu(cpu)
                watchdog_nmi_enable(cpu);
        put_online_cpus();
 
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void)
                goto unlock;
 
        get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_watchdog_cpu(cpu)
                watchdog_nmi_disable(cpu);
        put_online_cpus();
 
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void)
        int cpu;
 
        get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_watchdog_cpu(cpu)
                update_watchdog(cpu);
        put_online_cpus();
 }
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void)
                err = smpboot_register_percpu_thread(&watchdog_threads);
                if (err)
                        pr_err("Failed to create watchdog threads, disabled\n");
-               else
+               else {
+                       if (smpboot_update_cpumask_percpu_thread(
+                                   &watchdog_threads, &watchdog_cpumask))
+                               pr_err("Failed to set cpumask for watchdog threads\n");
                        watchdog_running = 1;
+               }
        } else {
                /*
                 * Enable/disable the lockup detectors or
@@ -879,12 +890,58 @@ out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
+
+/*
+ * The cpumask is the mask of possible cpus that the watchdog can run
+ * on, not the mask of cpus it is actually running on.  This allows the
+ * user to specify a mask that will include cpus that have not yet
+ * been brought online, if desired.
+ */
+int proc_watchdog_cpumask(struct ctl_table *table, int write,
+                         void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int err;
+
+       mutex_lock(&watchdog_proc_mutex);
+       err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+       if (!err && write) {
+               /* Remove impossible cpus to keep sysctl output cleaner. */
+               cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
+                           cpu_possible_mask);
+
+               if (watchdog_running) {
+                       /*
+                        * Failure would be due to being unable to allocate
+                        * a temporary cpumask, so we are likely not in a
+                        * position to do much else to make things better.
+                        */
+                       if (smpboot_update_cpumask_percpu_thread(
+                                   &watchdog_threads, &watchdog_cpumask) != 0)
+                               pr_err("cpumask update failed\n");
+               }
+       }
+       mutex_unlock(&watchdog_proc_mutex);
+       return err;
+}
+
 #endif /* CONFIG_SYSCTL */
 
 void __init lockup_detector_init(void)
 {
        set_sample_period();
 
+#ifdef CONFIG_NO_HZ_FULL
+       if (tick_nohz_full_enabled()) {
+               if (!cpumask_empty(tick_nohz_full_mask))
+                       pr_info("Disabling watchdog on nohz_full cores by default\n");
+               cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
+                              tick_nohz_full_mask);
+       } else
+               cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#else
+       cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#endif
+
        if (watchdog_enabled)
                watchdog_enable_all_cpus();
 }
index 390214da45463e0b134709af6f072ddcc9162f7d..c180af880ed5169cdbf1799c0650b61d68130990 100644 (file)
@@ -368,6 +368,7 @@ config MEMORY_FAILURE
        depends on ARCH_SUPPORTS_MEMORY_FAILURE
        bool "Enable recovery from hardware memory errors"
        select MEMORY_ISOLATION
+       select RAS
        help
          Enables code to recover from some memory failures on systems
          with MCA recovery. This allows a system to continue running
index 3a7a67b933942f5232091094f5d719d59bda13ab..e7d1db5330254da4a8d265b8784d1eb645693447 100644 (file)
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
        if (!size || !memblock_is_region_reserved(base, size))
                return -EINVAL;
 
-       /* ensure minimal alignment requied by mm core */
+       /* ensure minimal alignment required by mm core */
        alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
 
        /* alignment should be aligned with order_per_bit */
@@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
        /*
         * high_memory isn't direct mapped memory so retrieving its physical
         * address isn't appropriate.  But it would be useful to check the
-        * physical address of the highmem boundary so it's justfiable to get
+        * physical address of the highmem boundary so it's justifiable to get
         * the physical address from it.  On x86 there is a validation check for
         * this case, so the following workaround is needed to avoid it.
         */
@@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
                 */
                if (base < highmem_start && limit > highmem_start) {
                        addr = memblock_alloc_range(size, alignment,
-                                                   highmem_start, limit);
+                                                   highmem_start, limit,
+                                                   MEMBLOCK_NONE);
                        limit = highmem_start;
                }
 
                if (!addr) {
                        addr = memblock_alloc_range(size, alignment, base,
-                                                   limit);
+                                                   limit,
+                                                   MEMBLOCK_NONE);
                        if (!addr) {
                                ret = -ENOMEM;
                                goto err;
index 6bf5e42d560a46eea8e4916bd017d855033b8d65..8d17ceea8dbeb1f641687407f2a27ebbff480533 100644 (file)
@@ -196,7 +196,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
 
-       __dec_zone_page_state(page, NR_FILE_PAGES);
+       /* hugetlb pages do not participate in page cache accounting. */
+       if (!PageHuge(page))
+               __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
        BUG_ON(page_mapped(page));
@@ -483,7 +485,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
-               __inc_zone_page_state(new, NR_FILE_PAGES);
+
+               /*
+                * hugetlb pages do not participate in page cache accounting.
+                */
+               if (!PageHuge(new))
+                       __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irq(&mapping->tree_lock);
@@ -575,7 +582,10 @@ static int __add_to_page_cache_locked(struct page *page,
        radix_tree_preload_end();
        if (unlikely(error))
                goto err_insert;
-       __inc_zone_page_state(page, NR_FILE_PAGES);
+
+       /* hugetlb pages do not participate in page cache accounting. */
+       if (!huge)
+               __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false);
@@ -1654,8 +1664,8 @@ no_cached_page:
                        error = -ENOMEM;
                        goto out;
                }
-               error = add_to_page_cache_lru(page, mapping,
-                                               index, GFP_KERNEL);
+               error = add_to_page_cache_lru(page, mapping, index,
+                                       GFP_KERNEL & mapping_gfp_mask(mapping));
                if (error) {
                        page_cache_release(page);
                        if (error == -EEXIST) {
@@ -1756,7 +1766,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
                if (!page)
                        return -ENOMEM;
 
-               ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+               ret = add_to_page_cache_lru(page, mapping, offset,
+                               GFP_KERNEL & mapping_gfp_mask(mapping));
                if (ret == 0)
                        ret = mapping->a_ops->readpage(file, page);
                else if (ret == -EEXIST)
index 8d82809eb0859a49faea0ade42251daab1419982..27a9924caf617465937c784291766afd8f873476 100644 (file)
 #include <linux/swapfile.h>
 
 /*
- * frontswap_ops is set by frontswap_register_ops to contain the pointers
- * to the frontswap "backend" implementation functions.
+ * frontswap_ops are added by frontswap_register_ops, and provide the
+ * frontswap "backend" implementation functions.  Multiple implementations
+ * may be registered, but implementations can never deregister.  This
+ * is a simple singly-linked list of all registered implementations.
  */
 static struct frontswap_ops *frontswap_ops __read_mostly;
 
+#define for_each_frontswap_ops(ops)            \
+       for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
+
 /*
  * If enabled, frontswap_store will return failure even on success.  As
  * a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
  * on all frontswap functions to not call the backend until the backend
  * has registered.
  *
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to frontswap_init are executed (by iterating over the need_init
- * bitmap) to create tmem_pools and set the respective poolids. All of that is
- * guarded by us using atomic bit operations on the 'need_init' bitmap.
- *
  * This would not guards us against the user deciding to call swapoff right as
  * we are calling the backend to initialize (so swapon is in action).
  * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { }
  *
  * Obviously the opposite (unloading the backend) must be done after all
  * the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point frontswap_ops
- * would have to be made in some fashion atomic.
+ * ignoring or failing the requests.  However, there is currently no way
+ * to unload a backend once it is registered.
  */
-static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
 
 /*
- * Register operations for frontswap, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for frontswap
  */
-struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+void frontswap_register_ops(struct frontswap_ops *ops)
 {
-       struct frontswap_ops *old = frontswap_ops;
-       int i;
-
-       for (i = 0; i < MAX_SWAPFILES; i++) {
-               if (test_and_clear_bit(i, need_init)) {
-                       struct swap_info_struct *sis = swap_info[i];
-                       /* __frontswap_init _should_ have set it! */
-                       if (!sis->frontswap_map)
-                               return ERR_PTR(-EINVAL);
-                       ops->init(i);
-               }
+       DECLARE_BITMAP(a, MAX_SWAPFILES);
+       DECLARE_BITMAP(b, MAX_SWAPFILES);
+       struct swap_info_struct *si;
+       unsigned int i;
+
+       bitmap_zero(a, MAX_SWAPFILES);
+       bitmap_zero(b, MAX_SWAPFILES);
+
+       spin_lock(&swap_lock);
+       plist_for_each_entry(si, &swap_active_head, list) {
+               if (!WARN_ON(!si->frontswap_map))
+                       set_bit(si->type, a);
        }
+       spin_unlock(&swap_lock);
+
+       /* the new ops needs to know the currently active swap devices */
+       for_each_set_bit(i, a, MAX_SWAPFILES)
+               ops->init(i);
+
        /*
-        * We MUST have frontswap_ops set _after_ the frontswap_init's
-        * have been called. Otherwise __frontswap_store might fail. Hence
-        * the barrier to make sure compiler does not re-order us.
+        * Setting frontswap_ops must happen after the ops->init() calls
+        * above; cmpxchg implies smp_mb() which will ensure the init is
+        * complete at this point.
         */
-       barrier();
-       frontswap_ops = ops;
-       return old;
+       do {
+               ops->next = frontswap_ops;
+       } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+
+       spin_lock(&swap_lock);
+       plist_for_each_entry(si, &swap_active_head, list) {
+               if (si->frontswap_map)
+                       set_bit(si->type, b);
+       }
+       spin_unlock(&swap_lock);
+
+       /*
+        * On the very unlikely chance that a swap device was added or
+        * removed between setting the "a" list bits and the ops init
+        * calls, we re-check and do init or invalidate for any changed
+        * bits.
+        */
+       if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
+               for (i = 0; i < MAX_SWAPFILES; i++) {
+                       if (!test_bit(i, a) && test_bit(i, b))
+                               ops->init(i);
+                       else if (test_bit(i, a) && !test_bit(i, b))
+                               ops->invalidate_area(i);
+               }
+       }
 }
 EXPORT_SYMBOL(frontswap_register_ops);
 
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 void __frontswap_init(unsigned type, unsigned long *map)
 {
        struct swap_info_struct *sis = swap_info[type];
+       struct frontswap_ops *ops;
 
        BUG_ON(sis == NULL);
 
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map)
         * p->frontswap set to something valid to work properly.
         */
        frontswap_map_set(sis, map);
-       if (frontswap_ops)
-               frontswap_ops->init(type);
-       else {
-               BUG_ON(type >= MAX_SWAPFILES);
-               set_bit(type, need_init);
-       }
+
+       for_each_frontswap_ops(ops)
+               ops->init(type);
 }
 EXPORT_SYMBOL(__frontswap_init);
 
 bool __frontswap_test(struct swap_info_struct *sis,
                                pgoff_t offset)
 {
-       bool ret = false;
-
-       if (frontswap_ops && sis->frontswap_map)
-               ret = test_bit(offset, sis->frontswap_map);
-       return ret;
+       if (sis->frontswap_map)
+               return test_bit(offset, sis->frontswap_map);
+       return false;
 }
 EXPORT_SYMBOL(__frontswap_test);
 
+static inline void __frontswap_set(struct swap_info_struct *sis,
+                                  pgoff_t offset)
+{
+       set_bit(offset, sis->frontswap_map);
+       atomic_inc(&sis->frontswap_pages);
+}
+
 static inline void __frontswap_clear(struct swap_info_struct *sis,
-                               pgoff_t offset)
+                                    pgoff_t offset)
 {
        clear_bit(offset, sis->frontswap_map);
        atomic_dec(&sis->frontswap_pages);
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis,
  */
 int __frontswap_store(struct page *page)
 {
-       int ret = -1, dup = 0;
+       int ret = -1;
        swp_entry_t entry = { .val = page_private(page), };
        int type = swp_type(entry);
        struct swap_info_struct *sis = swap_info[type];
        pgoff_t offset = swp_offset(entry);
+       struct frontswap_ops *ops;
 
        /*
         * Return if no backend registed.
         * Don't need to inc frontswap_failed_stores here.
         */
        if (!frontswap_ops)
-               return ret;
+               return -1;
 
        BUG_ON(!PageLocked(page));
        BUG_ON(sis == NULL);
-       if (__frontswap_test(sis, offset))
-               dup = 1;
-       ret = frontswap_ops->store(type, offset, page);
+
+       /*
+        * If a dup, we must remove the old page first; we can't leave the
+        * old page no matter if the store of the new page succeeds or fails,
+        * and we can't rely on the new page replacing the old page as we may
+        * not store to the same implementation that contains the old page.
+        */
+       if (__frontswap_test(sis, offset)) {
+               __frontswap_clear(sis, offset);
+               for_each_frontswap_ops(ops)
+                       ops->invalidate_page(type, offset);
+       }
+
+       /* Try to store in each implementation, until one succeeds. */
+       for_each_frontswap_ops(ops) {
+               ret = ops->store(type, offset, page);
+               if (!ret) /* successful store */
+                       break;
+       }
        if (ret == 0) {
-               set_bit(offset, sis->frontswap_map);
+               __frontswap_set(sis, offset);
                inc_frontswap_succ_stores();
-               if (!dup)
-                       atomic_inc(&sis->frontswap_pages);
        } else {
-               /*
-                 failed dup always results in automatic invalidate of
-                 the (older) page from frontswap
-                */
                inc_frontswap_failed_stores();
-               if (dup) {
-                       __frontswap_clear(sis, offset);
-                       frontswap_ops->invalidate_page(type, offset);
-               }
        }
        if (frontswap_writethrough_enabled)
                /* report failure so swap also writes to swap device */
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page)
        int type = swp_type(entry);
        struct swap_info_struct *sis = swap_info[type];
        pgoff_t offset = swp_offset(entry);
+       struct frontswap_ops *ops;
+
+       if (!frontswap_ops)
+               return -1;
 
        BUG_ON(!PageLocked(page));
        BUG_ON(sis == NULL);
-       /*
-        * __frontswap_test() will check whether there is backend registered
-        */
-       if (__frontswap_test(sis, offset))
-               ret = frontswap_ops->load(type, offset, page);
+       if (!__frontswap_test(sis, offset))
+               return -1;
+
+       /* Try loading from each implementation, until one succeeds. */
+       for_each_frontswap_ops(ops) {
+               ret = ops->load(type, offset, page);
+               if (!ret) /* successful load */
+                       break;
+       }
        if (ret == 0) {
                inc_frontswap_loads();
                if (frontswap_tmem_exclusive_gets_enabled) {
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load);
 void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
        struct swap_info_struct *sis = swap_info[type];
+       struct frontswap_ops *ops;
+
+       if (!frontswap_ops)
+               return;
 
        BUG_ON(sis == NULL);
-       /*
-        * __frontswap_test() will check whether there is backend registered
-        */
-       if (__frontswap_test(sis, offset)) {
-               frontswap_ops->invalidate_page(type, offset);
-               __frontswap_clear(sis, offset);
-               inc_frontswap_invalidates();
-       }
+       if (!__frontswap_test(sis, offset))
+               return;
+
+       for_each_frontswap_ops(ops)
+               ops->invalidate_page(type, offset);
+       __frontswap_clear(sis, offset);
+       inc_frontswap_invalidates();
 }
 EXPORT_SYMBOL(__frontswap_invalidate_page);
 
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
 void __frontswap_invalidate_area(unsigned type)
 {
        struct swap_info_struct *sis = swap_info[type];
+       struct frontswap_ops *ops;
 
-       if (frontswap_ops) {
-               BUG_ON(sis == NULL);
-               if (sis->frontswap_map == NULL)
-                       return;
-               frontswap_ops->invalidate_area(type);
-               atomic_set(&sis->frontswap_pages, 0);
-               bitmap_zero(sis->frontswap_map, sis->max);
-       }
-       clear_bit(type, need_init);
+       if (!frontswap_ops)
+               return;
+
+       BUG_ON(sis == NULL);
+       if (sis->frontswap_map == NULL)
+               return;
+
+       for_each_frontswap_ops(ops)
+               ops->invalidate_area(type);
+       atomic_set(&sis->frontswap_pages, 0);
+       bitmap_zero(sis->frontswap_map, sis->max);
 }
 EXPORT_SYMBOL(__frontswap_invalidate_area);
 
index 078832cf3636579e1d9cc1c396b037319e52a6bd..c107094f79bae9ee895bd6bf30976d900f16c141 100644 (file)
@@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                goto out_free_pages;
        VM_BUG_ON_PAGE(!PageHead(page), page);
 
-       pmdp_clear_flush_notify(vma, haddr, pmd);
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
 
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1174,7 +1174,7 @@ alloc:
                pmd_t entry;
                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               pmdp_clear_flush_notify(vma, haddr, pmd);
+               pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                mem_cgroup_commit_charge(new_page, memcg, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1396,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t orig_pmd;
                /*
                 * For architectures like ppc64 we look at deposited pgtable
-                * when calling pmdp_get_and_clear. So do the
+                * when calling pmdp_huge_get_and_clear. So do the
                 * pgtable_trans_huge_withdraw after finishing pmdp related
                 * operations.
                 */
-               orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
-                                                  tlb->fullmm);
+               orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+                                                       tlb->fullmm);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
                if (is_huge_zero_pmd(orig_pmd)) {
@@ -1459,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
-               pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+               pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
                VM_BUG_ON(!pmd_none(*new_pmd));
 
                if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1505,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                }
 
                if (!prot_numa || !pmd_protnone(*pmd)) {
-                       entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+                       entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
                        entry = pmd_modify(entry, newprot);
                        if (preserve_write)
                                entry = pmd_mkwrite(entry);
@@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-       _pmd = pmdp_clear_flush(vma, address, pmd);
+       _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
@@ -2799,7 +2799,7 @@ static void khugepaged_do_scan(void)
 
                cond_resched();
 
-               if (unlikely(kthread_should_stop() || freezing(current)))
+               if (unlikely(kthread_should_stop() || try_to_freeze()))
                        break;
 
                spin_lock(&khugepaged_mm_lock);
@@ -2820,8 +2820,6 @@ static void khugepaged_do_scan(void)
 
 static void khugepaged_wait_work(void)
 {
-       try_to_freeze();
-
        if (khugepaged_has_work()) {
                if (!khugepaged_scan_sleep_millisecs)
                        return;
@@ -2865,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        pmd_t _pmd;
        int i;
 
-       pmdp_clear_flush_notify(vma, haddr, pmd);
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
 
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
index 271e4432734c376baf0bf4b8953a38e391ac011c..75c0eef52c5df4f3b534e439a56495f8ac8534f3 100644 (file)
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable;
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
+/*
+ * Minimum page order among possible hugepage sizes, set to a proper value
+ * at boot time.
+ */
+static unsigned int minimum_order __read_mostly = UINT_MAX;
 
 __initdata LIST_HEAD(huge_boot_pages);
 
@@ -212,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
  * Region tracking -- allows tracking of reservations and instantiated pages
  *                    across the pages in a mapping.
  *
- * The region data structures are embedded into a resv_map and
- * protected by a resv_map's lock
+ * The region data structures are embedded into a resv_map and protected
+ * by a resv_map's lock.  The set of regions within the resv_map represent
+ * reservations for huge pages, or huge pages that have already been
+ * instantiated within the map.  The from and to elements are huge page
+ * indicies into the associated mapping.  from indicates the starting index
+ * of the region.  to represents the first index past the end of  the region.
+ *
+ * For example, a file region structure with from == 0 and to == 4 represents
+ * four huge pages in a mapping.  It is important to note that the to element
+ * represents the first element past the end of the region. This is used in
+ * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
+ *
+ * Interval notation of the form [from, to) will be used to indicate that
+ * the endpoint from is inclusive and to is exclusive.
  */
 struct file_region {
        struct list_head link;
@@ -221,10 +238,22 @@ struct file_region {
        long to;
 };
 
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map.  Existing regions will be expanded to accommodate the
+ * specified range.  We know only existing regions need to be
+ * expanded, because region_add is only called after region_chg
+ * with the same range.  If a new file_region structure must
+ * be allocated, it is done in region_chg.
+ *
+ * Return the number of new huge pages added to the map.  This
+ * number is greater than or equal to zero.
+ */
 static long region_add(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
        struct file_region *rg, *nrg, *trg;
+       long add = 0;
 
        spin_lock(&resv->lock);
        /* Locate the region we are either in or before. */
@@ -250,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t)
                if (rg->to > t)
                        t = rg->to;
                if (rg != nrg) {
+                       /* Decrement return value by the deleted range.
+                        * Another range will span this area so that by
+                        * end of routine add will be >= zero
+                        */
+                       add -= (rg->to - rg->from);
                        list_del(&rg->link);
                        kfree(rg);
                }
        }
+
+       add += (nrg->from - f);         /* Added to beginning of region */
        nrg->from = f;
+       add += t - nrg->to;             /* Added to end of region */
        nrg->to = t;
+
        spin_unlock(&resv->lock);
-       return 0;
+       VM_BUG_ON(add < 0);
+       return add;
 }
 
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented.  This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t).  region_chg does
+ * not change the number of huge pages represented by the
+ * map.  However, if the existing regions in the map can not
+ * be expanded to represent the new range, a new file_region
+ * structure is added to the map as a placeholder.  This is
+ * so that the subsequent region_add call will have all the
+ * regions it needs and will not fail.
+ *
+ * Returns the number of huge pages that need to be added
+ * to the existing reservation map for the range [f, t).
+ * This number is greater or equal to zero.  -ENOMEM is
+ * returned if a new file_region structure is needed and can
+ * not be allocated.
+ */
 static long region_chg(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
@@ -326,6 +384,11 @@ out_nrg:
        return chg;
 }
 
+/*
+ * Truncate the reserve map at index 'end'.  Modify/truncate any
+ * region which contains end.  Delete any regions past end.
+ * Return the number of huge pages removed from the map.
+ */
 static long region_truncate(struct resv_map *resv, long end)
 {
        struct list_head *head = &resv->regions;
@@ -361,6 +424,10 @@ out:
        return chg;
 }
 
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
 static long region_count(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
@@ -1188,19 +1255,13 @@ static void dissolve_free_huge_page(struct page *page)
  */
 void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
-       unsigned int order = 8 * sizeof(void *);
        unsigned long pfn;
-       struct hstate *h;
 
        if (!hugepages_supported())
                return;
 
-       /* Set scan step to minimum hugepage size */
-       for_each_hstate(h)
-               if (order > huge_page_order(h))
-                       order = huge_page_order(h);
-       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
-       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
+       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
                dissolve_free_huge_page(pfn_to_page(pfn));
 }
 
@@ -1423,46 +1484,56 @@ static void return_unused_surplus_pages(struct hstate *h,
 }
 
 /*
- * Determine if the huge page at addr within the vma has an associated
- * reservation.  Where it does not we will need to logically increase
- * reservation and actually increase subpool usage before an allocation
- * can occur.  Where any new reservation would be required the
- * reservation change is prepared, but not committed.  Once the page
- * has been allocated from the subpool and instantiated the change should
- * be committed via vma_commit_reservation.  No action is required on
- * failure.
+ * vma_needs_reservation and vma_commit_reservation are used by the huge
+ * page allocation routines to manage reservations.
+ *
+ * vma_needs_reservation is called to determine if the huge page at addr
+ * within the vma has an associated reservation.  If a reservation is
+ * needed, the value 1 is returned.  The caller is then responsible for
+ * managing the global reservation and subpool usage counts.  After
+ * the huge page has been allocated, vma_commit_reservation is called
+ * to add the page to the reservation map.
+ *
+ * In the normal case, vma_commit_reservation returns the same value
+ * as the preceding vma_needs_reservation call.  The only time this
+ * is not the case is if a reserve map was changed between calls.  It
+ * is the responsibility of the caller to notice the difference and
+ * take appropriate action.
  */
-static long vma_needs_reservation(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long addr)
+static long __vma_reservation_common(struct hstate *h,
+                               struct vm_area_struct *vma, unsigned long addr,
+                               bool commit)
 {
        struct resv_map *resv;
        pgoff_t idx;
-       long chg;
+       long ret;
 
        resv = vma_resv_map(vma);
        if (!resv)
                return 1;
 
        idx = vma_hugecache_offset(h, vma, addr);
-       chg = region_chg(resv, idx, idx + 1);
+       if (commit)
+               ret = region_add(resv, idx, idx + 1);
+       else
+               ret = region_chg(resv, idx, idx + 1);
 
        if (vma->vm_flags & VM_MAYSHARE)
-               return chg;
+               return ret;
        else
-               return chg < 0 ? chg : 0;
+               return ret < 0 ? ret : 0;
 }
-static void vma_commit_reservation(struct hstate *h,
+
+static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-       struct resv_map *resv;
-       pgoff_t idx;
-
-       resv = vma_resv_map(vma);
-       if (!resv)
-               return;
+       return __vma_reservation_common(h, vma, addr, false);
+}
 
-       idx = vma_hugecache_offset(h, vma, addr);
-       region_add(resv, idx, idx + 1);
+static long vma_commit_reservation(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long addr)
+{
+       return __vma_reservation_common(h, vma, addr, true);
 }
 
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1471,7 +1542,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
-       long chg;
+       long chg, commit;
        int ret, idx;
        struct hugetlb_cgroup *h_cg;
 
@@ -1512,7 +1583,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 
        set_page_private(page, (unsigned long)spool);
 
-       vma_commit_reservation(h, vma, addr);
+       commit = vma_commit_reservation(h, vma, addr);
+       if (unlikely(chg > commit)) {
+               /*
+                * The page was added to the reservation map between
+                * vma_needs_reservation and vma_commit_reservation.
+                * This indicates a race with hugetlb_reserve_pages.
+                * Adjust for the subpool count incremented above AND
+                * in hugetlb_reserve_pages for the same page.  Also,
+                * the reservation count added in hugetlb_reserve_pages
+                * no longer applies.
+                */
+               long rsv_adjust;
+
+               rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+               hugetlb_acct_memory(h, -rsv_adjust);
+       }
        return page;
 
 out_uncharge_cgroup:
@@ -1627,10 +1713,14 @@ static void __init hugetlb_init_hstates(void)
        struct hstate *h;
 
        for_each_hstate(h) {
+               if (minimum_order > huge_page_order(h))
+                       minimum_order = huge_page_order(h);
+
                /* oversize hugepages were init'ed in early boot */
                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
        }
+       VM_BUG_ON(minimum_order == UINT_MAX);
 }
 
 static char * __init memfmt(char *buf, unsigned long n)
@@ -3626,8 +3716,24 @@ int hugetlb_reserve_pages(struct inode *inode,
         * consumed reservations are stored in the map. Hence, nothing
         * else has to be done for private mappings here
         */
-       if (!vma || vma->vm_flags & VM_MAYSHARE)
-               region_add(resv_map, from, to);
+       if (!vma || vma->vm_flags & VM_MAYSHARE) {
+               long add = region_add(resv_map, from, to);
+
+               if (unlikely(chg > add)) {
+                       /*
+                        * pages in this range were added to the reserve
+                        * map between region_chg and region_add.  This
+                        * indicates a race with alloc_huge_page.  Adjust
+                        * the subpool and reserve counts modified above
+                        * based on the difference.
+                        */
+                       long rsv_adjust;
+
+                       rsv_adjust = hugepage_subpool_put_pages(spool,
+                                                               chg - add);
+                       hugetlb_acct_memory(h, -rsv_adjust);
+               }
+       }
        return 0;
 out_err:
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
@@ -3789,6 +3895,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 {
        return NULL;
 }
+
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
 #define want_pmd_share()       (0)
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
index 4ca5fe0042e17c2eac0dd6d16f0065c41a5dfd4e..bf73ac17dad424d9e46857334e5b0befb6805fff 100644 (file)
@@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val)
        /*
         * This implies unable to support free buddy pages.
         */
-       if (!get_page_unless_zero(hpage))
+       if (!get_hwpoison_page(p))
                return 0;
 
        if (!hwpoison_filter_enable)
@@ -58,7 +58,7 @@ inject:
        pr_info("Injecting memory failure at pfn %#lx\n", pfn);
        return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 put_out:
-       put_page(hpage);
+       put_page(p);
        return 0;
 }
 
index f0fe4f2c1fa7aa865055731834cdd78e0684cca5..cf79f110157c9122afb7e6a72ded4b090ea814af 100644 (file)
  *   modifications to the memory scanning parameters including the scan_thread
  *   pointer
  *
+ * Locks and mutexes are acquired/nested in the following order:
+ *
+ *   scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING)
+ *
+ * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex
+ * regions.
+ *
  * The kmemleak_object structures have a use_count incremented or decremented
  * using the get_object()/put_object() functions. When the use_count becomes
  * 0, this count can no longer be incremented and put_object() schedules the
@@ -195,6 +202,8 @@ static struct kmem_cache *scan_area_cache;
 
 /* set if tracing memory operations is enabled */
 static int kmemleak_enabled;
+/* same as above but only for the kmemleak_free() callback */
+static int kmemleak_free_enabled;
 /* set in the late_initcall if there were no errors */
 static int kmemleak_initialized;
 /* enables or disables early logging of the memory operations */
@@ -483,8 +492,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 
        rcu_read_lock();
        read_lock_irqsave(&kmemleak_lock, flags);
-       if (ptr >= min_addr && ptr < max_addr)
-               object = lookup_object(ptr, alias);
+       object = lookup_object(ptr, alias);
        read_unlock_irqrestore(&kmemleak_lock, flags);
 
        /* check whether the object is still available */
@@ -495,6 +503,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
        return object;
 }
 
+/*
+ * Look up an object in the object search tree and remove it from both
+ * object_tree_root and object_list. The returned object's use_count should be
+ * at least 1, as initially set by create_object().
+ */
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+{
+       unsigned long flags;
+       struct kmemleak_object *object;
+
+       write_lock_irqsave(&kmemleak_lock, flags);
+       object = lookup_object(ptr, alias);
+       if (object) {
+               rb_erase(&object->rb_node, &object_tree_root);
+               list_del_rcu(&object->object_list);
+       }
+       write_unlock_irqrestore(&kmemleak_lock, flags);
+
+       return object;
+}
+
 /*
  * Save stack trace to the given array of MAX_TRACE size.
  */
@@ -580,11 +609,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
                        kmemleak_stop("Cannot insert 0x%lx into the object "
                                      "search tree (overlaps existing)\n",
                                      ptr);
+                       /*
+                        * No need for parent->lock here since "parent" cannot
+                        * be freed while the kmemleak_lock is held.
+                        */
+                       dump_object_info(parent);
                        kmem_cache_free(object_cache, object);
-                       object = parent;
-                       spin_lock(&object->lock);
-                       dump_object_info(object);
-                       spin_unlock(&object->lock);
+                       object = NULL;
                        goto out;
                }
        }
@@ -598,20 +629,14 @@ out:
 }
 
 /*
- * Remove the metadata (struct kmemleak_object) for a memory block from the
- * object_list and object_tree_root and decrement its use_count.
+ * Mark the object as not allocated and schedule RCU freeing via put_object().
  */
 static void __delete_object(struct kmemleak_object *object)
 {
        unsigned long flags;
 
-       write_lock_irqsave(&kmemleak_lock, flags);
-       rb_erase(&object->rb_node, &object_tree_root);
-       list_del_rcu(&object->object_list);
-       write_unlock_irqrestore(&kmemleak_lock, flags);
-
        WARN_ON(!(object->flags & OBJECT_ALLOCATED));
-       WARN_ON(atomic_read(&object->use_count) < 2);
+       WARN_ON(atomic_read(&object->use_count) < 1);
 
        /*
         * Locking here also ensures that the corresponding memory block
@@ -631,7 +656,7 @@ static void delete_object_full(unsigned long ptr)
 {
        struct kmemleak_object *object;
 
-       object = find_and_get_object(ptr, 0);
+       object = find_and_remove_object(ptr, 0);
        if (!object) {
 #ifdef DEBUG
                kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -640,7 +665,6 @@ static void delete_object_full(unsigned long ptr)
                return;
        }
        __delete_object(object);
-       put_object(object);
 }
 
 /*
@@ -653,7 +677,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
        struct kmemleak_object *object;
        unsigned long start, end;
 
-       object = find_and_get_object(ptr, 1);
+       object = find_and_remove_object(ptr, 1);
        if (!object) {
 #ifdef DEBUG
                kmemleak_warn("Partially freeing unknown object at 0x%08lx "
@@ -661,7 +685,6 @@ static void delete_object_part(unsigned long ptr, size_t size)
 #endif
                return;
        }
-       __delete_object(object);
 
        /*
         * Create one or two objects that may result from the memory block
@@ -679,7 +702,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
                create_object(ptr + size, end - ptr - size, object->min_count,
                              GFP_KERNEL);
 
-       put_object(object);
+       __delete_object(object);
 }
 
 static void __paint_it(struct kmemleak_object *object, int color)
@@ -907,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
  * kmemleak_alloc_percpu - register a newly allocated __percpu object
  * @ptr:       __percpu pointer to beginning of the object
  * @size:      size of the object
+ * @gfp:       flags used for kmemleak internal memory allocations
  *
  * This function is called from the kernel percpu allocator when a new object
- * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
- * allocation.
+ * (memory block) is allocated (alloc_percpu).
  */
-void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+                                gfp_t gfp)
 {
        unsigned int cpu;
 
@@ -925,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
        if (kmemleak_enabled && ptr && !IS_ERR(ptr))
                for_each_possible_cpu(cpu)
                        create_object((unsigned long)per_cpu_ptr(ptr, cpu),
-                                     size, 0, GFP_KERNEL);
+                                     size, 0, gfp);
        else if (kmemleak_early_log)
                log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
 }
@@ -942,7 +966,7 @@ void __ref kmemleak_free(const void *ptr)
 {
        pr_debug("%s(0x%p)\n", __func__, ptr);
 
-       if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+       if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
                delete_object_full((unsigned long)ptr);
        else if (kmemleak_early_log)
                log_early(KMEMLEAK_FREE, ptr, 0, 0);
@@ -982,7 +1006,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
 
        pr_debug("%s(0x%p)\n", __func__, ptr);
 
-       if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+       if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
                for_each_possible_cpu(cpu)
                        delete_object_full((unsigned long)per_cpu_ptr(ptr,
                                                                      cpu));
@@ -1148,19 +1172,18 @@ static int scan_should_stop(void)
  * found to the gray list.
  */
 static void scan_block(void *_start, void *_end,
-                      struct kmemleak_object *scanned, int allow_resched)
+                      struct kmemleak_object *scanned)
 {
        unsigned long *ptr;
        unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
        unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+       unsigned long flags;
 
+       read_lock_irqsave(&kmemleak_lock, flags);
        for (ptr = start; ptr < end; ptr++) {
                struct kmemleak_object *object;
-               unsigned long flags;
                unsigned long pointer;
 
-               if (allow_resched)
-                       cond_resched();
                if (scan_should_stop())
                        break;
 
@@ -1173,26 +1196,31 @@ static void scan_block(void *_start, void *_end,
                pointer = *ptr;
                kasan_enable_current();
 
-               object = find_and_get_object(pointer, 1);
+               if (pointer < min_addr || pointer >= max_addr)
+                       continue;
+
+               /*
+                * No need for get_object() here since we hold kmemleak_lock.
+                * object->use_count cannot be dropped to 0 while the object
+                * is still present in object_tree_root and object_list
+                * (with updates protected by kmemleak_lock).
+                */
+               object = lookup_object(pointer, 1);
                if (!object)
                        continue;
-               if (object == scanned) {
+               if (object == scanned)
                        /* self referenced, ignore */
-                       put_object(object);
                        continue;
-               }
 
                /*
                 * Avoid the lockdep recursive warning on object->lock being
                 * previously acquired in scan_object(). These locks are
                 * enclosed by scan_mutex.
                 */
-               spin_lock_irqsave_nested(&object->lock, flags,
-                                        SINGLE_DEPTH_NESTING);
+               spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
                if (!color_white(object)) {
                        /* non-orphan, ignored or new */
-                       spin_unlock_irqrestore(&object->lock, flags);
-                       put_object(object);
+                       spin_unlock(&object->lock);
                        continue;
                }
 
@@ -1204,13 +1232,27 @@ static void scan_block(void *_start, void *_end,
                 */
                object->count++;
                if (color_gray(object)) {
+                       /* put_object() called when removing from gray_list */
+                       WARN_ON(!get_object(object));
                        list_add_tail(&object->gray_list, &gray_list);
-                       spin_unlock_irqrestore(&object->lock, flags);
-                       continue;
                }
+               spin_unlock(&object->lock);
+       }
+       read_unlock_irqrestore(&kmemleak_lock, flags);
+}
 
-               spin_unlock_irqrestore(&object->lock, flags);
-               put_object(object);
+/*
+ * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
+ */
+static void scan_large_block(void *start, void *end)
+{
+       void *next;
+
+       while (start < end) {
+               next = min(start + MAX_SCAN_SIZE, end);
+               scan_block(start, next, NULL);
+               start = next;
+               cond_resched();
        }
 }
 
@@ -1236,22 +1278,25 @@ static void scan_object(struct kmemleak_object *object)
        if (hlist_empty(&object->area_list)) {
                void *start = (void *)object->pointer;
                void *end = (void *)(object->pointer + object->size);
+               void *next;
 
-               while (start < end && (object->flags & OBJECT_ALLOCATED) &&
-                      !(object->flags & OBJECT_NO_SCAN)) {
-                       scan_block(start, min(start + MAX_SCAN_SIZE, end),
-                                  object, 0);
-                       start += MAX_SCAN_SIZE;
+               do {
+                       next = min(start + MAX_SCAN_SIZE, end);
+                       scan_block(start, next, object);
+
+                       start = next;
+                       if (start >= end)
+                               break;
 
                        spin_unlock_irqrestore(&object->lock, flags);
                        cond_resched();
                        spin_lock_irqsave(&object->lock, flags);
-               }
+               } while (object->flags & OBJECT_ALLOCATED);
        } else
                hlist_for_each_entry(area, &object->area_list, node)
                        scan_block((void *)area->start,
                                   (void *)(area->start + area->size),
-                                  object, 0);
+                                  object);
 out:
        spin_unlock_irqrestore(&object->lock, flags);
 }
@@ -1328,14 +1373,14 @@ static void kmemleak_scan(void)
        rcu_read_unlock();
 
        /* data/bss scanning */
-       scan_block(_sdata, _edata, NULL, 1);
-       scan_block(__bss_start, __bss_stop, NULL, 1);
+       scan_large_block(_sdata, _edata);
+       scan_large_block(__bss_start, __bss_stop);
 
 #ifdef CONFIG_SMP
        /* per-cpu sections scanning */
        for_each_possible_cpu(i)
-               scan_block(__per_cpu_start + per_cpu_offset(i),
-                          __per_cpu_end + per_cpu_offset(i), NULL, 1);
+               scan_large_block(__per_cpu_start + per_cpu_offset(i),
+                                __per_cpu_end + per_cpu_offset(i));
 #endif
 
        /*
@@ -1356,7 +1401,7 @@ static void kmemleak_scan(void)
                        /* only scan if page is in use */
                        if (page_count(page) == 0)
                                continue;
-                       scan_block(page, page + 1, NULL, 1);
+                       scan_block(page, page + 1, NULL);
                }
        }
        put_online_mems();
@@ -1370,7 +1415,7 @@ static void kmemleak_scan(void)
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        scan_block(task_stack_page(p), task_stack_page(p) +
-                                  THREAD_SIZE, NULL, 0);
+                                  THREAD_SIZE, NULL);
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
        }
@@ -1747,15 +1792,20 @@ static void __kmemleak_do_cleanup(void)
  */
 static void kmemleak_do_cleanup(struct work_struct *work)
 {
-       mutex_lock(&scan_mutex);
        stop_scan_thread();
 
+       /*
+        * Once the scan thread has stopped, it is safe to no longer track
+        * object freeing. Ordering of the scan thread stopping and the memory
+        * accesses below is guaranteed by the kthread_stop() function.
+        */
+       kmemleak_free_enabled = 0;
+
        if (!kmemleak_found_leaks)
                __kmemleak_do_cleanup();
        else
                pr_info("Kmemleak disabled without freeing internal data. "
                        "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
-       mutex_unlock(&scan_mutex);
 }
 
 static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1776,6 +1826,8 @@ static void kmemleak_disable(void)
        /* check whether it is too early for a kernel thread */
        if (kmemleak_initialized)
                schedule_work(&cleanup_work);
+       else
+               kmemleak_free_enabled = 0;
 
        pr_info("Kernel memory leak detector disabled\n");
 }
@@ -1840,8 +1892,10 @@ void __init kmemleak_init(void)
        if (kmemleak_error) {
                local_irq_restore(flags);
                return;
-       } else
+       } else {
                kmemleak_enabled = 1;
+               kmemleak_free_enabled = 1;
+       }
        local_irq_restore(flags);
 
        /*
index 9318b567ed7959721cb20239f410686b2635f335..1b444c730846ddf59d9ce591aacce123f975bc13 100644 (file)
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock;
 #ifdef CONFIG_MOVABLE_NODE
 bool movable_node_enabled __initdata_memblock = false;
 #endif
+static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 
+ulong __init_memblock choose_memblock_flags(void)
+{
+       return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
+}
+
 /* inline so we don't get a warning when pr_debug is compiled out */
 static __init_memblock const char *
 memblock_type_name(struct memblock_type *type)
@@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
  * @size: size of free area to find
  * @align: alignment of free area to find
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
  *
  * Utility called from memblock_find_in_range_node(), find free area bottom-up.
  *
@@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
  */
 static phys_addr_t __init_memblock
 __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
-                               phys_addr_t size, phys_addr_t align, int nid)
+                               phys_addr_t size, phys_addr_t align, int nid,
+                               ulong flags)
 {
        phys_addr_t this_start, this_end, cand;
        u64 i;
 
-       for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+       for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
 
@@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
  * @size: size of free area to find
  * @align: alignment of free area to find
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
  *
  * Utility called from memblock_find_in_range_node(), find free area top-down.
  *
@@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
  */
 static phys_addr_t __init_memblock
 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
-                              phys_addr_t size, phys_addr_t align, int nid)
+                              phys_addr_t size, phys_addr_t align, int nid,
+                              ulong flags)
 {
        phys_addr_t this_start, this_end, cand;
        u64 i;
 
-       for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+       for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
+                                       NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
 
@@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  * @start: start of candidate range
  * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
  *
  * Find @size free area aligned to @align in the specified range and node.
  *
@@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t start,
-                                       phys_addr_t end, int nid)
+                                       phys_addr_t end, int nid, ulong flags)
 {
        phys_addr_t kernel_end, ret;
 
@@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 
                /* ok, try bottom-up allocation first */
                ret = __memblock_find_range_bottom_up(bottom_up_start, end,
-                                                     size, align, nid);
+                                                     size, align, nid, flags);
                if (ret)
                        return ret;
 
@@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                             "memory hotunplug may be affected\n");
        }
 
-       return __memblock_find_range_top_down(start, end, size, align, nid);
+       return __memblock_find_range_top_down(start, end, size, align, nid,
+                                             flags);
 }
 
 /**
@@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align)
 {
-       return memblock_find_in_range_node(size, align, start, end,
-                                           NUMA_NO_NODE);
+       phys_addr_t ret;
+       ulong flags = choose_memblock_flags();
+
+again:
+       ret = memblock_find_in_range_node(size, align, start, end,
+                                           NUMA_NO_NODE, flags);
+
+       if (!ret && (flags & MEMBLOCK_MIRROR)) {
+               pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+                       &size);
+               flags &= ~MEMBLOCK_MIRROR;
+               goto again;
+       }
+
+       return ret;
 }
 
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -778,10 +804,26 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
        return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
 }
 
+/**
+ * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
+{
+       system_has_some_mirror = true;
+
+       return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+}
+
+
 /**
  * __next__mem_range - next function for for_each_free_mem_range() etc.
  * @idx: pointer to u64 loop variable
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @type_a: pointer to memblock_type from where the range is taken
  * @type_b: pointer to memblock_type which excludes memory from being taken
  * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -803,7 +845,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
  * As both region arrays are sorted, the function advances the two indices
  * in lockstep and returns each intersection.
  */
-void __init_memblock __next_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
                                      struct memblock_type *type_a,
                                      struct memblock_type *type_b,
                                      phys_addr_t *out_start,
@@ -831,6 +873,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
                        continue;
 
+               /* if we want mirror memory skip non-mirror memory regions */
+               if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+                       continue;
+
                if (!type_b) {
                        if (out_start)
                                *out_start = m_start;
@@ -895,6 +941,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
  *
  * @idx: pointer to u64 loop variable
  * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @type_a: pointer to memblock_type from where the range is taken
  * @type_b: pointer to memblock_type which excludes memory from being taken
  * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -903,7 +950,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
  *
  * Reverse of __next_mem_range().
  */
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
                                          struct memblock_type *type_a,
                                          struct memblock_type *type_b,
                                          phys_addr_t *out_start,
@@ -935,6 +982,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
                        continue;
 
+               /* if we want mirror memory skip non-mirror memory regions */
+               if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+                       continue;
+
                if (!type_b) {
                        if (out_start)
                                *out_start = m_start;
@@ -1050,14 +1101,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 
 static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t start,
-                                       phys_addr_t end, int nid)
+                                       phys_addr_t end, int nid, ulong flags)
 {
        phys_addr_t found;
 
        if (!align)
                align = SMP_CACHE_BYTES;
 
-       found = memblock_find_in_range_node(size, align, start, end, nid);
+       found = memblock_find_in_range_node(size, align, start, end, nid,
+                                           flags);
        if (found && !memblock_reserve(found, size)) {
                /*
                 * The min_count is set to 0 so that memblock allocations are
@@ -1070,26 +1122,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 }
 
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-                                       phys_addr_t start, phys_addr_t end)
+                                       phys_addr_t start, phys_addr_t end,
+                                       ulong flags)
 {
-       return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+       return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+                                       flags);
 }
 
 static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t max_addr,
-                                       int nid)
+                                       int nid, ulong flags)
 {
-       return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+       return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
 }
 
 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-       return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+       ulong flags = choose_memblock_flags();
+       phys_addr_t ret;
+
+again:
+       ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
+                                     nid, flags);
+
+       if (!ret && (flags & MEMBLOCK_MIRROR)) {
+               flags &= ~MEMBLOCK_MIRROR;
+               goto again;
+       }
+       return ret;
 }
 
 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-       return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+       return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
+                                      MEMBLOCK_NONE);
 }
 
 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1153,6 +1219,7 @@ static void * __init memblock_virt_alloc_internal(
 {
        phys_addr_t alloc;
        void *ptr;
+       ulong flags = choose_memblock_flags();
 
        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
                nid = NUMA_NO_NODE;
@@ -1173,13 +1240,14 @@ static void * __init memblock_virt_alloc_internal(
 
 again:
        alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
-                                           nid);
+                                           nid, flags);
        if (alloc)
                goto done;
 
        if (nid != NUMA_NO_NODE) {
                alloc = memblock_find_in_range_node(size, align, min_addr,
-                                                   max_addr,  NUMA_NO_NODE);
+                                                   max_addr, NUMA_NO_NODE,
+                                                   flags);
                if (alloc)
                        goto done;
        }
@@ -1187,10 +1255,16 @@ again:
        if (min_addr) {
                min_addr = 0;
                goto again;
-       } else {
-               goto error;
        }
 
+       if (flags & MEMBLOCK_MIRROR) {
+               flags &= ~MEMBLOCK_MIRROR;
+               pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+                       &size);
+               goto again;
+       }
+
+       return NULL;
 done:
        memblock_reserve(alloc, size);
        ptr = phys_to_virt(alloc);
@@ -1205,9 +1279,6 @@ done:
        kmemleak_alloc(ptr, size, 0, 0);
 
        return ptr;
-
-error:
-       return NULL;
 }
 
 /**
index a04225d372ba3ab77516b970c10135b19def3ac4..e65f7b0131d3598cb5ba0ce3497d47b43d676dea 100644 (file)
@@ -285,9 +285,9 @@ struct mem_cgroup {
         */
        bool use_hierarchy;
 
+       /* protected by memcg_oom_lock */
        bool            oom_lock;
-       atomic_t        under_oom;
-       atomic_t        oom_wakeups;
+       int             under_oom;
 
        int     swappiness;
        /* OOM-Killer disable */
@@ -1530,14 +1530,16 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        unsigned int points = 0;
        struct task_struct *chosen = NULL;
 
+       mutex_lock(&oom_lock);
+
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-               mark_tsk_oom_victim(current);
-               return;
+               mark_oom_victim(current);
+               goto unlock;
        }
 
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
@@ -1564,7 +1566,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                mem_cgroup_iter_break(memcg, iter);
                                if (chosen)
                                        put_task_struct(chosen);
-                               return;
+                               goto unlock;
                        case OOM_SCAN_OK:
                                break;
                        };
@@ -1585,11 +1587,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                css_task_iter_end(&it);
        }
 
-       if (!chosen)
-               return;
-       points = chosen_points * 1000 / totalpages;
-       oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
-                        NULL, "Memory cgroup out of memory");
+       if (chosen) {
+               points = chosen_points * 1000 / totalpages;
+               oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+                                memcg, NULL, "Memory cgroup out of memory");
+       }
+unlock:
+       mutex_unlock(&oom_lock);
 }
 
 #if MAX_NUMNODES > 1
@@ -1806,8 +1810,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
 
+       spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
-               atomic_inc(&iter->under_oom);
+               iter->under_oom++;
+       spin_unlock(&memcg_oom_lock);
 }
 
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1816,11 +1822,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 
        /*
         * When a new child is created while the hierarchy is under oom,
-        * mem_cgroup_oom_lock() may not be called. We have to use
-        * atomic_add_unless() here.
+        * mem_cgroup_oom_lock() may not be called. Watch for underflow.
         */
+       spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
-               atomic_add_unless(&iter->under_oom, -1, 0);
+               if (iter->under_oom > 0)
+                       iter->under_oom--;
+       spin_unlock(&memcg_oom_lock);
 }
 
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1846,17 +1854,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
        return autoremove_wake_function(wait, mode, sync, arg);
 }
 
-static void memcg_wakeup_oom(struct mem_cgroup *memcg)
-{
-       atomic_inc(&memcg->oom_wakeups);
-       /* for filtering, pass "memcg" as argument. */
-       __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
-
 static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
-       if (memcg && atomic_read(&memcg->under_oom))
-               memcg_wakeup_oom(memcg);
+       /*
+        * For the following lockless ->under_oom test, the only required
+        * guarantee is that it must see the state asserted by an OOM when
+        * this function is called as a result of userland actions
+        * triggered by the notification of the OOM.  This is trivially
+        * achieved by invoking mem_cgroup_mark_under_oom() before
+        * triggering notification.
+        */
+       if (memcg && memcg->under_oom)
+               __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@ -3864,7 +3873,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
        list_add(&event->list, &memcg->oom_notify);
 
        /* already in OOM ? */
-       if (atomic_read(&memcg->under_oom))
+       if (memcg->under_oom)
                eventfd_signal(eventfd, 1);
        spin_unlock(&memcg_oom_lock);
 
@@ -3893,7 +3902,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
 
        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
-       seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+       seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
        return 0;
 }
 
index 501820c815b335b017ea87cf3dd3f1a0d034bd98..c53543d892828e75796239d6ce36afa90203085b 100644 (file)
  * this code has to be extremely careful. Generally it tries to use 
  * normal locking rules, as in get the standard locks, even if that means 
  * the error handling takes potentially a long time.
+ *
+ * It can be very tempting to add handling for obscure cases here.
+ * In general any code for handling new cases should only be added iff:
+ * - You know how to test it.
+ * - You have a test that can be added to mce-test
+ *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
+ * - The case actually shows up as a frequent (top 10) page state in
+ *   tools/vm/page-types when running a real workload.
  * 
  * There are several operations here with exponential complexity because
  * of unsuitable VM data structures. For example the operation to map back 
  * are rare we hope to get away with this. This avoids impacting the core 
  * VM.
  */
-
-/*
- * Notebook:
- * - hugetlb needs more code
- * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
- * - pass bad pages to kdump next kernel
- */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
@@ -56,6 +57,7 @@
 #include <linux/mm_inline.h>
 #include <linux/kfifo.h>
 #include "internal.h"
+#include "ras/ras_event.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
 
@@ -503,68 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill,
        kfree(tk);
 }
 
-/*
- * Error handlers for various types of pages.
- */
-
-enum outcome {
-       IGNORED,        /* Error: cannot be handled */
-       FAILED,         /* Error: handling failed */
-       DELAYED,        /* Will be handled later */
-       RECOVERED,      /* Successfully recovered */
-};
-
 static const char *action_name[] = {
-       [IGNORED] = "Ignored",
-       [FAILED] = "Failed",
-       [DELAYED] = "Delayed",
-       [RECOVERED] = "Recovered",
-};
-
-enum action_page_type {
-       MSG_KERNEL,
-       MSG_KERNEL_HIGH_ORDER,
-       MSG_SLAB,
-       MSG_DIFFERENT_COMPOUND,
-       MSG_POISONED_HUGE,
-       MSG_HUGE,
-       MSG_FREE_HUGE,
-       MSG_UNMAP_FAILED,
-       MSG_DIRTY_SWAPCACHE,
-       MSG_CLEAN_SWAPCACHE,
-       MSG_DIRTY_MLOCKED_LRU,
-       MSG_CLEAN_MLOCKED_LRU,
-       MSG_DIRTY_UNEVICTABLE_LRU,
-       MSG_CLEAN_UNEVICTABLE_LRU,
-       MSG_DIRTY_LRU,
-       MSG_CLEAN_LRU,
-       MSG_TRUNCATED_LRU,
-       MSG_BUDDY,
-       MSG_BUDDY_2ND,
-       MSG_UNKNOWN,
+       [MF_IGNORED] = "Ignored",
+       [MF_FAILED] = "Failed",
+       [MF_DELAYED] = "Delayed",
+       [MF_RECOVERED] = "Recovered",
 };
 
 static const char * const action_page_types[] = {
-       [MSG_KERNEL]                    = "reserved kernel page",
-       [MSG_KERNEL_HIGH_ORDER]         = "high-order kernel page",
-       [MSG_SLAB]                      = "kernel slab page",
-       [MSG_DIFFERENT_COMPOUND]        = "different compound page after locking",
-       [MSG_POISONED_HUGE]             = "huge page already hardware poisoned",
-       [MSG_HUGE]                      = "huge page",
-       [MSG_FREE_HUGE]                 = "free huge page",
-       [MSG_UNMAP_FAILED]              = "unmapping failed page",
-       [MSG_DIRTY_SWAPCACHE]           = "dirty swapcache page",
-       [MSG_CLEAN_SWAPCACHE]           = "clean swapcache page",
-       [MSG_DIRTY_MLOCKED_LRU]         = "dirty mlocked LRU page",
-       [MSG_CLEAN_MLOCKED_LRU]         = "clean mlocked LRU page",
-       [MSG_DIRTY_UNEVICTABLE_LRU]     = "dirty unevictable LRU page",
-       [MSG_CLEAN_UNEVICTABLE_LRU]     = "clean unevictable LRU page",
-       [MSG_DIRTY_LRU]                 = "dirty LRU page",
-       [MSG_CLEAN_LRU]                 = "clean LRU page",
-       [MSG_TRUNCATED_LRU]             = "already truncated LRU page",
-       [MSG_BUDDY]                     = "free buddy page",
-       [MSG_BUDDY_2ND]                 = "free buddy page (2nd try)",
-       [MSG_UNKNOWN]                   = "unknown page",
+       [MF_MSG_KERNEL]                 = "reserved kernel page",
+       [MF_MSG_KERNEL_HIGH_ORDER]      = "high-order kernel page",
+       [MF_MSG_SLAB]                   = "kernel slab page",
+       [MF_MSG_DIFFERENT_COMPOUND]     = "different compound page after locking",
+       [MF_MSG_POISONED_HUGE]          = "huge page already hardware poisoned",
+       [MF_MSG_HUGE]                   = "huge page",
+       [MF_MSG_FREE_HUGE]              = "free huge page",
+       [MF_MSG_UNMAP_FAILED]           = "unmapping failed page",
+       [MF_MSG_DIRTY_SWAPCACHE]        = "dirty swapcache page",
+       [MF_MSG_CLEAN_SWAPCACHE]        = "clean swapcache page",
+       [MF_MSG_DIRTY_MLOCKED_LRU]      = "dirty mlocked LRU page",
+       [MF_MSG_CLEAN_MLOCKED_LRU]      = "clean mlocked LRU page",
+       [MF_MSG_DIRTY_UNEVICTABLE_LRU]  = "dirty unevictable LRU page",
+       [MF_MSG_CLEAN_UNEVICTABLE_LRU]  = "clean unevictable LRU page",
+       [MF_MSG_DIRTY_LRU]              = "dirty LRU page",
+       [MF_MSG_CLEAN_LRU]              = "clean LRU page",
+       [MF_MSG_TRUNCATED_LRU]          = "already truncated LRU page",
+       [MF_MSG_BUDDY]                  = "free buddy page",
+       [MF_MSG_BUDDY_2ND]              = "free buddy page (2nd try)",
+       [MF_MSG_UNKNOWN]                = "unknown page",
 };
 
 /*
@@ -598,7 +566,7 @@ static int delete_from_lru_cache(struct page *p)
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
-       return IGNORED;
+       return MF_IGNORED;
 }
 
 /*
@@ -607,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
        printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
-       return FAILED;
+       return MF_FAILED;
 }
 
 /*
@@ -616,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
        int err;
-       int ret = FAILED;
+       int ret = MF_FAILED;
        struct address_space *mapping;
 
        delete_from_lru_cache(p);
@@ -626,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
         * should be the one m_f() holds.
         */
        if (PageAnon(p))
-               return RECOVERED;
+               return MF_RECOVERED;
 
        /*
         * Now truncate the page in the page cache. This is really
@@ -640,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                /*
                 * Page has been teared down in the meanwhile
                 */
-               return FAILED;
+               return MF_FAILED;
        }
 
        /*
@@ -657,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                                !try_to_release_page(p, GFP_NOIO)) {
                        pr_info("MCE %#lx: failed to release buffers\n", pfn);
                } else {
-                       ret = RECOVERED;
+                       ret = MF_RECOVERED;
                }
        } else {
                /*
@@ -665,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                 * This fails on dirty or anything with private pages
                 */
                if (invalidate_inode_page(p))
-                       ret = RECOVERED;
+                       ret = MF_RECOVERED;
                else
                        printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
                                pfn);
@@ -751,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
        ClearPageUptodate(p);
 
        if (!delete_from_lru_cache(p))
-               return DELAYED;
+               return MF_DELAYED;
        else
-               return FAILED;
+               return MF_FAILED;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -761,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
        delete_from_swap_cache(p);
 
        if (!delete_from_lru_cache(p))
-               return RECOVERED;
+               return MF_RECOVERED;
        else
-               return FAILED;
+               return MF_FAILED;
 }
 
 /*
@@ -776,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 {
        int res = 0;
        struct page *hpage = compound_head(p);
+
+       if (!PageHuge(hpage))
+               return MF_DELAYED;
+
        /*
         * We can safely recover from error on free or reserved (i.e.
         * not in-use) hugepage by dequeuing it from freelist.
@@ -789,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
                res = dequeue_hwpoisoned_huge_page(hpage);
                if (!res)
-                       return RECOVERED;
+                       return MF_RECOVERED;
        }
-       return DELAYED;
+       return MF_DELAYED;
 }
 
 /*
@@ -823,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 static struct page_state {
        unsigned long mask;
        unsigned long res;
-       enum action_page_type type;
+       enum mf_action_page_type type;
        int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-       { reserved,     reserved,       MSG_KERNEL,     me_kernel },
+       { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
        /*
         * free pages are specially detected outside this table:
         * PG_buddy pages only make a small fraction of all free pages.
@@ -837,31 +809,31 @@ static struct page_state {
         * currently unused objects without touching them. But just
         * treat it as standard kernel for now.
         */
-       { slab,         slab,           MSG_SLAB,       me_kernel },
+       { slab,         slab,           MF_MSG_SLAB,    me_kernel },
 
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
-       { head,         head,           MSG_HUGE,               me_huge_page },
-       { tail,         tail,           MSG_HUGE,               me_huge_page },
+       { head,         head,           MF_MSG_HUGE,            me_huge_page },
+       { tail,         tail,           MF_MSG_HUGE,            me_huge_page },
 #else
-       { compound,     compound,       MSG_HUGE,               me_huge_page },
+       { compound,     compound,       MF_MSG_HUGE,            me_huge_page },
 #endif
 
-       { sc|dirty,     sc|dirty,       MSG_DIRTY_SWAPCACHE,    me_swapcache_dirty },
-       { sc|dirty,     sc,             MSG_CLEAN_SWAPCACHE,    me_swapcache_clean },
+       { sc|dirty,     sc|dirty,       MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
+       { sc|dirty,     sc,             MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
 
-       { mlock|dirty,  mlock|dirty,    MSG_DIRTY_MLOCKED_LRU,  me_pagecache_dirty },
-       { mlock|dirty,  mlock,          MSG_CLEAN_MLOCKED_LRU,  me_pagecache_clean },
+       { mlock|dirty,  mlock|dirty,    MF_MSG_DIRTY_MLOCKED_LRU,       me_pagecache_dirty },
+       { mlock|dirty,  mlock,          MF_MSG_CLEAN_MLOCKED_LRU,       me_pagecache_clean },
 
-       { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU,      me_pagecache_dirty },
-       { unevict|dirty, unevict,       MSG_CLEAN_UNEVICTABLE_LRU,      me_pagecache_clean },
+       { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU,   me_pagecache_dirty },
+       { unevict|dirty, unevict,       MF_MSG_CLEAN_UNEVICTABLE_LRU,   me_pagecache_clean },
 
-       { lru|dirty,    lru|dirty,      MSG_DIRTY_LRU,  me_pagecache_dirty },
-       { lru|dirty,    lru,            MSG_CLEAN_LRU,  me_pagecache_clean },
+       { lru|dirty,    lru|dirty,      MF_MSG_DIRTY_LRU,       me_pagecache_dirty },
+       { lru|dirty,    lru,            MF_MSG_CLEAN_LRU,       me_pagecache_clean },
 
        /*
         * Catchall entry: must be at end.
         */
-       { 0,            0,              MSG_UNKNOWN,    me_unknown },
+       { 0,            0,              MF_MSG_UNKNOWN, me_unknown },
 };
 
 #undef dirty
@@ -881,8 +853,11 @@ static struct page_state {
  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
  */
-static void action_result(unsigned long pfn, enum action_page_type type, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+                         enum mf_result result)
 {
+       trace_memory_failure_event(pfn, type, result);
+
        pr_err("MCE %#lx: recovery action for %s: %s\n",
                pfn, action_page_types[type], action_name[result]);
 }
@@ -896,13 +871,13 @@ static int page_action(struct page_state *ps, struct page *p,
        result = ps->action(p, pfn);
 
        count = page_count(p) - 1;
-       if (ps->action == me_swapcache_dirty && result == DELAYED)
+       if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
                count--;
        if (count != 0) {
                printk(KERN_ERR
                       "MCE %#lx: %s still referenced by %d users\n",
                       pfn, action_page_types[ps->type], count);
-               result = FAILED;
+               result = MF_FAILED;
        }
        action_result(pfn, ps->type, result);
 
@@ -911,9 +886,42 @@ static int page_action(struct page_state *ps, struct page *p,
         * Could adjust zone counters here to correct for the missing page.
         */
 
-       return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+       return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
 }
 
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page:      raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+int get_hwpoison_page(struct page *page)
+{
+       struct page *head = compound_head(page);
+
+       if (PageHuge(head))
+               return get_page_unless_zero(head);
+
+       /*
+        * Thp tail page has special refcounting rule (refcount of tail pages
+        * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
+        * directly for tail pages.
+        */
+       if (PageTransHuge(head)) {
+               if (get_page_unless_zero(head)) {
+                       if (PageTail(page))
+                               get_page(page);
+                       return 1;
+               } else {
+                       return 0;
+               }
+       }
+
+       return get_page_unless_zero(page);
+}
+EXPORT_SYMBOL_GPL(get_hwpoison_page);
+
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
@@ -927,7 +935,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int kill = 1, forcekill;
        struct page *hpage = *hpagep;
-       struct page *ppage;
 
        /*
         * Here we are interested only in user-mapped pages, so skip any
@@ -976,59 +983,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                }
        }
 
-       /*
-        * ppage: poisoned page
-        *   if p is regular page(4k page)
-        *        ppage == real poisoned page;
-        *   else p is hugetlb or THP, ppage == head page.
-        */
-       ppage = hpage;
-
-       if (PageTransHuge(hpage)) {
-               /*
-                * Verify that this isn't a hugetlbfs head page, the check for
-                * PageAnon is just for avoid tripping a split_huge_page
-                * internal debug check, as split_huge_page refuses to deal with
-                * anything that isn't an anon page. PageAnon can't go away fro
-                * under us because we hold a refcount on the hpage, without a
-                * refcount on the hpage. split_huge_page can't be safely called
-                * in the first place, having a refcount on the tail isn't
-                * enough * to be safe.
-                */
-               if (!PageHuge(hpage) && PageAnon(hpage)) {
-                       if (unlikely(split_huge_page(hpage))) {
-                               /*
-                                * FIXME: if splitting THP is failed, it is
-                                * better to stop the following operation rather
-                                * than causing panic by unmapping. System might
-                                * survive if the page is freed later.
-                                */
-                               printk(KERN_INFO
-                                       "MCE %#lx: failed to split THP\n", pfn);
-
-                               BUG_ON(!PageHWPoison(p));
-                               return SWAP_FAIL;
-                       }
-                       /*
-                        * We pinned the head page for hwpoison handling,
-                        * now we split the thp and we are interested in
-                        * the hwpoisoned raw page, so move the refcount
-                        * to it. Similarly, page lock is shifted.
-                        */
-                       if (hpage != p) {
-                               if (!(flags & MF_COUNT_INCREASED)) {
-                                       put_page(hpage);
-                                       get_page(p);
-                               }
-                               lock_page(p);
-                               unlock_page(hpage);
-                               *hpagep = p;
-                       }
-                       /* THP is split, so ppage should be the real poisoned page. */
-                       ppage = p;
-               }
-       }
-
        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
@@ -1038,12 +992,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-               collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
 
-       ret = try_to_unmap(ppage, ttu);
+       ret = try_to_unmap(hpage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                               pfn, page_mapcount(ppage));
+                               pfn, page_mapcount(hpage));
 
        /*
         * Now that the dirty bit has been propagated to the
@@ -1055,7 +1009,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-       forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+       forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
        kill_procs(&tokill, forcekill, trapno,
                      ret != SWAP_SUCCESS, p, pfn, flags);
 
@@ -1101,6 +1055,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        struct page_state *ps;
        struct page *p;
        struct page *hpage;
+       struct page *orig_head;
        int res;
        unsigned int nr_pages;
        unsigned long page_flags;
@@ -1116,7 +1071,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
 
        p = pfn_to_page(pfn);
-       hpage = compound_head(p);
+       orig_head = hpage = compound_head(p);
        if (TestSetPageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
@@ -1149,10 +1104,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * In fact it's dangerous to directly bump up page count from 0,
         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
         */
-       if (!(flags & MF_COUNT_INCREASED) &&
-               !get_page_unless_zero(hpage)) {
+       if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
                if (is_free_buddy_page(p)) {
-                       action_result(pfn, MSG_BUDDY, DELAYED);
+                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
                        return 0;
                } else if (PageHuge(hpage)) {
                        /*
@@ -1169,16 +1123,39 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        }
                        set_page_hwpoison_huge_page(hpage);
                        res = dequeue_hwpoisoned_huge_page(hpage);
-                       action_result(pfn, MSG_FREE_HUGE,
-                                     res ? IGNORED : DELAYED);
+                       action_result(pfn, MF_MSG_FREE_HUGE,
+                                     res ? MF_IGNORED : MF_DELAYED);
                        unlock_page(hpage);
                        return res;
                } else {
-                       action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
+                       action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
                        return -EBUSY;
                }
        }
 
+       if (!PageHuge(p) && PageTransHuge(hpage)) {
+               if (!PageAnon(hpage)) {
+                       pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+                       if (TestClearPageHWPoison(p))
+                               atomic_long_sub(nr_pages, &num_poisoned_pages);
+                       put_page(p);
+                       if (p != hpage)
+                               put_page(hpage);
+                       return -EBUSY;
+               }
+               if (unlikely(split_huge_page(hpage))) {
+                       pr_err("MCE: %#lx: thp split failed\n", pfn);
+                       if (TestClearPageHWPoison(p))
+                               atomic_long_sub(nr_pages, &num_poisoned_pages);
+                       put_page(p);
+                       if (p != hpage)
+                               put_page(hpage);
+                       return -EBUSY;
+               }
+               VM_BUG_ON_PAGE(!page_count(p), p);
+               hpage = compound_head(p);
+       }
+
        /*
         * We ignore non-LRU pages for good reasons.
         * - PG_locked is only well defined for LRU pages and a few others
@@ -1188,18 +1165,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * walked by the page reclaim code, however that's not a big loss.
         */
        if (!PageHuge(p)) {
-               if (!PageLRU(hpage))
-                       shake_page(hpage, 0);
-               if (!PageLRU(hpage)) {
+               if (!PageLRU(p))
+                       shake_page(p, 0);
+               if (!PageLRU(p)) {
                        /*
                         * shake_page could have turned it free.
                         */
                        if (is_free_buddy_page(p)) {
                                if (flags & MF_COUNT_INCREASED)
-                                       action_result(pfn, MSG_BUDDY, DELAYED);
+                                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
                                else
-                                       action_result(pfn, MSG_BUDDY_2ND,
-                                                     DELAYED);
+                                       action_result(pfn, MF_MSG_BUDDY_2ND,
+                                                     MF_DELAYED);
                                return 0;
                        }
                }
@@ -1211,8 +1188,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * The page could have changed compound pages during the locking.
         * If this happens just bail out.
         */
-       if (compound_head(p) != hpage) {
-               action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
+       if (PageCompound(p) && compound_head(p) != orig_head) {
+               action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1252,7 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * on the head page to show that the hugepage is hwpoisoned
         */
        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
-               action_result(pfn, MSG_POISONED_HUGE, IGNORED);
+               action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
                unlock_page(hpage);
                put_page(hpage);
                return 0;
@@ -1281,7 +1258,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
            != SWAP_SUCCESS) {
-               action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
+               action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1290,7 +1267,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * Torn down by someone else?
         */
        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
-               action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
+               action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1450,12 +1427,12 @@ int unpoison_memory(unsigned long pfn)
         */
        if (!PageHuge(page) && PageTransHuge(page)) {
                pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
-                       return 0;
+               return 0;
        }
 
        nr_pages = 1 << compound_order(page);
 
-       if (!get_page_unless_zero(page)) {
+       if (!get_hwpoison_page(p)) {
                /*
                 * Since HWPoisoned hugepage should have non-zero refcount,
                 * race between memory failure and unpoison seems to happen.
@@ -1523,7 +1500,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
         * When the target page is a free hugepage, just remove it
         * from free hugepage list.
         */
-       if (!get_page_unless_zero(compound_head(p))) {
+       if (!get_hwpoison_page(p)) {
                if (PageHuge(p)) {
                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
                        ret = 0;
@@ -1694,20 +1671,7 @@ static int __soft_offline_page(struct page *page, int flags)
                        if (ret > 0)
                                ret = -EIO;
                } else {
-                       /*
-                        * After page migration succeeds, the source page can
-                        * be trapped in pagevec and actual freeing is delayed.
-                        * Freeing code works differently based on PG_hwpoison,
-                        * so there's a race. We need to make sure that the
-                        * source page should be freed back to buddy before
-                        * setting PG_hwpoison.
-                        */
-                       if (!is_free_buddy_page(page))
-                               drain_all_pages(page_zone(page));
                        SetPageHWPoison(page);
-                       if (!is_free_buddy_page(page))
-                               pr_info("soft offline: %#lx: page leaked\n",
-                                       pfn);
                        atomic_long_inc(&num_poisoned_pages);
                }
        } else {
@@ -1759,14 +1723,6 @@ int soft_offline_page(struct page *page, int flags)
 
        get_online_mems();
 
-       /*
-        * Isolate the page, so that it doesn't get reallocated if it
-        * was free. This flag should be kept set until the source page
-        * is freed and PG_hwpoison on it is set.
-        */
-       if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-               set_migratetype_isolate(page, true);
-
        ret = get_any_page(page, pfn, flags);
        put_online_mems();
        if (ret > 0) { /* for in-use pages */
@@ -1785,6 +1741,5 @@ int soft_offline_page(struct page *page, int flags)
                                atomic_long_inc(&num_poisoned_pages);
                }
        }
-       unset_migratetype_isolate(page, MIGRATE_MOVABLE);
        return ret;
 }
index 17734c3c1183ed799257d40eef18da55d809dd4a..11b9ca1767408dddb147c4b225de0aa31b8f17e7 100644 (file)
@@ -2081,11 +2081,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                        goto oom;
                cow_user_page(new_page, old_page, address, vma);
        }
-       __SetPageUptodate(new_page);
 
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                goto oom_free_new;
 
+       __SetPageUptodate(new_page);
+
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
        /*
@@ -2689,6 +2690,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!page)
                goto oom;
+
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+               goto oom_free_page;
+
        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * preceeding stores to the page contents become visible before
@@ -2696,9 +2701,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        __SetPageUptodate(page);
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
-               goto oom_free_page;
-
        entry = mk_pte(page, vma->vm_page_prot);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
index 9e88f749aa512395daea45f2727545fa0f281533..26fbba7d888f887c3383c1bb5829cb4f204e2040 100644 (file)
@@ -513,6 +513,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
                        break;
                err = 0;
        }
+       vmemmap_populate_print_last();
 
        return err;
 }
index 747743237d9f4d3ead6117d4ee152c00659cd362..99d4c1d0b8583dc453ef992582074ef015f1fb49 100644 (file)
@@ -1972,35 +1972,41 @@ retry_cpuset:
        pol = get_vma_policy(vma, addr);
        cpuset_mems_cookie = read_mems_allowed_begin();
 
-       if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
-                                       pol->mode != MPOL_INTERLEAVE)) {
+       if (pol->mode == MPOL_INTERLEAVE) {
+               unsigned nid;
+
+               nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+               mpol_cond_put(pol);
+               page = alloc_page_interleave(gfp, order, nid);
+               goto out;
+       }
+
+       if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+               int hpage_node = node;
+
                /*
                 * For hugepage allocation and non-interleave policy which
-                * allows the current node, we only try to allocate from the
-                * current node and don't fall back to other nodes, as the
-                * cost of remote accesses would likely offset THP benefits.
+                * allows the current node (or other explicitly preferred
+                * node) we only try to allocate from the current/preferred
+                * node and don't fall back to other nodes, as the cost of
+                * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave, or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
+               if (pol->mode == MPOL_PREFERRED &&
+                                               !(pol->flags & MPOL_F_LOCAL))
+                       hpage_node = pol->v.preferred_node;
+
                nmask = policy_nodemask(gfp, pol);
-               if (!nmask || node_isset(node, *nmask)) {
+               if (!nmask || node_isset(hpage_node, *nmask)) {
                        mpol_cond_put(pol);
-                       page = alloc_pages_exact_node(node,
+                       page = alloc_pages_exact_node(hpage_node,
                                                gfp | __GFP_THISNODE, order);
                        goto out;
                }
        }
 
-       if (pol->mode == MPOL_INTERLEAVE) {
-               unsigned nid;
-
-               nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
-               mpol_cond_put(pol);
-               page = alloc_page_interleave(gfp, order, nid);
-               goto out;
-       }
-
        nmask = policy_nodemask(gfp, pol);
        zl = policy_zonelist(gfp, pol, node);
        mpol_cond_put(pol);
index 1997d934b13b001f1b1decf9ca898847fe6fb6be..0a1cc133f6d72af96a7e16df6481c821dd907a03 100644 (file)
@@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
        u64 i;
        phys_addr_t this_start, this_end;
 
-       for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+       for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
+                               &this_end, NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
                if (this_start < this_end) {
index f53838fe3dfe6e84868b86eb98fedf7abd384ffb..ee401e4e5ef187c92247d03dd6d2ea0893092d1c 100644 (file)
@@ -918,7 +918,8 @@ out:
 static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                                   free_page_t put_new_page,
                                   unsigned long private, struct page *page,
-                                  int force, enum migrate_mode mode)
+                                  int force, enum migrate_mode mode,
+                                  enum migrate_reason reason)
 {
        int rc = 0;
        int *result = NULL;
@@ -949,7 +950,8 @@ out:
                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
-               putback_lru_page(page);
+               if (reason != MR_MEMORY_FAILURE)
+                       putback_lru_page(page);
        }
 
        /*
@@ -1122,7 +1124,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                                                pass > 2, mode);
                        else
                                rc = unmap_and_move(get_new_page, put_new_page,
-                                               private, page, pass > 2, mode);
+                                               private, page, pass > 2, mode,
+                                               reason);
 
                        switch(rc) {
                        case -ENOMEM:
@@ -1796,7 +1799,7 @@ fail_putback:
         */
        flush_cache_range(vma, mmun_start, mmun_end);
        page_add_anon_rmap(new_page, vma, mmun_start);
-       pmdp_clear_flush_notify(vma, mmun_start, pmd);
+       pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
        set_pmd_at(mm, mmun_start, pmd, entry);
        flush_tlb_range(vma, mmun_start, mmun_end);
        update_mmu_cache_pmd(vma, address, &entry);
index bb50cacc3ea5763af4638984ee3e84476fb59b06..aa632ade2be797a9cc4f7a80e37884bc4e14745a 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1258,6 +1258,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 
        *populate = 0;
 
+       if (!len)
+               return -EINVAL;
+
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
@@ -1268,9 +1271,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
                        prot |= PROT_EXEC;
 
-       if (!len)
-               return -EINVAL;
-
        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);
 
index 88584838e7046bec724d68c0cafcd94eec65a040..e7d6f1171ecb6ec453e62edd88cc258b31cfe9f3 100644 (file)
@@ -29,6 +29,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 /*
  * For a prot_numa update we only hold mmap_sem for read so there is a
  * potential race with faulting where a pmd was temporarily none. This
@@ -322,6 +324,15 @@ success:
        change_protection(vma, start, end, vma->vm_page_prot,
                          dirty_accountable, 0);
 
+       /*
+        * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
+        * fault on access.
+        */
+       if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
+                       (newflags & VM_WRITE)) {
+               populate_vma_page_range(vma, start, end, NULL);
+       }
+
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        perf_event_mmap(vma);
index 034e2d3606522bf2a3b550c93052b6bf11278851..a7c93eceb1c8d1ce59235d47ec7e49df4efefb86 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/sched/sysctl.h>
 #include <linux/uaccess.h>
+#include <linux/mm-arch-hooks.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                old_len = new_len;
                old_addr = new_addr;
                new_addr = -ENOMEM;
-       } else if (vma->vm_file && vma->vm_file->f_op->mremap) {
-               err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
-               if (err < 0) {
-                       move_page_tables(new_vma, new_addr, vma, old_addr,
-                                        moved_len, true);
-                       return err;
+       } else {
+               if (vma->vm_file && vma->vm_file->f_op->mremap) {
+                       err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+                       if (err < 0) {
+                               move_page_tables(new_vma, new_addr, vma,
+                                                old_addr, moved_len, true);
+                               return err;
+                       }
                }
+               arch_remap(mm, old_addr, old_addr + old_len,
+                          new_addr, new_addr + new_len);
        }
 
        /* Conceal VM_ACCOUNT so old reservation is not undone */
index 90b50468333e38563d4388096e584b6c23fa9132..5258386fa1beb44842dbbf9a7b85de07d54d83e1 100644 (file)
@@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 {
        void *ptr;
        u64 addr;
+       ulong flags = choose_memblock_flags();
 
        if (limit > memblock.current_limit)
                limit = memblock.current_limit;
 
-       addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+again:
+       addr = memblock_find_in_range_node(size, align, goal, limit, nid,
+                                          flags);
+       if (!addr && (flags & MEMBLOCK_MIRROR)) {
+               flags &= ~MEMBLOCK_MIRROR;
+               pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+                       &size);
+               goto again;
+       }
        if (!addr)
                return NULL;
 
@@ -121,7 +130,8 @@ static unsigned long __init free_low_memory_core_early(void)
 
        memblock_clear_hotplug(0, -1);
 
-       for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+       for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+                               NULL)
                count += __free_memory_core(start, end);
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
index e544508e2a4bc3e3dfa6190abdd3faf0616c95f2..05e7447d960b0628d9dea7ea22a02a0f15488b9d 100644 (file)
 #include <asm/mmu_context.h>
 #include "internal.h"
 
-#if 0
-#define kenter(FMT, ...) \
-       printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
-       printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
-       printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
-#else
-#define kenter(FMT, ...) \
-       no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
-       no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
-       no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
-#endif
-
 void *high_memory;
 EXPORT_SYMBOL(high_memory);
 struct page *mem_map;
@@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to)
        for (; from < to; from += PAGE_SIZE) {
                struct page *page = virt_to_page(from);
 
-               kdebug("- free %lx", from);
                atomic_long_dec(&mmap_pages_allocated);
-               if (page_count(page) != 1)
-                       kdebug("free page %p: refcount not one: %d",
-                              page, page_count(page));
                put_page(page);
        }
 }
@@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to)
 static void __put_nommu_region(struct vm_region *region)
        __releases(nommu_region_sem)
 {
-       kenter("%p{%d}", region, region->vm_usage);
-
        BUG_ON(!nommu_region_tree.rb_node);
 
        if (--region->vm_usage == 0) {
@@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region)
 
                /* IO memory and memory shared directly out of the pagecache
                 * from ramfs/tmpfs mustn't be released here */
-               if (region->vm_flags & VM_MAPPED_COPY) {
-                       kdebug("free series");
+               if (region->vm_flags & VM_MAPPED_COPY)
                        free_page_series(region->vm_start, region->vm_top);
-               }
                kmem_cache_free(vm_region_jar, region);
        } else {
                up_write(&nommu_region_sem);
@@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        struct address_space *mapping;
        struct rb_node **p, *parent, *rb_prev;
 
-       kenter(",%p", vma);
-
        BUG_ON(!vma->vm_region);
 
        mm->map_count++;
@@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        struct mm_struct *mm = vma->vm_mm;
        struct task_struct *curr = current;
 
-       kenter("%p", vma);
-
        protect_vma(vma, 0);
 
        mm->map_count--;
@@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
  */
 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-       kenter("%p", vma);
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
        if (vma->vm_file)
@@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file,
        int ret;
 
        /* do the simple checks first */
-       if (flags & MAP_FIXED) {
-               printk(KERN_DEBUG
-                      "%d: Can't do fixed-address/overlay mmap of RAM\n",
-                      current->pid);
+       if (flags & MAP_FIXED)
                return -EINVAL;
-       }
 
        if ((flags & MAP_TYPE) != MAP_PRIVATE &&
            (flags & MAP_TYPE) != MAP_SHARED)
@@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file,
                            ) {
                                capabilities &= ~NOMMU_MAP_DIRECT;
                                if (flags & MAP_SHARED) {
-                                       printk(KERN_WARNING
-                                              "MAP_SHARED not completely supported on !MMU\n");
+                                       pr_warn("MAP_SHARED not completely supported on !MMU\n");
                                        return -EINVAL;
                                }
                        }
@@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
         *   we're allocating is smaller than a page
         */
        order = get_order(len);
-       kdebug("alloc order %d for %lx", order, len);
-
        total = 1 << order;
        point = len >> PAGE_SHIFT;
 
        /* we don't want to allocate a power-of-2 sized page set */
-       if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+       if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
                total = point;
-               kdebug("try to alloc exact %lu pages", total);
-       }
 
        base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
        if (!base)
@@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file,
        unsigned long capabilities, vm_flags, result;
        int ret;
 
-       kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
-
        *populate = 0;
 
        /* decide whether we should attempt the mapping, and if so what sort of
         * mapping */
        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
                                    &capabilities);
-       if (ret < 0) {
-               kleave(" = %d [val]", ret);
+       if (ret < 0)
                return ret;
-       }
 
        /* we ignore the address hint */
        addr = 0;
@@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file,
                        vma->vm_start = start;
                        vma->vm_end = start + len;
 
-                       if (pregion->vm_flags & VM_MAPPED_COPY) {
-                               kdebug("share copy");
+                       if (pregion->vm_flags & VM_MAPPED_COPY)
                                vma->vm_flags |= VM_MAPPED_COPY;
-                       } else {
-                               kdebug("share mmap");
+                       else {
                                ret = do_mmap_shared_file(vma);
                                if (ret < 0) {
                                        vma->vm_region = NULL;
@@ -1467,7 +1423,6 @@ share:
 
        up_write(&nommu_region_sem);
 
-       kleave(" = %lx", result);
        return result;
 
 error_just_free:
@@ -1479,27 +1434,24 @@ error:
        if (vma->vm_file)
                fput(vma->vm_file);
        kmem_cache_free(vm_area_cachep, vma);
-       kleave(" = %d", ret);
        return ret;
 
 sharing_violation:
        up_write(&nommu_region_sem);
-       printk(KERN_WARNING "Attempt to share mismatched mappings\n");
+       pr_warn("Attempt to share mismatched mappings\n");
        ret = -EINVAL;
        goto error;
 
 error_getting_vma:
        kmem_cache_free(vm_region_jar, region);
-       printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
-              " from process %d failed\n",
-              len, current->pid);
+       pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
+                       len, current->pid);
        show_free_areas(0);
        return -ENOMEM;
 
 error_getting_region:
-       printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
-              " from process %d failed\n",
-              len, current->pid);
+       pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
+                       len, current->pid);
        show_free_areas(0);
        return -ENOMEM;
 }
@@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_region *region;
        unsigned long npages;
 
-       kenter("");
-
        /* we're only permitted to split anonymous regions (these should have
         * only a single usage on the region) */
        if (vma->vm_file)
@@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm,
 {
        struct vm_region *region;
 
-       kenter("");
-
        /* adjust the VMA's pointers, which may reposition it in the MM's tree
         * and list */
        delete_vma_from_mm(vma);
@@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        unsigned long end;
        int ret;
 
-       kenter(",%lx,%zx", start, len);
-
        len = PAGE_ALIGN(len);
        if (len == 0)
                return -EINVAL;
@@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        if (!vma) {
                static int limit;
                if (limit < 5) {
-                       printk(KERN_WARNING
-                              "munmap of memory not mmapped by process %d"
-                              " (%s): 0x%lx-0x%lx\n",
-                              current->pid, current->comm,
-                              start, start + len - 1);
+                       pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
+                                       current->pid, current->comm,
+                                       start, start + len - 1);
                        limit++;
                }
                return -EINVAL;
@@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        /* we're allowed to split an anonymous VMA but not a file-backed one */
        if (vma->vm_file) {
                do {
-                       if (start > vma->vm_start) {
-                               kleave(" = -EINVAL [miss]");
+                       if (start > vma->vm_start)
                                return -EINVAL;
-                       }
                        if (end == vma->vm_end)
                                goto erase_whole_vma;
                        vma = vma->vm_next;
                } while (vma);
-               kleave(" = -EINVAL [split file]");
                return -EINVAL;
        } else {
                /* the chunk must be a subset of the VMA found */
                if (start == vma->vm_start && end == vma->vm_end)
                        goto erase_whole_vma;
-               if (start < vma->vm_start || end > vma->vm_end) {
-                       kleave(" = -EINVAL [superset]");
+               if (start < vma->vm_start || end > vma->vm_end)
                        return -EINVAL;
-               }
-               if (start & ~PAGE_MASK) {
-                       kleave(" = -EINVAL [unaligned start]");
+               if (start & ~PAGE_MASK)
                        return -EINVAL;
-               }
-               if (end != vma->vm_end && end & ~PAGE_MASK) {
-                       kleave(" = -EINVAL [unaligned split]");
+               if (end != vma->vm_end && end & ~PAGE_MASK)
                        return -EINVAL;
-               }
                if (start != vma->vm_start && end != vma->vm_end) {
                        ret = split_vma(mm, vma, start, 1);
-                       if (ret < 0) {
-                               kleave(" = %d [split]", ret);
+                       if (ret < 0)
                                return ret;
-                       }
                }
                return shrink_vma(mm, vma, start, end);
        }
@@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 erase_whole_vma:
        delete_vma_from_mm(vma);
        delete_vma(mm, vma);
-       kleave(" = 0");
        return 0;
 }
 EXPORT_SYMBOL(do_munmap);
@@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm)
        if (!mm)
                return;
 
-       kenter("");
-
        mm->total_vm = 0;
 
        while ((vma = mm->mmap)) {
@@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm)
                delete_vma(mm, vma);
                cond_resched();
        }
-
-       kleave("");
 }
 
 unsigned long vm_brk(unsigned long addr, unsigned long len)
index 2b665da1b3c92070de5e9d8ee6864d843847115c..dff991e0681e85a5308ba097533e346e621bc954 100644 (file)
@@ -42,7 +42,8 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
-static DEFINE_SPINLOCK(zone_scan_lock);
+
+DEFINE_MUTEX(oom_lock);
 
 #ifdef CONFIG_NUMA
 /**
@@ -405,16 +406,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 
 bool oom_killer_disabled __read_mostly;
-static DECLARE_RWSEM(oom_sem);
 
 /**
- * mark_tsk_oom_victim - marks the given task as OOM victim.
+ * mark_oom_victim - mark the given task as OOM victim
  * @tsk: task to mark
  *
- * Has to be called with oom_sem taken for read and never after
+ * Has to be called with oom_lock held and never after
  * oom has been disabled already.
  */
-void mark_tsk_oom_victim(struct task_struct *tsk)
+void mark_oom_victim(struct task_struct *tsk)
 {
        WARN_ON(oom_killer_disabled);
        /* OOM killer might race with memcg OOM */
@@ -431,23 +431,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
 }
 
 /**
- * unmark_oom_victim - unmarks the current task as OOM victim.
- *
- * Wakes up all waiters in oom_killer_disable()
+ * exit_oom_victim - note the exit of an OOM victim
  */
-void unmark_oom_victim(void)
+void exit_oom_victim(void)
 {
-       if (!test_and_clear_thread_flag(TIF_MEMDIE))
-               return;
+       clear_thread_flag(TIF_MEMDIE);
 
-       down_read(&oom_sem);
-       /*
-        * There is no need to signal the lasst oom_victim if there
-        * is nobody who cares.
-        */
-       if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+       if (!atomic_dec_return(&oom_victims))
                wake_up_all(&oom_victims_wait);
-       up_read(&oom_sem);
 }
 
 /**
@@ -469,14 +460,14 @@ bool oom_killer_disable(void)
         * Make sure to not race with an ongoing OOM killer
         * and that the current is not the victim.
         */
-       down_write(&oom_sem);
+       mutex_lock(&oom_lock);
        if (test_thread_flag(TIF_MEMDIE)) {
-               up_write(&oom_sem);
+               mutex_unlock(&oom_lock);
                return false;
        }
 
        oom_killer_disabled = true;
-       up_write(&oom_sem);
+       mutex_unlock(&oom_lock);
 
        wait_event(oom_victims_wait, !atomic_read(&oom_victims));
 
@@ -488,9 +479,7 @@ bool oom_killer_disable(void)
  */
 void oom_killer_enable(void)
 {
-       down_write(&oom_sem);
        oom_killer_disabled = false;
-       up_write(&oom_sem);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -517,7 +506,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        task_lock(p);
        if (p->mm && task_will_free_mem(p)) {
-               mark_tsk_oom_victim(p);
+               mark_oom_victim(p);
                task_unlock(p);
                put_task_struct(p);
                return;
@@ -528,7 +517,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                dump_header(p, gfp_mask, order, memcg, nodemask);
 
        task_lock(p);
-       pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+       pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
                message, task_pid_nr(p), p->comm, points);
        task_unlock(p);
 
@@ -572,7 +561,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
        /* mm cannot safely be dereferenced after task_unlock(victim) */
        mm = victim->mm;
-       mark_tsk_oom_victim(victim);
+       mark_oom_victim(victim);
        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
                K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -645,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
- */
-bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-       struct zoneref *z;
-       struct zone *zone;
-       bool ret = true;
-
-       spin_lock(&zone_scan_lock);
-       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-               if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
-                       ret = false;
-                       goto out;
-               }
-
-       /*
-        * Lock each zone in the zonelist under zone_scan_lock so a parallel
-        * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
-        */
-       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-               set_bit(ZONE_OOM_LOCKED, &zone->flags);
-
-out:
-       spin_unlock(&zone_scan_lock);
-       return ret;
-}
-
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-       struct zoneref *z;
-       struct zone *zone;
-
-       spin_lock(&zone_scan_lock);
-       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-               clear_bit(ZONE_OOM_LOCKED, &zone->flags);
-       spin_unlock(&zone_scan_lock);
-}
-
 /**
  * __out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
@@ -704,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-               int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+                  int order, nodemask_t *nodemask, bool force_kill)
 {
        const nodemask_t *mpol_mask;
        struct task_struct *p;
@@ -715,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        enum oom_constraint constraint = CONSTRAINT_NONE;
        int killed = 0;
 
+       if (oom_killer_disabled)
+               return false;
+
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
-               return;
+               goto out;
 
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
@@ -730,8 +676,8 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
         */
        if (current->mm &&
            (fatal_signal_pending(current) || task_will_free_mem(current))) {
-               mark_tsk_oom_victim(current);
-               return;
+               mark_oom_victim(current);
+               goto out;
        }
 
        /*
@@ -771,32 +717,8 @@ out:
         */
        if (killed)
                schedule_timeout_killable(1);
-}
-
-/**
- * out_of_memory -  tries to invoke OOM killer.
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
- *
- * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
- * when it returns false. Otherwise returns true.
- */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-               int order, nodemask_t *nodemask, bool force_kill)
-{
-       bool ret = false;
-
-       down_read(&oom_sem);
-       if (!oom_killer_disabled) {
-               __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
-               ret = true;
-       }
-       up_read(&oom_sem);
 
-       return ret;
+       return true;
 }
 
 /*
@@ -806,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
  */
 void pagefault_out_of_memory(void)
 {
-       struct zonelist *zonelist;
-
-       down_read(&oom_sem);
        if (mem_cgroup_oom_synchronize(true))
-               goto unlock;
+               return;
 
-       zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
-       if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-               if (!oom_killer_disabled)
-                       __out_of_memory(NULL, 0, 0, NULL, false);
-               else
-                       /*
-                        * There shouldn't be any user tasks runable while the
-                        * OOM killer is disabled so the current task has to
-                        * be a racing OOM victim for which oom_killer_disable()
-                        * is waiting for.
-                        */
-                       WARN_ON(test_thread_flag(TIF_MEMDIE));
+       if (!mutex_trylock(&oom_lock))
+               return;
 
-               oom_zonelist_unlock(zonelist, GFP_KERNEL);
+       if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+               /*
+                * There shouldn't be any user tasks runnable while the
+                * OOM killer is disabled, so the current task has to
+                * be a racing OOM victim for which oom_killer_disable()
+                * is waiting for.
+                */
+               WARN_ON(test_thread_flag(TIF_MEMDIE));
        }
-unlock:
-       up_read(&oom_sem);
+
+       mutex_unlock(&oom_lock);
 }
index 2fd31aebef30c4abfb27d8890787bc914e880356..5e6fa06f2784c8cf9b2defd40120c0be32ca444e 100644 (file)
@@ -380,20 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
 
-static inline void prep_zero_page(struct page *page, unsigned int order,
-                                                       gfp_t gfp_flags)
-{
-       int i;
-
-       /*
-        * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
-        * and __GFP_HIGHMEM from hard or soft interrupt context.
-        */
-       VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
-       for (i = 0; i < (1 << order); i++)
-               clear_highpage(page + i);
-}
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 bool _debug_pagealloc_enabled __read_mostly;
@@ -975,7 +961,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
        kasan_alloc_pages(page, order);
 
        if (gfp_flags & __GFP_ZERO)
-               prep_zero_page(page, order, gfp_flags);
+               for (i = 0; i < (1 << order); i++)
+                       clear_highpage(page + i);
 
        if (order && (gfp_flags & __GFP_COMP))
                prep_compound_page(page, order);
@@ -2322,48 +2309,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                show_mem(filter);
 }
 
-static inline int
-should_alloc_retry(gfp_t gfp_mask, unsigned int order,
-                               unsigned long did_some_progress,
-                               unsigned long pages_reclaimed)
-{
-       /* Do not loop if specifically requested */
-       if (gfp_mask & __GFP_NORETRY)
-               return 0;
-
-       /* Always retry if specifically requested */
-       if (gfp_mask & __GFP_NOFAIL)
-               return 1;
-
-       /*
-        * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
-        * making forward progress without invoking OOM. Suspend also disables
-        * storage devices so kswapd will not help. Bail if we are suspending.
-        */
-       if (!did_some_progress && pm_suspended_storage())
-               return 0;
-
-       /*
-        * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-        * means __GFP_NOFAIL, but that may not be true in other
-        * implementations.
-        */
-       if (order <= PAGE_ALLOC_COSTLY_ORDER)
-               return 1;
-
-       /*
-        * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-        * specified, then we retry until we no longer reclaim any pages
-        * (above), or we've reclaimed an order of pages at least as
-        * large as the allocation's order. In both cases, if the
-        * allocation still fails, we stop retrying.
-        */
-       if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
-               return 1;
-
-       return 0;
-}
-
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -2373,10 +2318,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        *did_some_progress = 0;
 
        /*
-        * Acquire the per-zone oom lock for each zone.  If that
-        * fails, somebody else is making progress for us.
+        * Acquire the oom lock.  If that fails, somebody else is
+        * making progress for us.
         */
-       if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+       if (!mutex_trylock(&oom_lock)) {
                *did_some_progress = 1;
                schedule_timeout_uninterruptible(1);
                return NULL;
@@ -2402,16 +2347,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                /* The OOM killer does not needlessly kill tasks for lowmem */
                if (ac->high_zoneidx < ZONE_NORMAL)
                        goto out;
-               /* The OOM killer does not compensate for light reclaim */
+               /* The OOM killer does not compensate for IO-less reclaim */
                if (!(gfp_mask & __GFP_FS)) {
                        /*
                         * XXX: Page reclaim didn't yield anything,
                         * and the OOM killer can't be invoked, but
-                        * keep looping as per should_alloc_retry().
+                        * keep looping as per tradition.
                         */
                        *did_some_progress = 1;
                        goto out;
                }
+               if (pm_suspended_storage())
+                       goto out;
                /* The OOM killer may not free memory on a specific node */
                if (gfp_mask & __GFP_THISNODE)
                        goto out;
@@ -2421,7 +2368,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
                *did_some_progress = 1;
 out:
-       oom_zonelist_unlock(ac->zonelist, gfp_mask);
+       mutex_unlock(&oom_lock);
        return page;
 }
 
@@ -2794,40 +2741,40 @@ retry:
        if (page)
                goto got_pg;
 
-       /* Check if we should retry the allocation */
+       /* Do not loop if specifically requested */
+       if (gfp_mask & __GFP_NORETRY)
+               goto noretry;
+
+       /* Keep reclaiming pages as long as there is reasonable progress */
        pages_reclaimed += did_some_progress;
-       if (should_alloc_retry(gfp_mask, order, did_some_progress,
-                                               pages_reclaimed)) {
-               /*
-                * If we fail to make progress by freeing individual
-                * pages, but the allocation wants us to keep going,
-                * start OOM killing tasks.
-                */
-               if (!did_some_progress) {
-                       page = __alloc_pages_may_oom(gfp_mask, order, ac,
-                                                       &did_some_progress);
-                       if (page)
-                               goto got_pg;
-                       if (!did_some_progress)
-                               goto nopage;
-               }
+       if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
+           ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto retry;
-       } else {
-               /*
-                * High-order allocations do not necessarily loop after
-                * direct reclaim and reclaim/compaction depends on compaction
-                * being called after reclaim so call directly if necessary
-                */
-               page = __alloc_pages_direct_compact(gfp_mask, order,
-                                       alloc_flags, ac, migration_mode,
-                                       &contended_compaction,
-                                       &deferred_compaction);
-               if (page)
-                       goto got_pg;
        }
 
+       /* Reclaim has failed us, start killing things */
+       page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+       if (page)
+               goto got_pg;
+
+       /* Retry as long as the OOM killer is making progress */
+       if (did_some_progress)
+               goto retry;
+
+noretry:
+       /*
+        * High-order allocations do not necessarily loop after
+        * direct reclaim and reclaim/compaction depends on compaction
+        * being called after reclaim so call directly if necessary
+        */
+       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
+                                           ac, migration_mode,
+                                           &contended_compaction,
+                                           &deferred_compaction);
+       if (page)
+               goto got_pg;
 nopage:
        warn_alloc_failed(gfp_mask, order, NULL);
 got_pg:
@@ -4867,22 +4814,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                                                unsigned long *zones_size,
                                                unsigned long *zholes_size)
 {
-       unsigned long realtotalpages, totalpages = 0;
+       unsigned long realtotalpages = 0, totalpages = 0;
        enum zone_type i;
 
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
-                                                        node_start_pfn,
-                                                        node_end_pfn,
-                                                        zones_size);
-       pgdat->node_spanned_pages = totalpages;
-
-       realtotalpages = totalpages;
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               realtotalpages -=
-                       zone_absent_pages_in_node(pgdat->node_id, i,
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+               unsigned long size, real_size;
+
+               size = zone_spanned_pages_in_node(pgdat->node_id, i,
+                                                 node_start_pfn,
+                                                 node_end_pfn,
+                                                 zones_size);
+               real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn, node_end_pfn,
                                                  zholes_size);
+               zone->spanned_pages = size;
+               zone->present_pages = real_size;
+
+               totalpages += size;
+               realtotalpages += real_size;
+       }
+
+       pgdat->node_spanned_pages = totalpages;
        pgdat->node_present_pages = realtotalpages;
        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
                                                        realtotalpages);
@@ -4992,8 +4945,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
-               unsigned long node_start_pfn, unsigned long node_end_pfn,
-               unsigned long *zones_size, unsigned long *zholes_size)
+               unsigned long node_start_pfn, unsigned long node_end_pfn)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
@@ -5014,12 +4966,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, freesize, memmap_pages;
 
-               size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
-                                                 node_end_pfn, zones_size);
-               realsize = freesize = size - zone_absent_pages_in_node(nid, j,
-                                                               node_start_pfn,
-                                                               node_end_pfn,
-                                                               zholes_size);
+               size = zone->spanned_pages;
+               realsize = freesize = zone->present_pages;
 
                /*
                 * Adjust freesize so that it accounts for how much memory
@@ -5054,8 +5002,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        nr_kernel_pages -= memmap_pages;
                nr_all_pages += freesize;
 
-               zone->spanned_pages = size;
-               zone->present_pages = realsize;
                /*
                 * Set an approximate value for lowmem here, it will be adjusted
                 * when the bootmem allocator frees pages into the buddy system.
@@ -5161,8 +5107,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                (unsigned long)pgdat->node_mem_map);
 #endif
 
-       free_area_init_core(pgdat, start_pfn, end_pfn,
-                           zones_size, zholes_size);
+       free_area_init_core(pgdat, start_pfn, end_pfn);
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6111,9 +6056,9 @@ out:
        return ret;
 }
 
+#ifdef CONFIG_NUMA
 int hashdist = HASHDIST_DEFAULT;
 
-#ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
        if (!str)
index dfd02484e8de10913e4fe1264bde04e00cc1e9b6..2dd74487a0aff653814db79e9bd2251e87cf9981 100644 (file)
@@ -1030,7 +1030,7 @@ area_found:
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
 
        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
-       kmemleak_alloc_percpu(ptr, size);
+       kmemleak_alloc_percpu(ptr, size, gfp);
        return ptr;
 
 fail_unlock:
index c25f94b338115a4156b522c7c398d5d1602a89a2..6b674e00153cea664ecadcec49f7c4c35bb843c0 100644 (file)
@@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
-                      pmd_t *pmdp)
+pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp)
 {
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+       VM_BUG_ON(!pmd_trans_huge(*pmdp));
+       pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
 }
@@ -198,3 +199,23 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+
+#ifndef pmdp_collapse_flush
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+                         pmd_t *pmdp)
+{
+       /*
+        * pmd and hugepage pte format are same. So we could
+        * use the same function.
+        */
+       pmd_t pmd;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       VM_BUG_ON(pmd_trans_huge(*pmdp));
+       pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
index 24dd3f9fee27dfe577c7f605bd0c260f0bc143c7..7af1ecb21ccb2d560ca9f0f21e002a26465737f4 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -625,7 +625,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 
        pmd = pmd_offset(pud, address);
        /*
-        * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+        * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
@@ -950,7 +950,12 @@ void page_move_anon_rmap(struct page *page,
        VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
 
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-       page->mapping = (struct address_space *) anon_vma;
+       /*
+        * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+        * simultaneously, so a concurrent reader (eg page_referenced()'s
+        * PageAnon()) will not see one without the other.
+        */
+       WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
 }
 
 /**
index 3759099d8ce438f57398d84f50049b9e8821bd3a..4caf8ed24d6586e32ab910f28f945c01cef6373b 100644 (file)
@@ -569,7 +569,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
                }
-               if (newsize < oldsize) {
+               if (newsize <= oldsize) {
                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                        shmem_truncate_range(inode, newsize, (loff_t)-1);
index 7eb38dd1cefa2f988be6297c1b75b12369e05827..200e22412a161fc7a2232bcb923f07bc117362b8 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void)
        kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
                                kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
        slab_state = PARTIAL_NODE;
+       setup_kmalloc_cache_index_table();
 
        slab_early_init = 0;
 
index 4c3ac12dd64405478b1b5bd24b7f5ff4f479c981..8da63e4e470f21b935e12f7dc5a47199cca704fe 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags,
 
 #ifndef CONFIG_SLOB
 /* Kmalloc array related functions */
+void setup_kmalloc_cache_index_table(void);
 void create_kmalloc_caches(unsigned long);
 
 /* Find the kmalloc slab corresponding for a certain size */
index 999bb3424d44df71eb9b92d3ae6da75287a391d4..9f8d71f784041af8cafebb4751b39757f13f558e 100644 (file)
@@ -784,25 +784,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 }
 
 /*
- * Create the kmalloc array. Some of the regular kmalloc arrays
- * may already have been created because they were needed to
- * enable allocations for slab creation.
+ * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
+ * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
+ * kmalloc-67108864.
  */
-void __init create_kmalloc_caches(unsigned long flags)
+static struct {
+       const char *name;
+       unsigned long size;
+} const kmalloc_info[] __initconst = {
+       {NULL,                      0},         {"kmalloc-96",             96},
+       {"kmalloc-192",           192},         {"kmalloc-8",               8},
+       {"kmalloc-16",             16},         {"kmalloc-32",             32},
+       {"kmalloc-64",             64},         {"kmalloc-128",           128},
+       {"kmalloc-256",           256},         {"kmalloc-512",           512},
+       {"kmalloc-1024",         1024},         {"kmalloc-2048",         2048},
+       {"kmalloc-4096",         4096},         {"kmalloc-8192",         8192},
+       {"kmalloc-16384",       16384},         {"kmalloc-32768",       32768},
+       {"kmalloc-65536",       65536},         {"kmalloc-131072",     131072},
+       {"kmalloc-262144",     262144},         {"kmalloc-524288",     524288},
+       {"kmalloc-1048576",   1048576},         {"kmalloc-2097152",   2097152},
+       {"kmalloc-4194304",   4194304},         {"kmalloc-8388608",   8388608},
+       {"kmalloc-16777216", 16777216},         {"kmalloc-33554432", 33554432},
+       {"kmalloc-67108864", 67108864}
+};
+
+/*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+void __init setup_kmalloc_cache_index_table(void)
 {
        int i;
 
-       /*
-        * Patch up the size_index table if we have strange large alignment
-        * requirements for the kmalloc array. This is only the case for
-        * MIPS it seems. The standard arches will not generate any code here.
-        *
-        * Largest permitted alignment is 256 bytes due to the way we
-        * handle the index determination for the smaller caches.
-        *
-        * Make sure that nothing crazy happens if someone starts tinkering
-        * around with ARCH_KMALLOC_MINALIGN
-        */
        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
                (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
 
@@ -833,39 +853,41 @@ void __init create_kmalloc_caches(unsigned long flags)
                for (i = 128 + 8; i <= 192; i += 8)
                        size_index[size_index_elem(i)] = 8;
        }
-       for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+}
+
+/*
+ * Create the kmalloc array. Some of the regular kmalloc arrays
+ * may already have been created because they were needed to
+ * enable allocations for slab creation.
+ */
+void __init create_kmalloc_caches(unsigned long flags)
+{
+       int i;
+
+       for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
                if (!kmalloc_caches[i]) {
-                       kmalloc_caches[i] = create_kmalloc_cache(NULL,
-                                                       1 << i, flags);
+                       kmalloc_caches[i] = create_kmalloc_cache(
+                                               kmalloc_info[i].name,
+                                               kmalloc_info[i].size,
+                                               flags);
                }
 
                /*
-                * Caches that are not of the two-to-the-power-of size.
-                * These have to be created immediately after the
-                * earlier power of two caches
+                * "i == 2" is the "kmalloc-192" case which is the last special
+                * case for initialization and it's the point to jump to
+                * allocate the minimize size of the object. In slab allocator,
+                * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4
+                * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is
+                * defined, it may be larger than 2^5 and here is also the
+                * trick to skip the empty gap.
                 */
-               if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
-                       kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
-
-               if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
-                       kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
+               if (i == 2)
+                       i = (KMALLOC_SHIFT_LOW - 1);
        }
 
        /* Kmalloc array is now usable */
        slab_state = UP;
 
-       for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
-               struct kmem_cache *s = kmalloc_caches[i];
-               char *n;
-
-               if (s) {
-                       n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
-
-                       BUG_ON(!n);
-                       s->name = n;
-               }
-       }
-
 #ifdef CONFIG_ZONE_DMA
        for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
                struct kmem_cache *s = kmalloc_caches[i];
index 54c0876b43d554e4e1d8981e08007a8a3cd0c59a..816df0016555ad8a5cf03c8020e0b39b75b0a498 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3700,6 +3700,7 @@ void __init kmem_cache_init(void)
        kmem_cache_node = bootstrap(&boot_kmem_cache_node);
 
        /* Now we can use the kmem_cache to allocate kmalloc slabs */
+       setup_kmalloc_cache_index_table();
        create_kmalloc_caches(0);
 
 #ifdef CONFIG_SMP
index a7251a8ed53297a7ec129b6254a5229995d86fc3..a3a0a2f1f7c3dc48c43494b949af6aee66adcf8f 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -131,7 +131,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
                 * here, see the comment above this function.
                 */
                VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-               VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
                if (put_page_testzero(page_head)) {
                        /*
                         * If this is the tail of a slab THP page,
index 5e8eadd71bac71bee1dd9a121a3d44f3a4373c56..19ef01e90ac42077c3d7898d5ef0d149a166b1aa 100644 (file)
@@ -2646,7 +2646,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
-               if (!populated_zone(zone))
+               if (!populated_zone(zone) ||
+                   zone_reclaimable_pages(zone) == 0)
                        continue;
 
                pfmemalloc_reserve += min_wmark_pages(zone);
@@ -3596,7 +3597,7 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)    /* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
-#define RECLAIM_SWAP (1<<2)    /* Swap pages out during reclaim */
+#define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
 
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -3638,12 +3639,12 @@ static long zone_pagecache_reclaimable(struct zone *zone)
        long delta = 0;
 
        /*
-        * If RECLAIM_SWAP is set, then all file pages are considered
+        * If RECLAIM_UNMAP is set, then all file pages are considered
         * potentially reclaimable. Otherwise, we have to worry about
         * pages like swapcache and zone_unmapped_file_pages() provides
         * a better estimate
         */
-       if (zone_reclaim_mode & RECLAIM_SWAP)
+       if (zone_reclaim_mode & RECLAIM_UNMAP)
                nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
        else
                nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@ -3674,15 +3675,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-               .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+               .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
                .may_swap = 1,
        };
 
        cond_resched();
        /*
-        * We need to be able to allocate from the reserves for RECLAIM_SWAP
+        * We need to be able to allocate from the reserves for RECLAIM_UNMAP
         * and we also need to be able to write out pages for RECLAIM_WRITE
-        * and RECLAIM_SWAP.
+        * and RECLAIM_UNMAP.
         */
        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
        lockdep_set_current_reclaim_state(gfp_mask);