Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 11 Sep 2015 01:19:42 +0000 (18:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 11 Sep 2015 01:19:42 +0000 (18:19 -0700)
Merge third patch-bomb from Andrew Morton:

 - even more of the rest of MM

 - lib/ updates

 - checkpatch updates

 - small changes to a few scruffy filesystems

 - kmod fixes/cleanups

 - kexec updates

 - a dma-mapping cleanup series from hch

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (81 commits)
  dma-mapping: consolidate dma_set_mask
  dma-mapping: consolidate dma_supported
  dma-mapping: cosolidate dma_mapping_error
  dma-mapping: consolidate dma_{alloc,free}_noncoherent
  dma-mapping: consolidate dma_{alloc,free}_{attrs,coherent}
  mm: use vma_is_anonymous() in create_huge_pmd() and wp_huge_pmd()
  mm: make sure all file VMAs have ->vm_ops set
  mm, mpx: add "vm_flags_t vm_flags" arg to do_mmap_pgoff()
  mm: mark most vm_operations_struct const
  namei: fix warning while make xmldocs caused by namei.c
  ipc: convert invalid scenarios to use WARN_ON
  zlib_deflate/deftree: remove bi_reverse()
  lib/decompress_unlzma: Do a NULL check for pointer
  lib/decompressors: use real out buf size for gunzip with kernel
  fs/affs: make root lookup from blkdev logical size
  sysctl: fix int -> unsigned long assignments in INT_MIN case
  kexec: export KERNEL_IMAGE_SIZE to vmcoreinfo
  kexec: align crash_notes allocation to make it be inside one physical page
  kexec: remove unnecessary test in kimage_alloc_crash_control_pages()
  kexec: split kexec_load syscall from kexec core code
  ...

165 files changed:
CREDITS
Documentation/vm/00-INDEX
Documentation/vm/idle_page_tracking.txt [new file with mode: 0644]
Documentation/vm/pagemap.txt
Documentation/vm/zswap.txt
MAINTAINERS
arch/Kconfig
arch/alpha/include/asm/dma-mapping.h
arch/alpha/kernel/pci-noop.c
arch/alpha/kernel/pci_iommu.c
arch/arm/Kconfig
arch/arm/boot/compressed/decompress.c
arch/arm/include/asm/dma-mapping.h
arch/arm/mm/dma-mapping.c
arch/arm64/include/asm/dma-mapping.h
arch/h8300/boot/compressed/misc.c
arch/h8300/include/asm/dma-mapping.h
arch/hexagon/include/asm/dma-mapping.h
arch/hexagon/kernel/dma.c
arch/ia64/Kconfig
arch/ia64/include/asm/dma-mapping.h
arch/m32r/boot/compressed/misc.c
arch/m68k/Kconfig
arch/microblaze/include/asm/dma-mapping.h
arch/mips/Kconfig
arch/mips/boot/compressed/decompress.c
arch/mips/cavium-octeon/dma-octeon.c
arch/mips/include/asm/dma-mapping.h
arch/mips/loongson64/common/dma-swiotlb.c
arch/mips/mm/dma-default.c
arch/mips/netlogic/common/nlm-dma.c
arch/openrisc/include/asm/dma-mapping.h
arch/powerpc/Kconfig
arch/powerpc/include/asm/dma-mapping.h
arch/s390/Kconfig
arch/s390/boot/compressed/misc.c
arch/s390/include/asm/dma-mapping.h
arch/s390/pci/pci_dma.c
arch/sh/Kconfig
arch/sh/boot/compressed/misc.c
arch/sh/include/asm/dma-mapping.h
arch/sparc/include/asm/dma-mapping.h
arch/tile/Kconfig
arch/tile/include/asm/dma-mapping.h
arch/unicore32/boot/compressed/misc.c
arch/unicore32/include/asm/dma-mapping.h
arch/x86/Kconfig
arch/x86/boot/compressed/misc.c
arch/x86/boot/header.S
arch/x86/entry/vsyscall/vsyscall_64.c
arch/x86/include/asm/dma-mapping.h
arch/x86/include/asm/kdebug.h
arch/x86/kernel/Makefile
arch/x86/kernel/kvmclock.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/vmx.c
arch/x86/mm/mpx.c
arch/x86/platform/efi/efi.c
arch/x86/platform/uv/uv_nmi.c
arch/xtensa/include/asm/dma-mapping.h
drivers/android/binder.c
drivers/crypto/qat/qat_common/adf_transport_debug.c
drivers/firmware/efi/Kconfig
drivers/gpu/drm/vgem/vgem_drv.c
drivers/hsi/clients/cmt_speech.c
drivers/infiniband/hw/qib/qib_file_ops.c
drivers/infiniband/hw/qib/qib_mmap.c
drivers/media/platform/omap/omap_vout.c
drivers/misc/genwqe/card_dev.c
drivers/net/wireless/ath/wil6210/debugfs.c
drivers/parisc/ccio-dma.c
drivers/parisc/sba_iommu.c
drivers/pci/pci-driver.c
drivers/s390/crypto/zcrypt_api.c
drivers/staging/android/ion/ion.c
drivers/staging/comedi/comedi_fops.c
drivers/video/fbdev/omap2/omapfb/omapfb-main.c
drivers/xen/gntalloc.c
drivers/xen/gntdev.c
drivers/xen/privcmd.c
drivers/xen/swiotlb-xen.c
fs/affs/super.c
fs/ceph/addr.c
fs/cifs/file.c
fs/coda/upcall.c
fs/coredump.c
fs/hfs/bnode.c
fs/hfs/brec.c
fs/hfsplus/bnode.c
fs/namei.c
fs/proc/base.c
fs/proc/generic.c
fs/proc/page.c
fs/proc/task_mmu.c
fs/seq_file.c
include/asm-generic/dma-mapping-common.h
include/linux/kexec.h
include/linux/kmod.h
include/linux/memcontrol.h
include/linux/mm.h
include/linux/mmu_notifier.h
include/linux/page-flags.h
include/linux/page_ext.h
include/linux/page_idle.h [new file with mode: 0644]
include/linux/poison.h
include/linux/printk.h
include/linux/seq_file.h
include/linux/string_helpers.h
include/linux/zpool.h
include/uapi/linux/kernel-page-flags.h
init/initramfs.c
init/main.c
ipc/msgutil.c
ipc/shm.c
kernel/Makefile
kernel/cred.c
kernel/events/core.c
kernel/extable.c
kernel/kexec.c
kernel/kexec_core.c [new file with mode: 0644]
kernel/kexec_file.c [new file with mode: 0644]
kernel/kexec_internal.h [new file with mode: 0644]
kernel/kmod.c
kernel/ksysfs.c
kernel/printk/printk.c
kernel/reboot.c
kernel/sysctl.c
lib/bitmap.c
lib/decompress_bunzip2.c
lib/decompress_inflate.c
lib/decompress_unlz4.c
lib/decompress_unlzma.c
lib/decompress_unlzo.c
lib/decompress_unxz.c
lib/kstrtox.c
lib/string_helpers.c
lib/test-kstrtox.c
lib/test_kasan.c
lib/zlib_deflate/deftree.c
lib/zlib_deflate/defutil.h
mm/Kconfig
mm/Makefile
mm/debug.c
mm/huge_memory.c
mm/hwpoison-inject.c
mm/kmemleak.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/migrate.c
mm/mmap.c
mm/mmu_notifier.c
mm/nommu.c
mm/page_ext.c
mm/page_idle.c [new file with mode: 0644]
mm/rmap.c
mm/swap.c
mm/zpool.c
mm/zswap.c
scripts/checkpatch.pl
security/selinux/selinuxfs.c
virt/kvm/kvm_main.c

diff --git a/CREDITS b/CREDITS
index bcb8efaa945903abd97fb207718bcdfacf516c55..8207cc62ee9d6079bb55032090ef518ef08f6b04 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -2992,6 +2992,10 @@ S: 2200 Mission College Blvd
 S: Santa Clara, CA 95052
 S: USA
 
+N: Anil Ravindranath
+E: anil_ravindranath@pmc-sierra.com
+D: PMC-Sierra MaxRAID driver
+
 N: Eric S. Raymond
 E: esr@thyrsus.com
 W: http://www.tuxedo.org/~esr/
index 081c49777abb81e54bc6bfee8b2ae553b3954b3e..6a5e2a102a451b186344361603b43bff4c4ea15c 100644 (file)
@@ -14,6 +14,8 @@ hugetlbpage.txt
        - a brief summary of hugetlbpage support in the Linux kernel.
 hwpoison.txt
        - explains what hwpoison is
+idle_page_tracking.txt
+       - description of the idle page tracking feature.
 ksm.txt
        - how to use the Kernel Samepage Merging feature.
 numa
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt
new file mode 100644 (file)
index 0000000..85dcc3b
--- /dev/null
@@ -0,0 +1,98 @@
+MOTIVATION
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+USER API
+
+The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
+it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the IMPLEMENTATION
+DETAILS section). To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+/proc/kpageflags in order to correctly count idle huge pages.
+
+Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+    /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
+    /proc/pid/pagemap if the workload is represented by a process, or by
+    filtering out alien pages using /proc/kpagecgroup in case the workload is
+    placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
+    one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using /proc/kpageflags.
+
+See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
+/proc/kpageflags, and /proc/kpagecgroup.
+
+IMPLEMENTATION DETAILS
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+   or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+   because a process needs filesystem metadata stored in it (e.g. lists a
+   directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
index 3cd38438242aef330a6b2e67e82dfe52ee63f7b2..0e1e55588b598b45a60e3e476bad0dc8a9a53266 100644 (file)
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are three components to pagemap:
+There are four components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
    physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -70,6 +70,11 @@ There are three components to pagemap:
     22. THP
     23. BALLOON
     24. ZERO_PAGE
+    25. IDLE
+
+ * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
 
 Short descriptions to the page flags:
 
@@ -116,6 +121,12 @@ Short descriptions to the page flags:
 24. ZERO_PAGE
     zero page for pfn_zero or huge_zero page
 
+25. IDLE
+    page has not been accessed since it was marked idle (see
+    Documentation/vm/idle_page_tracking.txt). Note that this flag may be
+    stale in case the page was accessed via a PTE. To make sure the flag
+    is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
+
     [IO related page flags]
  1. ERROR     IO error occurred
  3. UPTODATE  page has up-to-date data
index 8458c0861e4e6e01fcd2aac4aa6721e88a6a1f37..89fff7d611ccb533a5c3d375bc94fecf3c2e0687 100644 (file)
@@ -32,7 +32,7 @@ can also be enabled and disabled at runtime using the sysfs interface.
 An example command to enable zswap at runtime, assuming sysfs is mounted
 at /sys, is:
 
-echo 1 > /sys/modules/zswap/parameters/enabled
+echo 1 > /sys/module/zswap/parameters/enabled
 
 When zswap is disabled at runtime it will stop storing pages that are
 being swapped out.  However, it will _not_ immediately write out or fault
@@ -49,14 +49,26 @@ Zswap receives pages for compression through the Frontswap API and is able to
 evict pages from its own compressed pool on an LRU basis and write them back to
 the backing swap device in the case that the compressed pool is full.
 
-Zswap makes use of zbud for the managing the compressed memory pool.  Each
-allocation in zbud is not directly accessible by address.  Rather, a handle is
+Zswap makes use of zpool for the managing the compressed memory pool.  Each
+allocation in zpool is not directly accessible by address.  Rather, a handle is
 returned by the allocation routine and that handle must be mapped before being
 accessed.  The compressed memory pool grows on demand and shrinks as compressed
-pages are freed.  The pool is not preallocated.
+pages are freed.  The pool is not preallocated.  By default, a zpool of type
+zbud is created, but it can be selected at boot time by setting the "zpool"
+attribute, e.g. zswap.zpool=zbud.  It can also be changed at runtime using the
+sysfs "zpool" attribute, e.g.
+
+echo zbud > /sys/module/zswap/parameters/zpool
+
+The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
+means the compression ratio will always be 2:1 or worse (because of half-full
+zbud pages).  The zsmalloc type zpool has a more complex compressed page
+storage method, and it can achieve greater storage densities.  However,
+zsmalloc does not implement compressed page eviction, so once zswap fills it
+cannot evict the oldest page, it can only reject new pages.
 
 When a swap page is passed from frontswap to zswap, zswap maintains a mapping
-of the swap entry, a combination of the swap type and swap offset, to the zbud
+of the swap entry, a combination of the swap type and swap offset, to the zpool
 handle that references that compressed swap page.  This mapping is achieved
 with a red-black tree per swap type.  The swap offset is the search key for the
 tree nodes.
@@ -74,9 +86,17 @@ controlled policy:
 * max_pool_percent - The maximum percentage of memory that the compressed
     pool can occupy.
 
-Zswap allows the compressor to be selected at kernel boot time by setting the
-“compressor” attribute.  The default compressor is lzo.  e.g.
-zswap.compressor=deflate
+The default compressor is lzo, but it can be selected at boot time by setting
+the “compressor” attribute, e.g. zswap.compressor=lzo.  It can also be changed
+at runtime using the sysfs "compressor" attribute, e.g.
+
+echo lzo > /sys/module/zswap/parameters/compressor
+
+When the zpool and/or compressor parameter is changed at runtime, any existing
+compressed pages are not modified; they are left in their own zpool.  When a
+request is made for a page in an old zpool, it is uncompressed using its
+original compressor.  Once all pages are removed from an old zpool, the zpool
+and its compressor are freed.
 
 A debugfs interface is provided for various statistic about pool size, number
 of pages stored, and various counters for the reasons pages are rejected.
index 67a4443daed937f8fffa0218e37d111b84ce2e86..310da4295c7026e27698e9f8b980adcc1893b774 100644 (file)
@@ -8199,10 +8199,9 @@ F:       drivers/hwmon/pmbus/
 F:     include/linux/i2c/pmbus.h
 
 PMC SIERRA MaxRAID DRIVER
-M:     Anil Ravindranath <anil_ravindranath@pmc-sierra.com>
 L:     linux-scsi@vger.kernel.org
 W:     http://www.pmc-sierra.com/
-S:     Supported
+S:     Orphan
 F:     drivers/scsi/pmcraid.*
 
 PMC SIERRA PM8001 DRIVER
index 8f35649305804c913efe1501e486ddb465e12810..4e949e58b1928363232abac3a69a25413e90652e 100644 (file)
@@ -2,6 +2,9 @@
 # General architecture dependent options
 #
 
+config KEXEC_CORE
+       bool
+
 config OPROFILE
        tristate "OProfile system profiling"
        depends on PROFILING
index dfa32f0613201a92c5c6e5bf81443302deff46ac..72a8ca7796d91a2d2a92d696ce507650678c3998 100644 (file)
@@ -12,42 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       return get_dma_ops(dev)->alloc(dev, size, dma_handle, gfp, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       get_dma_ops(dev)->free(dev, size, vaddr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f)      dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h)       dma_free_coherent(d, s, v, h)
-
 #define dma_cache_sync(dev, va, size, dir)               ((void)0)
 
 #endif /* _ALPHA_DMA_MAPPING_H */
index df24b76f92461a5df780059eea1488e3b50945e7..2b1f4a1e92723bf67450c3d3ba4006e792971c70 100644 (file)
@@ -166,15 +166,6 @@ static int alpha_noop_supported(struct device *dev, u64 mask)
        return mask < 0x00ffffffUL ? 0 : 1;
 }
 
-static int alpha_noop_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 struct dma_map_ops alpha_noop_ops = {
        .alloc                  = alpha_noop_alloc_coherent,
        .free                   = alpha_noop_free_coherent,
@@ -182,7 +173,6 @@ struct dma_map_ops alpha_noop_ops = {
        .map_sg                 = alpha_noop_map_sg,
        .mapping_error          = alpha_noop_mapping_error,
        .dma_supported          = alpha_noop_supported,
-       .set_dma_mask           = alpha_noop_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_noop_ops;
index eddee77203431fb9d50d7c0feb313f26d5985d8e..8969bf2dfe3a0d4ff797888d2ce0a4a8785103dc 100644 (file)
@@ -939,16 +939,6 @@ static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr)
        return dma_addr == 0;
 }
 
-static int alpha_pci_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask ||
-           !pci_dma_supported(alpha_gendev_to_pci(dev), mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 struct dma_map_ops alpha_pci_ops = {
        .alloc                  = alpha_pci_alloc_coherent,
        .free                   = alpha_pci_free_coherent,
@@ -958,7 +948,6 @@ struct dma_map_ops alpha_pci_ops = {
        .unmap_sg               = alpha_pci_unmap_sg,
        .mapping_error          = alpha_pci_mapping_error,
        .dma_supported          = alpha_pci_supported,
-       .set_dma_mask           = alpha_pci_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_pci_ops;
index 0d1b717e1eca6754672294777f9d7f17b9f08364..72ad724c67ae94cd6682ec15f3834966dd7028c0 100644 (file)
@@ -2020,6 +2020,7 @@ config KEXEC
        bool "Kexec system call (EXPERIMENTAL)"
        depends on (!SMP || PM_SLEEP_SMP)
        depends on !CPU_V7M
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index bd245d34952d2ad2392e9f9399654ac479c431f1..a0765e7ed6c7dd2166b2cb95874fc076532d526c 100644 (file)
@@ -57,5 +57,5 @@ extern char * strstr(const char * s1, const char *s2);
 
 int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
 {
-       return decompress(input, len, NULL, NULL, output, NULL, error);
+       return __decompress(input, len, NULL, NULL, output, 0, NULL, error);
 }
index a68b9d8a71fed8ee2357d833a023d2bed9696d69..ccb3aa64640dc350da9de1319d65b2a46c44e2ee 100644 (file)
@@ -8,7 +8,6 @@
 #include <linux/dma-attrs.h>
 #include <linux/dma-debug.h>
 
-#include <asm-generic/dma-coherent.h>
 #include <asm/memory.h>
 
 #include <xen/xen.h>
@@ -39,12 +38,15 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
        dev->archdata.dma_ops = ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
+#define HAVE_ARCH_DMA_SUPPORTED 1
+extern int dma_supported(struct device *dev, u64 mask);
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
+/*
+ * Note that while the generic code provides dummy dma_{alloc,free}_noncoherent
+ * implementations, we don't provide a dma_cache_sync function so drivers using
+ * this API are highlighted with build warnings.
+ */
+#include <asm-generic/dma-mapping-common.h>
 
 #ifdef __arch_page_to_dma
 #error Please update to __arch_pfn_to_dma
@@ -167,32 +169,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline void dma_mark_clean(void *addr, size_t size) { }
 
-/*
- * DMA errors are defined by all-bits-set in the DMA address.
- */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return dma_addr == DMA_ERROR_CODE;
-}
-
-/*
- * Dummy noncoherent implementation.  We don't provide a dma_cache_sync
- * function so drivers using this API are highlighted with build warnings.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-               dma_addr_t *handle, gfp_t gfp)
-{
-       return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-               void *cpu_addr, dma_addr_t handle)
-{
-}
-
-extern int dma_supported(struct device *dev, u64 mask);
-
 extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 
 /**
@@ -209,21 +185,6 @@ extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                           gfp_t gfp, struct dma_attrs *attrs);
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                      dma_addr_t *dma_handle, gfp_t flag,
-                                      struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-       BUG_ON(!ops);
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-       return cpu_addr;
-}
-
 /**
  * arm_dma_free - free memory allocated by arm_dma_alloc
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -241,19 +202,6 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 extern void arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
                         dma_addr_t handle, struct dma_attrs *attrs);
 
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                    void *cpu_addr, dma_addr_t dma_handle,
-                                    struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       BUG_ON(!ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 /**
  * arm_dma_mmap - map a coherent DMA allocation into user space
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
index bf35abcc7d598c6cf7c44e5e2e13fd044893cff0..e62604384945e513a9b1ed14a2a5a2e3d8950630 100644 (file)
@@ -676,10 +676,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                    gfp_t gfp, struct dma_attrs *attrs)
 {
        pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, handle, &memory))
-               return memory;
 
        return __dma_alloc(dev, size, handle, gfp, prot, false,
                           attrs, __builtin_return_address(0));
@@ -688,11 +684,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
        dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, handle, &memory))
-               return memory;
-
        return __dma_alloc(dev, size, handle, gfp, PAGE_KERNEL, true,
                           attrs, __builtin_return_address(0));
 }
@@ -752,9 +743,6 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
        struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
        bool want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs);
 
-       if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
-               return;
-
        size = PAGE_ALIGN(size);
 
        if (nommu()) {
index f0d6d0bfe55ceceba3339bc9044bed31a159a9cf..cfdb34bedbcd8adeb0a6f012a02459042726030b 100644 (file)
@@ -22,8 +22,6 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <xen/xen.h>
 #include <asm/xen/hypervisor.h>
 
@@ -86,28 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
        return (phys_addr_t)dev_addr;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dev_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       debug_dma_mapping_error(dev, dev_addr);
-       return ops->mapping_error(dev, dev_addr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
@@ -120,50 +96,5 @@ static inline void dma_mark_clean(void *addr, size_t size)
 {
 }
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, h, f)  dma_free_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flags,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *vaddr;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &vaddr))
-               return vaddr;
-
-       vaddr = ops->alloc(dev, size, dma_handle, flags, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, vaddr);
-       return vaddr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dev_addr,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       debug_dma_free_coherent(dev, size, vaddr, dev_addr);
-       ops->free(dev, size, vaddr, dev_addr, attrs);
-}
-
-/*
- * There is no dma_cache_sync() implementation, so just return NULL here.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                                         dma_addr_t *handle, gfp_t flags)
-{
-       return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-                                       void *cpu_addr, dma_addr_t handle)
-{
-}
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_DMA_MAPPING_H */
index 704274127c07e9f5c67e074264d82bf08018ddbf..c4f2cfcb117bd6a6b9f1844c3a3313fa5e6be94a 100644 (file)
@@ -70,5 +70,5 @@ void decompress_kernel(void)
        free_mem_ptr = (unsigned long)&_end;
        free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
 
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
 }
index 6e67a90902f2894293281d2c9df4070128a880bb..d9b5b806afe6fcecfaf6c727c1a09310ed8867a1 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _H8300_DMA_MAPPING_H
 #define _H8300_DMA_MAPPING_H
 
-#include <asm-generic/dma-coherent.h>
-
 extern struct dma_map_ops h8300_dma_map_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
@@ -12,46 +10,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       return 0;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return 0;
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-       return memory;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return 0;
-}
-
 #endif
index 16965427f6b4827d1b9807fe98248f80540e910b..268fde8a45756e580ef06da3a2b051c4fb2e9f58 100644 (file)
 
 struct device;
 extern int bad_dma_address;
+#define DMA_ERROR_CODE bad_dma_address
 
 extern struct dma_map_ops *dma_ops;
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
        if (unlikely(dev == NULL))
@@ -45,8 +43,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *dev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
 extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                           enum dma_data_direction direction);
@@ -60,47 +58,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == bad_dma_address);
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       ret = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
 #endif
index b74f9bae31a3b9e81204d3c86b51acd3dcfe65c6..9e3ddf792bd3e00afc44afc567e52cf290b8c54f 100644 (file)
@@ -44,17 +44,6 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 static struct gen_pool *coherent_pool;
 
 
index 42a91a7aa2b08fa3a9ba4f1de06e07fdb47bfa9d..eb0249e3798112615fd5774d6f30229aa6241e53 100644 (file)
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
 config KEXEC
        bool "kexec system call"
        depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index cf3ab7e784b5474be705c6e091431f5538b1ee00..9beccf8010bd6bf8eaa64ab292a753449a0ca609 100644 (file)
@@ -23,60 +23,10 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
 extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
                                enum dma_data_direction);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *daddr, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       void *caddr;
-
-       caddr = ops->alloc(dev, size, daddr, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *daddr, caddr);
-       return caddr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *caddr, dma_addr_t daddr,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       debug_dma_free_coherent(dev, size, caddr, daddr);
-       ops->free(dev, size, caddr, daddr, attrs);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #define get_dma_ops(dev) platform_dma_get_ops(dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       debug_dma_mapping_error(dev, daddr);
-       return ops->mapping_error(dev, daddr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int
-dma_set_mask (struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
index 28a09529f206915fd00633bf846dde48342b8b36..3a76927458681d3785e2ed558a88961ba5b18532 100644 (file)
@@ -86,6 +86,7 @@ decompress_kernel(int mmu_on, unsigned char *zimage_data,
        free_mem_end_ptr = free_mem_ptr + BOOT_HEAP_SIZE;
 
        puts("\nDecompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output_data, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output_data, 0,
+                       NULL, error);
        puts("done.\nBooting the kernel.\n");
 }
index 2dd8f63bfbbb7850e7e797eb108f708eb1fc6d54..498b567f007b0a80d1905dc21e8fb503101b3724 100644 (file)
@@ -95,6 +95,7 @@ config MMU_SUN3
 config KEXEC
        bool "kexec system call"
        depends on M68KCLASSIC
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index ab353723076a8d1dde001295cf49cb7ddb5f4195..24b12970c9cff772d24d8b581aeb945b8d917b6b 100644 (file)
@@ -27,7 +27,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
-#include <asm-generic/dma-coherent.h>
 #include <asm/cacheflush.h>
 
 #define DMA_ERROR_CODE         (~(dma_addr_t)0x0)
@@ -45,31 +44,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &dma_direct_ops;
 }
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (unlikely(!ops))
-               return 0;
-       if (!ops->dma_supported)
-               return 1;
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (unlikely(ops == NULL))
-               return -EIO;
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, dma_mask);
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-       *dev->dma_mask = dma_mask;
-       return 0;
-}
-
 #include <asm-generic/dma-mapping-common.h>
 
 static inline void __dma_sync(unsigned long paddr,
@@ -88,50 +62,6 @@ static inline void __dma_sync(unsigned long paddr,
        }
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       BUG_ON(!ops);
-
-       memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!ops);
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                enum dma_data_direction direction)
 {
index 752acca8de1fa9f6f73aaf1d218840c04cd1abc7..e3aa5b0b4ef17771fbd2afa1557f29ee6a7a2b3d 100644 (file)
@@ -2597,6 +2597,7 @@ source "kernel/Kconfig.preempt"
 
 config KEXEC
        bool "Kexec system call"
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 54831069a206249444b31e40c4c749a37a667aa2..080cd53bac369158481785fd285733e7d5372e8c 100644 (file)
@@ -111,8 +111,8 @@ void decompress_kernel(unsigned long boot_heap_start)
        puts("\n");
 
        /* Decompress the kernel with according algorithm */
-       decompress((char *)zimage_start, zimage_size, 0, 0,
-                  (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, error);
+       __decompress((char *)zimage_start, zimage_size, 0, 0,
+                  (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, 0, error);
 
        /* FIXME: should we flush cache here? */
        puts("Now, booting the kernel...\n");
index d8960d46417b07ec5e3b0603b2f934efe4c1f54c..2cd45f5f9481cec75b8e32e384bdcc1396ffeb9b 100644 (file)
@@ -161,9 +161,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 {
        void *ret;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -194,11 +191,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 static void octeon_dma_free_coherent(struct device *dev, size_t size,
        void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
index 360b3387182af251713106cb304ac85ca78c5ca5..e604f760c4a076b44255b312b6f45180f06200c4 100644 (file)
@@ -4,7 +4,6 @@
 #include <linux/scatterlist.h>
 #include <asm/dma-coherence.h>
 #include <asm/cache.h>
-#include <asm-generic/dma-coherent.h>
 
 #ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */
 #include <dma-coherence.h>
@@ -32,73 +31,7 @@ static inline void dma_mark_clean(void *addr, size_t size) {}
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_mapping_error(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, mask);
-       return ops->mapping_error(dev, mask);
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if(!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, mask);
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
               enum dma_data_direction direction);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ops->free(dev, size, vaddr, dma_handle, attrs);
-
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
-
-
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                          dma_addr_t *dma_handle, gfp_t flag);
-
-void dma_free_noncoherent(struct device *dev, size_t size,
-                        void *vaddr, dma_addr_t dma_handle);
-
 #endif /* _ASM_DMA_MAPPING_H */
index 2c6b989c1bc4054c354b65fc0fbded4ac8c52b66..4ffa6fc81c8f78acaf24ae15849ee475a814d37a 100644 (file)
@@ -14,9 +14,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 {
        void *ret;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -46,11 +43,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 static void loongson_dma_free_coherent(struct device *dev, size_t size,
                void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
@@ -93,6 +85,9 @@ static void loongson_dma_sync_sg_for_device(struct device *dev,
 
 static int loongson_dma_set_mask(struct device *dev, u64 mask)
 {
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+
        if (mask > DMA_BIT_MASK(loongson_sysconf.dma_mask_bits)) {
                *dev->dma_mask = DMA_BIT_MASK(loongson_sysconf.dma_mask_bits);
                return -EIO;
index 8f23cf08f4baa68d4d73b93b62a396867085f877..a914dc1cb6d1bc339cf44cc0c5aeac887a2e5f74 100644 (file)
@@ -112,7 +112,7 @@ static gfp_t massage_gfp_flags(const struct device *dev, gfp_t gfp)
        return gfp | dma_flag;
 }
 
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
+static void *mips_dma_alloc_noncoherent(struct device *dev, size_t size,
        dma_addr_t * dma_handle, gfp_t gfp)
 {
        void *ret;
@@ -128,7 +128,6 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size,
 
        return ret;
 }
-EXPORT_SYMBOL(dma_alloc_noncoherent);
 
 static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
        dma_addr_t * dma_handle, gfp_t gfp, struct dma_attrs *attrs)
@@ -137,8 +136,12 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
        struct page *page = NULL;
        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
+       /*
+        * XXX: seems like the coherent and non-coherent implementations could
+        * be consolidated.
+        */
+       if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs))
+               return mips_dma_alloc_noncoherent(dev, size, dma_handle, gfp);
 
        gfp = massage_gfp_flags(dev, gfp);
 
@@ -164,24 +167,24 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
 }
 
 
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
-       dma_addr_t dma_handle)
+static void mips_dma_free_noncoherent(struct device *dev, size_t size,
+               void *vaddr, dma_addr_t dma_handle)
 {
        plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
        free_pages((unsigned long) vaddr, get_order(size));
 }
-EXPORT_SYMBOL(dma_free_noncoherent);
 
 static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
        dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
        unsigned long addr = (unsigned long) vaddr;
-       int order = get_order(size);
        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
        struct page *page = NULL;
 
-       if (dma_release_from_coherent(dev, order, vaddr))
+       if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs)) {
+               mips_dma_free_noncoherent(dev, size, vaddr, dma_handle);
                return;
+       }
 
        plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
 
index f3d4ae87abc7ffebcb2dc1f3827da069b8d12c02..3758715d4ab671af54399d8e2cfe1eeade8a41ac 100644 (file)
@@ -47,11 +47,6 @@ static char *nlm_swiotlb;
 static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
        dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-       void *ret;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -69,11 +64,6 @@ static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
 static void nlm_dma_free_coherent(struct device *dev, size_t size,
        void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
index fab8628e1b6e70d5c8159ca652992f2556e7eeb7..413bfcf863848fba556078d8ad6ea1cba59cf770 100644 (file)
@@ -23,7 +23,6 @@
  */
 
 #include <linux/dma-debug.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/kmemcheck.h>
 #include <linux/dma-mapping.h>
 
@@ -36,75 +35,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &or1k_dma_map_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) 
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                                         dma_addr_t *dma_handle, gfp_t gfp)
-{
-       struct dma_attrs attrs;
-
-       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-
-       return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-                                        void *cpu_addr, dma_addr_t dma_handle)
-{
-       struct dma_attrs attrs;
-
-       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-
-       dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
-}
-
+#define HAVE_ARCH_DMA_SUPPORTED 1
 static inline int dma_supported(struct device *dev, u64 dma_mask)
 {
        /* Support 32 bit DMA mask exclusively */
        return dma_mask == DMA_BIT_MASK(32);
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return 0;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
+#include <asm-generic/dma-mapping-common.h>
 
-       return 0;
-}
 #endif /* __ASM_OPENRISC_DMA_MAPPING_H */
index b447918b9e2c8bd289372d5bcf69d0f3bb1f2aad..9a7057ec21541a09af3cedc4e49350852cba1791 100644 (file)
@@ -420,6 +420,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config KEXEC
        bool "kexec system call"
        depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 710f60e380e07dbcd55dd6a239cb78f5e3a09c60..7f522c021dc3087af2393b40e603a054f992952b 100644 (file)
@@ -18,7 +18,9 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 
+#ifdef CONFIG_PPC64
 #define DMA_ERROR_CODE         (~(dma_addr_t)0x0)
+#endif
 
 /* Some dma direct funcs must be visible for use in other dma_ops */
 extern void *__dma_direct_alloc_coherent(struct device *dev, size_t size,
@@ -120,71 +122,14 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 /* this will be removed soon */
 #define flush_write_buffers()
 
-#include <asm-generic/dma-mapping-common.h>
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
+#define HAVE_ARCH_DMA_SET_MASK 1
+extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
-       if (unlikely(dma_ops == NULL))
-               return 0;
-       if (dma_ops->dma_supported == NULL)
-               return 1;
-       return dma_ops->dma_supported(dev, mask);
-}
+#include <asm-generic/dma-mapping-common.h>
 
-extern int dma_set_mask(struct device *dev, u64 dma_mask);
 extern int __dma_set_mask(struct device *dev, u64 dma_mask);
 extern u64 __dma_get_required_mask(struct device *dev);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       BUG_ON(!dma_ops);
-
-       cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-#ifdef CONFIG_PPC64
-       return (dma_addr == DMA_ERROR_CODE);
-#else
-       return 0;
-#endif
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 #ifdef CONFIG_SWIOTLB
@@ -210,9 +155,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
        return daddr - get_dma_offset(dev);
 }
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #define ARCH_HAS_DMA_MMAP_COHERENT
 
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
index 4827870f7a6d8c00925b7d052ed68efda20f364e..1d57000b1b24ad6c6946f67ea821385e436391b6 100644 (file)
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 
 config KEXEC
        def_bool y
+       select KEXEC_CORE
 
 config AUDIT_ARCH
        def_bool y
index 42506b371b74144886e42a9ec21d43edcf680566..4da604ebf6fd8edd75eb01951913c79991d0eca5 100644 (file)
@@ -167,7 +167,7 @@ unsigned long decompress_kernel(void)
 #endif
 
        puts("Uncompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
        puts("Ok, booting the kernel.\n");
        return (unsigned long) output;
 }
index 9d395961e71380484e7813f42240e6dacf5e6714..b3fd54d93dd20f85147c9e1e884e08e7358ec914 100644 (file)
@@ -18,27 +18,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &s390_dma_ops;
 }
 
-extern int dma_set_mask(struct device *dev, u64 mask);
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                                  enum dma_data_direction direction)
 {
 }
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->dma_supported == NULL)
-               return 1;
-       return dma_ops->dma_supported(dev, mask);
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
@@ -46,45 +32,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-       return dma_addr == DMA_ERROR_CODE;
-}
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flags,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       BUG_ON(!ops);
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flags, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 #endif /* _ASM_S390_DMA_MAPPING_H */
index 42b76580c8b8a6fa155da58f106d83096089389b..37505b8b4093782bae7e7062e39ea8e8fa50dbc7 100644 (file)
@@ -262,16 +262,6 @@ out:
        spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
 }
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(dma_set_mask);
-
 static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
                                     unsigned long offset, size_t size,
                                     enum dma_data_direction direction,
index 50057fed819ddf3c07a8e16841c65d63cbaa5168..d514df7e04dd4c866597bd0772bfdc6a7dc000f9 100644 (file)
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
        depends on SUPERH32 && MMU
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 95470a472d2cf793ddad131f55805385a514e65f..208a9753ab38cd0a532c63b19c379cae8a8494e7 100644 (file)
@@ -132,7 +132,7 @@ void decompress_kernel(void)
 
        puts("Uncompressing Linux... ");
        cache_control(CACHE_ENABLE);
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
        cache_control(CACHE_DISABLE);
        puts("Ok, booting the kernel.\n");
 }
index b437f2c780b83f6c84d427f18af4416fa9e42fd7..a3745a3fe0290896a2a14450e6e47e8caf30a793 100644 (file)
@@ -9,86 +9,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
-#include <asm-generic/dma-coherent.h>
-#include <asm-generic/dma-mapping-common.h>
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (ops->dma_supported)
-               return ops->dma_supported(dev, mask);
-
-       return 1;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
+#define DMA_ERROR_CODE 0
 
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, mask);
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
+#include <asm-generic/dma-mapping-common.h>
 
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                    enum dma_data_direction dir);
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return dma_addr == 0;
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-               return memory;
-       if (!ops->alloc)
-               return NULL;
-
-       memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-       if (ops->free)
-               ops->free(dev, size, vaddr, dma_handle, attrs);
-}
-
 /* arch/sh/mm/consistent.c */
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                        dma_addr_t *dma_addr, gfp_t flag,
index 7e064c68c5ec8a0ab538a15947d5c44b2db0a322..a21da597b0b59d49cce2f5fcca68a4274bccbfc6 100644 (file)
@@ -7,11 +7,9 @@
 
 #define DMA_ERROR_CODE (~(dma_addr_t)0x0)
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 int dma_supported(struct device *dev, u64 mask);
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                                  enum dma_data_direction dir)
 {
@@ -39,39 +37,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return (dma_addr == DMA_ERROR_CODE);
-}
+#define HAVE_ARCH_DMA_SET_MASK 1
 
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
@@ -86,4 +52,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
        return -EINVAL;
 }
 
+#include <asm-generic/dma-mapping-common.h>
+
 #endif
index 2ba12d7617234417c4bec81988ae64f688e34704..106c21bd7f449d947094db5fdefce8a9a6e1b142 100644 (file)
@@ -205,6 +205,7 @@ source "kernel/Kconfig.hz"
 
 config KEXEC
        bool "kexec system call"
+       select KEXEC_CORE
        ---help---
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 1eae359d83150662a0c5d0ea7b477bfe35f13c2b..96ac6cce4a32c03ead94166ac1190b91ac5b032d 100644 (file)
@@ -59,8 +59,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
 {
        dev->archdata.dma_ops = ops;
@@ -74,18 +72,9 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-}
+#define HAVE_ARCH_DMA_SET_MASK 1
 
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->dma_supported(dev, mask);
-}
+#include <asm-generic/dma-mapping-common.h>
 
 static inline int
 dma_set_mask(struct device *dev, u64 mask)
@@ -116,36 +105,6 @@ dma_set_mask(struct device *dev, u64 mask)
        return 0;
 }
 
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-
 /*
  * dma_alloc_noncoherent() is #defined to return coherent memory,
  * so there's no need to do any flushing here.
index 176d5bda3559de3ffc23bdc667c0ee7be3f6d5bb..5c65dfee278c0319bcd6059a95249431f836d278 100644 (file)
@@ -119,8 +119,8 @@ unsigned long decompress_kernel(unsigned long output_start,
        output_ptr = get_unaligned_le32(tmp);
 
        arch_decomp_puts("Uncompressing Linux...");
-       decompress(input_data, input_data_end - input_data, NULL, NULL,
-                       output_data, NULL, error);
+       __decompress(input_data, input_data_end - input_data, NULL, NULL,
+                       output_data, 0, NULL, error);
        arch_decomp_puts(" done, booting the kernel.\n");
        return output_ptr;
 }
index 366460a817965d2ed7d1d4dceb987784067e87e6..8140e053ccd351332f33d3df8decccfdeb86074b 100644 (file)
@@ -18,8 +18,6 @@
 #include <linux/scatterlist.h>
 #include <linux/swiotlb.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <asm/memory.h>
 #include <asm/cacheflush.h>
 
@@ -30,26 +28,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &swiotlb_dma_map_ops;
 }
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (unlikely(dma_ops == NULL))
-               return 0;
-
-       return dma_ops->dma_supported(dev, mask);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-       return 0;
-}
-
 #include <asm-generic/dma-mapping-common.h>
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
@@ -72,41 +50,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
-
-       return 0;
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       return dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr,
                size_t size, enum dma_data_direction direction)
 {
index cc0d73eac047920f6845767123554df9c8deee7f..7aef2d52daa0d8ea8b55a683a11eb2c2e204eaef 100644 (file)
@@ -1754,6 +1754,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
        bool "kexec system call"
+       select KEXEC_CORE
        ---help---
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
@@ -1770,8 +1771,8 @@ config KEXEC
 
 config KEXEC_FILE
        bool "kexec file based system call"
+       select KEXEC_CORE
        select BUILD_BIN2C
-       depends on KEXEC
        depends on X86_64
        depends on CRYPTO=y
        depends on CRYPTO_SHA256=y
index f63797942bb5951adc91bf5bd4d355ff5e48db6e..79dac1758e7c00d8c062be2e3c2b054bc4dfc475 100644 (file)
@@ -448,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 #endif
 
        debug_putstr("\nDecompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, output_len,
+                       NULL, error);
        parse_elf(output);
        /*
         * 32-bit always performs relocations. 64-bit relocations are only
index 16ef02596db2daf1fa8eadd9c17fd994ec3c21b3..2d6b309c8e9a12ac67ddf9d9cb429cb1fe8a7eae 100644 (file)
@@ -414,7 +414,7 @@ xloadflags:
 # define XLF23 0
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
 # define XLF4 XLF_EFI_KEXEC
 #else
 # define XLF4 0
index 26a46f44e29819c17ba5f5aef2eb9f896b98f878..b160c0c6baed54c38cc0efbf15ff67075ec869c3 100644 (file)
@@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
 {
        return "[vsyscall]";
 }
-static struct vm_operations_struct gate_vma_ops = {
+static const struct vm_operations_struct gate_vma_ops = {
        .name = gate_vma_name,
 };
 static struct vm_area_struct gate_vma = {
index 1f5b7287d1ad8df92f789003018fec3913b03e1c..953b7263f84466f463d416814be709d22971c701 100644 (file)
@@ -12,7 +12,6 @@
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/dma-contiguous.h>
 
 #ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 #endif
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
+
+#include <asm-generic/dma-mapping-common.h>
 
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                        dma_addr_t *dma_addr, gfp_t flag,
@@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
        return gfp;
 }
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-               gfp_t gfp, struct dma_attrs *attrs);
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-void dma_free_attrs(struct device *dev, size_t size,
-                   void *vaddr, dma_addr_t bus,
-                   struct dma_attrs *attrs);
-
 #endif
index 32ce71375b212cd0fc8fa5d847d09c1d7aa5bc6b..b130d59406fb12ab3a75d5a2a8631b202be50ab3 100644 (file)
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 extern int in_crash_kexec;
 #else
 /* no crash dump is ever in progress if no crash kernel can be kexec'd */
index 9ffdf25e5b86843e94e29b3361a86f34349341c9..b1b78ffe01d060a38c93c3c7486393702edd4ffc 100644 (file)
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH)               += livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)  += ftrace.o
 obj-$(CONFIG_X86_TSC)          += trace_clock.o
-obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)       += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE)       += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)       += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
 obj-y                          += kprobes/
index 49487b4880616a225427c99d8eb7c498da36bae3..2c7aafa7070274420a909f4f804e964d2ddc473d 100644 (file)
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
  * kind of shutdown from our side, we unregister the clock by writting anything
  * that does not have the 'enable' bit set in the msr
  */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
        native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
        x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
        x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
        machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
        kvm_get_preset_lpj();
index 353972c1946cd35f378054439a05bed8200f92c9..84b8ef82a159bc7756914b40518d7fa00ed968ff 100644 (file)
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
 /* Number of entries preallocated for DMA-API debugging */
 #define PREALLOC_DMA_DEBUG_ENTRIES       65536
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 void __init pci_iommu_alloc(void)
 {
        struct iommu_table_entry *p;
@@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
                free_pages((unsigned long)vaddr, get_order(size));
 }
 
-void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-                     gfp_t gfp, struct dma_attrs *attrs)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
 {
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-               return memory;
-
-       if (!dev)
-               dev = &x86_dma_fallback_dev;
-
-       if (!is_device_dma_capable(dev))
-               return NULL;
-
-       if (!ops->alloc)
-               return NULL;
-
-       memory = ops->alloc(dev, size, dma_handle,
-                           dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-EXPORT_SYMBOL(dma_alloc_attrs);
-
-void dma_free_attrs(struct device *dev, size_t size,
-                   void *vaddr, dma_addr_t bus,
-                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       WARN_ON(irqs_disabled());       /* for portability */
+       *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
+       *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
 
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
+       if (!*dev)
+               *dev = &x86_dma_fallback_dev;
+       if (!is_device_dma_capable(*dev))
+               return false;
+       return true;
 
-       debug_dma_free_coherent(dev, size, vaddr, bus);
-       if (ops->free)
-               ops->free(dev, size, vaddr, bus, attrs);
 }
-EXPORT_SYMBOL(dma_free_attrs);
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
 
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
index 86db4bcd7ce52bcb74a5bf42efcd8e7152488cf1..02693dd9a0790b804a515294714d59ed68688ba8 100644 (file)
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
        .emergency_restart = native_machine_emergency_restart,
        .restart = native_machine_restart,
        .halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        .crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
@@ -703,7 +703,7 @@ void machine_halt(void)
        machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void machine_crash_shutdown(struct pt_regs *regs)
 {
        machine_ops.crash_shutdown(regs);
index baadbf90a7c59f4aafc6faa8abb0409344fa5244..fdb7f2a2d3286013a7ea41d392e48596c90fc672 100644 (file)
@@ -478,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * --------- Crashkernel reservation ------------------------------
  */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
index 00bf300fd8468db0e5bcd2fd9e32fc4f80e48adb..74e4bf11f562e0354c227518421e2375ec16fafa 100644 (file)
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
 
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <asm/kexec.h>
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
index 148ea20160222fa70a30ddb8c54a97f1cbcc8e32..d01986832afc28ed225b2f414ccb2742e528169c 100644 (file)
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This bitmap is used to indicate whether the vmclear
  * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
 #else
 static inline void crash_enable_local_vmclear(int cpu) { }
 static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
        if (r)
                return r;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                           crash_vmclear_local_loaded_vmcss);
 #endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
        synchronize_rcu();
 #endif
index db1b0bc5017c9f01b456a97b5b84d6d40a2c03b2..134948b0926f521afe63f9e8c3679835f72cccee 100644 (file)
@@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
  */
 static unsigned long mpx_mmap(unsigned long len)
 {
-       unsigned long ret;
-       unsigned long addr, pgoff;
        struct mm_struct *mm = current->mm;
-       vm_flags_t vm_flags;
-       struct vm_area_struct *vma;
+       unsigned long addr, populate;
 
        /* Only bounds table can be allocated here */
        if (len != mpx_bt_size_bytes(mm))
                return -EINVAL;
 
        down_write(&mm->mmap_sem);
-
-       /* Too many mappings? */
-       if (mm->map_count > sysctl_max_map_count) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /* Obtain the address to map to. we verify (or select) it and ensure
-        * that it represents a valid section of the address space.
-        */
-       addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
-       if (addr & ~PAGE_MASK) {
-               ret = addr;
-               goto out;
-       }
-
-       vm_flags = VM_READ | VM_WRITE | VM_MPX |
-                       mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
-       /* Set pgoff according to addr for anon_vma */
-       pgoff = addr >> PAGE_SHIFT;
-
-       ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
-       if (IS_ERR_VALUE(ret))
-               goto out;
-
-       vma = find_vma(mm, ret);
-       if (!vma) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       if (vm_flags & VM_LOCKED) {
-               up_write(&mm->mmap_sem);
-               mm_populate(ret, len);
-               return ret;
-       }
-
-out:
+       addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
        up_write(&mm->mmap_sem);
-       return ret;
+       if (populate)
+               mm_populate(addr, populate);
+
+       return addr;
 }
 
 enum reg_type {
index e4308fe6afe81e4d8be5a42a6cc682174761fe1f..1db84c0758b732b3465fcc896ef98862dabe0f16 100644 (file)
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 
 static void __init save_runtime_map(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        efi_memory_desc_t *md;
        void *tmp, *p, *q = NULL;
        int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 
 static void __init kexec_enter_virtual_mode(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        efi_memory_desc_t *md;
        void *p;
 
index 020c101c255fec8386ba36c13c82ac8ddaf715b3..5c9f63fa6abf24ed575005d7ffb0c3118a728505 100644 (file)
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
        touch_nmi_watchdog();
 }
 
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
 static atomic_t uv_nmi_kexec_failed;
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
        uv_nmi_sync_exit(0);
 }
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
        if (master)
                pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
 }
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
index f01cb3044e50d310112a94abb597b8ac7aed1b35..4427f38b634e62a90b629fc11d68b25d70c34ba6 100644 (file)
@@ -32,66 +32,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
-       ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       ops->free(dev, size, vaddr, dma_handle, attrs);
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
-
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       return ops->mapping_error(dev, dma_addr);
-}
-
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-       return 1;
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-       if(!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                    enum dma_data_direction direction);
 
index 6607f3c6ace1033fd4ca449d579bd2b7638e3963..a39e85f9efa98854768f39b01502274401673957 100644 (file)
@@ -2834,7 +2834,7 @@ static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct binder_vm_ops = {
+static const struct vm_operations_struct binder_vm_ops = {
        .open = binder_vma_open,
        .close = binder_vma_close,
        .fault = binder_vm_fault,
index e41986967294164f0e226f39faff1e817fb3c590..52340b9bb3873bbf1a327c7b41bbe3f2ffa5b222 100644 (file)
@@ -86,9 +86,7 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
 {
        struct adf_etr_ring_data *ring = sfile->private;
        struct adf_etr_bank_data *bank = ring->bank;
-       uint32_t *msg = v;
        void __iomem *csr = ring->bank->csr_addr;
-       int i, x;
 
        if (v == SEQ_START_TOKEN) {
                int head, tail, empty;
@@ -113,18 +111,8 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
                seq_puts(sfile, "----------- Ring data ------------\n");
                return 0;
        }
-       seq_printf(sfile, "%p:", msg);
-       x = 0;
-       i = 0;
-       for (; i < (ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2); i++) {
-               seq_printf(sfile, " %08X", *(msg + i));
-               if ((ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2) != i + 1 &&
-                   (++x == 8)) {
-                       seq_printf(sfile, "\n%p:", msg + i + 1);
-                       x = 0;
-               }
-       }
-       seq_puts(sfile, "\n");
+       seq_hex_dump(sfile, "", DUMP_PREFIX_ADDRESS, 32, 4,
+                    v, ADF_MSG_SIZE_TO_BYTES(ring->msg_size), false);
        return 0;
 }
 
index 54071c1483400d41e214c0f83512ca1f4600814a..84533e02fbf8ba292cddf960fde5881e1898821c 100644 (file)
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
 
 config EFI_RUNTIME_MAP
        bool "Export efi runtime maps to sysfs"
-       depends on X86 && EFI && KEXEC
+       depends on X86 && EFI && KEXEC_CORE
        default y
        help
          Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
index 6394547cf67a0b3de9662105bc9e4b604d2f77c6..860062ef88144e6fe4351b386328954be2293201 100644 (file)
@@ -125,7 +125,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
 }
 
-static struct vm_operations_struct vgem_gem_vm_ops = {
+static const struct vm_operations_struct vgem_gem_vm_ops = {
        .fault = vgem_gem_fault,
        .open = drm_gem_vm_open,
        .close = drm_gem_vm_close,
index d04643f9548bbca84edee48659cfe7fac2600bfa..95638df73d1c328c88a57c077fccf271731b5a4e 100644 (file)
@@ -1110,7 +1110,7 @@ static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return 0;
 }
 
-static struct vm_operations_struct cs_char_vm_ops = {
+static const struct vm_operations_struct cs_char_vm_ops = {
        .fault  = cs_char_vma_fault,
 };
 
index 725881890c4a217247993f9fbb933ff11bb27e27..e449e394963f00d42cd11ecafbca6081f9011bcd 100644 (file)
@@ -908,7 +908,7 @@ static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return 0;
 }
 
-static struct vm_operations_struct qib_file_vm_ops = {
+static const struct vm_operations_struct qib_file_vm_ops = {
        .fault = qib_file_vma_fault,
 };
 
index 146cf29a2e1db19a8293f2ecbf3f8348ce1732c2..34927b700b0e67f5ebbe605aceefd205c9651660 100644 (file)
@@ -75,7 +75,7 @@ static void qib_vma_close(struct vm_area_struct *vma)
        kref_put(&ip->ref, qib_release_mmap_info);
 }
 
-static struct vm_operations_struct qib_vm_ops = {
+static const struct vm_operations_struct qib_vm_ops = {
        .open =     qib_vma_open,
        .close =    qib_vma_close,
 };
index f09c5f17a42f35a37e7b4ae80eb746c98f9a755b..de2474e1132de486ae73f5581555f814a299b390 100644 (file)
@@ -872,7 +872,7 @@ static void omap_vout_vm_close(struct vm_area_struct *vma)
        vout->mmap_count--;
 }
 
-static struct vm_operations_struct omap_vout_vm_ops = {
+static const struct vm_operations_struct omap_vout_vm_ops = {
        .open   = omap_vout_vm_open,
        .close  = omap_vout_vm_close,
 };
index c49d244265eccd6c4017a8bcd73a7f0749fdc3dc..70e62d6a3231fd7ce4f42ea2cc8c2065163f083c 100644 (file)
@@ -418,7 +418,7 @@ static void genwqe_vma_close(struct vm_area_struct *vma)
        kfree(dma_map);
 }
 
-static struct vm_operations_struct genwqe_vma_ops = {
+static const struct vm_operations_struct genwqe_vma_ops = {
        .open   = genwqe_vma_open,
        .close  = genwqe_vma_close,
 };
index 613ca2b2527be25a0c4a51329acdab30bb0e3ab3..d1a1e160ef31132f35d56ace1e7dd12a13b267e6 100644 (file)
@@ -156,6 +156,12 @@ static const struct file_operations fops_vring = {
        .llseek         = seq_lseek,
 };
 
+static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
+                           const char *prefix)
+{
+       seq_hex_dump(s, prefix, DUMP_PREFIX_NONE, 16, 1, p, len, false);
+}
+
 static void wil_print_ring(struct seq_file *s, const char *prefix,
                           void __iomem *off)
 {
@@ -212,8 +218,6 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
                                   le16_to_cpu(hdr.seq), len,
                                   le16_to_cpu(hdr.type), hdr.flags);
                        if (len <= MAX_MBOXITEM_SIZE) {
-                               int n = 0;
-                               char printbuf[16 * 3 + 2];
                                unsigned char databuf[MAX_MBOXITEM_SIZE];
                                void __iomem *src = wmi_buffer(wil, d.addr) +
                                        sizeof(struct wil6210_mbox_hdr);
@@ -223,16 +227,7 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
                                 * reading header
                                 */
                                wil_memcpy_fromio_32(databuf, src, len);
-                               while (n < len) {
-                                       int l = min(len - n, 16);
-
-                                       hex_dump_to_buffer(databuf + n, l,
-                                                          16, 1, printbuf,
-                                                          sizeof(printbuf),
-                                                          false);
-                                       seq_printf(s, "      : %s\n", printbuf);
-                                       n += l;
-                               }
+                               wil_seq_hexdump(s, databuf, len, "      : ");
                        }
                } else {
                        seq_puts(s, "\n");
@@ -867,22 +862,6 @@ static const struct file_operations fops_wmi = {
        .open  = simple_open,
 };
 
-static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
-                           const char *prefix)
-{
-       char printbuf[16 * 3 + 2];
-       int i = 0;
-
-       while (i < len) {
-               int l = min(len - i, 16);
-
-               hex_dump_to_buffer(p + i, l, 16, 1, printbuf,
-                                  sizeof(printbuf), false);
-               seq_printf(s, "%s%s\n", prefix, printbuf);
-               i += l;
-       }
-}
-
 static void wil_seq_print_skb(struct seq_file *s, struct sk_buff *skb)
 {
        int i = 0;
index 02ff84fcfa61289f33d7a53f8f1eb16c6b871082..957b42198328f74809cf944ef33a023cb5745b41 100644 (file)
@@ -1103,16 +1103,9 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
        struct ioc *ioc = ioc_list;
 
        while (ioc != NULL) {
-               u32 *res_ptr = (u32 *)ioc->res_map;
-               int j;
-
-               for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) {
-                       if ((j & 7) == 0)
-                               seq_puts(m, "\n   ");
-                       seq_printf(m, "%08x", *res_ptr);
-                       res_ptr++;
-               }
-               seq_puts(m, "\n\n");
+               seq_hex_dump(m, "   ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+                            ioc->res_size, false);
+               seq_putc(m, '\n');
                ioc = ioc->next;
                break; /* XXX - remove me */
        }
index f1441e466c06cd12218d01f6f527c5485a78cdac..225049b492e535f7bf30ac8ef00f110d4647c0c2 100644 (file)
@@ -1854,14 +1854,9 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
 {
        struct sba_device *sba_dev = sba_list;
        struct ioc *ioc = &sba_dev->ioc[0];     /* FIXME: Multi-IOC support! */
-       unsigned int *res_ptr = (unsigned int *)ioc->res_map;
-       int i;
 
-       for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) {
-               if ((i & 7) == 0)
-                       seq_puts(m, "\n   ");
-               seq_printf(m, " %08x", *res_ptr);
-       }
+       seq_hex_dump(m, "   ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+                    ioc->res_size, false);
        seq_putc(m, '\n');
 
        return 0;
index 52a880ca1768362ec41399e8df11c3504a28a107..dd652f2ae03db964ed539c5d369092173ab9ab33 100644 (file)
@@ -467,7 +467,7 @@ static void pci_device_shutdown(struct device *dev)
        pci_msi_shutdown(pci_dev);
        pci_msix_shutdown(pci_dev);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        /*
         * If this is a kexec reboot, turn off Bus Master bit on the
         * device to tell it to not continue to do DMA. Don't touch
index 01bf1f5cf2e95a7f40f722f51e4df87ea257e724..4eb45546a3aaf39421e6434a181890f6145fd604 100644 (file)
@@ -1206,16 +1206,8 @@ static void sprinthx(unsigned char *title, struct seq_file *m,
 static void sprinthx4(unsigned char *title, struct seq_file *m,
                      unsigned int *array, unsigned int len)
 {
-       int r;
-
        seq_printf(m, "\n%s\n", title);
-       for (r = 0; r < len; r++) {
-               if ((r % 8) == 0)
-                       seq_printf(m, "    ");
-               seq_printf(m, "%08X ", array[r]);
-               if ((r % 8) == 7)
-                       seq_putc(m, '\n');
-       }
+       seq_hex_dump(m, "    ", DUMP_PREFIX_NONE, 32, 4, array, len, false);
        seq_putc(m, '\n');
 }
 
index eec878e183f5d81e51649efad85a470805fb0080..217aa537c4eb9770a0ca0abf34626a8146f5049b 100644 (file)
@@ -997,7 +997,7 @@ static void ion_vm_close(struct vm_area_struct *vma)
        mutex_unlock(&buffer->lock);
 }
 
-static struct vm_operations_struct ion_vma_ops = {
+static const struct vm_operations_struct ion_vma_ops = {
        .open = ion_vm_open,
        .close = ion_vm_close,
        .fault = ion_vm_fault,
index fd54d098ab02248eabff1b250afc123b27433260..0e8a45102933ea0c0a103581959ad54c01918abc 100644 (file)
@@ -2156,7 +2156,7 @@ static void comedi_vm_close(struct vm_area_struct *area)
        comedi_buf_map_put(bm);
 }
 
-static struct vm_operations_struct comedi_vm_ops = {
+static const struct vm_operations_struct comedi_vm_ops = {
        .open = comedi_vm_open,
        .close = comedi_vm_close,
 };
index 4f0cbb54d4dbdf1d43eb013d8fbdcc6cb3813389..d3af01c94a58d07f45b6e55c95c91af44755b987 100644 (file)
@@ -1091,7 +1091,7 @@ static void mmap_user_close(struct vm_area_struct *vma)
        omapfb_put_mem_region(rg);
 }
 
-static struct vm_operations_struct mmap_user_ops = {
+static const struct vm_operations_struct mmap_user_ops = {
        .open = mmap_user_open,
        .close = mmap_user_close,
 };
index 14370df9ac1cc17eeb554f16176fa5c5288d5338..4547a91bca67a1005c95c478aef929d121655d00 100644 (file)
@@ -494,7 +494,7 @@ static void gntalloc_vma_close(struct vm_area_struct *vma)
        mutex_unlock(&gref_mutex);
 }
 
-static struct vm_operations_struct gntalloc_vmops = {
+static const struct vm_operations_struct gntalloc_vmops = {
        .open = gntalloc_vma_open,
        .close = gntalloc_vma_close,
 };
index 0dbb222daaf1c694b1f073f3e206f755f5f77cc6..2ea0b3b2a91d2585a2d37f8ead07f08f32c79826 100644 (file)
@@ -433,7 +433,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
        return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT];
 }
 
-static struct vm_operations_struct gntdev_vmops = {
+static const struct vm_operations_struct gntdev_vmops = {
        .open = gntdev_vma_open,
        .close = gntdev_vma_close,
        .find_special_page = gntdev_vma_find_special_page,
index c6deb87c5c69704ed600cd9b5ead7b8443d4a77c..5e9adac928e694d6701b5c59ef5300144226a0ab 100644 (file)
@@ -414,7 +414,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
        return 0;
 }
 
-static struct vm_operations_struct privcmd_vm_ops;
+static const struct vm_operations_struct privcmd_vm_ops;
 
 static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 {
@@ -605,7 +605,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct privcmd_vm_ops = {
+static const struct vm_operations_struct privcmd_vm_ops = {
        .close = privcmd_close,
        .fault = privcmd_fault
 };
index d757a3e610c6c773692f79655aa4fa4c5a25dfa0..79bc4933b13e05c220325bb3ad39f7f6c0e243dd 100644 (file)
@@ -311,9 +311,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        */
        flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
 
-       if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
-               return ret;
-
        /* On ARM this function returns an ioremap'ped virtual address for
         * which virt_to_phys doesn't return the corresponding physical
         * address. In fact on ARM virt_to_phys only works for kernel direct
@@ -356,9 +353,6 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
        phys_addr_t phys;
        u64 dma_mask = DMA_BIT_MASK(32);
 
-       if (dma_release_from_coherent(hwdev, order, vaddr))
-               return;
-
        if (hwdev && hwdev->coherent_dma_mask)
                dma_mask = hwdev->coherent_dma_mask;
 
index 3f89c9e05b4077b5db9b728435713d04d5ed05ae..5b50c4ca43a7dde6ce48049f365a9988da23449b 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 #include "affs.h"
 
 static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
         * blocks, we will have to change it.
         */
 
-       size = sb->s_bdev->bd_inode->i_size >> 9;
+       size = i_size_read(sb->s_bdev->bd_inode) >> 9;
        pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
 
        affs_set_blocksize(sb, PAGE_SIZE);
        /* Try to find root block. Its location depends on the block size. */
 
-       i = 512;
-       j = 4096;
+       i = bdev_logical_block_size(sb->s_bdev);
+       j = PAGE_SIZE;
        if (blocksize > 0) {
                i = j = blocksize;
                size = size / (blocksize / 512);
        }
+
        for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
                sbi->s_root_block = root_block;
                if (root_block < 0)
index 890c50971a690472f6dc795b00fd0bcfbcdf1a39..a268abfe60acd53d034b89d53139064ecaa687ac 100644 (file)
@@ -1593,7 +1593,7 @@ out:
        return err;
 }
 
-static struct vm_operations_struct ceph_vmops = {
+static const struct vm_operations_struct ceph_vmops = {
        .fault          = ceph_filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
 };
index 3f50cee79df9d3318209e19281acef536b34af37..e2a6af1508af2aef789d0caab21fedfa91d49c60 100644 (file)
@@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_LOCKED;
 }
 
-static struct vm_operations_struct cifs_file_vm_ops = {
+static const struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = cifs_page_mkwrite,
index 9b1ffaa0572e5825d8c617880bfbc19be166313b..f6c6c8adbc01efd495de1d6e914dcf0f5b62c10a 100644 (file)
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
         char *result;
         
        insize = max_t(unsigned int,
-                    INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+                    INSIZE(readlink), OUTSIZE(readlink)+ *length);
        UPARG(CODA_READLINK);
 
         inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
        error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
        if (!error) {
                retlen = outp->coda_readlink.count;
-               if ( retlen > *length )
-                       retlen = *length;
+               if (retlen >= *length)
+                       retlen = *length - 1;
                *length = retlen;
                result =  (char *)outp + (long)outp->coda_readlink.data;
                memcpy(buffer, result, retlen);
index c5ecde6f3eed975af7756c17cec4f3b1748dbc83..a8f75640ac86ec2d29cd55a253b2c3c12c7bac9b 100644 (file)
@@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo)
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
-       int flag = 0;
        int ispipe;
        struct files_struct *displaced;
-       bool need_nonrelative = false;
+       /* require nonrelative corefile path and be extra careful */
+       bool need_suid_safe = false;
        bool core_dumped = false;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
@@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo)
         */
        if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
                /* Setuid core dump mode */
-               flag = O_EXCL;          /* Stop rewrite attacks */
                cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
-               need_nonrelative = true;
+               need_suid_safe = true;
        }
 
        retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo)
                if (cprm.limit < binfmt->min_coredump)
                        goto fail_unlock;
 
-               if (need_nonrelative && cn.corename[0] != '/') {
+               if (need_suid_safe && cn.corename[0] != '/') {
                        printk(KERN_WARNING "Pid %d(%s) can only dump core "\
                                "to fully qualified path!\n",
                                task_tgid_vnr(current), current->comm);
@@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo)
                        goto fail_unlock;
                }
 
+               /*
+                * Unlink the file if it exists unless this is a SUID
+                * binary - in that case, we're running around with root
+                * privs and don't want to unlink another user's coredump.
+                */
+               if (!need_suid_safe) {
+                       mm_segment_t old_fs;
+
+                       old_fs = get_fs();
+                       set_fs(KERNEL_DS);
+                       /*
+                        * If it doesn't exist, that's fine. If there's some
+                        * other problem, we'll catch it at the filp_open().
+                        */
+                       (void) sys_unlink((const char __user *)cn.corename);
+                       set_fs(old_fs);
+               }
+
+               /*
+                * There is a race between unlinking and creating the
+                * file, but if that causes an EEXIST here, that's
+                * fine - another process raced with us while creating
+                * the corefile, and the other process won. To userspace,
+                * what matters is that at least one of the two processes
+                * writes its coredump successfully, not which one.
+                */
                cprm.file = filp_open(cn.corename,
-                                O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+                                O_CREAT | 2 | O_NOFOLLOW |
+                                O_LARGEFILE | O_EXCL,
                                 0600);
                if (IS_ERR(cprm.file))
                        goto fail_unlock;
@@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo)
                if (!S_ISREG(inode->i_mode))
                        goto close_fail;
                /*
-                * Dont allow local users get cute and trick others to coredump
-                * into their pre-created files.
+                * Don't dump core if the filesystem changed owner or mode
+                * of the file during file creation. This is an issue when
+                * a process dumps core while its cwd is e.g. on a vfat
+                * filesystem.
                 */
                if (!uid_eq(inode->i_uid, current_fsuid()))
                        goto close_fail;
+               if ((inode->i_mode & 0677) != 0600)
+                       goto close_fail;
                if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
                        goto close_fail;
                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
index d3fa6bd9503e762c861debdd4fe64bef546bb78f..221719eac5de667c1d6044697605148fca6b87e8 100644 (file)
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-               page_cache_release(page);
                node->page[i] = page;
        }
 
@@ -398,11 +397,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-       //int i;
+       int i;
 
-       //for (i = 0; i < node->tree->pages_per_bnode; i++)
-       //      if (node->page[i])
-       //              page_cache_release(node->page[i]);
+       for (i = 0; i < node->tree->pages_per_bnode; i++)
+               if (node->page[i])
+                       page_cache_release(node->page[i]);
        kfree(node);
 }
 
index 9f4ee7f5202615ba41b41be76d12de3bbe1f5676..6fc766df04617a3f4abbdb0ba44f76ec71a468de 100644 (file)
@@ -131,13 +131,16 @@ skip:
        hfs_bnode_write(node, entry, data_off + key_len, entry_len);
        hfs_bnode_dump(node);
 
-       if (new_node) {
-               /* update parent key if we inserted a key
-                * at the start of the first node
-                */
-               if (!rec && new_node != node)
-                       hfs_brec_update_parent(fd);
+       /*
+        * update parent key if we inserted a key
+        * at the start of the node and it is not the new node
+        */
+       if (!rec && new_node != node) {
+               hfs_bnode_read_key(node, fd->search_key, data_off + size);
+               hfs_brec_update_parent(fd);
+       }
 
+       if (new_node) {
                hfs_bnode_put(fd->bnode);
                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
                goto again;
        }
 
-       if (!rec)
-               hfs_brec_update_parent(fd);
-
        return 0;
 }
 
@@ -366,6 +366,8 @@ again:
        if (IS_ERR(parent))
                return PTR_ERR(parent);
        __hfs_brec_find(parent, fd);
+       if (fd->record < 0)
+               return -ENOENT;
        hfs_bnode_dump(parent);
        rec = fd->record;
 
index 759708fd9331cc37a6775c31e068117eddbec11d..63924662aaf3efa3b80cb732e409499e5cb2f87e 100644 (file)
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-               page_cache_release(page);
                node->page[i] = page;
        }
 
@@ -566,13 +565,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-#if 0
        int i;
 
        for (i = 0; i < node->tree->pages_per_bnode; i++)
                if (node->page[i])
                        page_cache_release(node->page[i]);
-#endif
        kfree(node);
 }
 
index 29b927938b8ce9c78af99d0f9c8f5af5a294cefd..726d211db4842715f71e1911f6940c93b19fe57f 100644 (file)
@@ -2438,7 +2438,7 @@ done:
 
 /**
  * path_mountpoint - look up a path to be umounted
- * @nameidata: lookup context
+ * @nd:                lookup context
  * @flags:     lookup flags
  * @path:      pointer to container for result
  *
index aa50d1ac28fc6189a9489d1b679fcf86115e633c..b25eee4cead5398b69889c95c29480ba9862c397 100644 (file)
@@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
 {
        struct inode * inode = file_inode(file);
-       char *page, *tmp;
-       ssize_t length;
        uid_t loginuid;
        kuid_t kloginuid;
+       int rv;
 
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        }
        rcu_read_unlock();
 
-       if (count >= PAGE_SIZE)
-               count = PAGE_SIZE - 1;
-
        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }
-       page = (char*)__get_free_page(GFP_TEMPORARY);
-       if (!page)
-               return -ENOMEM;
-       length = -EFAULT;
-       if (copy_from_user(page, buf, count))
-               goto out_free_page;
-
-       page[count] = '\0';
-       loginuid = simple_strtoul(page, &tmp, 10);
-       if (tmp == page) {
-               length = -EINVAL;
-               goto out_free_page;
 
-       }
+       rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+       if (rv < 0)
+               return rv;
 
        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
-               if (!uid_valid(kloginuid)) {
-                       length = -EINVAL;
-                       goto out_free_page;
-               }
+               if (!uid_valid(kloginuid))
+                       return -EINVAL;
        }
 
-       length = audit_set_loginuid(kloginuid);
-       if (likely(length == 0))
-               length = count;
-
-out_free_page:
-       free_page((unsigned long) page);
-       return length;
+       rv = audit_set_loginuid(kloginuid);
+       if (rv < 0)
+               return rv;
+       return count;
 }
 
 static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
 {
        struct task_struct *task;
-       char buffer[PROC_NUMBUF], *end;
+       char buffer[PROC_NUMBUF];
        int make_it_fail;
+       int rv;
 
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
@@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-       make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
-       if (*end)
-               return -EINVAL;
+       rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+       if (rv < 0)
+               return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;
 
@@ -1836,8 +1818,6 @@ end_instantiate:
        return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
 /*
  * dname_to_vma_addr - maps a dentry name into two unsigned longs
  * which represent vma start and end addresses.
@@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        if (flags & LOOKUP_RCU)
                return -ECHILD;
 
-       if (!capable(CAP_SYS_ADMIN)) {
-               status = -EPERM;
-               goto out_notask;
-       }
-
        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
@@ -1957,6 +1932,29 @@ struct map_files_info {
        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+       .readlink       = proc_pid_readlink,
+       .follow_link    = proc_map_files_follow_link,
+       .setattr        = proc_setattr,
+};
+
 static int
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
@@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
        ei = PROC_I(inode);
        ei->op.proc_get_link = proc_map_files_get_link;
 
-       inode->i_op = &proc_pid_link_inode_operations;
+       inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;
        inode->i_mode = S_IFLNK;
 
@@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
        int result;
        struct mm_struct *mm;
 
-       result = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        result = -ENOENT;
        task = get_proc_task(dir);
        if (!task)
@@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
        struct map_files_info *p;
        int ret;
 
-       ret = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
@@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
-#endif /* CONFIG_CHECKPOINT_RESTORE */
 
 static int proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 {
        struct task_struct *task;
        struct mm_struct *mm;
-       char buffer[PROC_NUMBUF], *end;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;
 
-       ret = -EFAULT;
-       memset(buffer, 0, sizeof(buffer));
-       if (count > sizeof(buffer) - 1)
-               count = sizeof(buffer) - 1;
-       if (copy_from_user(buffer, buf, count))
-               goto out_no_task;
-
-       ret = -EINVAL;
-       val = (unsigned int)simple_strtoul(buffer, &end, 0);
-       if (*end == '\n')
-               end++;
-       if (end - buffer == 0)
-               goto out_no_task;
+       ret = kstrtouint_from_user(buf, count, 0, &val);
+       if (ret < 0)
+               return ret;
 
        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;
 
-       ret = end - buffer;
        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
@@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
  out_no_mm:
        put_task_struct(task);
  out_no_task:
-       return ret;
+       if (ret < 0)
+               return ret;
+       return count;
 }
 
 static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
index e5dee5c3188eb10e94742fbb57bb3b3564fa61bb..ff3ffc76a93795b6662cb80d85f2a404d8f20693 100644 (file)
@@ -26,7 +26,7 @@
 
 #include "internal.h"
 
-static DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_RWLOCK(proc_subdir_lock);
 
 static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 {
        int rv;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        rv = __xlate_proc_name(name, ret, residual);
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return rv;
 }
 
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 {
        struct inode *inode;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
        if (de) {
                pde_get(de);
-               spin_unlock(&proc_subdir_lock);
+               read_unlock(&proc_subdir_lock);
                inode = proc_get_inode(dir->i_sb, de);
                if (!inode)
                        return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                d_add(dentry, inode);
                return NULL;
        }
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return ERR_PTR(-ENOENT);
 }
 
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
        if (!dir_emit_dots(file, ctx))
                return 0;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        de = pde_subdir_first(de);
        i = ctx->pos - 2;
        for (;;) {
                if (!de) {
-                       spin_unlock(&proc_subdir_lock);
+                       read_unlock(&proc_subdir_lock);
                        return 0;
                }
                if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
        do {
                struct proc_dir_entry *next;
                pde_get(de);
-               spin_unlock(&proc_subdir_lock);
+               read_unlock(&proc_subdir_lock);
                if (!dir_emit(ctx, de->name, de->namelen,
                            de->low_ino, de->mode >> 12)) {
                        pde_put(de);
                        return 0;
                }
-               spin_lock(&proc_subdir_lock);
+               read_lock(&proc_subdir_lock);
                ctx->pos++;
                next = pde_subdir_next(de);
                pde_put(de);
                de = next;
        } while (de);
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return 1;
 }
 
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        if (ret)
                return ret;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                proc_free_inum(dp->low_ino);
                return -EEXIST;
        }
-       spin_unlock(&proc_subdir_lock);
+       write_unlock(&proc_subdir_lock);
 
        return 0;
 }
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        const char *fn = name;
        unsigned int len;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return;
        }
        len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        de = pde_subdir_find(parent, fn, len);
        if (de)
                rb_erase(&de->subdir_node, &parent->subdir);
-       spin_unlock(&proc_subdir_lock);
+       write_unlock(&proc_subdir_lock);
        if (!de) {
                WARN(1, "name '%s'\n", name);
                return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
        const char *fn = name;
        unsigned int len;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        len = strlen(fn);
 
        root = pde_subdir_find(parent, fn, len);
        if (!root) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
                        de = next;
                        continue;
                }
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
 
                proc_entry_rundown(de);
                next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
                        break;
                pde_put(de);
 
-               spin_lock(&proc_subdir_lock);
+               write_lock(&proc_subdir_lock);
                de = next;
        }
        pde_put(root);
index 7eee2d8b97d9786b7c05ca1078d477c4db6af5e1..93484034a03d04c38cc5ff7779fb95e7611fbd09 100644 (file)
@@ -9,12 +9,16 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 #include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
 
 /* /proc/kpagecount - an array exposing page counts
  *
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                pfn++;
                out++;
                count -= KPMSIZE;
+
+               cond_resched();
        }
 
        *ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
        if (PageBalloon(page))
                u |= 1 << KPF_BALLOON;
 
+       if (page_is_idle(page))
+               u |= 1 << KPF_IDLE;
+
        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
 
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                pfn++;
                out++;
                count -= KPMSIZE;
+
+               cond_resched();
        }
 
        *ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
        .read = kpageflags_read,
 };
 
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       u64 __user *out = (u64 __user *)buf;
+       struct page *ppage;
+       unsigned long src = *ppos;
+       unsigned long pfn;
+       ssize_t ret = 0;
+       u64 ino;
+
+       pfn = src / KPMSIZE;
+       count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+       if (src & KPMMASK || count & KPMMASK)
+               return -EINVAL;
+
+       while (count > 0) {
+               if (pfn_valid(pfn))
+                       ppage = pfn_to_page(pfn);
+               else
+                       ppage = NULL;
+
+               if (ppage)
+                       ino = page_cgroup_ino(ppage);
+               else
+                       ino = 0;
+
+               if (put_user(ino, out)) {
+                       ret = -EFAULT;
+                       break;
+               }
+
+               pfn++;
+               out++;
+               count -= KPMSIZE;
+
+               cond_resched();
+       }
+
+       *ppos += (char __user *)out - buf;
+       if (!ret)
+               ret = (char __user *)out - buf;
+       return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+       .llseek = mem_lseek,
+       .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
 static int __init proc_page_init(void)
 {
        proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
        proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+       proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
        return 0;
 }
 fs_initcall(proc_page_init);
index 41f1a50c10c9e171c0138b3282cef92aa112e1c4..e2d46adb54b42a76608a0cbf938d55529bd6c851 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -459,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 
        mss->resident += size;
        /* Accumulate the size in pages that have been accessed. */
-       if (young || PageReferenced(page))
+       if (young || page_is_young(page) || PageReferenced(page))
                mss->referenced += size;
        mapcount = page_mapcount(page);
        if (mapcount >= 2) {
@@ -807,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 
                /* Clear accessed and referenced bits. */
                pmdp_test_and_clear_young(vma, addr, pmd);
+               test_and_clear_page_young(page);
                ClearPageReferenced(page);
 out:
                spin_unlock(ptl);
@@ -834,6 +836,7 @@ out:
 
                /* Clear accessed and referenced bits. */
                ptep_test_and_clear_young(vma, addr, pte);
+               test_and_clear_page_young(page);
                ClearPageReferenced(page);
        }
        pte_unmap_unlock(pte - 1, ptl);
index ce9e39fd5dafc768c27b2ceaa4e69a02c3ed1e6e..263b125dbcf4d1e68787032bb0827074fcb075d5 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/cred.h>
 #include <linux/mm.h>
+#include <linux/printk.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -773,6 +774,47 @@ void seq_pad(struct seq_file *m, char c)
 }
 EXPORT_SYMBOL(seq_pad);
 
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+                 int rowsize, int groupsize, const void *buf, size_t len,
+                 bool ascii)
+{
+       const u8 *ptr = buf;
+       int i, linelen, remaining = len;
+       int ret;
+
+       if (rowsize != 16 && rowsize != 32)
+               rowsize = 16;
+
+       for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+               linelen = min(remaining, rowsize);
+               remaining -= rowsize;
+
+               switch (prefix_type) {
+               case DUMP_PREFIX_ADDRESS:
+                       seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+                       break;
+               case DUMP_PREFIX_OFFSET:
+                       seq_printf(m, "%s%.8x: ", prefix_str, i);
+                       break;
+               default:
+                       seq_printf(m, "%s", prefix_str);
+                       break;
+               }
+
+               ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+                                        m->buf + m->count, m->size - m->count,
+                                        ascii);
+               if (ret >= m->size - m->count) {
+                       seq_set_overflow(m);
+               } else {
+                       m->count += ret;
+                       seq_putc(m, '\n');
+               }
+       }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
        struct list_head *lh;
index 940d5ec122c96e5a72173b9db2d0bcff44d39405..b1bc954eccf37438213d6744fe69ff1bc4d71365 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
+#include <asm-generic/dma-coherent.h>
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
                                              size_t size,
@@ -237,4 +238,121 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 
 #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL)
 
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev, flag)        (true)
+#endif
+
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+                                      dma_addr_t *dma_handle, gfp_t flag,
+                                      struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+       void *cpu_addr;
+
+       BUG_ON(!ops);
+
+       if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
+               return cpu_addr;
+
+       if (!arch_dma_alloc_attrs(&dev, &flag))
+               return NULL;
+       if (!ops->alloc)
+               return NULL;
+
+       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+       return cpu_addr;
+}
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+                                    void *cpu_addr, dma_addr_t dma_handle,
+                                    struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       BUG_ON(!ops);
+       WARN_ON(irqs_disabled());
+
+       if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
+               return;
+
+       if (!ops->free)
+               return;
+
+       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+       ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, gfp_t flag)
+{
+       return dma_alloc_attrs(dev, size, dma_handle, flag, NULL);
+}
+
+static inline void dma_free_coherent(struct device *dev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle)
+{
+       return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL);
+}
+
+static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, gfp_t gfp)
+{
+       DEFINE_DMA_ATTRS(attrs);
+
+       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+       return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
+}
+
+static inline void dma_free_noncoherent(struct device *dev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle)
+{
+       DEFINE_DMA_ATTRS(attrs);
+
+       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+       dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
+}
+
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+       debug_dma_mapping_error(dev, dma_addr);
+
+       if (get_dma_ops(dev)->mapping_error)
+               return get_dma_ops(dev)->mapping_error(dev, dma_addr);
+
+#ifdef DMA_ERROR_CODE
+       return dma_addr == DMA_ERROR_CODE;
+#else
+       return 0;
+#endif
+}
+
+#ifndef HAVE_ARCH_DMA_SUPPORTED
+static inline int dma_supported(struct device *dev, u64 mask)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (!ops)
+               return 0;
+       if (!ops->dma_supported)
+               return 1;
+       return ops->dma_supported(dev, mask);
+}
+#endif
+
+#ifndef HAVE_ARCH_DMA_SET_MASK
+static inline int dma_set_mask(struct device *dev, u64 mask)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (ops->set_dma_mask)
+               return ops->set_dma_mask(dev, mask);
+
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+       *dev->dma_mask = mask;
+       return 0;
+}
+#endif
+
 #endif
index b63218f68c4b5a2c2862b082f62a1fe6caa5d128..d140b1e9faa71791264d6439bd8429810fff3ddd 100644 (file)
@@ -16,7 +16,7 @@
 
 #include <uapi/linux/kexec.h>
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
@@ -318,13 +318,24 @@ int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
 
-#else /* !CONFIG_KEXEC */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                        unsigned long buf_len);
+void * __weak arch_kexec_kernel_image_load(struct kimage *image);
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image);
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                       unsigned long buf_len);
+int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
+                                       Elf_Shdr *sechdrs, unsigned int relsec);
+int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                       unsigned int relsec);
+
+#else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #define kexec_in_progress false
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 #endif /* !defined(__ASSEBMLY__) */
 
index 0555cc66a15b27dfa7fd70c47e1a519ad49afef1..fcfd2bf14d3f0ef2dc113e5d77fd42c6dc233798 100644 (file)
@@ -85,8 +85,6 @@ enum umh_disable_depth {
        UMH_DISABLED,
 };
 
-extern void usermodehelper_init(void);
-
 extern int __usermodehelper_disable(enum umh_disable_depth depth);
 extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
 
index d92b80b63c5ca5cdac24f794abc59f1e6790688e..ad800e62cb7a603fdd5f7fb1a752edb03417dbb4 100644 (file)
@@ -305,11 +305,9 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
-
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -345,6 +343,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+ino_t page_cgroup_ino(struct page *page);
 
 static inline bool mem_cgroup_disabled(void)
 {
@@ -555,11 +554,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
        return &zone->lruvec;
 }
 
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       return NULL;
-}
-
 static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
 {
index f25a957bf0ab68967cc57d5f7d9472831f65cad2..fda728e3c27d000d952bfccc8c0850dcd649aa68 100644 (file)
@@ -1873,11 +1873,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
-       unsigned long pgoff, unsigned long *populate);
+       vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+static inline unsigned long
+do_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot, unsigned long flags,
+       unsigned long pgoff, unsigned long *populate)
+{
+       return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+}
+
 #ifdef CONFIG_MMU
 extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
index 61cd67f4d7881cbbd8eba481729c06b31d45e9c6..a1a210d59961a855964b03ee9a7eae7c74d7c86f 100644 (file)
@@ -65,6 +65,16 @@ struct mmu_notifier_ops {
                                 unsigned long start,
                                 unsigned long end);
 
+       /*
+        * clear_young is a lightweight version of clear_flush_young. Like the
+        * latter, it is supposed to test-and-clear the young/accessed bitflag
+        * in the secondary pte, but it may omit flushing the secondary tlb.
+        */
+       int (*clear_young)(struct mmu_notifier *mn,
+                          struct mm_struct *mm,
+                          unsigned long start,
+                          unsigned long end);
+
        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+                                     unsigned long start,
+                                     unsigned long end);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+                                          unsigned long start,
+                                          unsigned long end)
+{
+       if (mm_has_notifiers(mm))
+               return __mmu_notifier_clear_young(mm, start, end);
+       return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
 {
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __young;                                                        \
 })
 
+#define ptep_clear_young_notify(__vma, __address, __ptep)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PAGE_SIZE);    \
+       __young;                                                        \
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PMD_SIZE);     \
+       __young;                                                        \
+})
+
 #define        ptep_clear_flush_notify(__vma, __address, __ptep)               \
 ({                                                                     \
        unsigned long ___addr = __address & PAGE_MASK;                  \
@@ -427,6 +471,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define        ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
index 41c93844fb1d1ed5c0dbad77fe5a409557d66067..416509e26d6d16bfa0f75ef793b7e32e6b5fb090 100644 (file)
@@ -108,6 +108,10 @@ enum pageflags {
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        PG_compound_lock,
+#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+       PG_young,
+       PG_idle,
 #endif
        __NR_PAGEFLAGS,
 
@@ -289,6 +293,13 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Young, young)
+SETPAGEFLAG(Young, young)
+TESTCLEARFLAG(Young, young)
+PAGEFLAG(Idle, idle)
+#endif
+
 /*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
index c42981cd99aae91b33d21a0deb4bc9f925a7fbd1..17f118a82854960701dac4de69dfc1af43713989 100644 (file)
@@ -26,6 +26,10 @@ enum page_ext_flags {
        PAGE_EXT_DEBUG_POISON,          /* Page is poisoned */
        PAGE_EXT_DEBUG_GUARD,
        PAGE_EXT_OWNER,
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+       PAGE_EXT_YOUNG,
+       PAGE_EXT_IDLE,
+#endif
 };
 
 /*
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
new file mode 100644 (file)
index 0000000..bf268fa
--- /dev/null
@@ -0,0 +1,110 @@
+#ifndef _LINUX_MM_PAGE_IDLE_H
+#define _LINUX_MM_PAGE_IDLE_H
+
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/page_ext.h>
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+
+#ifdef CONFIG_64BIT
+static inline bool page_is_young(struct page *page)
+{
+       return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+       SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       ClearPageIdle(page);
+}
+#else /* !CONFIG_64BIT */
+/*
+ * If there is not enough space to store Idle and Young bits in page flags, use
+ * page ext flags instead.
+ */
+extern struct page_ext_operations page_idle_ops;
+
+static inline bool page_is_young(struct page *page)
+{
+       return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_young(struct page *page)
+{
+       set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return test_and_clear_bit(PAGE_EXT_YOUNG,
+                                 &lookup_page_ext(page)->flags);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+#endif /* CONFIG_64BIT */
+
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+
+static inline bool page_is_young(struct page *page)
+{
+       return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
+#endif /* _LINUX_MM_PAGE_IDLE_H */
index 2110a81c5e2afaab47ec5cb107cf17503d731317..317e16de09e508ed64b87dae6c5006b1efcb129d 100644 (file)
@@ -19,8 +19,8 @@
  * under normal circumstances, used to verify that nobody uses
  * non-initialized list entries.
  */
-#define LIST_POISON1  ((void *) 0x00100100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x00200200 + POISON_POINTER_DELTA)
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
 #define ATM_POISON_FREE                0x12
 #define ATM_POISON             0xdeadbeef
 
-/********** net/ **********/
-#define NEIGHBOR_DEAD          0xdeadbeef
-#define NETFILTER_LINK_POISON  0xdead57ac
-
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT       0x11
 #define MUTEX_DEBUG_FREE       0x22
@@ -83,7 +79,4 @@
 /********** security/ **********/
 #define KEY_DESTROY            0xbd
 
-/********** sound/oss/ **********/
-#define OSS_POISON_FREE                0xAB
-
 #endif
index a6298b27ac99d9197ccd4dac6c04b09f2c5011da..9729565c25ff19accc05ca6419bbc6f50f8cdb53 100644 (file)
@@ -404,10 +404,10 @@ do {                                                                      \
        static DEFINE_RATELIMIT_STATE(_rs,                              \
                                      DEFAULT_RATELIMIT_INTERVAL,       \
                                      DEFAULT_RATELIMIT_BURST);         \
-       DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);                 \
+       DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));         \
        if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&        \
            __ratelimit(&_rs))                                          \
-               __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__);    \
+               __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);    \
 } while (0)
 #elif defined(DEBUG)
 #define pr_debug_ratelimited(fmt, ...)                                 \
@@ -456,11 +456,17 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,      \
                         groupsize, buf, len, ascii)
-#else
+#elif defined(DEBUG)
 #define print_hex_dump_debug(prefix_str, prefix_type, rowsize,         \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,    \
                       groupsize, buf, len, ascii)
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+#else
+static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
+                                       int rowsize, int groupsize,
+                                       const void *buf, size_t len, bool ascii)
+{
+}
+#endif
 
 #endif
index d4c7271382cb310edc3d2bf4ffd5ef997e5bef87..adeadbd6d7bfa4909db409aaa3f10e90a9e8ca00 100644 (file)
@@ -122,6 +122,10 @@ int seq_write(struct seq_file *seq, const void *data, size_t len);
 __printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
 __printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
 
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+                 int rowsize, int groupsize, const void *buf, size_t len,
+                 bool ascii);
+
 int seq_path(struct seq_file *, const struct path *, const char *);
 int seq_file_path(struct seq_file *, struct file *, const char *);
 int seq_dentry(struct seq_file *, struct dentry *, const char *);
index 71f711db450067fbcba192999c010b49dbfeb408..dabe643eb5fadcde8d00a02df9aba03faa9c2592 100644 (file)
@@ -48,24 +48,24 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_HEX             0x20
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-               unsigned int flags, const char *esc);
+               unsigned int flags, const char *only);
 
 static inline int string_escape_mem_any_np(const char *src, size_t isz,
-               char *dst, size_t osz, const char *esc)
+               char *dst, size_t osz, const char *only)
 {
-       return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc);
+       return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
 }
 
 static inline int string_escape_str(const char *src, char *dst, size_t sz,
-               unsigned int flags, const char *esc)
+               unsigned int flags, const char *only)
 {
-       return string_escape_mem(src, strlen(src), dst, sz, flags, esc);
+       return string_escape_mem(src, strlen(src), dst, sz, flags, only);
 }
 
 static inline int string_escape_str_any_np(const char *src, char *dst,
-               size_t sz, const char *esc)
+               size_t sz, const char *only)
 {
-       return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc);
+       return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
 }
 
 #endif
index c924a28d9805010f49f9009e76459ab805de280b..42f8ec9924523aa1436ca72ef606ef6030e360c8 100644 (file)
@@ -36,6 +36,8 @@ enum zpool_mapmode {
        ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
 };
 
+bool zpool_has_pool(char *type);
+
 struct zpool *zpool_create_pool(char *type, char *name,
                        gfp_t gfp, const struct zpool_ops *ops);
 
index a6c4962e5d4623912adb21f4c240b352de223481..5da5f8751ce7dc082a4a99cfe29e7d2eb7f6ba5a 100644 (file)
@@ -33,6 +33,7 @@
 #define KPF_THP                        22
 #define KPF_BALLOON            23
 #define KPF_ZERO_PAGE          24
+#define KPF_IDLE               25
 
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
index ad1bd7787bbb0c3298e2f9790b0edd5322227639..b32ad7d97ac94f52a0c50acd2a904e8a0c2f888d 100644 (file)
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
 
 static void __init free_initrd(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
        unsigned long crashk_end   = (unsigned long)__va(crashk_res.end);
 #endif
        if (do_retain_initrd)
                goto skip;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        /*
         * If the initrd region is overlapped with crashkernel reserved region,
         * free only memory that is not part of crashkernel region.
index 56506553d4d80dff814b75f45db6db280fd0dea7..9e64d7097f1ad4d5744755c977cac583debbaf38 100644 (file)
@@ -877,7 +877,6 @@ static void __init do_initcalls(void)
 static void __init do_basic_setup(void)
 {
        cpuset_init_smp();
-       usermodehelper_init();
        shmem_init();
        driver_init();
        init_irq_proc();
index 2b491590ebab1f7fac5b6fecaf632a9a5aebada7..71f448e5e927aed0ccd8f5af24a928e82cfe616f 100644 (file)
@@ -123,7 +123,7 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
        size_t len = src->m_ts;
        size_t alen;
 
-       BUG_ON(dst == NULL);
+       WARN_ON(dst == NULL);
        if (src->m_ts > dst->m_ts)
                return ERR_PTR(-EINVAL);
 
index 4aef24d91b633e12275cea64a380df4543fc796b..222131e8e38f334547004bf0830b26bf808cc6a2 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -159,7 +159,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
         * We raced in the idr lookup or with shm_destroy().  Either way, the
         * ID is busted.
         */
-       BUG_ON(IS_ERR(ipcp));
+       WARN_ON(IS_ERR(ipcp));
 
        return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
@@ -393,7 +393,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
                return ret;
        sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
-       BUG_ON(!sfd->vm_ops->fault);
+       WARN_ON(!sfd->vm_ops->fault);
 #endif
        vma->vm_ops = &shm_vm_ops;
        shm_open(vma);
index e0d7587e7684e7b4408b69a189870eca3f9bce13..d4988410b410a6ae802b5d796b99e73d80393dc1 100644 (file)
@@ -49,7 +49,9 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
index ec1c07667ec1d5daec2820d6b30cda2bf99818c4..71179a09c1d6a3240fd9c1a4ead58d89646da24e 100644 (file)
 #include <linux/cn_proc.h>
 
 #if 0
-#define kdebug(FMT, ...) \
-       printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...)                                               \
+       printk("[%-5.5s%5u] " FMT "\n",                                 \
+              current->comm, current->pid, ##__VA_ARGS__)
 #else
-#define kdebug(FMT, ...) \
-       no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...)                                               \
+do {                                                                   \
+       if (0)                                                          \
+               no_printk("[%-5.5s%5u] " FMT "\n",                      \
+                         current->comm, current->pid, ##__VA_ARGS__);  \
+} while (0)
 #endif
 
 static struct kmem_cache *cred_jar;
index e8183895691c61f021e9dc9a7e0aab5f6cc709fc..f548f69c4299dd1ee44bfdc1f84d79d655d0d6d7 100644 (file)
@@ -9094,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
 
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
index c98f926277a8de676335eba7507eebc0513e93a9..e820ccee984673e77b23b09ff42a0db254aba762 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/ftrace.h>
 #include <linux/memory.h>
 #include <linux/module.h>
-#include <linux/ftrace.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
 
index a785c1015e25bf1ecacd3a6d92956e3e630e7f37..4c5edc357923a1b6198c9f8122b90b73b9a5e38f 100644 (file)
 /*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#define pr_fmt(fmt)    "kexec: " fmt
-
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/kexec.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
 #include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-       .name  = "Crash kernel",
-       .start = 0,
-       .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
-       .name  = "Crash kernel",
-       .start = 0,
-       .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
-       /*
-        * If crash_kexec_post_notifiers is enabled, don't run
-        * crash_kexec() here yet, which must be run after panic
-        * notifiers in panic().
-        */
-       if (crash_kexec_post_notifiers)
-               return 0;
-       /*
-        * There are 4 panic() calls in do_exit() path, each of which
-        * corresponds to each of these 4 conditions.
-        */
-       if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-               return 1;
-       return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses.  On processors
- * where you can disable the MMU this is trivial, and easy.  For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place.  This means I can only support memory whose
- * physical address can fit in an unsigned long.  In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages.  As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it).  The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- *  - allocating a page table with the control code buffer identity
- *    mapped, to simplify machine_kexec and make kexec_on_panic more
- *    reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
+#include <linux/slab.h>
 
-static int kimage_is_destination_range(struct kimage *image,
-                                      unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
-                                      gfp_t gfp_mask,
-                                      unsigned long dest);
+#include "kexec_internal.h"
 
 static int copy_user_segment_list(struct kimage *image,
                                  unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
        return ret;
 }
 
-static int sanity_check_segment_list(struct kimage *image)
-{
-       int result, i;
-       unsigned long nr_segments = image->nr_segments;
-
-       /*
-        * Verify we have good destination addresses.  The caller is
-        * responsible for making certain we don't attempt to load
-        * the new image into invalid or reserved areas of RAM.  This
-        * just verifies it is an address we can use.
-        *
-        * Since the kernel does everything in page size chunks ensure
-        * the destination addresses are page aligned.  Too many
-        * special cases crop of when we don't do this.  The most
-        * insidious is getting overlapping destination addresses
-        * simply because addresses are changed to page size
-        * granularity.
-        */
-       result = -EADDRNOTAVAIL;
-       for (i = 0; i < nr_segments; i++) {
-               unsigned long mstart, mend;
-
-               mstart = image->segment[i].mem;
-               mend   = mstart + image->segment[i].memsz;
-               if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-                       return result;
-               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-                       return result;
-       }
-
-       /* Verify our destination addresses do not overlap.
-        * If we alloed overlapping destination addresses
-        * through very weird things can happen with no
-        * easy explanation as one segment stops on another.
-        */
-       result = -EINVAL;
-       for (i = 0; i < nr_segments; i++) {
-               unsigned long mstart, mend;
-               unsigned long j;
-
-               mstart = image->segment[i].mem;
-               mend   = mstart + image->segment[i].memsz;
-               for (j = 0; j < i; j++) {
-                       unsigned long pstart, pend;
-                       pstart = image->segment[j].mem;
-                       pend   = pstart + image->segment[j].memsz;
-                       /* Do the segments overlap ? */
-                       if ((mend > pstart) && (mstart < pend))
-                               return result;
-               }
-       }
-
-       /* Ensure our buffer sizes are strictly less than
-        * our memory sizes.  This should always be the case,
-        * and it is easier to check up front than to be surprised
-        * later on.
-        */
-       result = -EINVAL;
-       for (i = 0; i < nr_segments; i++) {
-               if (image->segment[i].bufsz > image->segment[i].memsz)
-                       return result;
-       }
-
-       /*
-        * Verify we have good destination addresses.  Normally
-        * the caller is responsible for making certain we don't
-        * attempt to load the new image into invalid or reserved
-        * areas of RAM.  But crash kernels are preloaded into a
-        * reserved area of ram.  We must ensure the addresses
-        * are in the reserved area otherwise preloading the
-        * kernel could corrupt things.
-        */
-
-       if (image->type == KEXEC_TYPE_CRASH) {
-               result = -EADDRNOTAVAIL;
-               for (i = 0; i < nr_segments; i++) {
-                       unsigned long mstart, mend;
-
-                       mstart = image->segment[i].mem;
-                       mend = mstart + image->segment[i].memsz - 1;
-                       /* Ensure we are within the crash kernel limits */
-                       if ((mstart < crashk_res.start) ||
-                           (mend > crashk_res.end))
-                               return result;
-               }
-       }
-
-       return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
-       struct kimage *image;
-
-       /* Allocate a controlling structure */
-       image = kzalloc(sizeof(*image), GFP_KERNEL);
-       if (!image)
-               return NULL;
-
-       image->head = 0;
-       image->entry = &image->head;
-       image->last_entry = &image->head;
-       image->control_page = ~0; /* By default this does not apply */
-       image->type = KEXEC_TYPE_DEFAULT;
-
-       /* Initialize the list of control pages */
-       INIT_LIST_HEAD(&image->control_pages);
-
-       /* Initialize the list of destination pages */
-       INIT_LIST_HEAD(&image->dest_pages);
-
-       /* Initialize the list of unusable pages */
-       INIT_LIST_HEAD(&image->unusable_pages);
-
-       return image;
-}
-
-static void kimage_free_page_list(struct list_head *list);
-
 static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
                             unsigned long nr_segments,
                             struct kexec_segment __user *segments,
@@ -354,2427 +101,155 @@ out_free_image:
        return ret;
 }
 
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
-       struct fd f = fdget(fd);
-       int ret;
-       struct kstat stat;
-       loff_t pos;
-       ssize_t bytes = 0;
-
-       if (!f.file)
-               return -EBADF;
-
-       ret = vfs_getattr(&f.file->f_path, &stat);
-       if (ret)
-               goto out;
-
-       if (stat.size > INT_MAX) {
-               ret = -EFBIG;
-               goto out;
-       }
-
-       /* Don't hand 0 to vmalloc, it whines. */
-       if (stat.size == 0) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       *buf = vmalloc(stat.size);
-       if (!*buf) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       pos = 0;
-       while (pos < stat.size) {
-               bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
-                                   stat.size - pos);
-               if (bytes < 0) {
-                       vfree(*buf);
-                       ret = bytes;
-                       goto out;
-               }
-
-               if (bytes == 0)
-                       break;
-               pos += bytes;
-       }
-
-       if (pos != stat.size) {
-               ret = -EBADF;
-               vfree(*buf);
-               goto out;
-       }
-
-       *buf_len = pos;
-out:
-       fdput(f);
-       return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-                                        unsigned long buf_len)
-{
-       return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
-       return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
-                                       unsigned long buf_len)
-{
-       return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                                unsigned int relsec)
-{
-       pr_err("RELA relocation unsupported.\n");
-       return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                            unsigned int relsec)
-{
-       pr_err("REL relocation unsupported.\n");
-       return -ENOEXEC;
-}
-
 /*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and then copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
  */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-
-       vfree(image->kernel_buf);
-       image->kernel_buf = NULL;
 
-       vfree(image->initrd_buf);
-       image->initrd_buf = NULL;
-
-       kfree(image->cmdline_buf);
-       image->cmdline_buf = NULL;
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+               struct kexec_segment __user *, segments, unsigned long, flags)
+{
+       struct kimage **dest_image, *image;
+       int result;
 
-       vfree(pi->purgatory_buf);
-       pi->purgatory_buf = NULL;
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+               return -EPERM;
 
-       vfree(pi->sechdrs);
-       pi->sechdrs = NULL;
+       /*
+        * Verify we have a legal set of flags
+        * This leaves us room for future extensions.
+        */
+       if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+               return -EINVAL;
 
-       /* See if architecture has anything to cleanup post load */
-       arch_kimage_file_post_load_cleanup(image);
+       /* Verify we are on the appropriate architecture */
+       if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+               ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+               return -EINVAL;
 
-       /*
-        * Above call should have called into bootloader to free up
-        * any data stored in kimage->image_loader_data. It should
-        * be ok now to free it up.
+       /* Put an artificial cap on the number
+        * of segments passed to kexec_load.
         */
-       kfree(image->image_loader_data);
-       image->image_loader_data = NULL;
-}
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
 
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
-                            const char __user *cmdline_ptr,
-                            unsigned long cmdline_len, unsigned flags)
-{
-       int ret = 0;
-       void *ldata;
+       image = NULL;
+       result = 0;
 
-       ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
-                               &image->kernel_buf_len);
-       if (ret)
-               return ret;
+       /* Because we write directly to the reserved memory
+        * region when loading crash kernels we need a mutex here to
+        * prevent multiple crash  kernels from attempting to load
+        * simultaneously, and to prevent a crash kernel from loading
+        * over the top of a in use crash kernel.
+        *
+        * KISS: always take the mutex.
+        */
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
 
-       /* Call arch image probe handlers */
-       ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
-                                           image->kernel_buf_len);
+       dest_image = &kexec_image;
+       if (flags & KEXEC_ON_CRASH)
+               dest_image = &kexec_crash_image;
+       if (nr_segments > 0) {
+               unsigned long i;
 
-       if (ret)
-               goto out;
+               if (flags & KEXEC_ON_CRASH) {
+                       /*
+                        * Loading another kernel to switch to if this one
+                        * crashes.  Free any current crash dump kernel before
+                        * we corrupt it.
+                        */
 
-#ifdef CONFIG_KEXEC_VERIFY_SIG
-       ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
-                                          image->kernel_buf_len);
-       if (ret) {
-               pr_debug("kernel signature verification failed.\n");
-               goto out;
-       }
-       pr_debug("kernel signature verification successful.\n");
-#endif
-       /* It is possible that there no initramfs is being loaded */
-       if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
-               ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
-                                       &image->initrd_buf_len);
-               if (ret)
-                       goto out;
-       }
+                       kimage_free(xchg(&kexec_crash_image, NULL));
+                       result = kimage_alloc_init(&image, entry, nr_segments,
+                                                  segments, flags);
+                       crash_map_reserved_pages();
+               } else {
+                       /* Loading another kernel to reboot into. */
 
-       if (cmdline_len) {
-               image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
-               if (!image->cmdline_buf) {
-                       ret = -ENOMEM;
-                       goto out;
+                       result = kimage_alloc_init(&image, entry, nr_segments,
+                                                  segments, flags);
                }
-
-               ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
-                                    cmdline_len);
-               if (ret) {
-                       ret = -EFAULT;
+               if (result)
                        goto out;
-               }
-
-               image->cmdline_buf_len = cmdline_len;
 
-               /* command line should be a string with last byte null */
-               if (image->cmdline_buf[cmdline_len - 1] != '\0') {
-                       ret = -EINVAL;
+               if (flags & KEXEC_PRESERVE_CONTEXT)
+                       image->preserve_context = 1;
+               result = machine_kexec_prepare(image);
+               if (result)
                        goto out;
-               }
-       }
 
-       /* Call arch image load handlers */
-       ldata = arch_kexec_kernel_image_load(image);
-
-       if (IS_ERR(ldata)) {
-               ret = PTR_ERR(ldata);
-               goto out;
+               for (i = 0; i < nr_segments; i++) {
+                       result = kimage_load_segment(image, &image->segment[i]);
+                       if (result)
+                               goto out;
+               }
+               kimage_terminate(image);
+               if (flags & KEXEC_ON_CRASH)
+                       crash_unmap_reserved_pages();
        }
+       /* Install the new kernel, and  Uninstall the old */
+       image = xchg(dest_image, image);
 
-       image->image_loader_data = ldata;
 out:
-       /* In case of error, free up all allocated memory in this function */
-       if (ret)
-               kimage_file_post_load_cleanup(image);
-       return ret;
+       mutex_unlock(&kexec_mutex);
+       kimage_free(image);
+
+       return result;
 }
 
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
-                      int initrd_fd, const char __user *cmdline_ptr,
-                      unsigned long cmdline_len, unsigned long flags)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
+                      compat_ulong_t, nr_segments,
+                      struct compat_kexec_segment __user *, segments,
+                      compat_ulong_t, flags)
 {
-       int ret;
-       struct kimage *image;
-       bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
-       image = do_kimage_alloc_init();
-       if (!image)
-               return -ENOMEM;
+       struct compat_kexec_segment in;
+       struct kexec_segment out, __user *ksegments;
+       unsigned long i, result;
 
-       image->file_mode = 1;
+       /* Don't allow clients that don't understand the native
+        * architecture to do anything.
+        */
+       if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
+               return -EINVAL;
 
-       if (kexec_on_panic) {
-               /* Enable special crash kernel control page alloc policy. */
-               image->control_page = crashk_res.start;
-               image->type = KEXEC_TYPE_CRASH;
-       }
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
 
-       ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
-                                          cmdline_ptr, cmdline_len, flags);
-       if (ret)
-               goto out_free_image;
-
-       ret = sanity_check_segment_list(image);
-       if (ret)
-               goto out_free_post_load_bufs;
-
-       ret = -ENOMEM;
-       image->control_code_page = kimage_alloc_control_pages(image,
-                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
-       if (!image->control_code_page) {
-               pr_err("Could not allocate control_code_buffer\n");
-               goto out_free_post_load_bufs;
-       }
-
-       if (!kexec_on_panic) {
-               image->swap_page = kimage_alloc_control_pages(image, 0);
-               if (!image->swap_page) {
-                       pr_err("Could not allocate swap buffer\n");
-                       goto out_free_control_pages;
-               }
-       }
-
-       *rimage = image;
-       return 0;
-out_free_control_pages:
-       kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
-       kimage_file_post_load_cleanup(image);
-out_free_image:
-       kfree(image);
-       return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
-                                       unsigned long start,
-                                       unsigned long end)
-{
-       unsigned long i;
-
-       for (i = 0; i < image->nr_segments; i++) {
-               unsigned long mstart, mend;
-
-               mstart = image->segment[i].mem;
-               mend = mstart + image->segment[i].memsz;
-               if ((end > mstart) && (start < mend))
-                       return 1;
-       }
-
-       return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
-       struct page *pages;
-
-       pages = alloc_pages(gfp_mask, order);
-       if (pages) {
-               unsigned int count, i;
-               pages->mapping = NULL;
-               set_page_private(pages, order);
-               count = 1 << order;
-               for (i = 0; i < count; i++)
-                       SetPageReserved(pages + i);
-       }
-
-       return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
-       unsigned int order, count, i;
-
-       order = page_private(page);
-       count = 1 << order;
-       for (i = 0; i < count; i++)
-               ClearPageReserved(page + i);
-       __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
-       struct list_head *pos, *next;
-
-       list_for_each_safe(pos, next, list) {
-               struct page *page;
-
-               page = list_entry(pos, struct page, lru);
-               list_del(&page->lru);
-               kimage_free_pages(page);
-       }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
-                                                       unsigned int order)
-{
-       /* Control pages are special, they are the intermediaries
-        * that are needed while we copy the rest of the pages
-        * to their final resting place.  As such they must
-        * not conflict with either the destination addresses
-        * or memory the kernel is already using.
-        *
-        * The only case where we really need more than one of
-        * these are for architectures where we cannot disable
-        * the MMU and must instead generate an identity mapped
-        * page table for all of the memory.
-        *
-        * At worst this runs in O(N) of the image size.
-        */
-       struct list_head extra_pages;
-       struct page *pages;
-       unsigned int count;
-
-       count = 1 << order;
-       INIT_LIST_HEAD(&extra_pages);
-
-       /* Loop while I can allocate a page and the page allocated
-        * is a destination page.
-        */
-       do {
-               unsigned long pfn, epfn, addr, eaddr;
-
-               pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
-               if (!pages)
-                       break;
-               pfn   = page_to_pfn(pages);
-               epfn  = pfn + count;
-               addr  = pfn << PAGE_SHIFT;
-               eaddr = epfn << PAGE_SHIFT;
-               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-                             kimage_is_destination_range(image, addr, eaddr)) {
-                       list_add(&pages->lru, &extra_pages);
-                       pages = NULL;
-               }
-       } while (!pages);
-
-       if (pages) {
-               /* Remember the allocated page... */
-               list_add(&pages->lru, &image->control_pages);
-
-               /* Because the page is already in it's destination
-                * location we will never allocate another page at
-                * that address.  Therefore kimage_alloc_pages
-                * will not return it (again) and we don't need
-                * to give it an entry in image->segment[].
-                */
-       }
-       /* Deal with the destination pages I have inadvertently allocated.
-        *
-        * Ideally I would convert multi-page allocations into single
-        * page allocations, and add everything to image->dest_pages.
-        *
-        * For now it is simpler to just free the pages.
-        */
-       kimage_free_page_list(&extra_pages);
-
-       return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-                                                     unsigned int order)
-{
-       /* Control pages are special, they are the intermediaries
-        * that are needed while we copy the rest of the pages
-        * to their final resting place.  As such they must
-        * not conflict with either the destination addresses
-        * or memory the kernel is already using.
-        *
-        * Control pages are also the only pags we must allocate
-        * when loading a crash kernel.  All of the other pages
-        * are specified by the segments and we just memcpy
-        * into them directly.
-        *
-        * The only case where we really need more than one of
-        * these are for architectures where we cannot disable
-        * the MMU and must instead generate an identity mapped
-        * page table for all of the memory.
-        *
-        * Given the low demand this implements a very simple
-        * allocator that finds the first hole of the appropriate
-        * size in the reserved memory region, and allocates all
-        * of the memory up to and including the hole.
-        */
-       unsigned long hole_start, hole_end, size;
-       struct page *pages;
-
-       pages = NULL;
-       size = (1 << order) << PAGE_SHIFT;
-       hole_start = (image->control_page + (size - 1)) & ~(size - 1);
-       hole_end   = hole_start + size - 1;
-       while (hole_end <= crashk_res.end) {
-               unsigned long i;
-
-               if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
-                       break;
-               /* See if I overlap any of the segments */
-               for (i = 0; i < image->nr_segments; i++) {
-                       unsigned long mstart, mend;
-
-                       mstart = image->segment[i].mem;
-                       mend   = mstart + image->segment[i].memsz - 1;
-                       if ((hole_end >= mstart) && (hole_start <= mend)) {
-                               /* Advance the hole to the end of the segment */
-                               hole_start = (mend + (size - 1)) & ~(size - 1);
-                               hole_end   = hole_start + size - 1;
-                               break;
-                       }
-               }
-               /* If I don't overlap any segments I have found my hole! */
-               if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
-                       break;
-               }
-       }
-       if (pages)
-               image->control_page = hole_end;
-
-       return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
-                                        unsigned int order)
-{
-       struct page *pages = NULL;
-
-       switch (image->type) {
-       case KEXEC_TYPE_DEFAULT:
-               pages = kimage_alloc_normal_control_pages(image, order);
-               break;
-       case KEXEC_TYPE_CRASH:
-               pages = kimage_alloc_crash_control_pages(image, order);
-               break;
-       }
-
-       return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
-       if (*image->entry != 0)
-               image->entry++;
-
-       if (image->entry == image->last_entry) {
-               kimage_entry_t *ind_page;
-               struct page *page;
-
-               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-               if (!page)
-                       return -ENOMEM;
-
-               ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
-               image->entry = ind_page;
-               image->last_entry = ind_page +
-                                     ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
-       }
-       *image->entry = entry;
-       image->entry++;
-       *image->entry = 0;
-
-       return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
-                                  unsigned long destination)
-{
-       int result;
-
-       destination &= PAGE_MASK;
-       result = kimage_add_entry(image, destination | IND_DESTINATION);
-
-       return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
-       int result;
-
-       page &= PAGE_MASK;
-       result = kimage_add_entry(image, page | IND_SOURCE);
-
-       return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
-       /* Walk through and free any extra destination pages I may have */
-       kimage_free_page_list(&image->dest_pages);
-
-       /* Walk through and free any unusable pages I have cached */
-       kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
-       if (*image->entry != 0)
-               image->entry++;
-
-       *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
-       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
-               ptr = (entry & IND_INDIRECTION) ? \
-                       phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
-       struct page *page;
-
-       page = pfn_to_page(entry >> PAGE_SHIFT);
-       kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
-       kimage_entry_t *ptr, entry;
-       kimage_entry_t ind = 0;
-
-       if (!image)
-               return;
-
-       kimage_free_extra_pages(image);
-       for_each_kimage_entry(image, ptr, entry) {
-               if (entry & IND_INDIRECTION) {
-                       /* Free the previous indirection page */
-                       if (ind & IND_INDIRECTION)
-                               kimage_free_entry(ind);
-                       /* Save this indirection page until we are
-                        * done with it.
-                        */
-                       ind = entry;
-               } else if (entry & IND_SOURCE)
-                       kimage_free_entry(entry);
-       }
-       /* Free the final indirection page */
-       if (ind & IND_INDIRECTION)
-               kimage_free_entry(ind);
-
-       /* Handle any machine specific cleanup */
-       machine_kexec_cleanup(image);
-
-       /* Free the kexec control pages... */
-       kimage_free_page_list(&image->control_pages);
-
-       /*
-        * Free up any temporary buffers allocated. This might hit if
-        * error occurred much later after buffer allocation.
-        */
-       if (image->file_mode)
-               kimage_file_post_load_cleanup(image);
-
-       kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
-                                       unsigned long page)
-{
-       kimage_entry_t *ptr, entry;
-       unsigned long destination = 0;
-
-       for_each_kimage_entry(image, ptr, entry) {
-               if (entry & IND_DESTINATION)
-                       destination = entry & PAGE_MASK;
-               else if (entry & IND_SOURCE) {
-                       if (page == destination)
-                               return ptr;
-                       destination += PAGE_SIZE;
-               }
-       }
-
-       return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
-                                       gfp_t gfp_mask,
-                                       unsigned long destination)
-{
-       /*
-        * Here we implement safeguards to ensure that a source page
-        * is not copied to its destination page before the data on
-        * the destination page is no longer useful.
-        *
-        * To do this we maintain the invariant that a source page is
-        * either its own destination page, or it is not a
-        * destination page at all.
-        *
-        * That is slightly stronger than required, but the proof
-        * that no problems will not occur is trivial, and the
-        * implementation is simply to verify.
-        *
-        * When allocating all pages normally this algorithm will run
-        * in O(N) time, but in the worst case it will run in O(N^2)
-        * time.   If the runtime is a problem the data structures can
-        * be fixed.
-        */
-       struct page *page;
-       unsigned long addr;
-
-       /*
-        * Walk through the list of destination pages, and see if I
-        * have a match.
-        */
-       list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
-               if (addr == destination) {
-                       list_del(&page->lru);
-                       return page;
-               }
-       }
-       page = NULL;
-       while (1) {
-               kimage_entry_t *old;
-
-               /* Allocate a page, if we run out of memory give up */
-               page = kimage_alloc_pages(gfp_mask, 0);
-               if (!page)
-                       return NULL;
-               /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
-                               (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-                       list_add(&page->lru, &image->unusable_pages);
-                       continue;
-               }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
-
-               /* If it is the destination page we want use it */
-               if (addr == destination)
-                       break;
-
-               /* If the page is not a destination page use it */
-               if (!kimage_is_destination_range(image, addr,
-                                                 addr + PAGE_SIZE))
-                       break;
-
-               /*
-                * I know that the page is someones destination page.
-                * See if there is already a source page for this
-                * destination page.  And if so swap the source pages.
-                */
-               old = kimage_dst_used(image, addr);
-               if (old) {
-                       /* If so move it */
-                       unsigned long old_addr;
-                       struct page *old_page;
-
-                       old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
-                       copy_highpage(page, old_page);
-                       *old = addr | (*old & ~PAGE_MASK);
-
-                       /* The old page I have found cannot be a
-                        * destination page, so return it if it's
-                        * gfp_flags honor the ones passed in.
-                        */
-                       if (!(gfp_mask & __GFP_HIGHMEM) &&
-                           PageHighMem(old_page)) {
-                               kimage_free_pages(old_page);
-                               continue;
-                       }
-                       addr = old_addr;
-                       page = old_page;
-                       break;
-               } else {
-                       /* Place the page on the destination list I
-                        * will use it later.
-                        */
-                       list_add(&page->lru, &image->dest_pages);
-               }
-       }
-
-       return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
-                                        struct kexec_segment *segment)
-{
-       unsigned long maddr;
-       size_t ubytes, mbytes;
-       int result;
-       unsigned char __user *buf = NULL;
-       unsigned char *kbuf = NULL;
-
-       result = 0;
-       if (image->file_mode)
-               kbuf = segment->kbuf;
-       else
-               buf = segment->buf;
-       ubytes = segment->bufsz;
-       mbytes = segment->memsz;
-       maddr = segment->mem;
-
-       result = kimage_set_destination(image, maddr);
-       if (result < 0)
-               goto out;
-
-       while (mbytes) {
-               struct page *page;
-               char *ptr;
-               size_t uchunk, mchunk;
-
-               page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
-               if (!page) {
-                       result  = -ENOMEM;
-                       goto out;
-               }
-               result = kimage_add_page(image, page_to_pfn(page)
-                                                               << PAGE_SHIFT);
-               if (result < 0)
-                       goto out;
-
-               ptr = kmap(page);
-               /* Start with a clear page */
-               clear_page(ptr);
-               ptr += maddr & ~PAGE_MASK;
-               mchunk = min_t(size_t, mbytes,
-                               PAGE_SIZE - (maddr & ~PAGE_MASK));
-               uchunk = min(ubytes, mchunk);
-
-               /* For file based kexec, source pages are in kernel memory */
-               if (image->file_mode)
-                       memcpy(ptr, kbuf, uchunk);
-               else
-                       result = copy_from_user(ptr, buf, uchunk);
-               kunmap(page);
-               if (result) {
-                       result = -EFAULT;
-                       goto out;
-               }
-               ubytes -= uchunk;
-               maddr  += mchunk;
-               if (image->file_mode)
-                       kbuf += mchunk;
-               else
-                       buf += mchunk;
-               mbytes -= mchunk;
-       }
-out:
-       return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
-                                       struct kexec_segment *segment)
-{
-       /* For crash dumps kernels we simply copy the data from
-        * user space to it's destination.
-        * We do things a page at a time for the sake of kmap.
-        */
-       unsigned long maddr;
-       size_t ubytes, mbytes;
-       int result;
-       unsigned char __user *buf = NULL;
-       unsigned char *kbuf = NULL;
-
-       result = 0;
-       if (image->file_mode)
-               kbuf = segment->kbuf;
-       else
-               buf = segment->buf;
-       ubytes = segment->bufsz;
-       mbytes = segment->memsz;
-       maddr = segment->mem;
-       while (mbytes) {
-               struct page *page;
-               char *ptr;
-               size_t uchunk, mchunk;
-
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
-               if (!page) {
-                       result  = -ENOMEM;
-                       goto out;
-               }
-               ptr = kmap(page);
-               ptr += maddr & ~PAGE_MASK;
-               mchunk = min_t(size_t, mbytes,
-                               PAGE_SIZE - (maddr & ~PAGE_MASK));
-               uchunk = min(ubytes, mchunk);
-               if (mchunk > uchunk) {
-                       /* Zero the trailing part of the page */
-                       memset(ptr + uchunk, 0, mchunk - uchunk);
-               }
-
-               /* For file based kexec, source pages are in kernel memory */
-               if (image->file_mode)
-                       memcpy(ptr, kbuf, uchunk);
-               else
-                       result = copy_from_user(ptr, buf, uchunk);
-               kexec_flush_icache_page(page);
-               kunmap(page);
-               if (result) {
-                       result = -EFAULT;
-                       goto out;
-               }
-               ubytes -= uchunk;
-               maddr  += mchunk;
-               if (image->file_mode)
-                       kbuf += mchunk;
-               else
-                       buf += mchunk;
-               mbytes -= mchunk;
-       }
-out:
-       return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
-                               struct kexec_segment *segment)
-{
-       int result = -ENOMEM;
-
-       switch (image->type) {
-       case KEXEC_TYPE_DEFAULT:
-               result = kimage_load_normal_segment(image, segment);
-               break;
-       case KEXEC_TYPE_CRASH:
-               result = kimage_load_crash_segment(image, segment);
-               break;
-       }
-
-       return result;
-}
-
-/*
- * Exec Kernel system call: for obvious reasons only root may call it.
- *
- * This call breaks up into three pieces.
- * - A generic part which loads the new kernel from the current
- *   address space, and very carefully places the data in the
- *   allocated pages.
- *
- * - A generic part that interacts with the kernel and tells all of
- *   the devices to shut down.  Preventing on-going dmas, and placing
- *   the devices in a consistent state so a later kernel can
- *   reinitialize them.
- *
- * - A machine specific part that includes the syscall number
- *   and then copies the image to it's final destination.  And
- *   jumps into the image at entry.
- *
- * kexec does not sync, or unmount filesystems so if you need
- * that to happen you need to do that yourself.
- */
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-static DEFINE_MUTEX(kexec_mutex);
-
-SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
-               struct kexec_segment __user *, segments, unsigned long, flags)
-{
-       struct kimage **dest_image, *image;
-       int result;
-
-       /* We only trust the superuser with rebooting the system. */
-       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-               return -EPERM;
-
-       /*
-        * Verify we have a legal set of flags
-        * This leaves us room for future extensions.
-        */
-       if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
-               return -EINVAL;
-
-       /* Verify we are on the appropriate architecture */
-       if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
-               ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
-               return -EINVAL;
-
-       /* Put an artificial cap on the number
-        * of segments passed to kexec_load.
-        */
-       if (nr_segments > KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       image = NULL;
-       result = 0;
-
-       /* Because we write directly to the reserved memory
-        * region when loading crash kernels we need a mutex here to
-        * prevent multiple crash  kernels from attempting to load
-        * simultaneously, and to prevent a crash kernel from loading
-        * over the top of a in use crash kernel.
-        *
-        * KISS: always take the mutex.
-        */
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-
-       dest_image = &kexec_image;
-       if (flags & KEXEC_ON_CRASH)
-               dest_image = &kexec_crash_image;
-       if (nr_segments > 0) {
-               unsigned long i;
-
-               if (flags & KEXEC_ON_CRASH) {
-                       /*
-                        * Loading another kernel to switch to if this one
-                        * crashes.  Free any current crash dump kernel before
-                        * we corrupt it.
-                        */
-
-                       kimage_free(xchg(&kexec_crash_image, NULL));
-                       result = kimage_alloc_init(&image, entry, nr_segments,
-                                                  segments, flags);
-                       crash_map_reserved_pages();
-               } else {
-                       /* Loading another kernel to reboot into. */
-
-                       result = kimage_alloc_init(&image, entry, nr_segments,
-                                                  segments, flags);
-               }
-               if (result)
-                       goto out;
-
-               if (flags & KEXEC_PRESERVE_CONTEXT)
-                       image->preserve_context = 1;
-               result = machine_kexec_prepare(image);
-               if (result)
-                       goto out;
-
-               for (i = 0; i < nr_segments; i++) {
-                       result = kimage_load_segment(image, &image->segment[i]);
-                       if (result)
-                               goto out;
-               }
-               kimage_terminate(image);
-               if (flags & KEXEC_ON_CRASH)
-                       crash_unmap_reserved_pages();
-       }
-       /* Install the new kernel, and  Uninstall the old */
-       image = xchg(dest_image, image);
-
-out:
-       mutex_unlock(&kexec_mutex);
-       kimage_free(image);
-
-       return result;
-}
-
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
-                      compat_ulong_t, nr_segments,
-                      struct compat_kexec_segment __user *, segments,
-                      compat_ulong_t, flags)
-{
-       struct compat_kexec_segment in;
-       struct kexec_segment out, __user *ksegments;
-       unsigned long i, result;
-
-       /* Don't allow clients that don't understand the native
-        * architecture to do anything.
-        */
-       if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
-               return -EINVAL;
-
-       if (nr_segments > KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
-       for (i = 0; i < nr_segments; i++) {
-               result = copy_from_user(&in, &segments[i], sizeof(in));
-               if (result)
-                       return -EFAULT;
-
-               out.buf   = compat_ptr(in.buf);
-               out.bufsz = in.bufsz;
-               out.mem   = in.mem;
-               out.memsz = in.memsz;
-
-               result = copy_to_user(&ksegments[i], &out, sizeof(out));
-               if (result)
-                       return -EFAULT;
-       }
-
-       return sys_kexec_load(entry, nr_segments, ksegments, flags);
-}
-#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
-               unsigned long, cmdline_len, const char __user *, cmdline_ptr,
-               unsigned long, flags)
-{
-       int ret = 0, i;
-       struct kimage **dest_image, *image;
-
-       /* We only trust the superuser with rebooting the system. */
-       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-               return -EPERM;
-
-       /* Make sure we have a legal set of flags */
-       if (flags != (flags & KEXEC_FILE_FLAGS))
-               return -EINVAL;
-
-       image = NULL;
-
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-
-       dest_image = &kexec_image;
-       if (flags & KEXEC_FILE_ON_CRASH)
-               dest_image = &kexec_crash_image;
-
-       if (flags & KEXEC_FILE_UNLOAD)
-               goto exchange;
-
-       /*
-        * In case of crash, new kernel gets loaded in reserved region. It is
-        * same memory where old crash kernel might be loaded. Free any
-        * current crash dump kernel before we corrupt it.
-        */
-       if (flags & KEXEC_FILE_ON_CRASH)
-               kimage_free(xchg(&kexec_crash_image, NULL));
-
-       ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
-                                    cmdline_len, flags);
-       if (ret)
-               goto out;
-
-       ret = machine_kexec_prepare(image);
-       if (ret)
-               goto out;
-
-       ret = kexec_calculate_store_digests(image);
-       if (ret)
-               goto out;
-
-       for (i = 0; i < image->nr_segments; i++) {
-               struct kexec_segment *ksegment;
-
-               ksegment = &image->segment[i];
-               pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
-                        i, ksegment->buf, ksegment->bufsz, ksegment->mem,
-                        ksegment->memsz);
-
-               ret = kimage_load_segment(image, &image->segment[i]);
-               if (ret)
-                       goto out;
-       }
-
-       kimage_terminate(image);
-
-       /*
-        * Free up any temporary buffers allocated which are not needed
-        * after image has been loaded
-        */
-       kimage_file_post_load_cleanup(image);
-exchange:
-       image = xchg(dest_image, image);
-out:
-       mutex_unlock(&kexec_mutex);
-       kimage_free(image);
-       return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
-       /* Take the kexec_mutex here to prevent sys_kexec_load
-        * running on one cpu from replacing the crash kernel
-        * we are using after a panic on a different cpu.
-        *
-        * If the crash kernel was not located in a fixed area
-        * of memory the xchg(&kexec_crash_image) would be
-        * sufficient.  But since I reuse the memory...
-        */
-       if (mutex_trylock(&kexec_mutex)) {
-               if (kexec_crash_image) {
-                       struct pt_regs fixed_regs;
-
-                       crash_setup_regs(&fixed_regs, regs);
-                       crash_save_vmcoreinfo();
-                       machine_crash_shutdown(&fixed_regs);
-                       machine_kexec(kexec_crash_image);
-               }
-               mutex_unlock(&kexec_mutex);
-       }
-}
-
-size_t crash_get_memory_size(void)
-{
-       size_t size = 0;
-       mutex_lock(&kexec_mutex);
-       if (crashk_res.end != crashk_res.start)
-               size = resource_size(&crashk_res);
-       mutex_unlock(&kexec_mutex);
-       return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
-                                          unsigned long end)
-{
-       unsigned long addr;
-
-       for (addr = begin; addr < end; addr += PAGE_SIZE)
-               free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
-       int ret = 0;
-       unsigned long start, end;
-       unsigned long old_size;
-       struct resource *ram_res;
-
-       mutex_lock(&kexec_mutex);
-
-       if (kexec_crash_image) {
-               ret = -ENOENT;
-               goto unlock;
-       }
-       start = crashk_res.start;
-       end = crashk_res.end;
-       old_size = (end == 0) ? 0 : end - start + 1;
-       if (new_size >= old_size) {
-               ret = (new_size == old_size) ? 0 : -EINVAL;
-               goto unlock;
-       }
-
-       ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-       if (!ram_res) {
-               ret = -ENOMEM;
-               goto unlock;
-       }
-
-       start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-       end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
-       crash_map_reserved_pages();
-       crash_free_reserved_phys_range(end, crashk_res.end);
-
-       if ((start == end) && (crashk_res.parent != NULL))
-               release_resource(&crashk_res);
-
-       ram_res->start = end;
-       ram_res->end = crashk_res.end;
-       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-       ram_res->name = "System RAM";
-
-       crashk_res.end = end - 1;
-
-       insert_resource(&iomem_resource, ram_res);
-       crash_unmap_reserved_pages();
-
-unlock:
-       mutex_unlock(&kexec_mutex);
-       return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-                           size_t data_len)
-{
-       struct elf_note note;
-
-       note.n_namesz = strlen(name) + 1;
-       note.n_descsz = data_len;
-       note.n_type   = type;
-       memcpy(buf, &note, sizeof(note));
-       buf += (sizeof(note) + 3)/4;
-       memcpy(buf, name, note.n_namesz);
-       buf += (note.n_namesz + 3)/4;
-       memcpy(buf, data, note.n_descsz);
-       buf += (note.n_descsz + 3)/4;
-
-       return buf;
-}
-
-static void final_note(u32 *buf)
-{
-       struct elf_note note;
-
-       note.n_namesz = 0;
-       note.n_descsz = 0;
-       note.n_type   = 0;
-       memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-       struct elf_prstatus prstatus;
-       u32 *buf;
-
-       if ((cpu < 0) || (cpu >= nr_cpu_ids))
-               return;
-
-       /* Using ELF notes here is opportunistic.
-        * I need a well defined structure format
-        * for the data I pass, and I need tags
-        * on the data to indicate what information I have
-        * squirrelled away.  ELF notes happen to provide
-        * all of that, so there is no need to invent something new.
-        */
-       buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-       if (!buf)
-               return;
-       memset(&prstatus, 0, sizeof(prstatus));
-       prstatus.pr_pid = current->pid;
-       elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
-       buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-                             &prstatus, sizeof(prstatus));
-       final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
-       /* Allocate memory for saving cpu registers. */
-       crash_notes = alloc_percpu(note_buf_t);
-       if (!crash_notes) {
-               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
-               return -ENOMEM;
-       }
-       return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-                                       unsigned long long system_ram,
-                                       unsigned long long *crash_size,
-                                       unsigned long long *crash_base)
-{
-       char *cur = cmdline, *tmp;
-
-       /* for each entry of the comma-separated list */
-       do {
-               unsigned long long start, end = ULLONG_MAX, size;
-
-               /* get the start of the range */
-               start = memparse(cur, &tmp);
-               if (cur == tmp) {
-                       pr_warn("crashkernel: Memory value expected\n");
-                       return -EINVAL;
-               }
-               cur = tmp;
-               if (*cur != '-') {
-                       pr_warn("crashkernel: '-' expected\n");
-                       return -EINVAL;
-               }
-               cur++;
-
-               /* if no ':' is here, than we read the end */
-               if (*cur != ':') {
-                       end = memparse(cur, &tmp);
-                       if (cur == tmp) {
-                               pr_warn("crashkernel: Memory value expected\n");
-                               return -EINVAL;
-                       }
-                       cur = tmp;
-                       if (end <= start) {
-                               pr_warn("crashkernel: end <= start\n");
-                               return -EINVAL;
-                       }
-               }
-
-               if (*cur != ':') {
-                       pr_warn("crashkernel: ':' expected\n");
-                       return -EINVAL;
-               }
-               cur++;
-
-               size = memparse(cur, &tmp);
-               if (cur == tmp) {
-                       pr_warn("Memory value expected\n");
-                       return -EINVAL;
-               }
-               cur = tmp;
-               if (size >= system_ram) {
-                       pr_warn("crashkernel: invalid size\n");
-                       return -EINVAL;
-               }
-
-               /* match ? */
-               if (system_ram >= start && system_ram < end) {
-                       *crash_size = size;
-                       break;
-               }
-       } while (*cur++ == ',');
-
-       if (*crash_size > 0) {
-               while (*cur && *cur != ' ' && *cur != '@')
-                       cur++;
-               if (*cur == '@') {
-                       cur++;
-                       *crash_base = memparse(cur, &tmp);
-                       if (cur == tmp) {
-                               pr_warn("Memory value expected after '@'\n");
-                               return -EINVAL;
-                       }
-               }
-       }
-
-       return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *     crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-                                          unsigned long long *crash_size,
-                                          unsigned long long *crash_base)
-{
-       char *cur = cmdline;
-
-       *crash_size = memparse(cmdline, &cur);
-       if (cmdline == cur) {
-               pr_warn("crashkernel: memory value expected\n");
-               return -EINVAL;
-       }
-
-       if (*cur == '@')
-               *crash_base = memparse(cur+1, &cur);
-       else if (*cur != ' ' && *cur != '\0') {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-       [SUFFIX_HIGH] = ",high",
-       [SUFFIX_LOW]  = ",low",
-       [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *     crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-                                          unsigned long long   *crash_size,
-                                          const char *suffix)
-{
-       char *cur = cmdline;
-
-       *crash_size = memparse(cmdline, &cur);
-       if (cmdline == cur) {
-               pr_warn("crashkernel: memory value expected\n");
-               return -EINVAL;
-       }
-
-       /* check with suffix */
-       if (strncmp(cur, suffix, strlen(suffix))) {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-       cur += strlen(suffix);
-       if (*cur != ' ' && *cur != '\0') {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-                            const char *name,
-                            const char *suffix)
-{
-       char *p = cmdline, *ck_cmdline = NULL;
-
-       /* find crashkernel and use the last one if there are more */
-       p = strstr(p, name);
-       while (p) {
-               char *end_p = strchr(p, ' ');
-               char *q;
-
-               if (!end_p)
-                       end_p = p + strlen(p);
-
-               if (!suffix) {
-                       int i;
-
-                       /* skip the one with any known suffix */
-                       for (i = 0; suffix_tbl[i]; i++) {
-                               q = end_p - strlen(suffix_tbl[i]);
-                               if (!strncmp(q, suffix_tbl[i],
-                                            strlen(suffix_tbl[i])))
-                                       goto next;
-                       }
-                       ck_cmdline = p;
-               } else {
-                       q = end_p - strlen(suffix);
-                       if (!strncmp(q, suffix, strlen(suffix)))
-                               ck_cmdline = p;
-               }
-next:
-               p = strstr(p+1, name);
-       }
-
-       if (!ck_cmdline)
-               return NULL;
-
-       return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base,
-                            const char *name,
-                            const char *suffix)
-{
-       char    *first_colon, *first_space;
-       char    *ck_cmdline;
-
-       BUG_ON(!crash_size || !crash_base);
-       *crash_size = 0;
-       *crash_base = 0;
-
-       ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
-       if (!ck_cmdline)
-               return -EINVAL;
-
-       ck_cmdline += strlen(name);
-
-       if (suffix)
-               return parse_crashkernel_suffix(ck_cmdline, crash_size,
-                               suffix);
-       /*
-        * if the commandline contains a ':', then that's the extended
-        * syntax -- if not, it must be the classic syntax
-        */
-       first_colon = strchr(ck_cmdline, ':');
-       first_space = strchr(ck_cmdline, ' ');
-       if (first_colon && (!first_space || first_colon < first_space))
-               return parse_crashkernel_mem(ck_cmdline, system_ram,
-                               crash_size, crash_base);
-
-       return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                       "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                               "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                               "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
-       u32 *buf = vmcoreinfo_note;
-
-       if (!vmcoreinfo_size)
-               return;
-       buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-                             vmcoreinfo_size);
-       final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
-       vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-       update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-       va_list args;
-       char buf[0x50];
-       size_t r;
-
-       va_start(args, fmt);
-       r = vscnprintf(buf, sizeof(buf), fmt, args);
-       va_end(args);
-
-       r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
-       memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-       vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
-       return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-       VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-       VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-       VMCOREINFO_SYMBOL(init_uts_ns);
-       VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-       VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-       VMCOREINFO_SYMBOL(_stext);
-       VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-       VMCOREINFO_SYMBOL(mem_map);
-       VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-       VMCOREINFO_SYMBOL(mem_section);
-       VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-       VMCOREINFO_STRUCT_SIZE(mem_section);
-       VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-       VMCOREINFO_STRUCT_SIZE(page);
-       VMCOREINFO_STRUCT_SIZE(pglist_data);
-       VMCOREINFO_STRUCT_SIZE(zone);
-       VMCOREINFO_STRUCT_SIZE(free_area);
-       VMCOREINFO_STRUCT_SIZE(list_head);
-       VMCOREINFO_SIZE(nodemask_t);
-       VMCOREINFO_OFFSET(page, flags);
-       VMCOREINFO_OFFSET(page, _count);
-       VMCOREINFO_OFFSET(page, mapping);
-       VMCOREINFO_OFFSET(page, lru);
-       VMCOREINFO_OFFSET(page, _mapcount);
-       VMCOREINFO_OFFSET(page, private);
-       VMCOREINFO_OFFSET(pglist_data, node_zones);
-       VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-       VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-       VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-       VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-       VMCOREINFO_OFFSET(pglist_data, node_id);
-       VMCOREINFO_OFFSET(zone, free_area);
-       VMCOREINFO_OFFSET(zone, vm_stat);
-       VMCOREINFO_OFFSET(zone, spanned_pages);
-       VMCOREINFO_OFFSET(free_area, free_list);
-       VMCOREINFO_OFFSET(list_head, next);
-       VMCOREINFO_OFFSET(list_head, prev);
-       VMCOREINFO_OFFSET(vmap_area, va_start);
-       VMCOREINFO_OFFSET(vmap_area, list);
-       VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-       log_buf_kexec_setup();
-       VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-       VMCOREINFO_NUMBER(NR_FREE_PAGES);
-       VMCOREINFO_NUMBER(PG_lru);
-       VMCOREINFO_NUMBER(PG_private);
-       VMCOREINFO_NUMBER(PG_swapcache);
-       VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-       VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-       VMCOREINFO_NUMBER(PG_head_mask);
-       VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
-       VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
-       arch_crash_save_vmcoreinfo();
-       update_vmcoreinfo_note();
-
-       return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
-                                   struct kexec_buf *kbuf)
-{
-       struct kimage *image = kbuf->image;
-       unsigned long temp_start, temp_end;
-
-       temp_end = min(end, kbuf->buf_max);
-       temp_start = temp_end - kbuf->memsz;
-
-       do {
-               /* align down start */
-               temp_start = temp_start & (~(kbuf->buf_align - 1));
-
-               if (temp_start < start || temp_start < kbuf->buf_min)
-                       return 0;
-
-               temp_end = temp_start + kbuf->memsz - 1;
-
-               /*
-                * Make sure this does not conflict with any of existing
-                * segments
-                */
-               if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                       temp_start = temp_start - PAGE_SIZE;
-                       continue;
-               }
-
-               /* We found a suitable memory range */
-               break;
-       } while (1);
-
-       /* If we are here, we found a suitable memory range */
-       kbuf->mem = temp_start;
-
-       /* Success, stop navigating through remaining System RAM ranges */
-       return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
-                                    struct kexec_buf *kbuf)
-{
-       struct kimage *image = kbuf->image;
-       unsigned long temp_start, temp_end;
-
-       temp_start = max(start, kbuf->buf_min);
-
-       do {
-               temp_start = ALIGN(temp_start, kbuf->buf_align);
-               temp_end = temp_start + kbuf->memsz - 1;
-
-               if (temp_end > end || temp_end > kbuf->buf_max)
-                       return 0;
-               /*
-                * Make sure this does not conflict with any of existing
-                * segments
-                */
-               if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                       temp_start = temp_start + PAGE_SIZE;
-                       continue;
-               }
-
-               /* We found a suitable memory range */
-               break;
-       } while (1);
-
-       /* If we are here, we found a suitable memory range */
-       kbuf->mem = temp_start;
-
-       /* Success, stop navigating through remaining System RAM ranges */
-       return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
-       struct kexec_buf *kbuf = (struct kexec_buf *)arg;
-       unsigned long sz = end - start + 1;
-
-       /* Returning 0 will take to next memory range */
-       if (sz < kbuf->memsz)
-               return 0;
-
-       if (end < kbuf->buf_min || start > kbuf->buf_max)
-               return 0;
-
-       /*
-        * Allocate memory top down with-in ram range. Otherwise bottom up
-        * allocation.
-        */
-       if (kbuf->top_down)
-               return locate_mem_hole_top_down(start, end, kbuf);
-       return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
-                    unsigned long memsz, unsigned long buf_align,
-                    unsigned long buf_min, unsigned long buf_max,
-                    bool top_down, unsigned long *load_addr)
-{
-
-       struct kexec_segment *ksegment;
-       struct kexec_buf buf, *kbuf;
-       int ret;
-
-       /* Currently adding segment this way is allowed only in file mode */
-       if (!image->file_mode)
-               return -EINVAL;
-
-       if (image->nr_segments >= KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       /*
-        * Make sure we are not trying to add buffer after allocating
-        * control pages. All segments need to be placed first before
-        * any control pages are allocated. As control page allocation
-        * logic goes through list of segments to make sure there are
-        * no destination overlaps.
-        */
-       if (!list_empty(&image->control_pages)) {
-               WARN_ON(1);
-               return -EINVAL;
-       }
-
-       memset(&buf, 0, sizeof(struct kexec_buf));
-       kbuf = &buf;
-       kbuf->image = image;
-       kbuf->buffer = buffer;
-       kbuf->bufsz = bufsz;
-
-       kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
-       kbuf->buf_align = max(buf_align, PAGE_SIZE);
-       kbuf->buf_min = buf_min;
-       kbuf->buf_max = buf_max;
-       kbuf->top_down = top_down;
-
-       /* Walk the RAM ranges and allocate a suitable range for the buffer */
-       if (image->type == KEXEC_TYPE_CRASH)
-               ret = walk_iomem_res("Crash kernel",
-                                    IORESOURCE_MEM | IORESOURCE_BUSY,
-                                    crashk_res.start, crashk_res.end, kbuf,
-                                    locate_mem_hole_callback);
-       else
-               ret = walk_system_ram_res(0, -1, kbuf,
-                                         locate_mem_hole_callback);
-       if (ret != 1) {
-               /* A suitable memory range could not be found for buffer */
-               return -EADDRNOTAVAIL;
-       }
-
-       /* Found a suitable memory range */
-       ksegment = &image->segment[image->nr_segments];
-       ksegment->kbuf = kbuf->buffer;
-       ksegment->bufsz = kbuf->bufsz;
-       ksegment->mem = kbuf->mem;
-       ksegment->memsz = kbuf->memsz;
-       image->nr_segments++;
-       *load_addr = ksegment->mem;
-       return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
-       struct crypto_shash *tfm;
-       struct shash_desc *desc;
-       int ret = 0, i, j, zero_buf_sz, sha_region_sz;
-       size_t desc_size, nullsz;
-       char *digest;
-       void *zero_buf;
-       struct kexec_sha_region *sha_regions;
-       struct purgatory_info *pi = &image->purgatory_info;
-
-       zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
-       zero_buf_sz = PAGE_SIZE;
-
-       tfm = crypto_alloc_shash("sha256", 0, 0);
-       if (IS_ERR(tfm)) {
-               ret = PTR_ERR(tfm);
-               goto out;
-       }
-
-       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-       desc = kzalloc(desc_size, GFP_KERNEL);
-       if (!desc) {
-               ret = -ENOMEM;
-               goto out_free_tfm;
-       }
-
-       sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
-       sha_regions = vzalloc(sha_region_sz);
-       if (!sha_regions)
-               goto out_free_desc;
-
-       desc->tfm   = tfm;
-       desc->flags = 0;
-
-       ret = crypto_shash_init(desc);
-       if (ret < 0)
-               goto out_free_sha_regions;
-
-       digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
-       if (!digest) {
-               ret = -ENOMEM;
-               goto out_free_sha_regions;
-       }
-
-       for (j = i = 0; i < image->nr_segments; i++) {
-               struct kexec_segment *ksegment;
-
-               ksegment = &image->segment[i];
-               /*
-                * Skip purgatory as it will be modified once we put digest
-                * info in purgatory.
-                */
-               if (ksegment->kbuf == pi->purgatory_buf)
-                       continue;
-
-               ret = crypto_shash_update(desc, ksegment->kbuf,
-                                         ksegment->bufsz);
-               if (ret)
-                       break;
-
-               /*
-                * Assume rest of the buffer is filled with zero and
-                * update digest accordingly.
-                */
-               nullsz = ksegment->memsz - ksegment->bufsz;
-               while (nullsz) {
-                       unsigned long bytes = nullsz;
-
-                       if (bytes > zero_buf_sz)
-                               bytes = zero_buf_sz;
-                       ret = crypto_shash_update(desc, zero_buf, bytes);
-                       if (ret)
-                               break;
-                       nullsz -= bytes;
-               }
-
-               if (ret)
-                       break;
-
-               sha_regions[j].start = ksegment->mem;
-               sha_regions[j].len = ksegment->memsz;
-               j++;
-       }
-
-       if (!ret) {
-               ret = crypto_shash_final(desc, digest);
-               if (ret)
-                       goto out_free_digest;
-               ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
-                                               sha_regions, sha_region_sz, 0);
-               if (ret)
-                       goto out_free_digest;
-
-               ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
-                                               digest, SHA256_DIGEST_SIZE, 0);
-               if (ret)
-                       goto out_free_digest;
-       }
-
-out_free_digest:
-       kfree(digest);
-out_free_sha_regions:
-       vfree(sha_regions);
-out_free_desc:
-       kfree(desc);
-out_free_tfm:
-       kfree(tfm);
-out:
-       return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
-                                 unsigned long max, int top_down)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
-       unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
-       unsigned char *buf_addr, *src;
-       int i, ret = 0, entry_sidx = -1;
-       const Elf_Shdr *sechdrs_c;
-       Elf_Shdr *sechdrs = NULL;
-       void *purgatory_buf = NULL;
-
-       /*
-        * sechdrs_c points to section headers in purgatory and are read
-        * only. No modifications allowed.
-        */
-       sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
-       /*
-        * We can not modify sechdrs_c[] and its fields. It is read only.
-        * Copy it over to a local copy where one can store some temporary
-        * data and free it at the end. We need to modify ->sh_addr and
-        * ->sh_offset fields to keep track of permanent and temporary
-        * locations of sections.
-        */
-       sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-       if (!sechdrs)
-               return -ENOMEM;
-
-       memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
-       /*
-        * We seem to have multiple copies of sections. First copy is which
-        * is embedded in kernel in read only section. Some of these sections
-        * will be copied to a temporary buffer and relocated. And these
-        * sections will finally be copied to their final destination at
-        * segment load time.
-        *
-        * Use ->sh_offset to reflect section address in memory. It will
-        * point to original read only copy if section is not allocatable.
-        * Otherwise it will point to temporary copy which will be relocated.
-        *
-        * Use ->sh_addr to contain final address of the section where it
-        * will go during execution time.
-        */
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (sechdrs[i].sh_type == SHT_NOBITS)
-                       continue;
-
-               sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
-                                               sechdrs[i].sh_offset;
-       }
-
-       /*
-        * Identify entry point section and make entry relative to section
-        * start.
-        */
-       entry = pi->ehdr->e_entry;
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
-                       continue;
-
-               /* Make entry section relative */
-               if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
-                   ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
-                    pi->ehdr->e_entry)) {
-                       entry_sidx = i;
-                       entry -= sechdrs[i].sh_addr;
-                       break;
-               }
-       }
-
-       /* Determine how much memory is needed to load relocatable object. */
-       buf_align = 1;
-       bss_align = 1;
-       buf_sz = 0;
-       bss_sz = 0;
-
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               align = sechdrs[i].sh_addralign;
-               if (sechdrs[i].sh_type != SHT_NOBITS) {
-                       if (buf_align < align)
-                               buf_align = align;
-                       buf_sz = ALIGN(buf_sz, align);
-                       buf_sz += sechdrs[i].sh_size;
-               } else {
-                       /* bss section */
-                       if (bss_align < align)
-                               bss_align = align;
-                       bss_sz = ALIGN(bss_sz, align);
-                       bss_sz += sechdrs[i].sh_size;
-               }
-       }
-
-       /* Determine the bss padding required to align bss properly */
-       bss_pad = 0;
-       if (buf_sz & (bss_align - 1))
-               bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
-       memsz = buf_sz + bss_pad + bss_sz;
-
-       /* Allocate buffer for purgatory */
-       purgatory_buf = vzalloc(buf_sz);
-       if (!purgatory_buf) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       if (buf_align < bss_align)
-               buf_align = bss_align;
-
-       /* Add buffer to segment list */
-       ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
-                               buf_align, min, max, top_down,
-                               &pi->purgatory_load_addr);
-       if (ret)
-               goto out;
-
-       /* Load SHF_ALLOC sections */
-       buf_addr = purgatory_buf;
-       load_addr = curr_load_addr = pi->purgatory_load_addr;
-       bss_addr = load_addr + buf_sz + bss_pad;
-
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               align = sechdrs[i].sh_addralign;
-               if (sechdrs[i].sh_type != SHT_NOBITS) {
-                       curr_load_addr = ALIGN(curr_load_addr, align);
-                       offset = curr_load_addr - load_addr;
-                       /* We already modifed ->sh_offset to keep src addr */
-                       src = (char *) sechdrs[i].sh_offset;
-                       memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
-                       /* Store load address and source address of section */
-                       sechdrs[i].sh_addr = curr_load_addr;
-
-                       /*
-                        * This section got copied to temporary buffer. Update
-                        * ->sh_offset accordingly.
-                        */
-                       sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
-                       /* Advance to the next address */
-                       curr_load_addr += sechdrs[i].sh_size;
-               } else {
-                       bss_addr = ALIGN(bss_addr, align);
-                       sechdrs[i].sh_addr = bss_addr;
-                       bss_addr += sechdrs[i].sh_size;
-               }
-       }
-
-       /* Update entry point based on load address of text section */
-       if (entry_sidx >= 0)
-               entry += sechdrs[entry_sidx].sh_addr;
-
-       /* Make kernel jump to purgatory after shutdown */
-       image->start = entry;
-
-       /* Used later to get/set symbol values */
-       pi->sechdrs = sechdrs;
-
-       /*
-        * Used later to identify which section is purgatory and skip it
-        * from checksumming.
-        */
-       pi->purgatory_buf = purgatory_buf;
-       return ret;
-out:
-       vfree(sechdrs);
-       vfree(purgatory_buf);
-       return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
-       int i, ret;
-       struct purgatory_info *pi = &image->purgatory_info;
-       Elf_Shdr *sechdrs = pi->sechdrs;
-
-       /* Apply relocations */
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               Elf_Shdr *section, *symtab;
-
-               if (sechdrs[i].sh_type != SHT_RELA &&
-                   sechdrs[i].sh_type != SHT_REL)
-                       continue;
-
-               /*
-                * For section of type SHT_RELA/SHT_REL,
-                * ->sh_link contains section header index of associated
-                * symbol table. And ->sh_info contains section header
-                * index of section to which relocations apply.
-                */
-               if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
-                   sechdrs[i].sh_link >= pi->ehdr->e_shnum)
-                       return -ENOEXEC;
-
-               section = &sechdrs[sechdrs[i].sh_info];
-               symtab = &sechdrs[sechdrs[i].sh_link];
-
-               if (!(section->sh_flags & SHF_ALLOC))
-                       continue;
-
-               /*
-                * symtab->sh_link contain section header index of associated
-                * string table.
-                */
-               if (symtab->sh_link >= pi->ehdr->e_shnum)
-                       /* Invalid section number? */
-                       continue;
-
-               /*
-                * Respective architecture needs to provide support for applying
-                * relocations of type SHT_RELA/SHT_REL.
-                */
-               if (sechdrs[i].sh_type == SHT_RELA)
-                       ret = arch_kexec_apply_relocations_add(pi->ehdr,
-                                                              sechdrs, i);
-               else if (sechdrs[i].sh_type == SHT_REL)
-                       ret = arch_kexec_apply_relocations(pi->ehdr,
-                                                          sechdrs, i);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
-                        unsigned long max, int top_down,
-                        unsigned long *load_addr)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       int ret;
-
-       if (kexec_purgatory_size <= 0)
-               return -EINVAL;
-
-       if (kexec_purgatory_size < sizeof(Elf_Ehdr))
-               return -ENOEXEC;
-
-       pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
-       if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
-           || pi->ehdr->e_type != ET_REL
-           || !elf_check_arch(pi->ehdr)
-           || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
-               return -ENOEXEC;
-
-       if (pi->ehdr->e_shoff >= kexec_purgatory_size
-           || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
-           kexec_purgatory_size - pi->ehdr->e_shoff))
-               return -ENOEXEC;
-
-       ret = __kexec_load_purgatory(image, min, max, top_down);
-       if (ret)
-               return ret;
-
-       ret = kexec_apply_relocations(image);
-       if (ret)
-               goto out;
-
-       *load_addr = pi->purgatory_load_addr;
-       return 0;
-out:
-       vfree(pi->sechdrs);
-       vfree(pi->purgatory_buf);
-       return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
-                                           const char *name)
-{
-       Elf_Sym *syms;
-       Elf_Shdr *sechdrs;
-       Elf_Ehdr *ehdr;
-       int i, k;
-       const char *strtab;
-
-       if (!pi->sechdrs || !pi->ehdr)
-               return NULL;
-
-       sechdrs = pi->sechdrs;
-       ehdr = pi->ehdr;
-
-       for (i = 0; i < ehdr->e_shnum; i++) {
-               if (sechdrs[i].sh_type != SHT_SYMTAB)
-                       continue;
-
-               if (sechdrs[i].sh_link >= ehdr->e_shnum)
-                       /* Invalid strtab section number */
-                       continue;
-               strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
-               syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
-               /* Go through symbols for a match */
-               for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
-                       if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
-                               continue;
-
-                       if (strcmp(strtab + syms[k].st_name, name) != 0)
-                               continue;
-
-                       if (syms[k].st_shndx == SHN_UNDEF ||
-                           syms[k].st_shndx >= ehdr->e_shnum) {
-                               pr_debug("Symbol: %s has bad section index %d.\n",
-                                               name, syms[k].st_shndx);
-                               return NULL;
-                       }
-
-                       /* Found the symbol we are looking for */
-                       return &syms[k];
-               }
-       }
-
-       return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       Elf_Sym *sym;
-       Elf_Shdr *sechdr;
-
-       sym = kexec_purgatory_find_symbol(pi, name);
-       if (!sym)
-               return ERR_PTR(-EINVAL);
-
-       sechdr = &pi->sechdrs[sym->st_shndx];
-
-       /*
-        * Returns the address where symbol will finally be loaded after
-        * kexec_load_segment()
-        */
-       return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
-                                  void *buf, unsigned int size, bool get_value)
-{
-       Elf_Sym *sym;
-       Elf_Shdr *sechdrs;
-       struct purgatory_info *pi = &image->purgatory_info;
-       char *sym_buf;
-
-       sym = kexec_purgatory_find_symbol(pi, name);
-       if (!sym)
-               return -EINVAL;
-
-       if (sym->st_size != size) {
-               pr_err("symbol %s size mismatch: expected %lu actual %u\n",
-                      name, (unsigned long)sym->st_size, size);
-               return -EINVAL;
-       }
+       ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+       for (i = 0; i < nr_segments; i++) {
+               result = copy_from_user(&in, &segments[i], sizeof(in));
+               if (result)
+                       return -EFAULT;
 
-       sechdrs = pi->sechdrs;
+               out.buf   = compat_ptr(in.buf);
+               out.bufsz = in.bufsz;
+               out.mem   = in.mem;
+               out.memsz = in.memsz;
 
-       if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
-               pr_err("symbol %s is in a bss section. Cannot %s\n", name,
-                      get_value ? "get" : "set");
-               return -EINVAL;
+               result = copy_to_user(&ksegments[i], &out, sizeof(out));
+               if (result)
+                       return -EFAULT;
        }
 
-       sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
-                                       sym->st_value;
-
-       if (get_value)
-               memcpy((void *)buf, sym_buf, size);
-       else
-               memcpy((void *)sym_buf, buf, size);
-
-       return 0;
+       return sys_kexec_load(entry, nr_segments, ksegments, flags);
 }
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable.  If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
-       int error = 0;
-
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-       if (!kexec_image) {
-               error = -EINVAL;
-               goto Unlock;
-       }
-
-#ifdef CONFIG_KEXEC_JUMP
-       if (kexec_image->preserve_context) {
-               lock_system_sleep();
-               pm_prepare_console();
-               error = freeze_processes();
-               if (error) {
-                       error = -EBUSY;
-                       goto Restore_console;
-               }
-               suspend_console();
-               error = dpm_suspend_start(PMSG_FREEZE);
-               if (error)
-                       goto Resume_console;
-               /* At this point, dpm_suspend_start() has been called,
-                * but *not* dpm_suspend_end(). We *must* call
-                * dpm_suspend_end() now.  Otherwise, drivers for
-                * some devices (e.g. interrupt controllers) become
-                * desynchronized with the actual state of the
-                * hardware at resume time, and evil weirdness ensues.
-                */
-               error = dpm_suspend_end(PMSG_FREEZE);
-               if (error)
-                       goto Resume_devices;
-               error = disable_nonboot_cpus();
-               if (error)
-                       goto Enable_cpus;
-               local_irq_disable();
-               error = syscore_suspend();
-               if (error)
-                       goto Enable_irqs;
-       } else
-#endif
-       {
-               kexec_in_progress = true;
-               kernel_restart_prepare(NULL);
-               migrate_to_reboot_cpu();
-
-               /*
-                * migrate_to_reboot_cpu() disables CPU hotplug assuming that
-                * no further code needs to use CPU hotplug (which is true in
-                * the reboot case). However, the kexec path depends on using
-                * CPU hotplug again; so re-enable it here.
-                */
-               cpu_hotplug_enable();
-               pr_emerg("Starting new kernel\n");
-               machine_shutdown();
-       }
-
-       machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
-       if (kexec_image->preserve_context) {
-               syscore_resume();
- Enable_irqs:
-               local_irq_enable();
- Enable_cpus:
-               enable_nonboot_cpus();
-               dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
-               dpm_resume_end(PMSG_RESTORE);
- Resume_console:
-               resume_console();
-               thaw_processes();
- Restore_console:
-               pm_restore_console();
-               unlock_system_sleep();
-       }
 #endif
-
- Unlock:
-       mutex_unlock(&kexec_mutex);
-       return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644 (file)
index 0000000..201b453
--- /dev/null
@@ -0,0 +1,1534 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt)    "kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+       .name  = "Crash kernel",
+       .start = 0,
+       .end   = 0,
+       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+       .name  = "Crash kernel",
+       .start = 0,
+       .end   = 0,
+       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+       /*
+        * If crash_kexec_post_notifiers is enabled, don't run
+        * crash_kexec() here yet, which must be run after panic
+        * notifiers in panic().
+        */
+       if (crash_kexec_post_notifiers)
+               return 0;
+       /*
+        * There are 4 panic() calls in do_exit() path, each of which
+        * corresponds to each of these 4 conditions.
+        */
+       if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+               return 1;
+       return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+                                      gfp_t gfp_mask,
+                                      unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+       int result, i;
+       unsigned long nr_segments = image->nr_segments;
+
+       /*
+        * Verify we have good destination addresses.  The caller is
+        * responsible for making certain we don't attempt to load
+        * the new image into invalid or reserved areas of RAM.  This
+        * just verifies it is an address we can use.
+        *
+        * Since the kernel does everything in page size chunks ensure
+        * the destination addresses are page aligned.  Too many
+        * special cases crop of when we don't do this.  The most
+        * insidious is getting overlapping destination addresses
+        * simply because addresses are changed to page size
+        * granularity.
+        */
+       result = -EADDRNOTAVAIL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mstart, mend;
+
+               mstart = image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+                       return result;
+               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+                       return result;
+       }
+
+       /* Verify our destination addresses do not overlap.
+        * If we alloed overlapping destination addresses
+        * through very weird things can happen with no
+        * easy explanation as one segment stops on another.
+        */
+       result = -EINVAL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mstart, mend;
+               unsigned long j;
+
+               mstart = image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               for (j = 0; j < i; j++) {
+                       unsigned long pstart, pend;
+
+                       pstart = image->segment[j].mem;
+                       pend   = pstart + image->segment[j].memsz;
+                       /* Do the segments overlap ? */
+                       if ((mend > pstart) && (mstart < pend))
+                               return result;
+               }
+       }
+
+       /* Ensure our buffer sizes are strictly less than
+        * our memory sizes.  This should always be the case,
+        * and it is easier to check up front than to be surprised
+        * later on.
+        */
+       result = -EINVAL;
+       for (i = 0; i < nr_segments; i++) {
+               if (image->segment[i].bufsz > image->segment[i].memsz)
+                       return result;
+       }
+
+       /*
+        * Verify we have good destination addresses.  Normally
+        * the caller is responsible for making certain we don't
+        * attempt to load the new image into invalid or reserved
+        * areas of RAM.  But crash kernels are preloaded into a
+        * reserved area of ram.  We must ensure the addresses
+        * are in the reserved area otherwise preloading the
+        * kernel could corrupt things.
+        */
+
+       if (image->type == KEXEC_TYPE_CRASH) {
+               result = -EADDRNOTAVAIL;
+               for (i = 0; i < nr_segments; i++) {
+                       unsigned long mstart, mend;
+
+                       mstart = image->segment[i].mem;
+                       mend = mstart + image->segment[i].memsz - 1;
+                       /* Ensure we are within the crash kernel limits */
+                       if ((mstart < crashk_res.start) ||
+                           (mend > crashk_res.end))
+                               return result;
+               }
+       }
+
+       return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+       struct kimage *image;
+
+       /* Allocate a controlling structure */
+       image = kzalloc(sizeof(*image), GFP_KERNEL);
+       if (!image)
+               return NULL;
+
+       image->head = 0;
+       image->entry = &image->head;
+       image->last_entry = &image->head;
+       image->control_page = ~0; /* By default this does not apply */
+       image->type = KEXEC_TYPE_DEFAULT;
+
+       /* Initialize the list of control pages */
+       INIT_LIST_HEAD(&image->control_pages);
+
+       /* Initialize the list of destination pages */
+       INIT_LIST_HEAD(&image->dest_pages);
+
+       /* Initialize the list of unusable pages */
+       INIT_LIST_HEAD(&image->unusable_pages);
+
+       return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+                                       unsigned long start,
+                                       unsigned long end)
+{
+       unsigned long i;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               unsigned long mstart, mend;
+
+               mstart = image->segment[i].mem;
+               mend = mstart + image->segment[i].memsz;
+               if ((end > mstart) && (start < mend))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+       struct page *pages;
+
+       pages = alloc_pages(gfp_mask, order);
+       if (pages) {
+               unsigned int count, i;
+
+               pages->mapping = NULL;
+               set_page_private(pages, order);
+               count = 1 << order;
+               for (i = 0; i < count; i++)
+                       SetPageReserved(pages + i);
+       }
+
+       return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+       unsigned int order, count, i;
+
+       order = page_private(page);
+       count = 1 << order;
+       for (i = 0; i < count; i++)
+               ClearPageReserved(page + i);
+       __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+       struct list_head *pos, *next;
+
+       list_for_each_safe(pos, next, list) {
+               struct page *page;
+
+               page = list_entry(pos, struct page, lru);
+               list_del(&page->lru);
+               kimage_free_pages(page);
+       }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+                                                       unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * At worst this runs in O(N) of the image size.
+        */
+       struct list_head extra_pages;
+       struct page *pages;
+       unsigned int count;
+
+       count = 1 << order;
+       INIT_LIST_HEAD(&extra_pages);
+
+       /* Loop while I can allocate a page and the page allocated
+        * is a destination page.
+        */
+       do {
+               unsigned long pfn, epfn, addr, eaddr;
+
+               pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+               if (!pages)
+                       break;
+               pfn   = page_to_pfn(pages);
+               epfn  = pfn + count;
+               addr  = pfn << PAGE_SHIFT;
+               eaddr = epfn << PAGE_SHIFT;
+               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+                             kimage_is_destination_range(image, addr, eaddr)) {
+                       list_add(&pages->lru, &extra_pages);
+                       pages = NULL;
+               }
+       } while (!pages);
+
+       if (pages) {
+               /* Remember the allocated page... */
+               list_add(&pages->lru, &image->control_pages);
+
+               /* Because the page is already in it's destination
+                * location we will never allocate another page at
+                * that address.  Therefore kimage_alloc_pages
+                * will not return it (again) and we don't need
+                * to give it an entry in image->segment[].
+                */
+       }
+       /* Deal with the destination pages I have inadvertently allocated.
+        *
+        * Ideally I would convert multi-page allocations into single
+        * page allocations, and add everything to image->dest_pages.
+        *
+        * For now it is simpler to just free the pages.
+        */
+       kimage_free_page_list(&extra_pages);
+
+       return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+                                                     unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * Control pages are also the only pags we must allocate
+        * when loading a crash kernel.  All of the other pages
+        * are specified by the segments and we just memcpy
+        * into them directly.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * Given the low demand this implements a very simple
+        * allocator that finds the first hole of the appropriate
+        * size in the reserved memory region, and allocates all
+        * of the memory up to and including the hole.
+        */
+       unsigned long hole_start, hole_end, size;
+       struct page *pages;
+
+       pages = NULL;
+       size = (1 << order) << PAGE_SHIFT;
+       hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+       hole_end   = hole_start + size - 1;
+       while (hole_end <= crashk_res.end) {
+               unsigned long i;
+
+               if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+                       break;
+               /* See if I overlap any of the segments */
+               for (i = 0; i < image->nr_segments; i++) {
+                       unsigned long mstart, mend;
+
+                       mstart = image->segment[i].mem;
+                       mend   = mstart + image->segment[i].memsz - 1;
+                       if ((hole_end >= mstart) && (hole_start <= mend)) {
+                               /* Advance the hole to the end of the segment */
+                               hole_start = (mend + (size - 1)) & ~(size - 1);
+                               hole_end   = hole_start + size - 1;
+                               break;
+                       }
+               }
+               /* If I don't overlap any segments I have found my hole! */
+               if (i == image->nr_segments) {
+                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       image->control_page = hole_end;
+                       break;
+               }
+       }
+
+       return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       struct page *pages = NULL;
+
+       switch (image->type) {
+       case KEXEC_TYPE_DEFAULT:
+               pages = kimage_alloc_normal_control_pages(image, order);
+               break;
+       case KEXEC_TYPE_CRASH:
+               pages = kimage_alloc_crash_control_pages(image, order);
+               break;
+       }
+
+       return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+       if (*image->entry != 0)
+               image->entry++;
+
+       if (image->entry == image->last_entry) {
+               kimage_entry_t *ind_page;
+               struct page *page;
+
+               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+               if (!page)
+                       return -ENOMEM;
+
+               ind_page = page_address(page);
+               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               image->entry = ind_page;
+               image->last_entry = ind_page +
+                                     ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+       }
+       *image->entry = entry;
+       image->entry++;
+       *image->entry = 0;
+
+       return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+                                  unsigned long destination)
+{
+       int result;
+
+       destination &= PAGE_MASK;
+       result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+       return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+       int result;
+
+       page &= PAGE_MASK;
+       result = kimage_add_entry(image, page | IND_SOURCE);
+
+       return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+       /* Walk through and free any extra destination pages I may have */
+       kimage_free_page_list(&image->dest_pages);
+
+       /* Walk through and free any unusable pages I have cached */
+       kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+       if (*image->entry != 0)
+               image->entry++;
+
+       *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+               ptr = (entry & IND_INDIRECTION) ? \
+                       phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+       struct page *page;
+
+       page = pfn_to_page(entry >> PAGE_SHIFT);
+       kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+       kimage_entry_t *ptr, entry;
+       kimage_entry_t ind = 0;
+
+       if (!image)
+               return;
+
+       kimage_free_extra_pages(image);
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_INDIRECTION) {
+                       /* Free the previous indirection page */
+                       if (ind & IND_INDIRECTION)
+                               kimage_free_entry(ind);
+                       /* Save this indirection page until we are
+                        * done with it.
+                        */
+                       ind = entry;
+               } else if (entry & IND_SOURCE)
+                       kimage_free_entry(entry);
+       }
+       /* Free the final indirection page */
+       if (ind & IND_INDIRECTION)
+               kimage_free_entry(ind);
+
+       /* Handle any machine specific cleanup */
+       machine_kexec_cleanup(image);
+
+       /* Free the kexec control pages... */
+       kimage_free_page_list(&image->control_pages);
+
+       /*
+        * Free up any temporary buffers allocated. This might hit if
+        * error occurred much later after buffer allocation.
+        */
+       if (image->file_mode)
+               kimage_file_post_load_cleanup(image);
+
+       kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+                                       unsigned long page)
+{
+       kimage_entry_t *ptr, entry;
+       unsigned long destination = 0;
+
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_DESTINATION)
+                       destination = entry & PAGE_MASK;
+               else if (entry & IND_SOURCE) {
+                       if (page == destination)
+                               return ptr;
+                       destination += PAGE_SIZE;
+               }
+       }
+
+       return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+                                       gfp_t gfp_mask,
+                                       unsigned long destination)
+{
+       /*
+        * Here we implement safeguards to ensure that a source page
+        * is not copied to its destination page before the data on
+        * the destination page is no longer useful.
+        *
+        * To do this we maintain the invariant that a source page is
+        * either its own destination page, or it is not a
+        * destination page at all.
+        *
+        * That is slightly stronger than required, but the proof
+        * that no problems will not occur is trivial, and the
+        * implementation is simply to verify.
+        *
+        * When allocating all pages normally this algorithm will run
+        * in O(N) time, but in the worst case it will run in O(N^2)
+        * time.   If the runtime is a problem the data structures can
+        * be fixed.
+        */
+       struct page *page;
+       unsigned long addr;
+
+       /*
+        * Walk through the list of destination pages, and see if I
+        * have a match.
+        */
+       list_for_each_entry(page, &image->dest_pages, lru) {
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+               if (addr == destination) {
+                       list_del(&page->lru);
+                       return page;
+               }
+       }
+       page = NULL;
+       while (1) {
+               kimage_entry_t *old;
+
+               /* Allocate a page, if we run out of memory give up */
+               page = kimage_alloc_pages(gfp_mask, 0);
+               if (!page)
+                       return NULL;
+               /* If the page cannot be used file it away */
+               if (page_to_pfn(page) >
+                               (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+                       list_add(&page->lru, &image->unusable_pages);
+                       continue;
+               }
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+
+               /* If it is the destination page we want use it */
+               if (addr == destination)
+                       break;
+
+               /* If the page is not a destination page use it */
+               if (!kimage_is_destination_range(image, addr,
+                                                 addr + PAGE_SIZE))
+                       break;
+
+               /*
+                * I know that the page is someones destination page.
+                * See if there is already a source page for this
+                * destination page.  And if so swap the source pages.
+                */
+               old = kimage_dst_used(image, addr);
+               if (old) {
+                       /* If so move it */
+                       unsigned long old_addr;
+                       struct page *old_page;
+
+                       old_addr = *old & PAGE_MASK;
+                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       copy_highpage(page, old_page);
+                       *old = addr | (*old & ~PAGE_MASK);
+
+                       /* The old page I have found cannot be a
+                        * destination page, so return it if it's
+                        * gfp_flags honor the ones passed in.
+                        */
+                       if (!(gfp_mask & __GFP_HIGHMEM) &&
+                           PageHighMem(old_page)) {
+                               kimage_free_pages(old_page);
+                               continue;
+                       }
+                       addr = old_addr;
+                       page = old_page;
+                       break;
+               }
+               /* Place the page on the destination list, to be used later */
+               list_add(&page->lru, &image->dest_pages);
+       }
+
+       return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+                                        struct kexec_segment *segment)
+{
+       unsigned long maddr;
+       size_t ubytes, mbytes;
+       int result;
+       unsigned char __user *buf = NULL;
+       unsigned char *kbuf = NULL;
+
+       result = 0;
+       if (image->file_mode)
+               kbuf = segment->kbuf;
+       else
+               buf = segment->buf;
+       ubytes = segment->bufsz;
+       mbytes = segment->memsz;
+       maddr = segment->mem;
+
+       result = kimage_set_destination(image, maddr);
+       if (result < 0)
+               goto out;
+
+       while (mbytes) {
+               struct page *page;
+               char *ptr;
+               size_t uchunk, mchunk;
+
+               page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+               if (!page) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               result = kimage_add_page(image, page_to_pfn(page)
+                                                               << PAGE_SHIFT);
+               if (result < 0)
+                       goto out;
+
+               ptr = kmap(page);
+               /* Start with a clear page */
+               clear_page(ptr);
+               ptr += maddr & ~PAGE_MASK;
+               mchunk = min_t(size_t, mbytes,
+                               PAGE_SIZE - (maddr & ~PAGE_MASK));
+               uchunk = min(ubytes, mchunk);
+
+               /* For file based kexec, source pages are in kernel memory */
+               if (image->file_mode)
+                       memcpy(ptr, kbuf, uchunk);
+               else
+                       result = copy_from_user(ptr, buf, uchunk);
+               kunmap(page);
+               if (result) {
+                       result = -EFAULT;
+                       goto out;
+               }
+               ubytes -= uchunk;
+               maddr  += mchunk;
+               if (image->file_mode)
+                       kbuf += mchunk;
+               else
+                       buf += mchunk;
+               mbytes -= mchunk;
+       }
+out:
+       return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+                                       struct kexec_segment *segment)
+{
+       /* For crash dumps kernels we simply copy the data from
+        * user space to it's destination.
+        * We do things a page at a time for the sake of kmap.
+        */
+       unsigned long maddr;
+       size_t ubytes, mbytes;
+       int result;
+       unsigned char __user *buf = NULL;
+       unsigned char *kbuf = NULL;
+
+       result = 0;
+       if (image->file_mode)
+               kbuf = segment->kbuf;
+       else
+               buf = segment->buf;
+       ubytes = segment->bufsz;
+       mbytes = segment->memsz;
+       maddr = segment->mem;
+       while (mbytes) {
+               struct page *page;
+               char *ptr;
+               size_t uchunk, mchunk;
+
+               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               if (!page) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               ptr = kmap(page);
+               ptr += maddr & ~PAGE_MASK;
+               mchunk = min_t(size_t, mbytes,
+                               PAGE_SIZE - (maddr & ~PAGE_MASK));
+               uchunk = min(ubytes, mchunk);
+               if (mchunk > uchunk) {
+                       /* Zero the trailing part of the page */
+                       memset(ptr + uchunk, 0, mchunk - uchunk);
+               }
+
+               /* For file based kexec, source pages are in kernel memory */
+               if (image->file_mode)
+                       memcpy(ptr, kbuf, uchunk);
+               else
+                       result = copy_from_user(ptr, buf, uchunk);
+               kexec_flush_icache_page(page);
+               kunmap(page);
+               if (result) {
+                       result = -EFAULT;
+                       goto out;
+               }
+               ubytes -= uchunk;
+               maddr  += mchunk;
+               if (image->file_mode)
+                       kbuf += mchunk;
+               else
+                       buf += mchunk;
+               mbytes -= mchunk;
+       }
+out:
+       return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       int result = -ENOMEM;
+
+       switch (image->type) {
+       case KEXEC_TYPE_DEFAULT:
+               result = kimage_load_normal_segment(image, segment);
+               break;
+       case KEXEC_TYPE_CRASH:
+               result = kimage_load_crash_segment(image, segment);
+               break;
+       }
+
+       return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+       /* Take the kexec_mutex here to prevent sys_kexec_load
+        * running on one cpu from replacing the crash kernel
+        * we are using after a panic on a different cpu.
+        *
+        * If the crash kernel was not located in a fixed area
+        * of memory the xchg(&kexec_crash_image) would be
+        * sufficient.  But since I reuse the memory...
+        */
+       if (mutex_trylock(&kexec_mutex)) {
+               if (kexec_crash_image) {
+                       struct pt_regs fixed_regs;
+
+                       crash_setup_regs(&fixed_regs, regs);
+                       crash_save_vmcoreinfo();
+                       machine_crash_shutdown(&fixed_regs);
+                       machine_kexec(kexec_crash_image);
+               }
+               mutex_unlock(&kexec_mutex);
+       }
+}
+
+size_t crash_get_memory_size(void)
+{
+       size_t size = 0;
+
+       mutex_lock(&kexec_mutex);
+       if (crashk_res.end != crashk_res.start)
+               size = resource_size(&crashk_res);
+       mutex_unlock(&kexec_mutex);
+       return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+                                          unsigned long end)
+{
+       unsigned long addr;
+
+       for (addr = begin; addr < end; addr += PAGE_SIZE)
+               free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+       int ret = 0;
+       unsigned long start, end;
+       unsigned long old_size;
+       struct resource *ram_res;
+
+       mutex_lock(&kexec_mutex);
+
+       if (kexec_crash_image) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+       start = crashk_res.start;
+       end = crashk_res.end;
+       old_size = (end == 0) ? 0 : end - start + 1;
+       if (new_size >= old_size) {
+               ret = (new_size == old_size) ? 0 : -EINVAL;
+               goto unlock;
+       }
+
+       ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+       if (!ram_res) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+
+       start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+       end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+       crash_map_reserved_pages();
+       crash_free_reserved_phys_range(end, crashk_res.end);
+
+       if ((start == end) && (crashk_res.parent != NULL))
+               release_resource(&crashk_res);
+
+       ram_res->start = end;
+       ram_res->end = crashk_res.end;
+       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       ram_res->name = "System RAM";
+
+       crashk_res.end = end - 1;
+
+       insert_resource(&iomem_resource, ram_res);
+       crash_unmap_reserved_pages();
+
+unlock:
+       mutex_unlock(&kexec_mutex);
+       return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                           size_t data_len)
+{
+       struct elf_note note;
+
+       note.n_namesz = strlen(name) + 1;
+       note.n_descsz = data_len;
+       note.n_type   = type;
+       memcpy(buf, &note, sizeof(note));
+       buf += (sizeof(note) + 3)/4;
+       memcpy(buf, name, note.n_namesz);
+       buf += (note.n_namesz + 3)/4;
+       memcpy(buf, data, note.n_descsz);
+       buf += (note.n_descsz + 3)/4;
+
+       return buf;
+}
+
+static void final_note(u32 *buf)
+{
+       struct elf_note note;
+
+       note.n_namesz = 0;
+       note.n_descsz = 0;
+       note.n_type   = 0;
+       memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+       struct elf_prstatus prstatus;
+       u32 *buf;
+
+       if ((cpu < 0) || (cpu >= nr_cpu_ids))
+               return;
+
+       /* Using ELF notes here is opportunistic.
+        * I need a well defined structure format
+        * for the data I pass, and I need tags
+        * on the data to indicate what information I have
+        * squirrelled away.  ELF notes happen to provide
+        * all of that, so there is no need to invent something new.
+        */
+       buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+       if (!buf)
+               return;
+       memset(&prstatus, 0, sizeof(prstatus));
+       prstatus.pr_pid = current->pid;
+       elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+       buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+                             &prstatus, sizeof(prstatus));
+       final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+       /* Allocate memory for saving cpu registers. */
+       size_t size, align;
+
+       /*
+        * crash_notes could be allocated across 2 vmalloc pages when percpu
+        * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+        * pages are also on 2 continuous physical pages. In this case the
+        * 2nd part of crash_notes in 2nd page could be lost since only the
+        * starting address and size of crash_notes are exported through sysfs.
+        * Here round up the size of crash_notes to the nearest power of two
+        * and pass it to __alloc_percpu as align value. This can make sure
+        * crash_notes is allocated inside one physical page.
+        */
+       size = sizeof(note_buf_t);
+       align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+       /*
+        * Break compile if size is bigger than PAGE_SIZE since crash_notes
+        * definitely will be in 2 pages with that.
+        */
+       BUILD_BUG_ON(size > PAGE_SIZE);
+
+       crash_notes = __alloc_percpu(size, align);
+       if (!crash_notes) {
+               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+               return -ENOMEM;
+       }
+       return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+                                       unsigned long long system_ram,
+                                       unsigned long long *crash_size,
+                                       unsigned long long *crash_base)
+{
+       char *cur = cmdline, *tmp;
+
+       /* for each entry of the comma-separated list */
+       do {
+               unsigned long long start, end = ULLONG_MAX, size;
+
+               /* get the start of the range */
+               start = memparse(cur, &tmp);
+               if (cur == tmp) {
+                       pr_warn("crashkernel: Memory value expected\n");
+                       return -EINVAL;
+               }
+               cur = tmp;
+               if (*cur != '-') {
+                       pr_warn("crashkernel: '-' expected\n");
+                       return -EINVAL;
+               }
+               cur++;
+
+               /* if no ':' is here, than we read the end */
+               if (*cur != ':') {
+                       end = memparse(cur, &tmp);
+                       if (cur == tmp) {
+                               pr_warn("crashkernel: Memory value expected\n");
+                               return -EINVAL;
+                       }
+                       cur = tmp;
+                       if (end <= start) {
+                               pr_warn("crashkernel: end <= start\n");
+                               return -EINVAL;
+                       }
+               }
+
+               if (*cur != ':') {
+                       pr_warn("crashkernel: ':' expected\n");
+                       return -EINVAL;
+               }
+               cur++;
+
+               size = memparse(cur, &tmp);
+               if (cur == tmp) {
+                       pr_warn("Memory value expected\n");
+                       return -EINVAL;
+               }
+               cur = tmp;
+               if (size >= system_ram) {
+                       pr_warn("crashkernel: invalid size\n");
+                       return -EINVAL;
+               }
+
+               /* match ? */
+               if (system_ram >= start && system_ram < end) {
+                       *crash_size = size;
+                       break;
+               }
+       } while (*cur++ == ',');
+
+       if (*crash_size > 0) {
+               while (*cur && *cur != ' ' && *cur != '@')
+                       cur++;
+               if (*cur == '@') {
+                       cur++;
+                       *crash_base = memparse(cur, &tmp);
+                       if (cur == tmp) {
+                               pr_warn("Memory value expected after '@'\n");
+                               return -EINVAL;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *     crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+                                          unsigned long long *crash_size,
+                                          unsigned long long *crash_base)
+{
+       char *cur = cmdline;
+
+       *crash_size = memparse(cmdline, &cur);
+       if (cmdline == cur) {
+               pr_warn("crashkernel: memory value expected\n");
+               return -EINVAL;
+       }
+
+       if (*cur == '@')
+               *crash_base = memparse(cur+1, &cur);
+       else if (*cur != ' ' && *cur != '\0') {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+       [SUFFIX_HIGH] = ",high",
+       [SUFFIX_LOW]  = ",low",
+       [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *     crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+                                          unsigned long long   *crash_size,
+                                          const char *suffix)
+{
+       char *cur = cmdline;
+
+       *crash_size = memparse(cmdline, &cur);
+       if (cmdline == cur) {
+               pr_warn("crashkernel: memory value expected\n");
+               return -EINVAL;
+       }
+
+       /* check with suffix */
+       if (strncmp(cur, suffix, strlen(suffix))) {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+       cur += strlen(suffix);
+       if (*cur != ' ' && *cur != '\0') {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+                            const char *name,
+                            const char *suffix)
+{
+       char *p = cmdline, *ck_cmdline = NULL;
+
+       /* find crashkernel and use the last one if there are more */
+       p = strstr(p, name);
+       while (p) {
+               char *end_p = strchr(p, ' ');
+               char *q;
+
+               if (!end_p)
+                       end_p = p + strlen(p);
+
+               if (!suffix) {
+                       int i;
+
+                       /* skip the one with any known suffix */
+                       for (i = 0; suffix_tbl[i]; i++) {
+                               q = end_p - strlen(suffix_tbl[i]);
+                               if (!strncmp(q, suffix_tbl[i],
+                                            strlen(suffix_tbl[i])))
+                                       goto next;
+                       }
+                       ck_cmdline = p;
+               } else {
+                       q = end_p - strlen(suffix);
+                       if (!strncmp(q, suffix, strlen(suffix)))
+                               ck_cmdline = p;
+               }
+next:
+               p = strstr(p+1, name);
+       }
+
+       if (!ck_cmdline)
+               return NULL;
+
+       return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base,
+                            const char *name,
+                            const char *suffix)
+{
+       char    *first_colon, *first_space;
+       char    *ck_cmdline;
+
+       BUG_ON(!crash_size || !crash_base);
+       *crash_size = 0;
+       *crash_base = 0;
+
+       ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+       if (!ck_cmdline)
+               return -EINVAL;
+
+       ck_cmdline += strlen(name);
+
+       if (suffix)
+               return parse_crashkernel_suffix(ck_cmdline, crash_size,
+                               suffix);
+       /*
+        * if the commandline contains a ':', then that's the extended
+        * syntax -- if not, it must be the classic syntax
+        */
+       first_colon = strchr(ck_cmdline, ':');
+       first_space = strchr(ck_cmdline, ' ');
+       if (first_colon && (!first_space || first_colon < first_space))
+               return parse_crashkernel_mem(ck_cmdline, system_ram,
+                               crash_size, crash_base);
+
+       return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                       "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                               "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                               "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+       u32 *buf = vmcoreinfo_note;
+
+       if (!vmcoreinfo_size)
+               return;
+       buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+                             vmcoreinfo_size);
+       final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+       vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+       update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+       va_list args;
+       char buf[0x50];
+       size_t r;
+
+       va_start(args, fmt);
+       r = vscnprintf(buf, sizeof(buf), fmt, args);
+       va_end(args);
+
+       r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+       memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+       vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+       return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+       VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+       VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+       VMCOREINFO_SYMBOL(init_uts_ns);
+       VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+       VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+       VMCOREINFO_SYMBOL(_stext);
+       VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+       VMCOREINFO_SYMBOL(mem_map);
+       VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+       VMCOREINFO_SYMBOL(mem_section);
+       VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+       VMCOREINFO_STRUCT_SIZE(mem_section);
+       VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+       VMCOREINFO_STRUCT_SIZE(page);
+       VMCOREINFO_STRUCT_SIZE(pglist_data);
+       VMCOREINFO_STRUCT_SIZE(zone);
+       VMCOREINFO_STRUCT_SIZE(free_area);
+       VMCOREINFO_STRUCT_SIZE(list_head);
+       VMCOREINFO_SIZE(nodemask_t);
+       VMCOREINFO_OFFSET(page, flags);
+       VMCOREINFO_OFFSET(page, _count);
+       VMCOREINFO_OFFSET(page, mapping);
+       VMCOREINFO_OFFSET(page, lru);
+       VMCOREINFO_OFFSET(page, _mapcount);
+       VMCOREINFO_OFFSET(page, private);
+       VMCOREINFO_OFFSET(pglist_data, node_zones);
+       VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+       VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+       VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+       VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+       VMCOREINFO_OFFSET(pglist_data, node_id);
+       VMCOREINFO_OFFSET(zone, free_area);
+       VMCOREINFO_OFFSET(zone, vm_stat);
+       VMCOREINFO_OFFSET(zone, spanned_pages);
+       VMCOREINFO_OFFSET(free_area, free_list);
+       VMCOREINFO_OFFSET(list_head, next);
+       VMCOREINFO_OFFSET(list_head, prev);
+       VMCOREINFO_OFFSET(vmap_area, va_start);
+       VMCOREINFO_OFFSET(vmap_area, list);
+       VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+       log_buf_kexec_setup();
+       VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+       VMCOREINFO_NUMBER(NR_FREE_PAGES);
+       VMCOREINFO_NUMBER(PG_lru);
+       VMCOREINFO_NUMBER(PG_private);
+       VMCOREINFO_NUMBER(PG_swapcache);
+       VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+       VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+       VMCOREINFO_NUMBER(PG_head_mask);
+       VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_X86
+       VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+#endif
+#ifdef CONFIG_HUGETLBFS
+       VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+       arch_crash_save_vmcoreinfo();
+       update_vmcoreinfo_note();
+
+       return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+       int error = 0;
+
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
+       if (!kexec_image) {
+               error = -EINVAL;
+               goto Unlock;
+       }
+
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context) {
+               lock_system_sleep();
+               pm_prepare_console();
+               error = freeze_processes();
+               if (error) {
+                       error = -EBUSY;
+                       goto Restore_console;
+               }
+               suspend_console();
+               error = dpm_suspend_start(PMSG_FREEZE);
+               if (error)
+                       goto Resume_console;
+               /* At this point, dpm_suspend_start() has been called,
+                * but *not* dpm_suspend_end(). We *must* call
+                * dpm_suspend_end() now.  Otherwise, drivers for
+                * some devices (e.g. interrupt controllers) become
+                * desynchronized with the actual state of the
+                * hardware at resume time, and evil weirdness ensues.
+                */
+               error = dpm_suspend_end(PMSG_FREEZE);
+               if (error)
+                       goto Resume_devices;
+               error = disable_nonboot_cpus();
+               if (error)
+                       goto Enable_cpus;
+               local_irq_disable();
+               error = syscore_suspend();
+               if (error)
+                       goto Enable_irqs;
+       } else
+#endif
+       {
+               kexec_in_progress = true;
+               kernel_restart_prepare(NULL);
+               migrate_to_reboot_cpu();
+
+               /*
+                * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+                * no further code needs to use CPU hotplug (which is true in
+                * the reboot case). However, the kexec path depends on using
+                * CPU hotplug again; so re-enable it here.
+                */
+               cpu_hotplug_enable();
+               pr_emerg("Starting new kernel\n");
+               machine_shutdown();
+       }
+
+       machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context) {
+               syscore_resume();
+ Enable_irqs:
+               local_irq_enable();
+ Enable_cpus:
+               enable_nonboot_cpus();
+               dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+               dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+               resume_console();
+               thaw_processes();
+ Restore_console:
+               pm_restore_console();
+               unlock_system_sleep();
+       }
+#endif
+
+ Unlock:
+       mutex_unlock(&kexec_mutex);
+       return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644 (file)
index 0000000..6a9a3f2
--- /dev/null
@@ -0,0 +1,1045 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ *      Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+       struct fd f = fdget(fd);
+       int ret;
+       struct kstat stat;
+       loff_t pos;
+       ssize_t bytes = 0;
+
+       if (!f.file)
+               return -EBADF;
+
+       ret = vfs_getattr(&f.file->f_path, &stat);
+       if (ret)
+               goto out;
+
+       if (stat.size > INT_MAX) {
+               ret = -EFBIG;
+               goto out;
+       }
+
+       /* Don't hand 0 to vmalloc, it whines. */
+       if (stat.size == 0) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       *buf = vmalloc(stat.size);
+       if (!*buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       pos = 0;
+       while (pos < stat.size) {
+               bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+                                   stat.size - pos);
+               if (bytes < 0) {
+                       vfree(*buf);
+                       ret = bytes;
+                       goto out;
+               }
+
+               if (bytes == 0)
+                       break;
+               pos += bytes;
+       }
+
+       if (pos != stat.size) {
+               ret = -EBADF;
+               vfree(*buf);
+               goto out;
+       }
+
+       *buf_len = pos;
+out:
+       fdput(f);
+       return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                        unsigned long buf_len)
+{
+       return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+       return ERR_PTR(-ENOEXEC);
+}
+
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+       return -EINVAL;
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                       unsigned long buf_len)
+{
+       return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                unsigned int relsec)
+{
+       pr_err("RELA relocation unsupported.\n");
+       return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                            unsigned int relsec)
+{
+       pr_err("REL relocation unsupported.\n");
+       return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+
+       vfree(image->kernel_buf);
+       image->kernel_buf = NULL;
+
+       vfree(image->initrd_buf);
+       image->initrd_buf = NULL;
+
+       kfree(image->cmdline_buf);
+       image->cmdline_buf = NULL;
+
+       vfree(pi->purgatory_buf);
+       pi->purgatory_buf = NULL;
+
+       vfree(pi->sechdrs);
+       pi->sechdrs = NULL;
+
+       /* See if architecture has anything to cleanup post load */
+       arch_kimage_file_post_load_cleanup(image);
+
+       /*
+        * Above call should have called into bootloader to free up
+        * any data stored in kimage->image_loader_data. It should
+        * be ok now to free it up.
+        */
+       kfree(image->image_loader_data);
+       image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+                            const char __user *cmdline_ptr,
+                            unsigned long cmdline_len, unsigned flags)
+{
+       int ret = 0;
+       void *ldata;
+
+       ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+                               &image->kernel_buf_len);
+       if (ret)
+               return ret;
+
+       /* Call arch image probe handlers */
+       ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+                                           image->kernel_buf_len);
+
+       if (ret)
+               goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+       ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+                                          image->kernel_buf_len);
+       if (ret) {
+               pr_debug("kernel signature verification failed.\n");
+               goto out;
+       }
+       pr_debug("kernel signature verification successful.\n");
+#endif
+       /* It is possible that there no initramfs is being loaded */
+       if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+               ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+                                       &image->initrd_buf_len);
+               if (ret)
+                       goto out;
+       }
+
+       if (cmdline_len) {
+               image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+               if (!image->cmdline_buf) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+
+               ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+                                    cmdline_len);
+               if (ret) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               image->cmdline_buf_len = cmdline_len;
+
+               /* command line should be a string with last byte null */
+               if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /* Call arch image load handlers */
+       ldata = arch_kexec_kernel_image_load(image);
+
+       if (IS_ERR(ldata)) {
+               ret = PTR_ERR(ldata);
+               goto out;
+       }
+
+       image->image_loader_data = ldata;
+out:
+       /* In case of error, free up all allocated memory in this function */
+       if (ret)
+               kimage_file_post_load_cleanup(image);
+       return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+                      int initrd_fd, const char __user *cmdline_ptr,
+                      unsigned long cmdline_len, unsigned long flags)
+{
+       int ret;
+       struct kimage *image;
+       bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+       image = do_kimage_alloc_init();
+       if (!image)
+               return -ENOMEM;
+
+       image->file_mode = 1;
+
+       if (kexec_on_panic) {
+               /* Enable special crash kernel control page alloc policy. */
+               image->control_page = crashk_res.start;
+               image->type = KEXEC_TYPE_CRASH;
+       }
+
+       ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+                                          cmdline_ptr, cmdline_len, flags);
+       if (ret)
+               goto out_free_image;
+
+       ret = sanity_check_segment_list(image);
+       if (ret)
+               goto out_free_post_load_bufs;
+
+       ret = -ENOMEM;
+       image->control_code_page = kimage_alloc_control_pages(image,
+                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
+       if (!image->control_code_page) {
+               pr_err("Could not allocate control_code_buffer\n");
+               goto out_free_post_load_bufs;
+       }
+
+       if (!kexec_on_panic) {
+               image->swap_page = kimage_alloc_control_pages(image, 0);
+               if (!image->swap_page) {
+                       pr_err("Could not allocate swap buffer\n");
+                       goto out_free_control_pages;
+               }
+       }
+
+       *rimage = image;
+       return 0;
+out_free_control_pages:
+       kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+       kimage_file_post_load_cleanup(image);
+out_free_image:
+       kfree(image);
+       return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+               unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+               unsigned long, flags)
+{
+       int ret = 0, i;
+       struct kimage **dest_image, *image;
+
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+               return -EPERM;
+
+       /* Make sure we have a legal set of flags */
+       if (flags != (flags & KEXEC_FILE_FLAGS))
+               return -EINVAL;
+
+       image = NULL;
+
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
+
+       dest_image = &kexec_image;
+       if (flags & KEXEC_FILE_ON_CRASH)
+               dest_image = &kexec_crash_image;
+
+       if (flags & KEXEC_FILE_UNLOAD)
+               goto exchange;
+
+       /*
+        * In case of crash, new kernel gets loaded in reserved region. It is
+        * same memory where old crash kernel might be loaded. Free any
+        * current crash dump kernel before we corrupt it.
+        */
+       if (flags & KEXEC_FILE_ON_CRASH)
+               kimage_free(xchg(&kexec_crash_image, NULL));
+
+       ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+                                    cmdline_len, flags);
+       if (ret)
+               goto out;
+
+       ret = machine_kexec_prepare(image);
+       if (ret)
+               goto out;
+
+       ret = kexec_calculate_store_digests(image);
+       if (ret)
+               goto out;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               struct kexec_segment *ksegment;
+
+               ksegment = &image->segment[i];
+               pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+                        i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+                        ksegment->memsz);
+
+               ret = kimage_load_segment(image, &image->segment[i]);
+               if (ret)
+                       goto out;
+       }
+
+       kimage_terminate(image);
+
+       /*
+        * Free up any temporary buffers allocated which are not needed
+        * after image has been loaded
+        */
+       kimage_file_post_load_cleanup(image);
+exchange:
+       image = xchg(dest_image, image);
+out:
+       mutex_unlock(&kexec_mutex);
+       kimage_free(image);
+       return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+                                   struct kexec_buf *kbuf)
+{
+       struct kimage *image = kbuf->image;
+       unsigned long temp_start, temp_end;
+
+       temp_end = min(end, kbuf->buf_max);
+       temp_start = temp_end - kbuf->memsz;
+
+       do {
+               /* align down start */
+               temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+               if (temp_start < start || temp_start < kbuf->buf_min)
+                       return 0;
+
+               temp_end = temp_start + kbuf->memsz - 1;
+
+               /*
+                * Make sure this does not conflict with any of existing
+                * segments
+                */
+               if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                       temp_start = temp_start - PAGE_SIZE;
+                       continue;
+               }
+
+               /* We found a suitable memory range */
+               break;
+       } while (1);
+
+       /* If we are here, we found a suitable memory range */
+       kbuf->mem = temp_start;
+
+       /* Success, stop navigating through remaining System RAM ranges */
+       return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+                                    struct kexec_buf *kbuf)
+{
+       struct kimage *image = kbuf->image;
+       unsigned long temp_start, temp_end;
+
+       temp_start = max(start, kbuf->buf_min);
+
+       do {
+               temp_start = ALIGN(temp_start, kbuf->buf_align);
+               temp_end = temp_start + kbuf->memsz - 1;
+
+               if (temp_end > end || temp_end > kbuf->buf_max)
+                       return 0;
+               /*
+                * Make sure this does not conflict with any of existing
+                * segments
+                */
+               if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                       temp_start = temp_start + PAGE_SIZE;
+                       continue;
+               }
+
+               /* We found a suitable memory range */
+               break;
+       } while (1);
+
+       /* If we are here, we found a suitable memory range */
+       kbuf->mem = temp_start;
+
+       /* Success, stop navigating through remaining System RAM ranges */
+       return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+       struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+       unsigned long sz = end - start + 1;
+
+       /* Returning 0 will take to next memory range */
+       if (sz < kbuf->memsz)
+               return 0;
+
+       if (end < kbuf->buf_min || start > kbuf->buf_max)
+               return 0;
+
+       /*
+        * Allocate memory top down with-in ram range. Otherwise bottom up
+        * allocation.
+        */
+       if (kbuf->top_down)
+               return locate_mem_hole_top_down(start, end, kbuf);
+       return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+                    unsigned long memsz, unsigned long buf_align,
+                    unsigned long buf_min, unsigned long buf_max,
+                    bool top_down, unsigned long *load_addr)
+{
+
+       struct kexec_segment *ksegment;
+       struct kexec_buf buf, *kbuf;
+       int ret;
+
+       /* Currently adding segment this way is allowed only in file mode */
+       if (!image->file_mode)
+               return -EINVAL;
+
+       if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+               return -EINVAL;
+
+       /*
+        * Make sure we are not trying to add buffer after allocating
+        * control pages. All segments need to be placed first before
+        * any control pages are allocated. As control page allocation
+        * logic goes through list of segments to make sure there are
+        * no destination overlaps.
+        */
+       if (!list_empty(&image->control_pages)) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+
+       memset(&buf, 0, sizeof(struct kexec_buf));
+       kbuf = &buf;
+       kbuf->image = image;
+       kbuf->buffer = buffer;
+       kbuf->bufsz = bufsz;
+
+       kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+       kbuf->buf_align = max(buf_align, PAGE_SIZE);
+       kbuf->buf_min = buf_min;
+       kbuf->buf_max = buf_max;
+       kbuf->top_down = top_down;
+
+       /* Walk the RAM ranges and allocate a suitable range for the buffer */
+       if (image->type == KEXEC_TYPE_CRASH)
+               ret = walk_iomem_res("Crash kernel",
+                                    IORESOURCE_MEM | IORESOURCE_BUSY,
+                                    crashk_res.start, crashk_res.end, kbuf,
+                                    locate_mem_hole_callback);
+       else
+               ret = walk_system_ram_res(0, -1, kbuf,
+                                         locate_mem_hole_callback);
+       if (ret != 1) {
+               /* A suitable memory range could not be found for buffer */
+               return -EADDRNOTAVAIL;
+       }
+
+       /* Found a suitable memory range */
+       ksegment = &image->segment[image->nr_segments];
+       ksegment->kbuf = kbuf->buffer;
+       ksegment->bufsz = kbuf->bufsz;
+       ksegment->mem = kbuf->mem;
+       ksegment->memsz = kbuf->memsz;
+       image->nr_segments++;
+       *load_addr = ksegment->mem;
+       return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+       struct crypto_shash *tfm;
+       struct shash_desc *desc;
+       int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+       size_t desc_size, nullsz;
+       char *digest;
+       void *zero_buf;
+       struct kexec_sha_region *sha_regions;
+       struct purgatory_info *pi = &image->purgatory_info;
+
+       zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+       zero_buf_sz = PAGE_SIZE;
+
+       tfm = crypto_alloc_shash("sha256", 0, 0);
+       if (IS_ERR(tfm)) {
+               ret = PTR_ERR(tfm);
+               goto out;
+       }
+
+       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+       desc = kzalloc(desc_size, GFP_KERNEL);
+       if (!desc) {
+               ret = -ENOMEM;
+               goto out_free_tfm;
+       }
+
+       sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+       sha_regions = vzalloc(sha_region_sz);
+       if (!sha_regions)
+               goto out_free_desc;
+
+       desc->tfm   = tfm;
+       desc->flags = 0;
+
+       ret = crypto_shash_init(desc);
+       if (ret < 0)
+               goto out_free_sha_regions;
+
+       digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+       if (!digest) {
+               ret = -ENOMEM;
+               goto out_free_sha_regions;
+       }
+
+       for (j = i = 0; i < image->nr_segments; i++) {
+               struct kexec_segment *ksegment;
+
+               ksegment = &image->segment[i];
+               /*
+                * Skip purgatory as it will be modified once we put digest
+                * info in purgatory.
+                */
+               if (ksegment->kbuf == pi->purgatory_buf)
+                       continue;
+
+               ret = crypto_shash_update(desc, ksegment->kbuf,
+                                         ksegment->bufsz);
+               if (ret)
+                       break;
+
+               /*
+                * Assume rest of the buffer is filled with zero and
+                * update digest accordingly.
+                */
+               nullsz = ksegment->memsz - ksegment->bufsz;
+               while (nullsz) {
+                       unsigned long bytes = nullsz;
+
+                       if (bytes > zero_buf_sz)
+                               bytes = zero_buf_sz;
+                       ret = crypto_shash_update(desc, zero_buf, bytes);
+                       if (ret)
+                               break;
+                       nullsz -= bytes;
+               }
+
+               if (ret)
+                       break;
+
+               sha_regions[j].start = ksegment->mem;
+               sha_regions[j].len = ksegment->memsz;
+               j++;
+       }
+
+       if (!ret) {
+               ret = crypto_shash_final(desc, digest);
+               if (ret)
+                       goto out_free_digest;
+               ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+                                               sha_regions, sha_region_sz, 0);
+               if (ret)
+                       goto out_free_digest;
+
+               ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+                                               digest, SHA256_DIGEST_SIZE, 0);
+               if (ret)
+                       goto out_free_digest;
+       }
+
+out_free_digest:
+       kfree(digest);
+out_free_sha_regions:
+       vfree(sha_regions);
+out_free_desc:
+       kfree(desc);
+out_free_tfm:
+       kfree(tfm);
+out:
+       return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+                                 unsigned long max, int top_down)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+       unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+       unsigned char *buf_addr, *src;
+       int i, ret = 0, entry_sidx = -1;
+       const Elf_Shdr *sechdrs_c;
+       Elf_Shdr *sechdrs = NULL;
+       void *purgatory_buf = NULL;
+
+       /*
+        * sechdrs_c points to section headers in purgatory and are read
+        * only. No modifications allowed.
+        */
+       sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+       /*
+        * We can not modify sechdrs_c[] and its fields. It is read only.
+        * Copy it over to a local copy where one can store some temporary
+        * data and free it at the end. We need to modify ->sh_addr and
+        * ->sh_offset fields to keep track of permanent and temporary
+        * locations of sections.
+        */
+       sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+       if (!sechdrs)
+               return -ENOMEM;
+
+       memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+       /*
+        * We seem to have multiple copies of sections. First copy is which
+        * is embedded in kernel in read only section. Some of these sections
+        * will be copied to a temporary buffer and relocated. And these
+        * sections will finally be copied to their final destination at
+        * segment load time.
+        *
+        * Use ->sh_offset to reflect section address in memory. It will
+        * point to original read only copy if section is not allocatable.
+        * Otherwise it will point to temporary copy which will be relocated.
+        *
+        * Use ->sh_addr to contain final address of the section where it
+        * will go during execution time.
+        */
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (sechdrs[i].sh_type == SHT_NOBITS)
+                       continue;
+
+               sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+                                               sechdrs[i].sh_offset;
+       }
+
+       /*
+        * Identify entry point section and make entry relative to section
+        * start.
+        */
+       entry = pi->ehdr->e_entry;
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+                       continue;
+
+               /* Make entry section relative */
+               if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+                   ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+                    pi->ehdr->e_entry)) {
+                       entry_sidx = i;
+                       entry -= sechdrs[i].sh_addr;
+                       break;
+               }
+       }
+
+       /* Determine how much memory is needed to load relocatable object. */
+       buf_align = 1;
+       bss_align = 1;
+       buf_sz = 0;
+       bss_sz = 0;
+
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               align = sechdrs[i].sh_addralign;
+               if (sechdrs[i].sh_type != SHT_NOBITS) {
+                       if (buf_align < align)
+                               buf_align = align;
+                       buf_sz = ALIGN(buf_sz, align);
+                       buf_sz += sechdrs[i].sh_size;
+               } else {
+                       /* bss section */
+                       if (bss_align < align)
+                               bss_align = align;
+                       bss_sz = ALIGN(bss_sz, align);
+                       bss_sz += sechdrs[i].sh_size;
+               }
+       }
+
+       /* Determine the bss padding required to align bss properly */
+       bss_pad = 0;
+       if (buf_sz & (bss_align - 1))
+               bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+       memsz = buf_sz + bss_pad + bss_sz;
+
+       /* Allocate buffer for purgatory */
+       purgatory_buf = vzalloc(buf_sz);
+       if (!purgatory_buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (buf_align < bss_align)
+               buf_align = bss_align;
+
+       /* Add buffer to segment list */
+       ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+                               buf_align, min, max, top_down,
+                               &pi->purgatory_load_addr);
+       if (ret)
+               goto out;
+
+       /* Load SHF_ALLOC sections */
+       buf_addr = purgatory_buf;
+       load_addr = curr_load_addr = pi->purgatory_load_addr;
+       bss_addr = load_addr + buf_sz + bss_pad;
+
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               align = sechdrs[i].sh_addralign;
+               if (sechdrs[i].sh_type != SHT_NOBITS) {
+                       curr_load_addr = ALIGN(curr_load_addr, align);
+                       offset = curr_load_addr - load_addr;
+                       /* We already modifed ->sh_offset to keep src addr */
+                       src = (char *) sechdrs[i].sh_offset;
+                       memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+                       /* Store load address and source address of section */
+                       sechdrs[i].sh_addr = curr_load_addr;
+
+                       /*
+                        * This section got copied to temporary buffer. Update
+                        * ->sh_offset accordingly.
+                        */
+                       sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+                       /* Advance to the next address */
+                       curr_load_addr += sechdrs[i].sh_size;
+               } else {
+                       bss_addr = ALIGN(bss_addr, align);
+                       sechdrs[i].sh_addr = bss_addr;
+                       bss_addr += sechdrs[i].sh_size;
+               }
+       }
+
+       /* Update entry point based on load address of text section */
+       if (entry_sidx >= 0)
+               entry += sechdrs[entry_sidx].sh_addr;
+
+       /* Make kernel jump to purgatory after shutdown */
+       image->start = entry;
+
+       /* Used later to get/set symbol values */
+       pi->sechdrs = sechdrs;
+
+       /*
+        * Used later to identify which section is purgatory and skip it
+        * from checksumming.
+        */
+       pi->purgatory_buf = purgatory_buf;
+       return ret;
+out:
+       vfree(sechdrs);
+       vfree(purgatory_buf);
+       return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+       int i, ret;
+       struct purgatory_info *pi = &image->purgatory_info;
+       Elf_Shdr *sechdrs = pi->sechdrs;
+
+       /* Apply relocations */
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               Elf_Shdr *section, *symtab;
+
+               if (sechdrs[i].sh_type != SHT_RELA &&
+                   sechdrs[i].sh_type != SHT_REL)
+                       continue;
+
+               /*
+                * For section of type SHT_RELA/SHT_REL,
+                * ->sh_link contains section header index of associated
+                * symbol table. And ->sh_info contains section header
+                * index of section to which relocations apply.
+                */
+               if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+                   sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+                       return -ENOEXEC;
+
+               section = &sechdrs[sechdrs[i].sh_info];
+               symtab = &sechdrs[sechdrs[i].sh_link];
+
+               if (!(section->sh_flags & SHF_ALLOC))
+                       continue;
+
+               /*
+                * symtab->sh_link contain section header index of associated
+                * string table.
+                */
+               if (symtab->sh_link >= pi->ehdr->e_shnum)
+                       /* Invalid section number? */
+                       continue;
+
+               /*
+                * Respective architecture needs to provide support for applying
+                * relocations of type SHT_RELA/SHT_REL.
+                */
+               if (sechdrs[i].sh_type == SHT_RELA)
+                       ret = arch_kexec_apply_relocations_add(pi->ehdr,
+                                                              sechdrs, i);
+               else if (sechdrs[i].sh_type == SHT_REL)
+                       ret = arch_kexec_apply_relocations(pi->ehdr,
+                                                          sechdrs, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+                        unsigned long max, int top_down,
+                        unsigned long *load_addr)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       int ret;
+
+       if (kexec_purgatory_size <= 0)
+               return -EINVAL;
+
+       if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+               return -ENOEXEC;
+
+       pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+       if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+           || pi->ehdr->e_type != ET_REL
+           || !elf_check_arch(pi->ehdr)
+           || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+               return -ENOEXEC;
+
+       if (pi->ehdr->e_shoff >= kexec_purgatory_size
+           || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+           kexec_purgatory_size - pi->ehdr->e_shoff))
+               return -ENOEXEC;
+
+       ret = __kexec_load_purgatory(image, min, max, top_down);
+       if (ret)
+               return ret;
+
+       ret = kexec_apply_relocations(image);
+       if (ret)
+               goto out;
+
+       *load_addr = pi->purgatory_load_addr;
+       return 0;
+out:
+       vfree(pi->sechdrs);
+       vfree(pi->purgatory_buf);
+       return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+                                           const char *name)
+{
+       Elf_Sym *syms;
+       Elf_Shdr *sechdrs;
+       Elf_Ehdr *ehdr;
+       int i, k;
+       const char *strtab;
+
+       if (!pi->sechdrs || !pi->ehdr)
+               return NULL;
+
+       sechdrs = pi->sechdrs;
+       ehdr = pi->ehdr;
+
+       for (i = 0; i < ehdr->e_shnum; i++) {
+               if (sechdrs[i].sh_type != SHT_SYMTAB)
+                       continue;
+
+               if (sechdrs[i].sh_link >= ehdr->e_shnum)
+                       /* Invalid strtab section number */
+                       continue;
+               strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+               syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+               /* Go through symbols for a match */
+               for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+                       if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+                               continue;
+
+                       if (strcmp(strtab + syms[k].st_name, name) != 0)
+                               continue;
+
+                       if (syms[k].st_shndx == SHN_UNDEF ||
+                           syms[k].st_shndx >= ehdr->e_shnum) {
+                               pr_debug("Symbol: %s has bad section index %d.\n",
+                                               name, syms[k].st_shndx);
+                               return NULL;
+                       }
+
+                       /* Found the symbol we are looking for */
+                       return &syms[k];
+               }
+       }
+
+       return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       Elf_Sym *sym;
+       Elf_Shdr *sechdr;
+
+       sym = kexec_purgatory_find_symbol(pi, name);
+       if (!sym)
+               return ERR_PTR(-EINVAL);
+
+       sechdr = &pi->sechdrs[sym->st_shndx];
+
+       /*
+        * Returns the address where symbol will finally be loaded after
+        * kexec_load_segment()
+        */
+       return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+                                  void *buf, unsigned int size, bool get_value)
+{
+       Elf_Sym *sym;
+       Elf_Shdr *sechdrs;
+       struct purgatory_info *pi = &image->purgatory_info;
+       char *sym_buf;
+
+       sym = kexec_purgatory_find_symbol(pi, name);
+       if (!sym)
+               return -EINVAL;
+
+       if (sym->st_size != size) {
+               pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+                      name, (unsigned long)sym->st_size, size);
+               return -EINVAL;
+       }
+
+       sechdrs = pi->sechdrs;
+
+       if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+               pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+                      get_value ? "get" : "set");
+               return -EINVAL;
+       }
+
+       sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+                                       sym->st_value;
+
+       if (get_value)
+               memcpy((void *)buf, sym_buf, size);
+       else
+               memcpy((void *)sym_buf, buf, size);
+
+       return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644 (file)
index 0000000..e4392a6
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+                               unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
index 2777f40a9c7be84c60da316960a012e982d67c35..da98d0593de24206d68222d787d059a5a2b025a1 100644 (file)
@@ -45,8 +45,6 @@
 
 extern int max_threads;
 
-static struct workqueue_struct *khelper_wq;
-
 #define CAP_BSET       (void *)1
 #define CAP_PI         (void *)2
 
@@ -114,10 +112,11 @@ out:
  * @...: arguments as specified in the format string
  *
  * Load a module using the user mode module loader. The function returns
- * zero on success or a negative errno code on failure. Note that a
- * successful module load does not mean the module did not then unload
- * and exit on an error of its own. Callers must check that the service
- * they requested is now available not blindly invoke it.
+ * zero on success or a negative errno code or positive exit code from
+ * "modprobe" on failure. Note that a successful module load does not mean
+ * the module did not then unload and exit on an error of its own. Callers
+ * must check that the service they requested is now available not blindly
+ * invoke it.
  *
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
 /*
  * This is the task which runs the usermode application
  */
-static int ____call_usermodehelper(void *data)
+static int call_usermodehelper_exec_async(void *data)
 {
        struct subprocess_info *sub_info = data;
        struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
        flush_signal_handlers(current, 1);
        spin_unlock_irq(&current->sighand->siglock);
 
-       /* We can run anywhere, unlike our parent keventd(). */
-       set_cpus_allowed_ptr(current, cpu_all_mask);
-
        /*
-        * Our parent is keventd, which runs with elevated scheduling priority.
-        * Avoid propagating that into the userspace child.
+        * Our parent (unbound workqueue) runs with elevated scheduling
+        * priority. Avoid propagating that into the userspace child.
         */
        set_user_nice(current, 0);
 
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
                           (const char __user *const __user *)sub_info->envp);
 out:
        sub_info->retval = retval;
-       /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+       /*
+        * call_usermodehelper_exec_sync() will call umh_complete
+        * if UHM_WAIT_PROC.
+        */
        if (!(sub_info->wait & UMH_WAIT_PROC))
                umh_complete(sub_info);
        if (!retval)
@@ -266,15 +265,14 @@ out:
        do_exit(0);
 }
 
-/* Keventd can't block, but this (a child) can. */
-static int wait_for_helper(void *data)
+/* Handles UMH_WAIT_PROC.  */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 {
-       struct subprocess_info *sub_info = data;
        pid_t pid;
 
        /* If SIGCLD is ignored sys_wait4 won't populate the status. */
        kernel_sigaction(SIGCHLD, SIG_DFL);
-       pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+       pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
        if (pid < 0) {
                sub_info->retval = pid;
        } else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
                /*
                 * Normally it is bogus to call wait4() from in-kernel because
                 * wait4() wants to write the exit code to a userspace address.
-                * But wait_for_helper() always runs as keventd, and put_user()
-                * to a kernel address works OK for kernel threads, due to their
-                * having an mm_segment_t which spans the entire address space.
+                * But call_usermodehelper_exec_sync() always runs as kernel
+                * thread (workqueue) and put_user() to a kernel address works
+                * OK for kernel threads, due to their having an mm_segment_t
+                * which spans the entire address space.
                 *
                 * Thus the __user pointer cast is valid here.
                 */
                sys_wait4(pid, (int __user *)&ret, 0, NULL);
 
                /*
-                * If ret is 0, either ____call_usermodehelper failed and the
-                * real error code is already in sub_info->retval or
+                * If ret is 0, either call_usermodehelper_exec_async failed and
+                * the real error code is already in sub_info->retval or
                 * sub_info->retval is 0 anyway, so don't mess with it then.
                 */
                if (ret)
                        sub_info->retval = ret;
        }
 
+       /* Restore default kernel sig handler */
+       kernel_sigaction(SIGCHLD, SIG_IGN);
+
        umh_complete(sub_info);
-       do_exit(0);
 }
 
-/* This is run by khelper thread  */
-static void __call_usermodehelper(struct work_struct *work)
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-       pid_t pid;
 
-       if (sub_info->wait & UMH_WAIT_PROC)
-               pid = kernel_thread(wait_for_helper, sub_info,
-                                   CLONE_FS | CLONE_FILES | SIGCHLD);
-       else
-               pid = kernel_thread(____call_usermodehelper, sub_info,
-                                   SIGCHLD);
+       if (sub_info->wait & UMH_WAIT_PROC) {
+               call_usermodehelper_exec_sync(sub_info);
+       } else {
+               pid_t pid;
 
-       if (pid < 0) {
-               sub_info->retval = pid;
-               umh_complete(sub_info);
+               pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+                                   SIGCHLD);
+               if (pid < 0) {
+                       sub_info->retval = pid;
+                       umh_complete(sub_info);
+               }
        }
 }
 
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
        if (!sub_info)
                goto out;
 
-       INIT_WORK(&sub_info->work, __call_usermodehelper);
+       INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
        sub_info->path = path;
        sub_info->argv = argv;
        sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  *        from interrupt context.
  *
  * Runs a user-space application.  The application is started
- * asynchronously if wait is not set, and runs as a child of keventd.
- * (ie. it runs with full root capabilities).
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                return -EINVAL;
        }
        helper_lock();
-       if (!khelper_wq || usermodehelper_disabled) {
+       if (usermodehelper_disabled) {
                retval = -EBUSY;
                goto out;
        }
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;
 
-       queue_work(khelper_wq, &sub_info->work);
+       queue_work(system_unbound_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;
 
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
        },
        { }
 };
-
-void __init usermodehelper_init(void)
-{
-       khelper_wq = create_singlethread_workqueue("khelper");
-       BUG_ON(!khelper_wq);
-}
index 6683ccef9fffb2de28b6a4d6d01393b080811d6a..e83b264640615c47c31cce539f31014dc11b0776 100644 (file)
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
 KERNEL_ATTR_RW(profiling);
 #endif
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static ssize_t kexec_loaded_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_PROFILING
        &profiling_attr.attr,
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
        &kexec_crash_size_attr.attr,
index cf8c24203368651af417eba7525a053e9cc8ff93..8f0324ef72ab374925badb5454aa0a79ae731c61 100644 (file)
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
        .release = devkmsg_release,
 };
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This appends the listed symbols to /proc/vmcore
  *
index d20c85d9f8c0d71df00a2ac7297d4ace5ed18323..bd30a973fe946b03916a1eeb873928adfe1b32b0 100644 (file)
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                kernel_restart(buffer);
                break;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        case LINUX_REBOOT_CMD_KEXEC:
                ret = kernel_kexec();
                break;
index 19b62b522158acb6414cd7440b25e64bd16add35..e69201d8094eb8bed747329afc17528c4315a6b7 100644 (file)
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        {
                .procname       = "kexec_load_disabled",
                .data           = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                       *lvalp = (unsigned long)-val;
+                       *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                       *lvalp = (unsigned long)-val;
+                       *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
index a578a018919977579063bb599d4fd462bae6a54b..814814397cce39b5b0a4aafa3571062a6468e8cf 100644 (file)
@@ -367,7 +367,8 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
 
        nchunks = nbits = totaldigits = c = 0;
        do {
-               chunk = ndigits = 0;
+               chunk = 0;
+               ndigits = totaldigits;
 
                /* Get the next chunk of the bitmap */
                while (buflen) {
@@ -406,9 +407,9 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
                                return -EOVERFLOW;
 
                        chunk = (chunk << 4) | hex_to_bin(c);
-                       ndigits++; totaldigits++;
+                       totaldigits++;
                }
-               if (ndigits == 0)
+               if (ndigits == totaldigits)
                        return -EINVAL;
                if (nchunks == 0 && chunk == 0)
                        continue;
@@ -505,7 +506,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                int nmaskbits)
 {
        unsigned a, b;
-       int c, old_c, totaldigits;
+       int c, old_c, totaldigits, ndigits;
        const char __user __force *ubuf = (const char __user __force *)buf;
        int at_start, in_range;
 
@@ -515,6 +516,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                at_start = 1;
                in_range = 0;
                a = b = 0;
+               ndigits = totaldigits;
 
                /* Get the next cpu# or a range of cpu#'s */
                while (buflen) {
@@ -528,23 +530,27 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                        if (isspace(c))
                                continue;
 
-                       /*
-                        * If the last character was a space and the current
-                        * character isn't '\0', we've got embedded whitespace.
-                        * This is a no-no, so throw an error.
-                        */
-                       if (totaldigits && c && isspace(old_c))
-                               return -EINVAL;
-
                        /* A '\0' or a ',' signal the end of a cpu# or range */
                        if (c == '\0' || c == ',')
                                break;
+                       /*
+                       * whitespaces between digits are not allowed,
+                       * but it's ok if whitespaces are on head or tail.
+                       * when old_c is whilespace,
+                       * if totaldigits == ndigits, whitespace is on head.
+                       * if whitespace is on tail, it should not run here.
+                       * as c was ',' or '\0',
+                       * the last code line has broken the current loop.
+                       */
+                       if ((totaldigits != ndigits) && isspace(old_c))
+                               return -EINVAL;
 
                        if (c == '-') {
                                if (at_start || in_range)
                                        return -EINVAL;
                                b = 0;
                                in_range = 1;
+                               at_start = 1;
                                continue;
                        }
 
@@ -557,15 +563,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                        at_start = 0;
                        totaldigits++;
                }
+               if (ndigits == totaldigits)
+                       continue;
+               /* if no digit is after '-', it's wrong*/
+               if (at_start && in_range)
+                       return -EINVAL;
                if (!(a <= b))
                        return -EINVAL;
                if (b >= nmaskbits)
                        return -ERANGE;
-               if (!at_start) {
-                       while (a <= b) {
-                               set_bit(a, maskp);
-                               a++;
-                       }
+               while (a <= b) {
+                       set_bit(a, maskp);
+                       a++;
                }
        } while (buflen && c == ',');
        return 0;
index 6dd0335ea61b296b5dfd77818e58e294fb2e805c..0234361b24b89ee09dc452e397bd950a43f69025 100644 (file)
@@ -743,12 +743,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long len,
+STATIC int INIT __decompress(unsigned char *buf, long len,
                        long (*fill)(void*, unsigned long),
                        long (*flush)(void*, unsigned long),
-                       unsigned char *outbuf,
+                       unsigned char *outbuf, long olen,
                        long *pos,
-                       void(*error)(char *x))
+                       void (*error)(char *x))
 {
        return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error);
 }
index d4c7891635ecc2b1fb70b9f4fc75a216c782fe0d..555c06bf20daa83190139392597c4622a00a0e5d 100644 (file)
@@ -1,4 +1,5 @@
 #ifdef STATIC
+#define PREBOOT
 /* Pre-boot environment: included */
 
 /* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots
@@ -33,23 +34,23 @@ static long INIT nofill(void *buffer, unsigned long len)
 }
 
 /* Included from initramfs et al code */
-STATIC int INIT gunzip(unsigned char *buf, long len,
+STATIC int INIT __gunzip(unsigned char *buf, long len,
                       long (*fill)(void*, unsigned long),
                       long (*flush)(void*, unsigned long),
-                      unsigned char *out_buf,
+                      unsigned char *out_buf, long out_len,
                       long *pos,
                       void(*error)(char *x)) {
        u8 *zbuf;
        struct z_stream_s *strm;
        int rc;
-       size_t out_len;
 
        rc = -1;
        if (flush) {
                out_len = 0x8000; /* 32 K */
                out_buf = malloc(out_len);
        } else {
-               out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
+               if (!out_len)
+                       out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
        }
        if (!out_buf) {
                error("Out of memory while allocating output buffer");
@@ -181,4 +182,24 @@ gunzip_nomem1:
        return rc; /* returns Z_OK (0) if successful */
 }
 
-#define decompress gunzip
+#ifndef PREBOOT
+STATIC int INIT gunzip(unsigned char *buf, long len,
+                      long (*fill)(void*, unsigned long),
+                      long (*flush)(void*, unsigned long),
+                      unsigned char *out_buf,
+                      long *pos,
+                      void (*error)(char *x))
+{
+       return __gunzip(buf, len, fill, flush, out_buf, 0, pos, error);
+}
+#else
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long out_len,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return __gunzip(buf, len, fill, flush, out_buf, out_len, pos, error);
+}
+#endif
index 40f66ebe57b77a0566460a2407bdd713d6e0b3fc..036fc882cd72561a2a96b39314078676fe94ff90 100644 (file)
@@ -196,12 +196,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
                              long (*fill)(void*, unsigned long),
                              long (*flush)(void*, unsigned long),
-                             unsigned char *output,
+                             unsigned char *output, long out_len,
                              long *posp,
-                             void(*error)(char *x)
+                             void (*error)(char *x)
        )
 {
        return unlz4(buf, in_len - 4, fill, flush, output, posp, error);
index 0be83af62b884c3dbfa9f29f9630a5ecfd605e2f..ed7a1fd819f2fbc86b3ad0f238fce01739a8e07f 100644 (file)
@@ -620,7 +620,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, long in_len,
 
        num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
        p = (uint16_t *) large_malloc(num_probs * sizeof(*p));
-       if (p == 0)
+       if (p == NULL)
                goto exit_2;
        num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
        for (i = 0; i < num_probs; i++)
@@ -667,13 +667,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
                              long (*fill)(void*, unsigned long),
                              long (*flush)(void*, unsigned long),
-                             unsigned char *output,
+                             unsigned char *output, long out_len,
                              long *posp,
-                             void(*error)(char *x)
-       )
+                             void (*error)(char *x))
 {
        return unlzma(buf, in_len - 4, fill, flush, output, posp, error);
 }
index b94a31bdd87d15f34a7f4902eb6170f800c01206..f4c158e3a022aa1af35a6937190d5e0bc90e31ce 100644 (file)
@@ -31,6 +31,7 @@
  */
 
 #ifdef STATIC
+#define PREBOOT
 #include "lzo/lzo1x_decompress_safe.c"
 #else
 #include <linux/decompress/unlzo.h>
@@ -287,4 +288,14 @@ exit:
        return ret;
 }
 
-#define decompress unlzo
+#ifdef PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long olen,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return unlzo(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
index b07a78340e9d315006a97194fbcccae0c4c16509..25d59a95bd6681465d9e57af06f77c3d641b0649 100644 (file)
@@ -394,4 +394,14 @@ error_alloc_state:
  * This macro is used by architecture-specific files to decompress
  * the kernel image.
  */
-#define decompress unxz
+#ifdef XZ_PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long olen,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return unxz(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
index ec8da78df9be9f4ea245ff398193bd1d90210573..94be244e844103d0fb6ed20c7bee905ca908fa12 100644 (file)
@@ -152,7 +152,7 @@ int kstrtoll(const char *s, unsigned int base, long long *res)
                rv = _kstrtoull(s + 1, base, &tmp);
                if (rv < 0)
                        return rv;
-               if ((long long)(-tmp) >= 0)
+               if ((long long)-tmp > 0)
                        return -ERANGE;
                *res = -tmp;
        } else {
index c98ae818eb4eed802119133ca9d1c809d6b9f155..54036ce2e2dd0a4ab042c9c19b2d54fa41a34115 100644 (file)
@@ -410,7 +410,7 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * @dst:       destination buffer (escaped)
  * @osz:       destination buffer size
  * @flags:     combination of the flags (bitwise OR):
- *     %ESCAPE_SPACE:
+ *     %ESCAPE_SPACE: (special white space, not space itself)
  *             '\f' - form feed
  *             '\n' - new line
  *             '\r' - carriage return
@@ -432,16 +432,18 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *             all previous together
  *     %ESCAPE_HEX:
  *             '\xHH' - byte with hexadecimal value HH (2 digits)
- * @esc:       NULL-terminated string of characters any of which, if found in
- *             the source, has to be escaped
+ * @only:      NULL-terminated string containing characters used to limit
+ *             the selected escape class. If characters are included in @only
+ *             that would not normally be escaped by the classes selected
+ *             in @flags, they will be copied to @dst unescaped.
  *
  * Description:
  * The process of escaping byte buffer includes several parts. They are applied
  * in the following sequence.
  *     1. The character is matched to the printable class, if asked, and in
  *        case of match it passes through to the output.
- *     2. The character is not matched to the one from @esc string and thus
- *        must go as is to the output.
+ *     2. The character is not matched to the one from @only string and thus
+ *        must go as-is to the output.
  *     3. The character is checked if it falls into the class given by @flags.
  *        %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
  *        character. Note that they actually can't go together, otherwise
@@ -458,11 +460,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * dst for a '\0' terminator if and only if ret < osz.
  */
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-                     unsigned int flags, const char *esc)
+                     unsigned int flags, const char *only)
 {
        char *p = dst;
        char *end = p + osz;
-       bool is_dict = esc && *esc;
+       bool is_dict = only && *only;
 
        while (isz--) {
                unsigned char c = *src++;
@@ -471,7 +473,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                 * Apply rules in the following sequence:
                 *      - the character is printable, when @flags has
                 *        %ESCAPE_NP bit set
-                *      - the @esc string is supplied and does not contain a
+                *      - the @only string is supplied and does not contain a
                 *        character under question
                 *      - the character doesn't fall into a class of symbols
                 *        defined by given @flags
@@ -479,7 +481,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                 * output buffer.
                 */
                if ((flags & ESCAPE_NP && isprint(c)) ||
-                   (is_dict && !strchr(esc, c))) {
+                   (is_dict && !strchr(only, c))) {
                        /* do nothing */
                } else {
                        if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
index 4137bca5f8e8e5008ca88b7fcb801096856eaf0c..f355f67169b6a32fbb7162b46691a45de4291cd2 100644 (file)
@@ -260,6 +260,7 @@ static void __init test_kstrtoll_ok(void)
                {"4294967297",  10,     4294967297LL},
                {"9223372036854775807", 10,     9223372036854775807LL},
 
+               {"-0",  10,     0LL},
                {"-1",  10,     -1LL},
                {"-2",  10,     -2LL},
                {"-9223372036854775808",        10,     LLONG_MIN},
@@ -277,11 +278,6 @@ static void __init test_kstrtoll_fail(void)
                {"-9223372036854775809",        10},
                {"-18446744073709551614",       10},
                {"-18446744073709551615",       10},
-               /* negative zero isn't an integer in Linux */
-               {"-0",  0},
-               {"-0",  8},
-               {"-0",  10},
-               {"-0",  16},
                /* sign is first character if any */
                {"-+1", 0},
                {"-+1", 8},
index 098c08eddfab715e98a0c428ef69f3012b58d5d0..c1efb1b610179013baf5d662f40f739a5f9abc60 100644 (file)
@@ -65,7 +65,7 @@ static noinline void __init kmalloc_node_oob_right(void)
        kfree(ptr);
 }
 
-static noinline void __init kmalloc_large_oob_rigth(void)
+static noinline void __init kmalloc_large_oob_right(void)
 {
        char *ptr;
        size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
@@ -114,7 +114,7 @@ static noinline void __init kmalloc_oob_krealloc_less(void)
                kfree(ptr1);
                return;
        }
-       ptr2[size1] = 'x';
+       ptr2[size2] = 'x';
        kfree(ptr2);
 }
 
@@ -259,7 +259,7 @@ static int __init kmalloc_tests_init(void)
        kmalloc_oob_right();
        kmalloc_oob_left();
        kmalloc_node_oob_right();
-       kmalloc_large_oob_rigth();
+       kmalloc_large_oob_right();
        kmalloc_oob_krealloc_more();
        kmalloc_oob_krealloc_less();
        kmalloc_oob_16();
index ddf348299f244a9db0b5f430b2df3b173298b4ac..9b1756b12743fad143f4c07ae866dfed5fa5bf74 100644 (file)
@@ -35,6 +35,7 @@
 /* #include "deflate.h" */
 
 #include <linux/zutil.h>
+#include <linux/bitrev.h>
 #include "defutil.h"
 
 #ifdef DEBUG_ZLIB
@@ -146,7 +147,6 @@ static void send_all_trees (deflate_state *s, int lcodes, int dcodes,
 static void compress_block (deflate_state *s, ct_data *ltree,
                            ct_data *dtree);
 static void set_data_type  (deflate_state *s);
-static unsigned bi_reverse (unsigned value, int length);
 static void bi_windup      (deflate_state *s);
 static void bi_flush       (deflate_state *s);
 static void copy_block     (deflate_state *s, char *buf, unsigned len,
@@ -284,7 +284,7 @@ static void tr_static_init(void)
     /* The static distance tree is trivial: */
     for (n = 0; n < D_CODES; n++) {
         static_dtree[n].Len = 5;
-        static_dtree[n].Code = bi_reverse((unsigned)n, 5);
+        static_dtree[n].Code = bitrev32((u32)n) >> (32 - 5);
     }
     static_init_done = 1;
 }
@@ -520,7 +520,7 @@ static void gen_codes(
         int len = tree[n].Len;
         if (len == 0) continue;
         /* Now reverse the bits */
-        tree[n].Code = bi_reverse(next_code[len]++, len);
+        tree[n].Code = bitrev32((u32)(next_code[len]++)) >> (32 - len);
 
         Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
              n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
index b640b6402e99e3a74db809012a4138c4cef48457..a8c370897c9f4ee641373ba9245a5d833de07683 100644 (file)
@@ -292,22 +292,6 @@ void zlib_tr_stored_type_only (deflate_state *);
     put_byte(s, (uch)((ush)(w) >> 8)); \
 }
 
-/* ===========================================================================
- * Reverse the first len bits of a code, using straightforward code (a faster
- * method would use a table)
- * IN assertion: 1 <= len <= 15
- */
-static inline unsigned bi_reverse(unsigned code, /* the value to invert */
-                                 int len)       /* its bit length */
-{
-    register unsigned res = 0;
-    do {
-        res |= code & 1;
-        code >>= 1, res <<= 1;
-    } while (--len > 0);
-    return res >> 1;
-}
-
 /* ===========================================================================
  * Flush the bit buffer, keeping at most 7 bits in it.
  */
index 3a4070f5ab7941b768b09919e17753acb8022db2..6413d027c0b2ea18acb3c9c1a35149ce7af4bd2c 100644 (file)
@@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT
          processes running early in the lifetime of the systemm until kswapd
          finishes the initialisation.
 
+config IDLE_PAGE_TRACKING
+       bool "Enable idle page tracking"
+       depends on SYSFS && MMU
+       select PAGE_EXTENSION if !64BIT
+       help
+         This feature allows to estimate the amount of user pages that have
+         not been touched during a given period of time. This information can
+         be useful to tune memory cgroup limits and/or for job placement
+         within a compute cluster.
+
+         See Documentation/vm/idle_page_tracking.txt for more details.
+
 config ZONE_DEVICE
        bool "Device memory (pmem, etc...) hotplug support" if EXPERT
        default !ZONE_DMA
index b424d5e5b6ff5b1dec8f95fdd089451dba4dd19c..56f8eed73f1a6f658d90d156b5b4f2ddd0030eae 100644 (file)
@@ -79,3 +79,4 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
index 76089ddf99ea1cba96695010cbcbe9e5a4a90331..6c1b3ea61bfddfe4f042a6ef067e53e34f82792b 100644 (file)
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        {1UL << PG_compound_lock,       "compound_lock" },
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+       {1UL << PG_young,               "young"         },
+       {1UL << PG_idle,                "idle"          },
+#endif
 };
 
 static void dump_flags(unsigned long flags,
index b16279cbd91df6abe691c56af537e52cc8d8c34a..4b06b8db9df23c8f33406586507bbaecf7f5444c 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page,
                /* clear PageTail before overwriting first_page */
                smp_wmb();
 
+               if (page_is_young(page))
+                       set_page_young(page_tail);
+               if (page_is_idle(page))
+                       set_page_idle(page_tail);
+
                /*
                 * __split_huge_page_splitting() already set the
                 * splitting bit in all pmd that could map this
@@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                VM_BUG_ON_PAGE(PageLRU(page), page);
 
                /* If there is no mapped pte young don't collapse the page */
-               if (pte_young(pteval) || PageReferenced(page) ||
+               if (pte_young(pteval) ||
+                   page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
@@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 */
                if (page_count(page) != 1 + !!PageSwapCache(page))
                        goto out_unmap;
-               if (pte_young(pteval) || PageReferenced(page) ||
+               if (pte_young(pteval) ||
+                   page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
index aeba0edd6e447b1e853fdf2eb46547dae363f6f1..9d26fd9fefe4a1f4ec78279455c3b27f4ddbb875 100644 (file)
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
        /*
         * do a racy check with elevated page count, to make sure PG_hwpoison
         * will only be set for the targeted owner (or on a free page).
-        * We temporarily take page lock for try_get_mem_cgroup_from_page().
         * memory_failure() will redo the check reliably inside page lock.
         */
-       lock_page(hpage);
        err = hwpoison_filter(hpage);
-       unlock_page(hpage);
        if (err)
                goto put_out;
 
@@ -126,7 +123,7 @@ static int pfn_inject_init(void)
        if (!dentry)
                goto fail;
 
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
                                    hwpoison_dir, &hwpoison_filter_memcg);
        if (!dentry)
index f532f6a37b553bb0555fc2f6bf3bd32882d09140..77191eccdc6f6c372e84e2f49750f7f95e56c324 100644 (file)
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq,
                            struct kmemleak_object *object)
 {
        const u8 *ptr = (const u8 *)object->pointer;
-       int i, len, remaining;
-       unsigned char linebuf[HEX_ROW_SIZE * 5];
+       size_t len;
 
        /* limit the number of lines to HEX_MAX_LINES */
-       remaining = len =
-               min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
-
-       seq_printf(seq, "  hex dump (first %d bytes):\n", len);
-       for (i = 0; i < len; i += HEX_ROW_SIZE) {
-               int linelen = min(remaining, HEX_ROW_SIZE);
-
-               remaining -= HEX_ROW_SIZE;
-               hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
-                                  HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
-                                  HEX_ASCII);
-               seq_printf(seq, "    %s\n", linebuf);
-       }
+       len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
+
+       seq_printf(seq, "  hex dump (first %zu bytes):\n", len);
+       seq_hex_dump(seq, "    ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+                    HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
 }
 
 /*
index 1742a2db89c7beaa70bef4fc90340d25e25ebc88..6ddaeba34e097a7553d33b8add26e26a27d3c81a 100644 (file)
@@ -441,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
        return &memcg->css;
 }
 
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+       struct mem_cgroup *memcg;
+       unsigned long ino = 0;
+
+       rcu_read_lock();
+       memcg = READ_ONCE(page->mem_cgroup);
+       while (memcg && !(memcg->css.flags & CSS_ONLINE))
+               memcg = parent_mem_cgroup(memcg);
+       if (memcg)
+               ino = cgroup_ino(memcg->css.cgroup);
+       rcu_read_unlock();
+       return ino;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -2071,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
        css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges.  If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       struct mem_cgroup *memcg;
-       unsigned short id;
-       swp_entry_t ent;
-
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-       memcg = page->mem_cgroup;
-       if (memcg) {
-               if (!css_tryget_online(&memcg->css))
-                       memcg = NULL;
-       } else if (PageSwapCache(page)) {
-               ent.val = page_private(page);
-               id = lookup_swap_cgroup_id(ent);
-               rcu_read_lock();
-               memcg = mem_cgroup_from_id(id);
-               if (memcg && !css_tryget_online(&memcg->css))
-                       memcg = NULL;
-               rcu_read_unlock();
-       }
-       return memcg;
-}
-
 static void lock_page_lru(struct page *page, int *isolated)
 {
        struct zone *zone = page_zone(page);
@@ -5301,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 * the page lock, which serializes swap cache removal, which
                 * in turn serializes uncharging.
                 */
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
                if (page->mem_cgroup)
                        goto out;
+
+               if (do_swap_account) {
+                       swp_entry_t ent = { .val = page_private(page), };
+                       unsigned short id = lookup_swap_cgroup_id(ent);
+
+                       rcu_read_lock();
+                       memcg = mem_cgroup_from_id(id);
+                       if (memcg && !css_tryget_online(&memcg->css))
+                               memcg = NULL;
+                       rcu_read_unlock();
+               }
        }
 
        if (PageTransHuge(page)) {
@@ -5310,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
        }
 
-       if (do_swap_account && PageSwapCache(page))
-               memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
 
index eeda6485e76c27074ffb4462e1f6bd55d696b5fc..95882692e747c2a534488190287e5954fba35d39 100644 (file)
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p)
  * can only guarantee that the page either belongs to the memcg tasks, or is
  * a freed page.
  */
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
 {
-       struct mem_cgroup *mem;
-       struct cgroup_subsys_state *css;
-       unsigned long ino;
-
        if (!hwpoison_filter_memcg)
                return 0;
 
-       mem = try_get_mem_cgroup_from_page(p);
-       if (!mem)
-               return -EINVAL;
-
-       css = &mem->css;
-       ino = cgroup_ino(css->cgroup);
-       css_put(css);
-
-       if (ino != hwpoison_filter_memcg)
+       if (page_cgroup_ino(p) != hwpoison_filter_memcg)
                return -EINVAL;
 
        return 0;
index 6cd0b216040190f74087f61f4d8650f970176438..9cb27470fee991cb874676bb0cbc0f694b5e1d36 100644 (file)
@@ -3233,7 +3233,7 @@ out:
 static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pmd_t *pmd, unsigned int flags)
 {
-       if (!vma->vm_ops)
+       if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
        if (vma->vm_ops->pmd_fault)
                return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
@@ -3244,7 +3244,7 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
                        unsigned int flags)
 {
-       if (!vma->vm_ops)
+       if (vma_is_anonymous(vma))
                return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
        if (vma->vm_ops->pmd_fault)
                return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
index 02ce25df16c26476dd6a44528e6ff765a8442f5e..c3cb566af3e273a92e8353835b1cd6d03d64c7e3 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/gfp.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                        __set_page_dirty_nobuffers(newpage);
        }
 
+       if (page_is_young(page))
+               set_page_young(newpage);
+       if (page_is_idle(page))
+               set_page_idle(newpage);
+
        /*
         * Copy NUMA information to the new page, to prevent over-eager
         * future migrations of this same page.
index b6be3249f0a923cf1735405726f17cc4de8085a7..971dd2cb77d227b792f29f0f0cdc5d3d8aa16634 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,6 +612,8 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
+       WARN_ONCE(vma->vm_file && !vma->vm_ops, "missing vma->vm_ops");
+
        /* Update tracking information for the gap following the new vma. */
        if (vma->vm_next)
                vma_gap_update(vma->vm_next);
@@ -1260,14 +1262,12 @@ static inline int mlock_future_check(struct mm_struct *mm,
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
-                       unsigned long flags, unsigned long pgoff,
-                       unsigned long *populate)
+                       unsigned long flags, vm_flags_t vm_flags,
+                       unsigned long pgoff, unsigned long *populate)
 {
        struct mm_struct *mm = current->mm;
-       vm_flags_t vm_flags;
 
        *populate = 0;
 
@@ -1311,7 +1311,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
-       vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+       vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
        if (flags & MAP_LOCKED)
@@ -1638,6 +1638,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
                 */
                WARN_ON_ONCE(addr != vma->vm_start);
 
+               /* All file mapping must have ->vm_ops set */
+               if (!vma->vm_ops) {
+                       static const struct vm_operations_struct dummy_ops = {};
+                       vma->vm_ops = &dummy_ops;
+               }
+
                addr = vma->vm_start;
                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
index 3b9b3d0741b2a1546837761d90f7eec2c0b3b18b..5fbdd367bbed9c57bc9ffd600293a5913b44e752 100644 (file)
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }
 
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+                              unsigned long start,
+                              unsigned long end)
+{
+       struct mmu_notifier *mn;
+       int young = 0, id;
+
+       id = srcu_read_lock(&srcu);
+       hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+               if (mn->ops->clear_young)
+                       young |= mn->ops->clear_young(mn, mm, start, end);
+       }
+       srcu_read_unlock(&srcu, id);
+
+       return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
                              unsigned long address)
 {
index 1cc0709fcaa5312351ce69d548afadda5ce80097..ab14a2014dea76b62e77b0176d810037e1c76788 100644 (file)
@@ -1233,18 +1233,19 @@ enomem:
 /*
  * handle mapping creation for uClinux
  */
-unsigned long do_mmap_pgoff(struct file *file,
-                           unsigned long addr,
-                           unsigned long len,
-                           unsigned long prot,
-                           unsigned long flags,
-                           unsigned long pgoff,
-                           unsigned long *populate)
+unsigned long do_mmap(struct file *file,
+                       unsigned long addr,
+                       unsigned long len,
+                       unsigned long prot,
+                       unsigned long flags,
+                       vm_flags_t vm_flags,
+                       unsigned long pgoff,
+                       unsigned long *populate)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *rb;
-       unsigned long capabilities, vm_flags, result;
+       unsigned long capabilities, result;
        int ret;
 
        *populate = 0;
@@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        /* we've determined that we can make the mapping, now translate what we
         * now know into VMA flags */
-       vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+       vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
 
        /* we're going to need to record the mapping */
        region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
index d86fd2f5353fcb05d39c27887f64b81e148a2bc6..292ca7b8debd2c27c87d056e1ea4872d3094c756 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
 #include <linux/page_owner.h>
+#include <linux/page_idle.h>
 
 /*
  * struct page extension
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = {
 #ifdef CONFIG_PAGE_OWNER
        &page_owner_ops,
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+       &page_idle_ops,
+#endif
 };
 
 static unsigned long total_usage;
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644 (file)
index 0000000..d5dd790
--- /dev/null
@@ -0,0 +1,232 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_ext.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE      sizeof(u64)
+#define BITMAP_CHUNK_BITS      (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+       struct page *page;
+       struct zone *zone;
+
+       if (!pfn_valid(pfn))
+               return NULL;
+
+       page = pfn_to_page(pfn);
+       if (!page || !PageLRU(page) ||
+           !get_page_unless_zero(page))
+               return NULL;
+
+       zone = page_zone(page);
+       spin_lock_irq(&zone->lru_lock);
+       if (unlikely(!PageLRU(page))) {
+               put_page(page);
+               page = NULL;
+       }
+       spin_unlock_irq(&zone->lru_lock);
+       return page;
+}
+
+static int page_idle_clear_pte_refs_one(struct page *page,
+                                       struct vm_area_struct *vma,
+                                       unsigned long addr, void *arg)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte;
+       bool referenced = false;
+
+       if (unlikely(PageTransHuge(page))) {
+               pmd = page_check_address_pmd(page, mm, addr,
+                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+               if (pmd) {
+                       referenced = pmdp_clear_young_notify(vma, addr, pmd);
+                       spin_unlock(ptl);
+               }
+       } else {
+               pte = page_check_address(page, mm, addr, &ptl, 0);
+               if (pte) {
+                       referenced = ptep_clear_young_notify(vma, addr, pte);
+                       pte_unmap_unlock(pte, ptl);
+               }
+       }
+       if (referenced) {
+               clear_page_idle(page);
+               /*
+                * We cleared the referenced bit in a mapping to this page. To
+                * avoid interference with page reclaim, mark it young so that
+                * page_referenced() will return > 0.
+                */
+               set_page_young(page);
+       }
+       return SWAP_AGAIN;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+       /*
+        * Since rwc.arg is unused, rwc is effectively immutable, so we
+        * can make it static const to save some cycles and stack.
+        */
+       static const struct rmap_walk_control rwc = {
+               .rmap_one = page_idle_clear_pte_refs_one,
+               .anon_lock = page_lock_anon_vma_read,
+       };
+       bool need_lock;
+
+       if (!page_mapped(page) ||
+           !page_rmapping(page))
+               return;
+
+       need_lock = !PageAnon(page) || PageKsm(page);
+       if (need_lock && !trylock_page(page))
+               return;
+
+       rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+       if (need_lock)
+               unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+                                    struct bin_attribute *attr, char *buf,
+                                    loff_t pos, size_t count)
+{
+       u64 *out = (u64 *)buf;
+       struct page *page;
+       unsigned long pfn, end_pfn;
+       int bit;
+
+       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+               return -EINVAL;
+
+       pfn = pos * BITS_PER_BYTE;
+       if (pfn >= max_pfn)
+               return 0;
+
+       end_pfn = pfn + count * BITS_PER_BYTE;
+       if (end_pfn > max_pfn)
+               end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+       for (; pfn < end_pfn; pfn++) {
+               bit = pfn % BITMAP_CHUNK_BITS;
+               if (!bit)
+                       *out = 0ULL;
+               page = page_idle_get_page(pfn);
+               if (page) {
+                       if (page_is_idle(page)) {
+                               /*
+                                * The page might have been referenced via a
+                                * pte, in which case it is not idle. Clear
+                                * refs and recheck.
+                                */
+                               page_idle_clear_pte_refs(page);
+                               if (page_is_idle(page))
+                                       *out |= 1ULL << bit;
+                       }
+                       put_page(page);
+               }
+               if (bit == BITMAP_CHUNK_BITS - 1)
+                       out++;
+               cond_resched();
+       }
+       return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+                                     struct bin_attribute *attr, char *buf,
+                                     loff_t pos, size_t count)
+{
+       const u64 *in = (u64 *)buf;
+       struct page *page;
+       unsigned long pfn, end_pfn;
+       int bit;
+
+       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+               return -EINVAL;
+
+       pfn = pos * BITS_PER_BYTE;
+       if (pfn >= max_pfn)
+               return -ENXIO;
+
+       end_pfn = pfn + count * BITS_PER_BYTE;
+       if (end_pfn > max_pfn)
+               end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+       for (; pfn < end_pfn; pfn++) {
+               bit = pfn % BITMAP_CHUNK_BITS;
+               if ((*in >> bit) & 1) {
+                       page = page_idle_get_page(pfn);
+                       if (page) {
+                               page_idle_clear_pte_refs(page);
+                               set_page_idle(page);
+                               put_page(page);
+                       }
+               }
+               if (bit == BITMAP_CHUNK_BITS - 1)
+                       in++;
+               cond_resched();
+       }
+       return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+               __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+                          page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+       &page_idle_bitmap_attr,
+       NULL,
+};
+
+static struct attribute_group page_idle_attr_group = {
+       .bin_attrs = page_idle_bin_attrs,
+       .name = "page_idle",
+};
+
+#ifndef CONFIG_64BIT
+static bool need_page_idle(void)
+{
+       return true;
+}
+struct page_ext_operations page_idle_ops = {
+       .need = need_page_idle,
+};
+#endif
+
+static int __init page_idle_init(void)
+{
+       int err;
+
+       err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+       if (err) {
+               pr_err("page_idle: register sysfs failed\n");
+               return err;
+       }
+       return 0;
+}
+subsys_initcall(page_idle_init);
index 0db38e7d0a72b20ce63a6653ba24934ac3ce7825..f5b5c1f3dcd755ae313bba1404f2c9b079d5c18f 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -59,6 +59,7 @@
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
 
+       if (referenced)
+               clear_page_idle(page);
+       if (test_and_clear_page_young(page))
+               referenced++;
+
        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags;
index a3a0a2f1f7c3dc48c43494b949af6aee66adcf8f..983f692a47fdfbb80505fa77f673b9af37d08739 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
 #include <linux/gfp.h>
 #include <linux/uio.h>
 #include <linux/hugetlb.h>
+#include <linux/page_idle.h>
 
 #include "internal.h"
 
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page)
        } else if (!PageReferenced(page)) {
                SetPageReferenced(page);
        }
+       if (page_is_idle(page))
+               clear_page_idle(page);
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
index 68d2dd8ed2d8c6b43b6ed8e3f2f9cb29a287049f..8f670d3e87060f6277f5651a79cfaa8d27a30713 100644 (file)
@@ -99,6 +99,39 @@ static void zpool_put_driver(struct zpool_driver *driver)
        module_put(driver->owner);
 }
 
+/**
+ * zpool_has_pool() - Check if the pool driver is available
+ * @type       The type of the zpool to check (e.g. zbud, zsmalloc)
+ *
+ * This checks if the @type pool driver is available.  This will try to load
+ * the requested module, if needed, but there is no guarantee the module will
+ * still be loaded and available immediately after calling.  If this returns
+ * true, the caller should assume the pool is available, but must be prepared
+ * to handle the @zpool_create_pool() returning failure.  However if this
+ * returns false, the caller should assume the requested pool type is not
+ * available; either the requested pool type module does not exist, or could
+ * not be loaded, and calling @zpool_create_pool() with the pool type will
+ * fail.
+ *
+ * Returns: true if @type pool is available, false if not
+ */
+bool zpool_has_pool(char *type)
+{
+       struct zpool_driver *driver = zpool_get_driver(type);
+
+       if (!driver) {
+               request_module("zpool-%s", type);
+               driver = zpool_get_driver(type);
+       }
+
+       if (!driver)
+               return false;
+
+       zpool_put_driver(driver);
+       return true;
+}
+EXPORT_SYMBOL(zpool_has_pool);
+
 /**
  * zpool_create_pool() - Create a new zpool
  * @type       The type of the zpool to create (e.g. zbud, zsmalloc)
index 48a1d081e2a5f1ccba77570b8f7215a8f69c6302..4043df7c672fb6f5b1be298b8d510fd17a3bbf42 100644 (file)
@@ -80,85 +80,54 @@ static u64 zswap_duplicate_entry;
 static bool zswap_enabled;
 module_param_named(enabled, zswap_enabled, bool, 0644);
 
-/* Compressor to be used by zswap (fixed at boot for now) */
+/* Crypto compressor to use */
 #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0444);
-
-/* The maximum percentage of memory that the compressed pool can occupy */
-static unsigned int zswap_max_pool_percent = 20;
-module_param_named(max_pool_percent,
-                       zswap_max_pool_percent, uint, 0644);
+static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
+static struct kparam_string zswap_compressor_kparam = {
+       .string =       zswap_compressor,
+       .maxlen =       sizeof(zswap_compressor),
+};
+static int zswap_compressor_param_set(const char *,
+                                     const struct kernel_param *);
+static struct kernel_param_ops zswap_compressor_param_ops = {
+       .set =          zswap_compressor_param_set,
+       .get =          param_get_string,
+};
+module_param_cb(compressor, &zswap_compressor_param_ops,
+               &zswap_compressor_kparam, 0644);
 
-/* Compressed storage to use */
+/* Compressed storage zpool to use */
 #define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-module_param_named(zpool, zswap_zpool_type, charp, 0444);
+static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
+static struct kparam_string zswap_zpool_kparam = {
+       .string =       zswap_zpool_type,
+       .maxlen =       sizeof(zswap_zpool_type),
+};
+static int zswap_zpool_param_set(const char *, const struct kernel_param *);
+static struct kernel_param_ops zswap_zpool_param_ops = {
+       .set =  zswap_zpool_param_set,
+       .get =  param_get_string,
+};
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
 
-/* zpool is shared by all of zswap backend  */
-static struct zpool *zswap_pool;
+/* The maximum percentage of memory that the compressed pool can occupy */
+static unsigned int zswap_max_pool_percent = 20;
+module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
 
 /*********************************
-* compression functions
+* data structures
 **********************************/
-/* per-cpu compression transforms */
-static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
 
-enum comp_op {
-       ZSWAP_COMPOP_COMPRESS,
-       ZSWAP_COMPOP_DECOMPRESS
+struct zswap_pool {
+       struct zpool *zpool;
+       struct crypto_comp * __percpu *tfm;
+       struct kref kref;
+       struct list_head list;
+       struct rcu_head rcu_head;
+       struct notifier_block notifier;
+       char tfm_name[CRYPTO_MAX_ALG_NAME];
 };
 
-static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
-                               u8 *dst, unsigned int *dlen)
-{
-       struct crypto_comp *tfm;
-       int ret;
-
-       tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
-       switch (op) {
-       case ZSWAP_COMPOP_COMPRESS:
-               ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
-               break;
-       case ZSWAP_COMPOP_DECOMPRESS:
-               ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-       put_cpu();
-       return ret;
-}
-
-static int __init zswap_comp_init(void)
-{
-       if (!crypto_has_comp(zswap_compressor, 0, 0)) {
-               pr_info("%s compressor not available\n", zswap_compressor);
-               /* fall back to default compressor */
-               zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-               if (!crypto_has_comp(zswap_compressor, 0, 0))
-                       /* can't even load the default compressor */
-                       return -ENODEV;
-       }
-       pr_info("using %s compressor\n", zswap_compressor);
-
-       /* alloc percpu transforms */
-       zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
-       if (!zswap_comp_pcpu_tfms)
-               return -ENOMEM;
-       return 0;
-}
-
-static void __init zswap_comp_exit(void)
-{
-       /* free percpu transforms */
-       free_percpu(zswap_comp_pcpu_tfms);
-}
-
-/*********************************
-* data structures
-**********************************/
 /*
  * struct zswap_entry
  *
@@ -166,22 +135,24 @@ static void __init zswap_comp_exit(void)
  * page within zswap.
  *
  * rbnode - links the entry into red-black tree for the appropriate swap type
+ * offset - the swap offset for the entry.  Index into the red-black tree.
  * refcount - the number of outstanding reference to the entry. This is needed
  *            to protect against premature freeing of the entry by code
  *            concurrent calls to load, invalidate, and writeback.  The lock
  *            for the zswap_tree structure that contains the entry must
  *            be held while changing the refcount.  Since the lock must
  *            be held, there is no reason to also make refcount atomic.
- * offset - the swap offset for the entry.  Index into the red-black tree.
- * handle - zpool allocation handle that stores the compressed page data
  * length - the length in bytes of the compressed page data.  Needed during
  *          decompression
+ * pool - the zswap_pool the entry's data is in
+ * handle - zpool allocation handle that stores the compressed page data
  */
 struct zswap_entry {
        struct rb_node rbnode;
        pgoff_t offset;
        int refcount;
        unsigned int length;
+       struct zswap_pool *pool;
        unsigned long handle;
 };
 
@@ -201,6 +172,51 @@ struct zswap_tree {
 
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
 
+/* RCU-protected iteration */
+static LIST_HEAD(zswap_pools);
+/* protects zswap_pools list modification */
+static DEFINE_SPINLOCK(zswap_pools_lock);
+
+/* used by param callback function */
+static bool zswap_init_started;
+
+/*********************************
+* helpers and fwd declarations
+**********************************/
+
+#define zswap_pool_debug(msg, p)                               \
+       pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
+                zpool_get_type((p)->zpool))
+
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_pool_get(struct zswap_pool *pool);
+static void zswap_pool_put(struct zswap_pool *pool);
+
+static const struct zpool_ops zswap_zpool_ops = {
+       .evict = zswap_writeback_entry
+};
+
+static bool zswap_is_full(void)
+{
+       return totalram_pages * zswap_max_pool_percent / 100 <
+               DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
+static void zswap_update_total_size(void)
+{
+       struct zswap_pool *pool;
+       u64 total = 0;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list)
+               total += zpool_get_total_size(pool->zpool);
+
+       rcu_read_unlock();
+
+       zswap_pool_total_size = total;
+}
+
 /*********************************
 * zswap entry functions
 **********************************/
@@ -294,10 +310,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
  */
 static void zswap_free_entry(struct zswap_entry *entry)
 {
-       zpool_free(zswap_pool, entry->handle);
+       zpool_free(entry->pool->zpool, entry->handle);
+       zswap_pool_put(entry->pool);
        zswap_entry_cache_free(entry);
        atomic_dec(&zswap_stored_pages);
-       zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+       zswap_update_total_size();
 }
 
 /* caller must hold the tree lock */
@@ -339,35 +356,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 **********************************/
 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
 
-static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
+static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
 {
-       struct crypto_comp *tfm;
        u8 *dst;
 
        switch (action) {
        case CPU_UP_PREPARE:
-               tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
-               if (IS_ERR(tfm)) {
-                       pr_err("can't allocate compressor transform\n");
-                       return NOTIFY_BAD;
-               }
-               *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
                dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
                if (!dst) {
                        pr_err("can't allocate compressor buffer\n");
-                       crypto_free_comp(tfm);
-                       *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
                        return NOTIFY_BAD;
                }
                per_cpu(zswap_dstmem, cpu) = dst;
                break;
        case CPU_DEAD:
        case CPU_UP_CANCELED:
-               tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
-               if (tfm) {
-                       crypto_free_comp(tfm);
-                       *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
-               }
                dst = per_cpu(zswap_dstmem, cpu);
                kfree(dst);
                per_cpu(zswap_dstmem, cpu) = NULL;
@@ -378,43 +381,398 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
        return NOTIFY_OK;
 }
 
-static int zswap_cpu_notifier(struct notifier_block *nb,
-                               unsigned long action, void *pcpu)
+static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
+                                    unsigned long action, void *pcpu)
 {
-       unsigned long cpu = (unsigned long)pcpu;
-       return __zswap_cpu_notifier(action, cpu);
+       return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
 }
 
-static struct notifier_block zswap_cpu_notifier_block = {
-       .notifier_call = zswap_cpu_notifier
+static struct notifier_block zswap_dstmem_notifier = {
+       .notifier_call =        zswap_cpu_dstmem_notifier,
 };
 
-static int __init zswap_cpu_init(void)
+static int __init zswap_cpu_dstmem_init(void)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
+                   NOTIFY_BAD)
+                       goto cleanup;
+       __register_cpu_notifier(&zswap_dstmem_notifier);
+       cpu_notifier_register_done();
+       return 0;
+
+cleanup:
+       for_each_online_cpu(cpu)
+               __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
+       cpu_notifier_register_done();
+       return -ENOMEM;
+}
+
+static void zswap_cpu_dstmem_destroy(void)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
+       __unregister_cpu_notifier(&zswap_dstmem_notifier);
+       cpu_notifier_register_done();
+}
+
+static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
+                                    unsigned long action, unsigned long cpu)
+{
+       struct crypto_comp *tfm;
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+               if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
+                       break;
+               tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
+               if (IS_ERR_OR_NULL(tfm)) {
+                       pr_err("could not alloc crypto comp %s : %ld\n",
+                              pool->tfm_name, PTR_ERR(tfm));
+                       return NOTIFY_BAD;
+               }
+               *per_cpu_ptr(pool->tfm, cpu) = tfm;
+               break;
+       case CPU_DEAD:
+       case CPU_UP_CANCELED:
+               tfm = *per_cpu_ptr(pool->tfm, cpu);
+               if (!IS_ERR_OR_NULL(tfm))
+                       crypto_free_comp(tfm);
+               *per_cpu_ptr(pool->tfm, cpu) = NULL;
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static int zswap_cpu_comp_notifier(struct notifier_block *nb,
+                                  unsigned long action, void *pcpu)
+{
+       unsigned long cpu = (unsigned long)pcpu;
+       struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
+
+       return __zswap_cpu_comp_notifier(pool, action, cpu);
+}
+
+static int zswap_cpu_comp_init(struct zswap_pool *pool)
 {
        unsigned long cpu;
 
+       memset(&pool->notifier, 0, sizeof(pool->notifier));
+       pool->notifier.notifier_call = zswap_cpu_comp_notifier;
+
        cpu_notifier_register_begin();
        for_each_online_cpu(cpu)
-               if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+               if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
+                   NOTIFY_BAD)
                        goto cleanup;
-       __register_cpu_notifier(&zswap_cpu_notifier_block);
+       __register_cpu_notifier(&pool->notifier);
        cpu_notifier_register_done();
        return 0;
 
 cleanup:
        for_each_online_cpu(cpu)
-               __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
+               __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
        cpu_notifier_register_done();
        return -ENOMEM;
 }
 
+static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
+       __unregister_cpu_notifier(&pool->notifier);
+       cpu_notifier_register_done();
+}
+
 /*********************************
-* helpers
+* pool functions
 **********************************/
-static bool zswap_is_full(void)
+
+static struct zswap_pool *__zswap_pool_current(void)
 {
-       return totalram_pages * zswap_max_pool_percent / 100 <
-               DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+       struct zswap_pool *pool;
+
+       pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+       WARN_ON(!pool);
+
+       return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+       assert_spin_locked(&zswap_pools_lock);
+
+       return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+       struct zswap_pool *pool;
+
+       rcu_read_lock();
+
+       pool = __zswap_pool_current();
+       if (!pool || !zswap_pool_get(pool))
+               pool = NULL;
+
+       rcu_read_unlock();
+
+       return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+       struct zswap_pool *pool, *last = NULL;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list)
+               last = pool;
+       if (!WARN_ON(!last) && !zswap_pool_get(last))
+               last = NULL;
+
+       rcu_read_unlock();
+
+       return last;
+}
+
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+       struct zswap_pool *pool;
+
+       assert_spin_locked(&zswap_pools_lock);
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list) {
+               if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
+                       continue;
+               if (strncmp(zpool_get_type(pool->zpool), type,
+                           sizeof(zswap_zpool_type)))
+                       continue;
+               /* if we can't get it, it's about to be destroyed */
+               if (!zswap_pool_get(pool))
+                       continue;
+               return pool;
+       }
+
+       return NULL;
+}
+
+static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+{
+       struct zswap_pool *pool;
+       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+
+       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+       if (!pool) {
+               pr_err("pool alloc failed\n");
+               return NULL;
+       }
+
+       pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
+       if (!pool->zpool) {
+               pr_err("%s zpool not available\n", type);
+               goto error;
+       }
+       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+
+       strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+       pool->tfm = alloc_percpu(struct crypto_comp *);
+       if (!pool->tfm) {
+               pr_err("percpu alloc failed\n");
+               goto error;
+       }
+
+       if (zswap_cpu_comp_init(pool))
+               goto error;
+       pr_debug("using %s compressor\n", pool->tfm_name);
+
+       /* being the current pool takes 1 ref; this func expects the
+        * caller to always add the new pool as the current pool
+        */
+       kref_init(&pool->kref);
+       INIT_LIST_HEAD(&pool->list);
+
+       zswap_pool_debug("created", pool);
+
+       return pool;
+
+error:
+       free_percpu(pool->tfm);
+       if (pool->zpool)
+               zpool_destroy_pool(pool->zpool);
+       kfree(pool);
+       return NULL;
+}
+
+static struct zswap_pool *__zswap_pool_create_fallback(void)
+{
+       if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+               pr_err("compressor %s not available, using default %s\n",
+                      zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
+               strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
+                       sizeof(zswap_compressor));
+       }
+       if (!zpool_has_pool(zswap_zpool_type)) {
+               pr_err("zpool %s not available, using default %s\n",
+                      zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
+               strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
+                       sizeof(zswap_zpool_type));
+       }
+
+       return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+}
+
+static void zswap_pool_destroy(struct zswap_pool *pool)
+{
+       zswap_pool_debug("destroying", pool);
+
+       zswap_cpu_comp_destroy(pool);
+       free_percpu(pool->tfm);
+       zpool_destroy_pool(pool->zpool);
+       kfree(pool);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+       return kref_get_unless_zero(&pool->kref);
+}
+
+static void __zswap_pool_release(struct rcu_head *head)
+{
+       struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
+
+       /* nobody should have been able to get a kref... */
+       WARN_ON(kref_get_unless_zero(&pool->kref));
+
+       /* pool is now off zswap_pools list and has no references. */
+       zswap_pool_destroy(pool);
+}
+
+static void __zswap_pool_empty(struct kref *kref)
+{
+       struct zswap_pool *pool;
+
+       pool = container_of(kref, typeof(*pool), kref);
+
+       spin_lock(&zswap_pools_lock);
+
+       WARN_ON(pool == zswap_pool_current());
+
+       list_del_rcu(&pool->list);
+       call_rcu(&pool->rcu_head, __zswap_pool_release);
+
+       spin_unlock(&zswap_pools_lock);
+}
+
+static void zswap_pool_put(struct zswap_pool *pool)
+{
+       kref_put(&pool->kref, __zswap_pool_empty);
+}
+
+/*********************************
+* param callbacks
+**********************************/
+
+static int __zswap_param_set(const char *val, const struct kernel_param *kp,
+                            char *type, char *compressor)
+{
+       struct zswap_pool *pool, *put_pool = NULL;
+       char str[kp->str->maxlen], *s;
+       int ret;
+
+       /*
+        * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
+        * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
+        * 32 (arbitrary).
+        */
+       strlcpy(str, val, kp->str->maxlen);
+       s = strim(str);
+
+       /* if this is load-time (pre-init) param setting,
+        * don't create a pool; that's done during init.
+        */
+       if (!zswap_init_started)
+               return param_set_copystring(s, kp);
+
+       /* no change required */
+       if (!strncmp(kp->str->string, s, kp->str->maxlen))
+               return 0;
+
+       if (!type) {
+               type = s;
+               if (!zpool_has_pool(type)) {
+                       pr_err("zpool %s not available\n", type);
+                       return -ENOENT;
+               }
+       } else if (!compressor) {
+               compressor = s;
+               if (!crypto_has_comp(compressor, 0, 0)) {
+                       pr_err("compressor %s not available\n", compressor);
+                       return -ENOENT;
+               }
+       }
+
+       spin_lock(&zswap_pools_lock);
+
+       pool = zswap_pool_find_get(type, compressor);
+       if (pool) {
+               zswap_pool_debug("using existing", pool);
+               list_del_rcu(&pool->list);
+       } else {
+               spin_unlock(&zswap_pools_lock);
+               pool = zswap_pool_create(type, compressor);
+               spin_lock(&zswap_pools_lock);
+       }
+
+       if (pool)
+               ret = param_set_copystring(s, kp);
+       else
+               ret = -EINVAL;
+
+       if (!ret) {
+               put_pool = zswap_pool_current();
+               list_add_rcu(&pool->list, &zswap_pools);
+       } else if (pool) {
+               /* add the possibly pre-existing pool to the end of the pools
+                * list; if it's new (and empty) then it'll be removed and
+                * destroyed by the put after we drop the lock
+                */
+               list_add_tail_rcu(&pool->list, &zswap_pools);
+               put_pool = pool;
+       }
+
+       spin_unlock(&zswap_pools_lock);
+
+       /* drop the ref from either the old current pool,
+        * or the new pool we failed to add
+        */
+       if (put_pool)
+               zswap_pool_put(put_pool);
+
+       return ret;
+}
+
+static int zswap_compressor_param_set(const char *val,
+                                     const struct kernel_param *kp)
+{
+       return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
+}
+
+static int zswap_zpool_param_set(const char *val,
+                                const struct kernel_param *kp)
+{
+       return __zswap_param_set(val, kp, NULL, zswap_compressor);
 }
 
 /*********************************
@@ -477,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
        pgoff_t offset;
        struct zswap_entry *entry;
        struct page *page;
+       struct crypto_comp *tfm;
        u8 *src, *dst;
        unsigned int dlen;
        int ret;
@@ -517,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
                /* decompress */
                dlen = PAGE_SIZE;
-               src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+               src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
                                ZPOOL_MM_RO) + sizeof(struct zswap_header);
                dst = kmap_atomic(page);
-               ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
-                               entry->length, dst, &dlen);
+               tfm = *get_cpu_ptr(entry->pool->tfm);
+               ret = crypto_comp_decompress(tfm, src, entry->length,
+                                            dst, &dlen);
+               put_cpu_ptr(entry->pool->tfm);
                kunmap_atomic(dst);
-               zpool_unmap_handle(zswap_pool, entry->handle);
+               zpool_unmap_handle(entry->pool->zpool, entry->handle);
                BUG_ON(ret);
                BUG_ON(dlen != PAGE_SIZE);
 
@@ -572,6 +933,22 @@ end:
        return ret;
 }
 
+static int zswap_shrink(void)
+{
+       struct zswap_pool *pool;
+       int ret;
+
+       pool = zswap_pool_last_get();
+       if (!pool)
+               return -ENOENT;
+
+       ret = zpool_shrink(pool->zpool, 1, NULL);
+
+       zswap_pool_put(pool);
+
+       return ret;
+}
+
 /*********************************
 * frontswap hooks
 **********************************/
@@ -581,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 {
        struct zswap_tree *tree = zswap_trees[type];
        struct zswap_entry *entry, *dupentry;
+       struct crypto_comp *tfm;
        int ret;
        unsigned int dlen = PAGE_SIZE, len;
        unsigned long handle;
@@ -596,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* reclaim space if needed */
        if (zswap_is_full()) {
                zswap_pool_limit_hit++;
-               if (zpool_shrink(zswap_pool, 1, NULL)) {
+               if (zswap_shrink()) {
                        zswap_reject_reclaim_fail++;
                        ret = -ENOMEM;
                        goto reject;
@@ -611,33 +989,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                goto reject;
        }
 
+       /* if entry is successfully added, it keeps the reference */
+       entry->pool = zswap_pool_current_get();
+       if (!entry->pool) {
+               ret = -EINVAL;
+               goto freepage;
+       }
+
        /* compress */
        dst = get_cpu_var(zswap_dstmem);
+       tfm = *get_cpu_ptr(entry->pool->tfm);
        src = kmap_atomic(page);
-       ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
+       ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
        kunmap_atomic(src);
+       put_cpu_ptr(entry->pool->tfm);
        if (ret) {
                ret = -EINVAL;
-               goto freepage;
+               goto put_dstmem;
        }
 
        /* store */
        len = dlen + sizeof(struct zswap_header);
-       ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
-               &handle);
+       ret = zpool_malloc(entry->pool->zpool, len,
+                          __GFP_NORETRY | __GFP_NOWARN, &handle);
        if (ret == -ENOSPC) {
                zswap_reject_compress_poor++;
-               goto freepage;
+               goto put_dstmem;
        }
        if (ret) {
                zswap_reject_alloc_fail++;
-               goto freepage;
+               goto put_dstmem;
        }
-       zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
+       zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
        zhdr->swpentry = swp_entry(type, offset);
        buf = (u8 *)(zhdr + 1);
        memcpy(buf, dst, dlen);
-       zpool_unmap_handle(zswap_pool, handle);
+       zpool_unmap_handle(entry->pool->zpool, handle);
        put_cpu_var(zswap_dstmem);
 
        /* populate entry */
@@ -660,12 +1047,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 
        /* update stats */
        atomic_inc(&zswap_stored_pages);
-       zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+       zswap_update_total_size();
 
        return 0;
 
-freepage:
+put_dstmem:
        put_cpu_var(zswap_dstmem);
+       zswap_pool_put(entry->pool);
+freepage:
        zswap_entry_cache_free(entry);
 reject:
        return ret;
@@ -680,6 +1069,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 {
        struct zswap_tree *tree = zswap_trees[type];
        struct zswap_entry *entry;
+       struct crypto_comp *tfm;
        u8 *src, *dst;
        unsigned int dlen;
        int ret;
@@ -696,13 +1086,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
        /* decompress */
        dlen = PAGE_SIZE;
-       src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+       src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
                        ZPOOL_MM_RO) + sizeof(struct zswap_header);
        dst = kmap_atomic(page);
-       ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
-               dst, &dlen);
+       tfm = *get_cpu_ptr(entry->pool->tfm);
+       ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
+       put_cpu_ptr(entry->pool->tfm);
        kunmap_atomic(dst);
-       zpool_unmap_handle(zswap_pool, entry->handle);
+       zpool_unmap_handle(entry->pool->zpool, entry->handle);
        BUG_ON(ret);
 
        spin_lock(&tree->lock);
@@ -755,10 +1146,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        zswap_trees[type] = NULL;
 }
 
-static const struct zpool_ops zswap_zpool_ops = {
-       .evict = zswap_writeback_entry
-};
-
 static void zswap_frontswap_init(unsigned type)
 {
        struct zswap_tree *tree;
@@ -839,49 +1226,40 @@ static void __exit zswap_debugfs_exit(void) { }
 **********************************/
 static int __init init_zswap(void)
 {
-       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+       struct zswap_pool *pool;
 
-       pr_info("loading zswap\n");
-
-       zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
-                                       &zswap_zpool_ops);
-       if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
-               pr_info("%s zpool not available\n", zswap_zpool_type);
-               zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-               zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
-                                       &zswap_zpool_ops);
-       }
-       if (!zswap_pool) {
-               pr_err("%s zpool not available\n", zswap_zpool_type);
-               pr_err("zpool creation failed\n");
-               goto error;
-       }
-       pr_info("using %s pool\n", zswap_zpool_type);
+       zswap_init_started = true;
 
        if (zswap_entry_cache_create()) {
                pr_err("entry cache creation failed\n");
-               goto cachefail;
+               goto cache_fail;
        }
-       if (zswap_comp_init()) {
-               pr_err("compressor initialization failed\n");
-               goto compfail;
+
+       if (zswap_cpu_dstmem_init()) {
+               pr_err("dstmem alloc failed\n");
+               goto dstmem_fail;
        }
-       if (zswap_cpu_init()) {
-               pr_err("per-cpu initialization failed\n");
-               goto pcpufail;
+
+       pool = __zswap_pool_create_fallback();
+       if (!pool) {
+               pr_err("pool creation failed\n");
+               goto pool_fail;
        }
+       pr_info("loaded using pool %s/%s\n", pool->tfm_name,
+               zpool_get_type(pool->zpool));
+
+       list_add(&pool->list, &zswap_pools);
 
        frontswap_register_ops(&zswap_frontswap_ops);
        if (zswap_debugfs_init())
                pr_warn("debugfs initialization failed\n");
        return 0;
-pcpufail:
-       zswap_comp_exit();
-compfail:
+
+pool_fail:
+       zswap_cpu_dstmem_destroy();
+dstmem_fail:
        zswap_entry_cache_destroy();
-cachefail:
-       zpool_destroy_pool(zswap_pool);
-error:
+cache_fail:
        return -ENOMEM;
 }
 /* must be late so crypto has time to come up */
index a51ca0e5beef4bab5d06de05f96a639672f69596..f2a1131b2f8baf06f28e286d6b9203d4d1a873f6 100755 (executable)
@@ -264,6 +264,7 @@ our $Sparse = qr{
                        __kernel|
                        __force|
                        __iomem|
+                       __pmem|
                        __must_check|
                        __init_refok|
                        __kprobes|
@@ -584,7 +585,7 @@ our $LvalOrFunc     = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*};
 our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)};
 
 our $declaration_macros = qr{(?x:
-       (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(|
+       (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
        (?:$Storage\s+)?LIST_HEAD\s*\(|
        (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(
 )};
@@ -1953,9 +1954,9 @@ sub process {
        our $clean = 1;
        my $signoff = 0;
        my $is_patch = 0;
-
        my $in_header_lines = $file ? 0 : 1;
        my $in_commit_log = 0;          #Scanning lines before patch
+       my $commit_log_possible_stack_dump = 0;
        my $commit_log_long_line = 0;
        my $commit_log_has_diff = 0;
        my $reported_maintainer_file = 0;
@@ -2166,11 +2167,15 @@ sub process {
                if ($showfile) {
                        $prefix = "$realfile:$realline: "
                } elsif ($emacs) {
-                       $prefix = "$filename:$linenr: ";
+                       if ($file) {
+                               $prefix = "$filename:$realline: ";
+                       } else {
+                               $prefix = "$filename:$linenr: ";
+                       }
                }
 
                if ($found_file) {
-                       if ($realfile =~ m@^(drivers/net/|net/)@) {
+                       if ($realfile =~ m@^(?:drivers/net/|net/|drivers/staging/)@) {
                                $check = 1;
                        } else {
                                $check = $check_orig;
@@ -2310,16 +2315,42 @@ sub process {
 
 # Check for line lengths > 75 in commit log, warn once
                if ($in_commit_log && !$commit_log_long_line &&
-                   length($line) > 75) {
+                   length($line) > 75 &&
+                   !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
+                                       # file delta changes
+                     $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
+                                       # filename then :
+                     $line =~ /^\s*(?:Fixes:|Link:)/i ||
+                                       # A Fixes: or Link: line
+                     $commit_log_possible_stack_dump)) {
                        WARN("COMMIT_LOG_LONG_LINE",
                             "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
                        $commit_log_long_line = 1;
                }
 
+# Check if the commit log is in a possible stack dump
+               if ($in_commit_log && !$commit_log_possible_stack_dump &&
+                   ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
+                    $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
+                               # timestamp
+                    $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
+                               # stack dump address
+                       $commit_log_possible_stack_dump = 1;
+               }
+
+# Reset possible stack dump if a blank line is found
+               if ($in_commit_log && $commit_log_possible_stack_dump &&
+                   $line =~ /^\s*$/) {
+                       $commit_log_possible_stack_dump = 0;
+               }
+
 # Check for git id commit length and improperly formed commit descriptions
-               if ($in_commit_log && $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i) {
-                       my $init_char = $1;
-                       my $orig_commit = lc($2);
+               if ($in_commit_log &&
+                   ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
+                    ($line =~ /\b[0-9a-f]{12,40}\b/i &&
+                     $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
+                       my $init_char = "c";
+                       my $orig_commit = "";
                        my $short = 1;
                        my $long = 0;
                        my $case = 1;
@@ -2330,6 +2361,13 @@ sub process {
                        my $orig_desc = "commit description";
                        my $description = "";
 
+                       if ($line =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) {
+                               $init_char = $1;
+                               $orig_commit = lc($2);
+                       } elsif ($line =~ /\b([0-9a-f]{12,40})\b/i) {
+                               $orig_commit = lc($1);
+                       }
+
                        $short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i);
                        $long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i);
                        $space = 0 if ($line =~ /\bcommit [0-9a-f]/i);
@@ -2738,6 +2776,8 @@ sub process {
                        }
                }
 
+# Block comment styles
+# Networking with an initial /*
                if ($realfile =~ m@^(drivers/net/|net/)@ &&
                    $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
                    $rawline =~ /^\+[ \t]*\*/ &&
@@ -2746,22 +2786,23 @@ sub process {
                             "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
                }
 
-               if ($realfile =~ m@^(drivers/net/|net/)@ &&
-                   $prevrawline =~ /^\+[ \t]*\/\*/ &&          #starting /*
+# Block comments use * on subsequent lines
+               if ($prevline =~ /$;[ \t]*$/ &&                 #ends in comment
+                   $prevrawline =~ /^\+.*?\/\*/ &&             #starting /*
                    $prevrawline !~ /\*\/[ \t]*$/ &&            #no trailing */
                    $rawline =~ /^\+/ &&                        #line is new
                    $rawline !~ /^\+[ \t]*\*/) {                #no leading *
-                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
-                            "networking block comments start with * on subsequent lines\n" . $hereprev);
+                       WARN("BLOCK_COMMENT_STYLE",
+                            "Block comments use * on subsequent lines\n" . $hereprev);
                }
 
-               if ($realfile =~ m@^(drivers/net/|net/)@ &&
-                   $rawline !~ m@^\+[ \t]*\*/[ \t]*$@ &&       #trailing */
+# Block comments use */ on trailing lines
+               if ($rawline !~ m@^\+[ \t]*\*/[ \t]*$@ &&       #trailing */
                    $rawline !~ m@^\+.*/\*.*\*/[ \t]*$@ &&      #inline /*...*/
                    $rawline !~ m@^\+.*\*{2,}/[ \t]*$@ &&       #trailing **/
                    $rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) {    #non blank */
-                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
-                            "networking block comments put the trailing */ on a separate line\n" . $herecurr);
+                       WARN("BLOCK_COMMENT_STYLE",
+                            "Block comments use a trailing */ on a separate line\n" . $herecurr);
                }
 
 # check for missing blank lines after struct/union declarations
@@ -3067,15 +3108,22 @@ sub process {
 
                        substr($s, 0, length($c), '');
 
-                       # Make sure we remove the line prefixes as we have
-                       # none on the first line, and are going to readd them
-                       # where necessary.
-                       $s =~ s/\n./\n/gs;
+                       # remove inline comments
+                       $s =~ s/$;/ /g;
+                       $c =~ s/$;/ /g;
 
                        # Find out how long the conditional actually is.
                        my @newlines = ($c =~ /\n/gs);
                        my $cond_lines = 1 + $#newlines;
 
+                       # Make sure we remove the line prefixes as we have
+                       # none on the first line, and are going to readd them
+                       # where necessary.
+                       $s =~ s/\n./\n/gs;
+                       while ($s =~ /\n\s+\\\n/) {
+                               $cond_lines += $s =~ s/\n\s+\\\n/\n/g;
+                       }
+
                        # We want to check the first line inside the block
                        # starting at the end of the conditional, so remove:
                        #  1) any blank line termination
@@ -3141,8 +3189,10 @@ sub process {
 
                        #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
 
-                       if ($check && (($sindent % 8) != 0 ||
-                           ($sindent <= $indent && $s ne ''))) {
+                       if ($check && $s ne '' &&
+                           (($sindent % 8) != 0 ||
+                            ($sindent < $indent) ||
+                            ($sindent > $indent + 8))) {
                                WARN("SUSPECT_CODE_INDENT",
                                     "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
                        }
@@ -3439,13 +3489,15 @@ sub process {
                        }
                }
 
-# # no BUG() or BUG_ON()
-#              if ($line =~ /\b(BUG|BUG_ON)\b/) {
-#                      print "Try to use WARN_ON & Recovery code rather than BUG() or BUG_ON()\n";
-#                      print "$herecurr";
-#                      $clean = 0;
-#              }
+# avoid BUG() or BUG_ON()
+               if ($line =~ /\b(?:BUG|BUG_ON)\b/) {
+                       my $msg_type = \&WARN;
+                       $msg_type = \&CHK if ($file);
+                       &{$msg_type}("AVOID_BUG",
+                                    "Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON()\n" . $herecurr);
+               }
 
+# avoid LINUX_VERSION_CODE
                if ($line =~ /\bLINUX_VERSION_CODE\b/) {
                        WARN("LINUX_VERSION_CODE",
                             "LINUX_VERSION_CODE should be avoided, code should be for the version to which it is merged\n" . $herecurr);
@@ -3520,7 +3572,7 @@ sub process {
 # function brace can't be on same line, except for #defines of do while,
 # or if closed on same line
                if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and
-                   !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) {
+                   !($line=~/\#\s*define.*do\s\{/) and !($line=~/}/)) {
                        if (ERROR("OPEN_BRACE",
                                  "open brace '{' following function declarations go on the next line\n" . $herecurr) &&
                            $fix) {
@@ -4032,8 +4084,8 @@ sub process {
 ##             }
 
 #need space before brace following if, while, etc
-               if (($line =~ /\(.*\){/ && $line !~ /\($Type\){/) ||
-                   $line =~ /do{/) {
+               if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) ||
+                   $line =~ /do\{/) {
                        if (ERROR("SPACING",
                                  "space required before the open brace '{'\n" . $herecurr) &&
                            $fix) {
@@ -4179,6 +4231,35 @@ sub process {
                        }
                }
 
+# comparisons with a constant or upper case identifier on the left
+#      avoid cases like "foo + BAR < baz"
+#      only fix matches surrounded by parentheses to avoid incorrect
+#      conversions like "FOO < baz() + 5" being "misfixed" to "baz() > FOO + 5"
+               if ($^V && $^V ge 5.10.0 &&
+                   $line =~ /^\+(.*)\b($Constant|[A-Z_][A-Z0-9_]*)\s*($Compare)\s*($LvalOrFunc)/) {
+                       my $lead = $1;
+                       my $const = $2;
+                       my $comp = $3;
+                       my $to = $4;
+                       my $newcomp = $comp;
+                       if ($lead !~ /$Operators\s*$/ &&
+                           $to !~ /^(?:Constant|[A-Z_][A-Z0-9_]*)$/ &&
+                           WARN("CONSTANT_COMPARISON",
+                                "Comparisons should place the constant on the right side of the test\n" . $herecurr) &&
+                           $fix) {
+                               if ($comp eq "<") {
+                                       $newcomp = ">";
+                               } elsif ($comp eq "<=") {
+                                       $newcomp = ">=";
+                               } elsif ($comp eq ">") {
+                                       $newcomp = "<";
+                               } elsif ($comp eq ">=") {
+                                       $newcomp = "<=";
+                               }
+                               $fixed[$fixlinenr] =~ s/\(\s*\Q$const\E\s*$Compare\s*\Q$to\E\s*\)/($to $newcomp $const)/;
+                       }
+               }
+
 # Return of what appears to be an errno should normally be negative
                if ($sline =~ /\breturn(?:\s*\(+\s*|\s+)(E[A-Z]+)(?:\s*\)+\s*|\s*)[;:,]/) {
                        my $name = $1;
@@ -4480,7 +4561,7 @@ sub process {
                            $dstat !~ /^for\s*$Constant$/ &&                            # for (...)
                            $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ &&   # for (...) bar()
                            $dstat !~ /^do\s*{/ &&                                      # do {...
-                           $dstat !~ /^\({/ &&                                         # ({...
+                           $dstat !~ /^\(\{/ &&                                                # ({...
                            $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/)
                        {
                                $ctx =~ s/\n*$//;
@@ -4789,16 +4870,20 @@ sub process {
                             "Consecutive strings are generally better as a single string\n" . $herecurr);
                }
 
-# check for %L{u,d,i} in strings
+# check for %L{u,d,i} and 0x%[udi] in strings
                my $string;
                while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
                        $string = substr($rawline, $-[1], $+[1] - $-[1]);
                        $string =~ s/%%/__/g;
-                       if ($string =~ /(?<!%)%L[udi]/) {
+                       if ($string =~ /(?<!%)%[\*\d\.\$]*L[udi]/) {
                                WARN("PRINTF_L",
                                     "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
                                last;
                        }
+                       if ($string =~ /0x%[\*\d\.\$\Llzth]*[udi]/) {
+                               ERROR("PRINTF_0xDECIMAL",
+                                     "Prefixing 0x with decimal output is defective\n" . $herecurr);
+                       }
                }
 
 # check for line continuations in quoted strings with odd counts of "
@@ -4816,10 +4901,34 @@ sub process {
 
 # check for needless "if (<foo>) fn(<foo>)" uses
                if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) {
-                       my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;';
-                       if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) {
-                               WARN('NEEDLESS_IF',
-                                    "$1(NULL) is safe and this check is probably not required\n" . $hereprev);
+                       my $tested = quotemeta($1);
+                       my $expr = '\s*\(\s*' . $tested . '\s*\)\s*;';
+                       if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?|(?:kmem_cache|mempool|dma_pool)_destroy)$expr/) {
+                               my $func = $1;
+                               if (WARN('NEEDLESS_IF',
+                                        "$func(NULL) is safe and this check is probably not required\n" . $hereprev) &&
+                                   $fix) {
+                                       my $do_fix = 1;
+                                       my $leading_tabs = "";
+                                       my $new_leading_tabs = "";
+                                       if ($lines[$linenr - 2] =~ /^\+(\t*)if\s*\(\s*$tested\s*\)\s*$/) {
+                                               $leading_tabs = $1;
+                                       } else {
+                                               $do_fix = 0;
+                                       }
+                                       if ($lines[$linenr - 1] =~ /^\+(\t+)$func\s*\(\s*$tested\s*\)\s*;\s*$/) {
+                                               $new_leading_tabs = $1;
+                                               if (length($leading_tabs) + 1 ne length($new_leading_tabs)) {
+                                                       $do_fix = 0;
+                                               }
+                                       } else {
+                                               $do_fix = 0;
+                                       }
+                                       if ($do_fix) {
+                                               fix_delete_line($fixlinenr - 1, $prevrawline);
+                                               $fixed[$fixlinenr] =~ s/^\+$new_leading_tabs/\+$leading_tabs/;
+                                       }
+                               }
                        }
                }
 
index 3d22014130289d2904dee4fb3c831223437864d2..5bed7716f8ab61b6f13a97ec8548c9559ad42b96 100644 (file)
@@ -472,7 +472,7 @@ static int sel_mmap_policy_fault(struct vm_area_struct *vma,
        return 0;
 }
 
-static struct vm_operations_struct sel_mmap_policy_ops = {
+static const struct vm_operations_struct sel_mmap_policy_ops = {
        .fault = sel_mmap_policy_fault,
        .page_mkwrite = sel_mmap_policy_fault,
 };
index 4662a8877f6c7a06b81803d417e52120a009a15b..a25a73147f714458dd6c55fe7426649f9dd5baa2 100644 (file)
@@ -397,6 +397,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        return young;
 }
 
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long start,
+                                       unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int young, idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+       /*
+        * Even though we do not flush TLB, this will still adversely
+        * affect performance on pre-Haswell Intel EPT, where there is
+        * no EPT Access Bit to clear so that we have to tear down EPT
+        * tables instead. If we find this unacceptable, we can always
+        * add a parameter to kvm_age_hva so that it effectively doesn't
+        * do anything on clear_young.
+        *
+        * Also note that currently we never issue secondary TLB flushes
+        * from clear_young, leaving this job up to the regular system
+        * cadence. If we find this inaccurate, we might come up with a
+        * more sophisticated heuristic later.
+        */
+       young = kvm_age_hva(kvm, start, end);
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
@@ -429,6 +459,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+       .clear_young            = kvm_mmu_notifier_clear_young,
        .test_young             = kvm_mmu_notifier_test_young,
        .change_pte             = kvm_mmu_notifier_change_pte,
        .release                = kvm_mmu_notifier_release,