Merge remote-tracking branch 'lts/linux-4.4.y' into linux-linaro-lsk-v4.4
authorAlex Shi <alex.shi@linaro.org>
Tue, 20 Sep 2016 02:17:00 +0000 (10:17 +0800)
committerAlex Shi <alex.shi@linaro.org>
Tue, 20 Sep 2016 02:17:00 +0000 (10:17 +0800)
Conflicts:
set ARM64_WORKAROUND_CAVIUM_27456 to 12 in
arch/arm64/include/asm/cpufeature.h
and add asm/memory.h in arch/arm64/kernel/entry.S

1  2 
arch/arm64/Kconfig
arch/arm64/include/asm/cpufeature.h
arch/arm64/include/asm/elf.h
arch/arm64/kernel/cpu_errata.c
arch/arm64/kernel/entry.S
arch/arm64/kernel/smp.c
arch/arm64/mm/mmu.c
arch/arm64/mm/proc.S
include/linux/perf_event.h
kernel/events/uprobes.c
tools/perf/arch/x86/util/intel-pt.c

diff --combined arch/arm64/Kconfig
index 8dbe3cba855c185012aed6a86cc49acef526f363,14cdc6dea4939f9875b8332fea4fe625f5d7f278..0ddd4d9cc84c609edfbf58315cfaea7ce00f2059
@@@ -13,7 -13,6 +13,7 @@@ config ARM6
        select ARCH_WANT_OPTIONAL_GPIOLIB
        select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
        select ARCH_WANT_FRAME_POINTERS
 +      select ARCH_HAS_UBSAN_SANITIZE_ALL
        select ARM_AMBA
        select ARM_ARCH_TIMER
        select ARM_GIC
@@@ -49,8 -48,6 +49,8 @@@
        select HAVE_ALIGNED_STRUCT_PAGE if SLUB
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_BITREVERSE
 +      select HAVE_ARCH_HARDENED_USERCOPY
 +      select HAVE_ARCH_HUGE_VMAP
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
        select HAVE_ARCH_KGDB
@@@ -73,7 -70,6 +73,7 @@@
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_GENERIC_DMA_COHERENT
        select HAVE_HW_BREAKPOINT if PERF_EVENTS
 +      select HAVE_IRQ_TIME_ACCOUNTING
        select HAVE_MEMBLOCK
        select HAVE_PATA_PLATFORM
        select HAVE_PERF_EVENTS
@@@ -366,7 -362,6 +366,7 @@@ config ARM64_ERRATUM_84341
        bool "Cortex-A53: 843419: A load or store might access an incorrect address"
        depends on MODULES
        default y
 +      select ARM64_MODULE_CMODEL_LARGE
        help
          This option builds kernel modules using the large memory model in
          order to avoid the use of the ADRP instruction, which can cause
@@@ -396,6 -391,15 +396,15 @@@ config CAVIUM_ERRATUM_2237
  
          If unsure, say Y.
  
+ config CAVIUM_ERRATUM_23144
+       bool "Cavium erratum 23144: ITS SYNC hang on dual socket system"
+       depends on NUMA
+       default y
+       help
+         ITS SYNC command hang for cross node io and collections/cpu mapping.
+         If unsure, say Y.
  config CAVIUM_ERRATUM_23154
        bool "Cavium erratum 23154: Access to ICC_IAR1_EL1 is not sync'ed"
        default y
  
          If unsure, say Y.
  
+ config CAVIUM_ERRATUM_27456
+       bool "Cavium erratum 27456: Broadcast TLBI instructions may cause icache corruption"
+       default y
+       help
+         On ThunderX T88 pass 1.x through 2.1 parts, broadcast TLBI
+         instructions may cause the icache to become corrupted if it
+         contains data for a non-current ASID.  The fix is to
+         invalidate the icache when changing the mm context.
+         If unsure, say Y.
  endmenu
  
  
@@@ -511,9 -526,6 +531,9 @@@ config HOTPLUG_CP
  source kernel/Kconfig.preempt
  source kernel/Kconfig.hz
  
 +config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 +      def_bool y
 +
  config ARCH_HAS_HOLES_MEMORYMODEL
        def_bool y if SPARSEMEM
  
@@@ -537,6 -549,9 +557,6 @@@ config HW_PERF_EVENT
  config SYS_SUPPORTS_HUGETLBFS
        def_bool y
  
 -config ARCH_WANT_GENERAL_HUGETLB
 -      def_bool y
 -
  config ARCH_WANT_HUGE_PMD_SHARE
        def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
  
@@@ -709,93 -724,10 +729,93 @@@ config ARM64_LSE_ATOMIC
  
  endmenu
  
 +config ARM64_UAO
 +      bool "Enable support for User Access Override (UAO)"
 +      default y
 +      help
 +        User Access Override (UAO; part of the ARMv8.2 Extensions)
 +        causes the 'unprivileged' variant of the load/store instructions to
 +        be overriden to be privileged.
 +
 +        This option changes get_user() and friends to use the 'unprivileged'
 +        variant of the load/store instructions. This ensures that user-space
 +        really did have access to the supplied memory. When addr_limit is
 +        set to kernel memory the UAO bit will be set, allowing privileged
 +        access to kernel memory.
 +
 +        Choosing this option will cause copy_to_user() et al to use user-space
 +        memory permissions.
 +
 +        The feature is detected at runtime, the kernel will use the
 +        regular load/store instructions if the cpu does not implement the
 +        feature.
 +
 +config ARM64_MODULE_CMODEL_LARGE
 +      bool
 +
 +config ARM64_MODULE_PLTS
 +      bool
 +      select ARM64_MODULE_CMODEL_LARGE
 +      select HAVE_MOD_ARCH_SPECIFIC
 +
 +config RELOCATABLE
 +      bool
 +      help
 +        This builds the kernel as a Position Independent Executable (PIE),
 +        which retains all relocation metadata required to relocate the
 +        kernel binary at runtime to a different virtual address than the
 +        address it was linked at.
 +        Since AArch64 uses the RELA relocation format, this requires a
 +        relocation pass at runtime even if the kernel is loaded at the
 +        same address it was linked at.
 +
 +config RANDOMIZE_BASE
 +      bool "Randomize the address of the kernel image"
 +      select ARM64_MODULE_PLTS
 +      select RELOCATABLE
 +      help
 +        Randomizes the virtual address at which the kernel image is
 +        loaded, as a security feature that deters exploit attempts
 +        relying on knowledge of the location of kernel internals.
 +
 +        It is the bootloader's job to provide entropy, by passing a
 +        random u64 value in /chosen/kaslr-seed at kernel entry.
 +
 +        When booting via the UEFI stub, it will invoke the firmware's
 +        EFI_RNG_PROTOCOL implementation (if available) to supply entropy
 +        to the kernel proper. In addition, it will randomise the physical
 +        location of the kernel Image as well.
 +
 +        If unsure, say N.
 +
 +config RANDOMIZE_MODULE_REGION_FULL
 +      bool "Randomize the module region independently from the core kernel"
 +      depends on RANDOMIZE_BASE
 +      default y
 +      help
 +        Randomizes the location of the module region without considering the
 +        location of the core kernel. This way, it is impossible for modules
 +        to leak information about the location of core kernel data structures
 +        but it does imply that function calls between modules and the core
 +        kernel will need to be resolved via veneers in the module PLT.
 +
 +        When this option is not set, the module region will be randomized over
 +        a limited range that contains the [_stext, _etext] interval of the
 +        core kernel, so branch relocations are always in range.
 +
  endmenu
  
  menu "Boot options"
  
 +config ARM64_ACPI_PARKING_PROTOCOL
 +      bool "Enable support for the ARM64 ACPI parking protocol"
 +      depends on ACPI
 +      help
 +        Enable support for the ARM64 ACPI parking protocol. If disabled
 +        the kernel will not allow booting through the ARM64 ACPI parking
 +        protocol even if the corresponding data is present in the ACPI
 +        MADT table.
 +
  config CMDLINE
        string "Default kernel command string"
        default ""
index 37a53fc6b384eadb7d5b755066ec9fe67717167c,8136afc9df0d7c3ce4db25fe814c8ebd270fc30d..876fe06222047b7d177c2b30150a6fa6b6119877
  #define ARM64_HAS_LSE_ATOMICS                 5
  #define ARM64_WORKAROUND_CAVIUM_23154         6
  #define ARM64_WORKAROUND_834220                       7
 -#define ARM64_WORKAROUND_CAVIUM_27456         8
 +#define ARM64_HAS_NO_HW_PREFETCH              8
 +#define ARM64_HAS_UAO                         9
 +#define ARM64_ALT_PAN_NOT_UAO                 10
 +
 +#define ARM64_NCAPS                           11
++#define ARM64_WORKAROUND_CAVIUM_27456         12      
 -#define ARM64_NCAPS                           9
  
  #ifndef __ASSEMBLY__
  
@@@ -179,7 -177,7 +181,7 @@@ u64 read_system_reg(u32 id)
  
  static inline bool cpu_supports_mixed_endian_el0(void)
  {
 -      return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
 +      return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1));
  }
  
  static inline bool system_supports_mixed_endian_el0(void)
index 83d48a599f69cdf2917c0e76a5903244efbe1275,44dd892a4bbea515692ea090823210491425d7d8..7875c886ad24226bea0617649b6e50a7944361bf
  #include <asm/ptrace.h>
  #include <asm/user.h>
  
 -typedef unsigned long elf_greg_t;
 -
 -#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
 -#define ELF_CORE_COPY_REGS(dest, regs)        \
 -      *(struct user_pt_regs *)&(dest) = (regs)->user_regs;
 -
 -typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 -typedef struct user_fpsimd_state elf_fpregset_t;
 -
  /*
   * AArch64 static relocation types.
   */
@@@ -77,8 -86,6 +77,8 @@@
  #define R_AARCH64_MOVW_PREL_G2_NC     292
  #define R_AARCH64_MOVW_PREL_G3                293
  
 +#define R_AARCH64_RELATIVE            1027
 +
  /*
   * These are used to set parameters in the core dumps.
   */
   */
  #define ELF_ET_DYN_BASE       (2 * TASK_SIZE_64 / 3)
  
 +#ifndef __ASSEMBLY__
 +
 +typedef unsigned long elf_greg_t;
 +
 +#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
 +#define ELF_CORE_COPY_REGS(dest, regs)        \
 +      *(struct user_pt_regs *)&(dest) = (regs)->user_regs;
 +
 +typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 +typedef struct user_fpsimd_state elf_fpregset_t;
 +
  /*
   * When the program starts, a1 contains a pointer to a function to be
   * registered with atexit, as per the SVR4 ABI.  A value of 0 means we have no
  
  #define SET_PERSONALITY(ex)           clear_thread_flag(TIF_32BIT);
  
+ /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
  #define ARCH_DLINFO                                                   \
  do {                                                                  \
        NEW_AUX_ENT(AT_SYSINFO_EHDR,                                    \
@@@ -190,6 -187,4 +191,6 @@@ extern int aarch32_setup_vectors_page(s
  
  #endif /* CONFIG_COMPAT */
  
 +#endif /* !__ASSEMBLY__ */
 +
  #endif
index e6bc988e8dbf0f69fc4b1a48f9a7b4a89ee713f3,a3e846a28b05f9e80bc65f8f34618c11d21cff2a..06afd04e02c0d05f1e0546230a5d446b8bb06b60
  #include <asm/cputype.h>
  #include <asm/cpufeature.h>
  
 -#define MIDR_CORTEX_A53 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 -#define MIDR_CORTEX_A57 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
 -#define MIDR_THUNDERX MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 -
 -#define CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \
 -                      MIDR_ARCHITECTURE_MASK)
 -
  static bool __maybe_unused
  is_affected_midr_range(const struct arm64_cpu_capabilities *entry)
  {
 -      u32 midr = read_cpuid_id();
 -
 -      if ((midr & CPU_MODEL_MASK) != entry->midr_model)
 -              return false;
 -
 -      midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK;
 -
 -      return (midr >= entry->midr_range_min && midr <= entry->midr_range_max);
 +      return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model,
 +                                     entry->midr_range_min,
 +                                     entry->midr_range_max);
  }
  
  #define MIDR_RANGE(model, min, max) \
@@@ -87,6 -99,15 +87,15 @@@ const struct arm64_cpu_capabilities arm
                .capability = ARM64_WORKAROUND_CAVIUM_23154,
                MIDR_RANGE(MIDR_THUNDERX, 0x00, 0x01),
        },
+ #endif
+ #ifdef CONFIG_CAVIUM_ERRATUM_27456
+       {
+       /* Cavium ThunderX, T88 pass 1.x - 2.1 */
+               .desc = "Cavium erratum 27456",
+               .capability = ARM64_WORKAROUND_CAVIUM_27456,
+               MIDR_RANGE(MIDR_THUNDERX, 0x00,
+                          (1 << MIDR_VARIANT_SHIFT) | 1),
+       },
  #endif
        {
        }
index 1f7f5a2b61bf0de999d80e6ced16bec120f716b6,5a3753d09e20d607ab9419cec2d27e3aef50b8b0..588c8e1778d44a6cde02ef8ebb4b6db33652b809
@@@ -27,7 -27,7 +27,8 @@@
  #include <asm/cpufeature.h>
  #include <asm/errno.h>
  #include <asm/esr.h>
 +#include <asm/irq.h>
+ #include <asm/memory.h>
  #include <asm/thread_info.h>
  #include <asm/unistd.h>
  
  
        .if     \el == 0
        mrs     x21, sp_el0
 -      get_thread_info tsk                     // Ensure MDSCR_EL1.SS is clear,
 +      mov     tsk, sp
 +      and     tsk, tsk, #~(THREAD_SIZE - 1)   // Ensure MDSCR_EL1.SS is clear,
        ldr     x19, [tsk, #TI_FLAGS]           // since we can unmask debug
        disable_step_tsk x19, x20               // exceptions when scheduling.
 +
 +      mov     x29, xzr                        // fp pointed to user-space
        .else
        add     x21, sp, #S_FRAME_SIZE
-       .endif
+       get_thread_info tsk
+       /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
+       ldr     x20, [tsk, #TI_ADDR_LIMIT]
+       str     x20, [sp, #S_ORIG_ADDR_LIMIT]
+       mov     x20, #TASK_SIZE_64
+       str     x20, [tsk, #TI_ADDR_LIMIT]
+       .endif /* \el == 0 */
        mrs     x22, elr_el1
        mrs     x23, spsr_el1
        stp     lr, x21, [sp, #S_LR]
        str     x21, [sp, #S_SYSCALLNO]
        .endif
  
 +      /*
 +       * Set sp_el0 to current thread_info.
 +       */
 +      .if     \el == 0
 +      msr     sp_el0, tsk
 +      .endif
 +
        /*
         * Registers that may be useful after this macro is invoked:
         *
        .endm
  
        .macro  kernel_exit, el
+       .if     \el != 0
+       /* Restore the task's original addr_limit. */
+       ldr     x20, [sp, #S_ORIG_ADDR_LIMIT]
+       str     x20, [tsk, #TI_ADDR_LIMIT]
+       .endif
        ldp     x21, x22, [sp, #S_PC]           // load ELR, SPSR
        .if     \el == 0
        ct_user_enter
@@@ -175,44 -177,8 +188,44 @@@ alternative_endi
        .endm
  
        .macro  get_thread_info, rd
 -      mov     \rd, sp
 -      and     \rd, \rd, #~(THREAD_SIZE - 1)   // top of stack
 +      mrs     \rd, sp_el0
 +      .endm
 +
 +      .macro  irq_stack_entry
 +      mov     x19, sp                 // preserve the original sp
 +
 +      /*
 +       * Compare sp with the current thread_info, if the top
 +       * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
 +       * should switch to the irq stack.
 +       */
 +      and     x25, x19, #~(THREAD_SIZE - 1)
 +      cmp     x25, tsk
 +      b.ne    9998f
 +
 +      this_cpu_ptr irq_stack, x25, x26
 +      mov     x26, #IRQ_STACK_START_SP
 +      add     x26, x25, x26
 +
 +      /* switch to the irq stack */
 +      mov     sp, x26
 +
 +      /*
 +       * Add a dummy stack frame, this non-standard format is fixed up
 +       * by unwind_frame()
 +       */
 +      stp     x29, x19, [sp, #-16]!
 +      mov     x29, sp
 +
 +9998:
 +      .endm
 +
 +      /*
 +       * x19 should be preserved between irq_stack_entry and
 +       * irq_stack_exit.
 +       */
 +      .macro  irq_stack_exit
 +      mov     sp, x19
        .endm
  
  /*
@@@ -230,11 -196,10 +243,11 @@@ tsk     .req    x28             // current thread_inf
   * Interrupt handling.
   */
        .macro  irq_handler
 -      adrp    x1, handle_arch_irq
 -      ldr     x1, [x1, #:lo12:handle_arch_irq]
 +      ldr_l   x1, handle_arch_irq
        mov     x0, sp
 +      irq_stack_entry
        blr     x1
 +      irq_stack_exit
        .endm
  
        .text
@@@ -406,10 -371,10 +419,10 @@@ el1_irq
        bl      trace_hardirqs_off
  #endif
  
 +      get_thread_info tsk
        irq_handler
  
  #ifdef CONFIG_PREEMPT
 -      get_thread_info tsk
        ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
        cbnz    w24, 1f                         // preempt count != 0
        ldr     x0, [tsk, #TI_FLAGS]            // get flags
@@@ -647,8 -612,6 +660,8 @@@ ENTRY(cpu_switch_to
        ldp     x29, x9, [x8], #16
        ldr     lr, [x8]
        mov     sp, x9
 +      and     x9, x9, #~(THREAD_SIZE - 1)
 +      msr     sp_el0, x9
        ret
  ENDPROC(cpu_switch_to)
  
@@@ -676,14 -639,14 +689,14 @@@ ret_fast_syscall_trace
  work_pending:
        tbnz    x1, #TIF_NEED_RESCHED, work_resched
        /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
 -      ldr     x2, [sp, #S_PSTATE]
        mov     x0, sp                          // 'regs'
 -      tst     x2, #PSR_MODE_MASK              // user mode regs?
 -      b.ne    no_work_pending                 // returning to kernel
        enable_irq                              // enable interrupts for do_notify_resume()
        bl      do_notify_resume
        b       ret_to_user
  work_resched:
 +#ifdef CONFIG_TRACE_IRQFLAGS
 +      bl      trace_hardirqs_off              // the IRQs are off here, inform the tracing code
 +#endif
        bl      schedule
  
  /*
@@@ -695,6 -658,7 +708,6 @@@ ret_to_user
        and     x2, x1, #_TIF_WORK_MASK
        cbnz    x2, work_pending
        enable_step_tsk x1, x2
 -no_work_pending:
        kernel_exit 0
  ENDPROC(ret_to_user)
  
diff --combined arch/arm64/kernel/smp.c
index 24cb4f800033bc2b9d5ad49144f915ca4506e6dc,f3c3d8fee5bab2dbeec427266c23ae90badc9e2d..a84623d91410d17daf444995c6c8ec1baceae939
@@@ -70,7 -70,6 +70,7 @@@ enum ipi_msg_type 
        IPI_CPU_STOP,
        IPI_TIMER,
        IPI_IRQ_WORK,
 +      IPI_WAKEUP
  };
  
  /*
@@@ -150,7 -149,9 +150,7 @@@ asmlinkage void secondary_start_kernel(
         * TTBR0 is only used for the identity mapping at this stage. Make it
         * point to zero page to avoid speculatively fetching new entries.
         */
 -      cpu_set_reserved_ttbr0();
 -      local_flush_tlb_all();
 -      cpu_set_default_tcr_t0sz();
 +      cpu_uninstall_idmap();
  
        preempt_disable();
        trace_hardirqs_off();
        set_cpu_online(cpu, true);
        complete(&cpu_running);
  
-       local_dbg_enable();
        local_irq_enable();
        local_async_enable();
  
@@@ -333,8 -333,8 +332,8 @@@ void __init smp_cpus_done(unsigned int 
  
  void __init smp_prepare_boot_cpu(void)
  {
-       cpuinfo_store_boot_cpu();
        set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+       cpuinfo_store_boot_cpu();
  }
  
  static u64 __init of_get_cpu_mpidr(struct device_node *dn)
@@@ -444,17 -444,6 +443,17 @@@ acpi_map_gic_cpu_interface(struct acpi_
        /* map the logical cpu id to cpu MPIDR */
        cpu_logical_map(cpu_count) = hwid;
  
 +      /*
 +       * Set-up the ACPI parking protocol cpu entries
 +       * while initializing the cpu_logical_map to
 +       * avoid parsing MADT entries multiple times for
 +       * nothing (ie a valid cpu_logical_map entry should
 +       * contain a valid parking protocol data set to
 +       * initialize the cpu if the parking protocol is
 +       * the only available enable method).
 +       */
 +      acpi_set_mailbox_entry(cpu_count, processor);
 +
        cpu_count++;
  }
  
@@@ -637,7 -626,6 +636,7 @@@ static const char *ipi_types[NR_IPI] __
        S(IPI_CPU_STOP, "CPU stop interrupts"),
        S(IPI_TIMER, "Timer broadcast interrupts"),
        S(IPI_IRQ_WORK, "IRQ work interrupts"),
 +      S(IPI_WAKEUP, "CPU wake-up interrupts"),
  };
  
  static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
@@@ -681,13 -669,6 +680,13 @@@ void arch_send_call_function_single_ipi
        smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC);
  }
  
 +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
 +void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
 +{
 +      smp_cross_call(mask, IPI_WAKEUP);
 +}
 +#endif
 +
  #ifdef CONFIG_IRQ_WORK
  void arch_irq_work_raise(void)
  {
@@@ -765,14 -746,6 +764,14 @@@ void handle_IPI(int ipinr, struct pt_re
                break;
  #endif
  
 +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
 +      case IPI_WAKEUP:
 +              WARN_ONCE(!acpi_parking_protocol_valid(cpu),
 +                        "CPU%u: Wake-up IPI outside the ACPI parking protocol\n",
 +                        cpu);
 +              break;
 +#endif
 +
        default:
                pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
                break;
diff --combined arch/arm64/mm/mmu.c
index cd4177a1781d8e86c99269091995a7f3090f37bc,653735a8c58a86248e37593648de4a62f301893c..8fc302d84e1f524aa0496fd213524a9d30602ebc
  #include <linux/slab.h>
  #include <linux/stop_machine.h>
  
 +#include <asm/barrier.h>
  #include <asm/cputype.h>
  #include <asm/fixmap.h>
 +#include <asm/kasan.h>
  #include <asm/kernel-pgtable.h>
  #include <asm/sections.h>
  #include <asm/setup.h>
  
  u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
  
 +u64 kimage_voffset __read_mostly;
 +EXPORT_SYMBOL(kimage_voffset);
 +
  /*
   * Empty_zero_page is a special page that is used for zero-initialized data
   * and COW.
   */
 -struct page *empty_zero_page;
 +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
  EXPORT_SYMBOL(empty_zero_page);
  
 +static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
 +static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
 +static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
 +
  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot)
  {
  }
  EXPORT_SYMBOL(phys_mem_access_prot);
  
 -static void __init *early_alloc(unsigned long sz)
 +static phys_addr_t __init early_pgtable_alloc(void)
  {
        phys_addr_t phys;
        void *ptr;
  
 -      phys = memblock_alloc(sz, sz);
 +      phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
        BUG_ON(!phys);
 -      ptr = __va(phys);
 -      memset(ptr, 0, sz);
 -      return ptr;
 +
 +      /*
 +       * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
 +       * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
 +       * any level of table.
 +       */
 +      ptr = pte_set_fixmap(phys);
 +
 +      memset(ptr, 0, PAGE_SIZE);
 +
 +      /*
 +       * Implicit barriers also ensure the zeroed page is visible to the page
 +       * table walker
 +       */
 +      pte_clear_fixmap();
 +
 +      return phys;
  }
  
  /*
@@@ -118,30 -95,24 +118,30 @@@ static void split_pmd(pmd_t *pmd, pte_
  static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
                                  unsigned long end, unsigned long pfn,
                                  pgprot_t prot,
 -                                void *(*alloc)(unsigned long size))
 +                                phys_addr_t (*pgtable_alloc)(void))
  {
        pte_t *pte;
  
        if (pmd_none(*pmd) || pmd_sect(*pmd)) {
 -              pte = alloc(PTRS_PER_PTE * sizeof(pte_t));
 +              phys_addr_t pte_phys;
 +              BUG_ON(!pgtable_alloc);
 +              pte_phys = pgtable_alloc();
 +              pte = pte_set_fixmap(pte_phys);
                if (pmd_sect(*pmd))
                        split_pmd(pmd, pte);
 -              __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE);
 +              __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
                flush_tlb_all();
 +              pte_clear_fixmap();
        }
        BUG_ON(pmd_bad(*pmd));
  
 -      pte = pte_offset_kernel(pmd, addr);
 +      pte = pte_set_fixmap_offset(pmd, addr);
        do {
                set_pte(pte, pfn_pte(pfn, prot));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
 +
 +      pte_clear_fixmap();
  }
  
  static void split_pud(pud_t *old_pud, pmd_t *pmd)
        } while (pmd++, i++, i < PTRS_PER_PMD);
  }
  
 -static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
 -                                unsigned long addr, unsigned long end,
 +#ifdef CONFIG_DEBUG_PAGEALLOC
 +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
 +{
 +
 +      /*
 +       * If debug_page_alloc is enabled we must map the linear map
 +       * using pages. However, other mappings created by
 +       * create_mapping_noalloc must use sections in some cases. Allow
 +       * sections to be used in those cases, where no pgtable_alloc
 +       * function is provided.
 +       */
 +      return !pgtable_alloc || !debug_pagealloc_enabled();
 +}
 +#else
 +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
 +{
 +      return true;
 +}
 +#endif
 +
 +static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
                                  phys_addr_t phys, pgprot_t prot,
 -                                void *(*alloc)(unsigned long size))
 +                                phys_addr_t (*pgtable_alloc)(void))
  {
        pmd_t *pmd;
        unsigned long next;
         * Check for initial section mappings in the pgd/pud and remove them.
         */
        if (pud_none(*pud) || pud_sect(*pud)) {
 -              pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t));
 +              phys_addr_t pmd_phys;
 +              BUG_ON(!pgtable_alloc);
 +              pmd_phys = pgtable_alloc();
 +              pmd = pmd_set_fixmap(pmd_phys);
                if (pud_sect(*pud)) {
                        /*
                         * need to have the 1G of mappings continue to be
                         */
                        split_pud(pud, pmd);
                }
 -              pud_populate(mm, pud, pmd);
 +              __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE);
                flush_tlb_all();
 +              pmd_clear_fixmap();
        }
        BUG_ON(pud_bad(*pud));
  
 -      pmd = pmd_offset(pud, addr);
 +      pmd = pmd_set_fixmap_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                /* try section mapping first */
 -              if (((addr | next | phys) & ~SECTION_MASK) == 0) {
 +              if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
 +                    block_mappings_allowed(pgtable_alloc)) {
                        pmd_t old_pmd =*pmd;
 -                      set_pmd(pmd, __pmd(phys |
 -                                         pgprot_val(mk_sect_prot(prot))));
 +                      pmd_set_huge(pmd, phys, prot);
                        /*
                         * Check for previous table entries created during
                         * boot (__create_page_tables) and flush them.
                        if (!pmd_none(old_pmd)) {
                                flush_tlb_all();
                                if (pmd_table(old_pmd)) {
 -                                      phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0));
 +                                      phys_addr_t table = pmd_page_paddr(old_pmd);
                                        if (!WARN_ON_ONCE(slab_is_available()))
                                                memblock_free(table, PAGE_SIZE);
                                }
                        }
                } else {
                        alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys),
 -                                     prot, alloc);
 +                                     prot, pgtable_alloc);
                }
                phys += next - addr;
        } while (pmd++, addr = next, addr != end);
 +
 +      pmd_clear_fixmap();
  }
  
  static inline bool use_1G_block(unsigned long addr, unsigned long next,
        return true;
  }
  
 -static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
 -                                unsigned long addr, unsigned long end,
 +static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
                                  phys_addr_t phys, pgprot_t prot,
 -                                void *(*alloc)(unsigned long size))
 +                                phys_addr_t (*pgtable_alloc)(void))
  {
        pud_t *pud;
        unsigned long next;
  
        if (pgd_none(*pgd)) {
 -              pud = alloc(PTRS_PER_PUD * sizeof(pud_t));
 -              pgd_populate(mm, pgd, pud);
 +              phys_addr_t pud_phys;
 +              BUG_ON(!pgtable_alloc);
 +              pud_phys = pgtable_alloc();
 +              __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE);
        }
        BUG_ON(pgd_bad(*pgd));
  
 -      pud = pud_offset(pgd, addr);
 +      pud = pud_set_fixmap_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
  
                /*
                 * For 4K granule only, attempt to put down a 1GB block
                 */
 -              if (use_1G_block(addr, next, phys)) {
 +              if (use_1G_block(addr, next, phys) &&
 +                  block_mappings_allowed(pgtable_alloc)) {
                        pud_t old_pud = *pud;
 -                      set_pud(pud, __pud(phys |
 -                                         pgprot_val(mk_sect_prot(prot))));
 +                      pud_set_huge(pud, phys, prot);
  
                        /*
                         * If we have an old value for a pud, it will
                        if (!pud_none(old_pud)) {
                                flush_tlb_all();
                                if (pud_table(old_pud)) {
 -                                      phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
 +                                      phys_addr_t table = pud_page_paddr(old_pud);
                                        if (!WARN_ON_ONCE(slab_is_available()))
                                                memblock_free(table, PAGE_SIZE);
                                }
                        }
                } else {
 -                      alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc);
 +                      alloc_init_pmd(pud, addr, next, phys, prot,
 +                                     pgtable_alloc);
                }
                phys += next - addr;
        } while (pud++, addr = next, addr != end);
 +
 +      pud_clear_fixmap();
  }
  
  /*
   * Create the page directory entries and any necessary page tables for the
   * mapping specified by 'md'.
   */
 -static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
 -                                  phys_addr_t phys, unsigned long virt,
 +static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt,
                                    phys_addr_t size, pgprot_t prot,
 -                                  void *(*alloc)(unsigned long size))
 +                                  phys_addr_t (*pgtable_alloc)(void))
  {
        unsigned long addr, length, end, next;
  
 +      /*
 +       * If the virtual and physical address don't have the same offset
 +       * within a page, we cannot map the region as the caller expects.
 +       */
 +      if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
 +              return;
 +
 +      phys &= PAGE_MASK;
        addr = virt & PAGE_MASK;
        length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
  
        end = addr + length;
        do {
                next = pgd_addr_end(addr, end);
 -              alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc);
 +              alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc);
                phys += next - addr;
        } while (pgd++, addr = next, addr != end);
  }
  
 -static void *late_alloc(unsigned long size)
 +static phys_addr_t late_pgtable_alloc(void)
  {
 -      void *ptr;
 -
 -      BUG_ON(size > PAGE_SIZE);
 -      ptr = (void *)__get_free_page(PGALLOC_GFP);
 +      void *ptr = (void *)__get_free_page(PGALLOC_GFP);
        BUG_ON(!ptr);
 -      return ptr;
 +
 +      /* Ensure the zeroed page is visible to the page table walker */
 +      dsb(ishst);
 +      return __pa(ptr);
  }
  
 -static void __init create_mapping(phys_addr_t phys, unsigned long virt,
 +static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 +                               unsigned long virt, phys_addr_t size,
 +                               pgprot_t prot,
 +                               phys_addr_t (*alloc)(void))
 +{
 +      init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc);
 +}
 +
 +/*
 + * This function can only be used to modify existing table entries,
 + * without allocating new levels of table. Note that this permits the
 + * creation of new section or page entries.
 + */
 +static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
                                  phys_addr_t size, pgprot_t prot)
  {
        if (virt < VMALLOC_START) {
                        &phys, virt);
                return;
        }
 -      __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt,
 -                       size, prot, early_alloc);
 +      __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
 +                           NULL);
  }
  
  void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
                               unsigned long virt, phys_addr_t size,
                               pgprot_t prot)
  {
 -      __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot,
 -                              late_alloc);
 +      __create_pgd_mapping(mm->pgd, phys, virt, size, prot,
 +                           late_pgtable_alloc);
  }
  
  static void create_mapping_late(phys_addr_t phys, unsigned long virt,
                return;
        }
  
 -      return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK),
 -                              phys, virt, size, prot, late_alloc);
 +      __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
 +                           late_pgtable_alloc);
  }
  
 -#ifdef CONFIG_DEBUG_RODATA
 -static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
 +static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
  {
 +      unsigned long kernel_start = __pa(_text);
 +      unsigned long kernel_end = __pa(_etext);
 +
        /*
 -       * Set up the executable regions using the existing section mappings
 -       * for now. This will get more fine grained later once all memory
 -       * is mapped
 +       * Take care not to create a writable alias for the
 +       * read-only text and rodata sections of the kernel image.
         */
 -      unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE);
 -      unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE);
 -
 -      if (end < kernel_x_start) {
 -              create_mapping(start, __phys_to_virt(start),
 -                      end - start, PAGE_KERNEL);
 -      } else if (start >= kernel_x_end) {
 -              create_mapping(start, __phys_to_virt(start),
 -                      end - start, PAGE_KERNEL);
 -      } else {
 -              if (start < kernel_x_start)
 -                      create_mapping(start, __phys_to_virt(start),
 -                              kernel_x_start - start,
 -                              PAGE_KERNEL);
 -              create_mapping(kernel_x_start,
 -                              __phys_to_virt(kernel_x_start),
 -                              kernel_x_end - kernel_x_start,
 -                              PAGE_KERNEL_EXEC);
 -              if (kernel_x_end < end)
 -                      create_mapping(kernel_x_end,
 -                              __phys_to_virt(kernel_x_end),
 -                              end - kernel_x_end,
 -                              PAGE_KERNEL);
 +
 +      /* No overlap with the kernel text */
 +      if (end < kernel_start || start >= kernel_end) {
 +              __create_pgd_mapping(pgd, start, __phys_to_virt(start),
 +                                   end - start, PAGE_KERNEL,
 +                                   early_pgtable_alloc);
 +              return;
        }
  
 +      /*
 +       * This block overlaps the kernel text mapping.
 +       * Map the portion(s) which don't overlap.
 +       */
 +      if (start < kernel_start)
 +              __create_pgd_mapping(pgd, start,
 +                                   __phys_to_virt(start),
 +                                   kernel_start - start, PAGE_KERNEL,
 +                                   early_pgtable_alloc);
 +      if (kernel_end < end)
 +              __create_pgd_mapping(pgd, kernel_end,
 +                                   __phys_to_virt(kernel_end),
 +                                   end - kernel_end, PAGE_KERNEL,
 +                                   early_pgtable_alloc);
 +
 +      /*
 +       * Map the linear alias of the [_text, _etext) interval as
 +       * read-only/non-executable. This makes the contents of the
 +       * region accessible to subsystems such as hibernate, but
 +       * protects it from inadvertent modification or execution.
 +       */
 +      __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start),
 +                           kernel_end - kernel_start, PAGE_KERNEL_RO,
 +                           early_pgtable_alloc);
  }
 -#else
 -static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
 -{
 -      create_mapping(start, __phys_to_virt(start), end - start,
 -                      PAGE_KERNEL_EXEC);
 -}
 -#endif
  
 -static void __init map_mem(void)
 +static void __init map_mem(pgd_t *pgd)
  {
        struct memblock_region *reg;
 -      phys_addr_t limit;
 -
 -      /*
 -       * Temporarily limit the memblock range. We need to do this as
 -       * create_mapping requires puds, pmds and ptes to be allocated from
 -       * memory addressable from the initial direct kernel mapping.
 -       *
 -       * The initial direct kernel mapping, located at swapper_pg_dir, gives
 -       * us PUD_SIZE (with SECTION maps) or PMD_SIZE (without SECTION maps,
 -       * memory starting from PHYS_OFFSET (which must be aligned to 2MB as
 -       * per Documentation/arm64/booting.txt).
 -       */
 -      limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE;
 -      memblock_set_current_limit(limit);
  
        /* map all the memory banks */
        for_each_memblock(memory, reg) {
                if (start >= end)
                        break;
  
 -              if (ARM64_SWAPPER_USES_SECTION_MAPS) {
 -                      /*
 -                       * For the first memory bank align the start address and
 -                       * current memblock limit to prevent create_mapping() from
 -                       * allocating pte page tables from unmapped memory. With
 -                       * the section maps, if the first block doesn't end on section
 -                       * size boundary, create_mapping() will try to allocate a pte
 -                       * page, which may be returned from an unmapped area.
 -                       * When section maps are not used, the pte page table for the
 -                       * current limit is already present in swapper_pg_dir.
 -                       */
 -                      if (start < limit)
 -                              start = ALIGN(start, SECTION_SIZE);
 -                      if (end < limit) {
 -                              limit = end & SECTION_MASK;
 -                              memblock_set_current_limit(limit);
 -                      }
 -              }
 -              __map_memblock(start, end);
 +              __map_memblock(pgd, start, end);
        }
 -
 -      /* Limit no longer required. */
 -      memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
  }
  
 -static void __init fixup_executable(void)
 +void mark_rodata_ro(void)
  {
 -#ifdef CONFIG_DEBUG_RODATA
 -      /* now that we are actually fully mapped, make the start/end more fine grained */
 -      if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) {
 -              unsigned long aligned_start = round_down(__pa(_stext),
 -                                                       SWAPPER_BLOCK_SIZE);
 +      unsigned long section_size;
  
 -              create_mapping(aligned_start, __phys_to_virt(aligned_start),
 -                              __pa(_stext) - aligned_start,
 -                              PAGE_KERNEL);
 -      }
 +      section_size = (unsigned long)__start_rodata - (unsigned long)_text;
 +      create_mapping_late(__pa(_text), (unsigned long)_text,
 +                          section_size, PAGE_KERNEL_ROX);
 +      /*
 +       * mark .rodata as read only. Use _etext rather than __end_rodata to
 +       * cover NOTES and EXCEPTION_TABLE.
 +       */
 +      section_size = (unsigned long)_etext - (unsigned long)__start_rodata;
 +      create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
 +                          section_size, PAGE_KERNEL_RO);
 +}
  
 -      if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) {
 -              unsigned long aligned_end = round_up(__pa(__init_end),
 -                                                        SWAPPER_BLOCK_SIZE);
 -              create_mapping(__pa(__init_end), (unsigned long)__init_end,
 -                              aligned_end - __pa(__init_end),
 -                              PAGE_KERNEL);
 -      }
 -#endif
 +void fixup_init(void)
 +{
 +      /*
 +       * Unmap the __init region but leave the VM area in place. This
 +       * prevents the region from being reused for kernel modules, which
 +       * is not supported by kallsyms.
 +       */
 +      unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
  }
  
 -#ifdef CONFIG_DEBUG_RODATA
 -void mark_rodata_ro(void)
 +static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
 +                                    pgprot_t prot, struct vm_struct *vma)
  {
 -      create_mapping_late(__pa(_stext), (unsigned long)_stext,
 -                              (unsigned long)_etext - (unsigned long)_stext,
 -                              PAGE_KERNEL_ROX);
 +      phys_addr_t pa_start = __pa(va_start);
 +      unsigned long size = va_end - va_start;
 +
 +      BUG_ON(!PAGE_ALIGNED(pa_start));
 +      BUG_ON(!PAGE_ALIGNED(size));
 +
 +      __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
 +                           early_pgtable_alloc);
  
 +      vma->addr       = va_start;
 +      vma->phys_addr  = pa_start;
 +      vma->size       = size;
 +      vma->flags      = VM_MAP;
 +      vma->caller     = __builtin_return_address(0);
 +
 +      vm_area_add_early(vma);
  }
 -#endif
  
 -void fixup_init(void)
 +/*
 + * Create fine-grained mappings for the kernel.
 + */
 +static void __init map_kernel(pgd_t *pgd)
  {
 -      create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin,
 -                      (unsigned long)__init_end - (unsigned long)__init_begin,
 -                      PAGE_KERNEL);
 +      static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
 +
 +      map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
 +      map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
 +      map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
 +                         &vmlinux_init);
 +      map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
 +
 +      if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
 +              /*
 +               * The fixmap falls in a separate pgd to the kernel, and doesn't
 +               * live in the carveout for the swapper_pg_dir. We can simply
 +               * re-use the existing dir for the fixmap.
 +               */
 +              set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
 +                      *pgd_offset_k(FIXADDR_START));
 +      } else if (CONFIG_PGTABLE_LEVELS > 3) {
 +              /*
 +               * The fixmap shares its top level pgd entry with the kernel
 +               * mapping. This can really only occur when we are running
 +               * with 16k/4 levels, so we can simply reuse the pud level
 +               * entry instead.
 +               */
 +              BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
 +              set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
 +                      __pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
 +              pud_clear_fixmap();
 +      } else {
 +              BUG();
 +      }
 +
 +      kasan_copy_shadow(pgd);
  }
  
  /*
   */
  void __init paging_init(void)
  {
 -      void *zero_page;
 -
 -      map_mem();
 -      fixup_executable();
 +      phys_addr_t pgd_phys = early_pgtable_alloc();
 +      pgd_t *pgd = pgd_set_fixmap(pgd_phys);
  
 -      /* allocate the zero page. */
 -      zero_page = early_alloc(PAGE_SIZE);
 +      map_kernel(pgd);
 +      map_mem(pgd);
  
 -      bootmem_init();
 -
 -      empty_zero_page = virt_to_page(zero_page);
 +      /*
 +       * We want to reuse the original swapper_pg_dir so we don't have to
 +       * communicate the new address to non-coherent secondaries in
 +       * secondary_entry, and so cpu_switch_mm can generate the address with
 +       * adrp+add rather than a load from some global variable.
 +       *
 +       * To do this we need to go via a temporary pgd.
 +       */
 +      cpu_replace_ttbr1(__va(pgd_phys));
 +      memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
 +      cpu_replace_ttbr1(swapper_pg_dir);
  
 -      /* Ensure the zero page is visible to the page table walker */
 -      dsb(ishst);
 +      pgd_clear_fixmap();
 +      memblock_free(pgd_phys, PAGE_SIZE);
  
        /*
 -       * TTBR0 is only used for the identity mapping at this stage. Make it
 -       * point to zero page to avoid speculatively fetching new entries.
 +       * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
 +       * allocated with it.
         */
 -      cpu_set_reserved_ttbr0();
 -      local_flush_tlb_all();
 -      cpu_set_default_tcr_t0sz();
 +      memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
 +                    SWAPPER_DIR_SIZE - PAGE_SIZE);
 +
 +      bootmem_init();
  }
  
  /*
@@@ -650,13 -552,21 +650,13 @@@ void vmemmap_free(unsigned long start, 
  }
  #endif        /* CONFIG_SPARSEMEM_VMEMMAP */
  
 -static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
 -#if CONFIG_PGTABLE_LEVELS > 2
 -static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
 -#endif
 -#if CONFIG_PGTABLE_LEVELS > 3
 -static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
 -#endif
 -
  static inline pud_t * fixmap_pud(unsigned long addr)
  {
        pgd_t *pgd = pgd_offset_k(addr);
  
        BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
  
 -      return pud_offset(pgd, addr);
 +      return pud_offset_kimg(pgd, addr);
  }
  
  static inline pmd_t * fixmap_pmd(unsigned long addr)
  
        BUG_ON(pud_none(*pud) || pud_bad(*pud));
  
 -      return pmd_offset(pud, addr);
 +      return pmd_offset_kimg(pud, addr);
  }
  
  static inline pte_t * fixmap_pte(unsigned long addr)
  {
 -      pmd_t *pmd = fixmap_pmd(addr);
 -
 -      BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd));
 -
 -      return pte_offset_kernel(pmd, addr);
 +      return &bm_pte[pte_index(addr)];
  }
  
  void __init early_fixmap_init(void)
        unsigned long addr = FIXADDR_START;
  
        pgd = pgd_offset_k(addr);
 -      pgd_populate(&init_mm, pgd, bm_pud);
 -      pud = pud_offset(pgd, addr);
 +      if (CONFIG_PGTABLE_LEVELS > 3 &&
 +          !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) {
 +              /*
 +               * We only end up here if the kernel mapping and the fixmap
 +               * share the top level pgd entry, which should only happen on
 +               * 16k/4 levels configurations.
 +               */
 +              BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
 +              pud = pud_offset_kimg(pgd, addr);
 +      } else {
 +              pgd_populate(&init_mm, pgd, bm_pud);
 +              pud = fixmap_pud(addr);
 +      }
        pud_populate(&init_mm, pud, bm_pmd);
 -      pmd = pmd_offset(pud, addr);
 +      pmd = fixmap_pmd(addr);
        pmd_populate_kernel(&init_mm, pmd, bm_pte);
  
        /*
         * The boot-ioremap range spans multiple pmds, for which
 -       * we are not preparted:
 +       * we are not prepared:
         */
        BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
                     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
@@@ -739,18 -642,19 +739,18 @@@ void __set_fixmap(enum fixed_addresses 
        }
  }
  
 -void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 +void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
  {
        const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
 -      pgprot_t prot = PAGE_KERNEL_RO;
 -      int size, offset;
 +      int offset;
        void *dt_virt;
  
        /*
         * Check whether the physical FDT address is set and meets the minimum
         * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
-        * at least 8 bytes so that we can always access the size field of the
-        * FDT header after mapping the first chunk, double check here if that
-        * is indeed the case.
+        * at least 8 bytes so that we can always access the magic and size
+        * fields of the FDT header after mapping the first chunk, double check
+        * here if that is indeed the case.
         */
        BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
        if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
        /*
         * Make sure that the FDT region can be mapped without the need to
         * allocate additional translation table pages, so that it is safe
 -       * to call create_mapping() this early.
 +       * to call create_mapping_noalloc() this early.
         *
         * On 64k pages, the FDT will be mapped using PTEs, so we need to
         * be in the same PMD as the rest of the fixmap.
        dt_virt = (void *)dt_virt_base + offset;
  
        /* map the first chunk so we can read the size from the header */
 -      create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
 -                     SWAPPER_BLOCK_SIZE, prot);
 +      create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
 +                      dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
  
-       if (fdt_check_header(dt_virt) != 0)
+       if (fdt_magic(dt_virt) != FDT_MAGIC)
                return NULL;
  
 -      size = fdt_totalsize(dt_virt);
 -      if (size > MAX_FDT_SIZE)
 +      *size = fdt_totalsize(dt_virt);
 +      if (*size > MAX_FDT_SIZE)
                return NULL;
  
 -      if (offset + size > SWAPPER_BLOCK_SIZE)
 -              create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
 -                             round_up(offset + size, SWAPPER_BLOCK_SIZE), prot);
 +      if (offset + *size > SWAPPER_BLOCK_SIZE)
 +              create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
 +                             round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
  
 -      memblock_reserve(dt_phys, size);
 +      return dt_virt;
 +}
 +
 +void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 +{
 +      void *dt_virt;
 +      int size;
 +
 +      dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
 +      if (!dt_virt)
 +              return NULL;
  
 +      memblock_reserve(dt_phys, size);
        return dt_virt;
  }
 +
 +int __init arch_ioremap_pud_supported(void)
 +{
 +      /* only 4k granule supports level 1 block mappings */
 +      return IS_ENABLED(CONFIG_ARM64_4K_PAGES);
 +}
 +
 +int __init arch_ioremap_pmd_supported(void)
 +{
 +      return 1;
 +}
 +
 +int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot)
 +{
 +      BUG_ON(phys & ~PUD_MASK);
 +      set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
 +      return 1;
 +}
 +
 +int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot)
 +{
 +      BUG_ON(phys & ~PMD_MASK);
 +      set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
 +      return 1;
 +}
 +
 +int pud_clear_huge(pud_t *pud)
 +{
 +      if (!pud_sect(*pud))
 +              return 0;
 +      pud_clear(pud);
 +      return 1;
 +}
 +
 +int pmd_clear_huge(pmd_t *pmd)
 +{
 +      if (!pmd_sect(*pmd))
 +              return 0;
 +      pmd_clear(pmd);
 +      return 1;
 +}
diff --combined arch/arm64/mm/proc.S
index 0c19534a901e616ecc5fe508ce205dc0de8fe0f4,18201e9e8cc71c22d017a9573cf722bdab63de74..a92738e8b1eb670772e9314643eadc29d7f954ba
@@@ -25,6 -25,8 +25,8 @@@
  #include <asm/hwcap.h>
  #include <asm/pgtable-hwdef.h>
  #include <asm/pgtable.h>
+ #include <asm/cpufeature.h>
+ #include <asm/alternative.h>
  
  #include "proc-macros.S"
  
@@@ -137,36 -139,20 +139,46 @@@ ENTRY(cpu_do_switch_mm
        bfi     x0, x1, #48, #16                // set the ASID
        msr     ttbr0_el1, x0                   // set TTBR0
        isb
+ alternative_if_not ARM64_WORKAROUND_CAVIUM_27456
        ret
+       nop
+       nop
+       nop
+ alternative_else
+       ic      iallu
+       dsb     nsh
+       isb
+       ret
+ alternative_endif
  ENDPROC(cpu_do_switch_mm)
  
 -      .section ".text.init", #alloc, #execinstr
 +      .pushsection ".idmap.text", "ax"
 +/*
 + * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd)
 + *
 + * This is the low-level counterpart to cpu_replace_ttbr1, and should not be
 + * called by anything else. It can only be executed from a TTBR0 mapping.
 + */
 +ENTRY(idmap_cpu_replace_ttbr1)
 +      mrs     x2, daif
 +      msr     daifset, #0xf
 +
 +      adrp    x1, empty_zero_page
 +      msr     ttbr1_el1, x1
 +      isb
 +
 +      tlbi    vmalle1
 +      dsb     nsh
 +      isb
 +
 +      msr     ttbr1_el1, x0
 +      isb
 +
 +      msr     daif, x2
 +
 +      ret
 +ENDPROC(idmap_cpu_replace_ttbr1)
 +      .popsection
  
  /*
   *    __cpu_setup
@@@ -182,6 -168,8 +194,8 @@@ ENTRY(__cpu_setup
        msr     cpacr_el1, x0                   // Enable FP/ASIMD
        mov     x0, #1 << 12                    // Reset mdscr_el1 and disable
        msr     mdscr_el1, x0                   // access to the DCC from EL0
+       isb                                     // Unmask debug exceptions now,
+       enable_dbg                              // since this is per-cpu
        reset_pmuserenr_el0 x0                  // Disable PMU access from EL0
        /*
         * Memory region attributes for LPAE:
index ece8b9629a47bfe15099829ae91ebe4e28240f18,6cdd50f7f52d7dc1009bef3b39c30a4b86135308..a288010667dcf561f1019a6d28ade8ead8673787
@@@ -121,6 -121,7 +121,7 @@@ struct hw_perf_event 
                struct { /* intel_cqm */
                        int                     cqm_state;
                        u32                     cqm_rmid;
+                       int                     is_group_event;
                        struct list_head        cqm_events_entry;
                        struct list_head        cqm_groups_entry;
                        struct list_head        cqm_group_entry;
@@@ -378,7 -379,7 +379,7 @@@ struct pmu 
        /*
         * Set up pmu-private data structures for an AUX area
         */
 -      void *(*setup_aux)              (int cpu, void **pages,
 +      void *(*setup_aux)              (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */
  
         * Filter events for PMU-specific reasons.
         */
        int (*filter_match)             (struct perf_event *event); /* optional */
 +
 +      /*
 +       * Initial, PMU driver specific configuration.
 +       */
 +      int (*get_drv_configs)          (struct perf_event *event,
 +                                       void __user *arg); /* optional */
 +      void (*free_drv_configs)        (struct perf_event *event);
 +                                      /* optional */
  };
  
  /**
@@@ -566,7 -559,6 +567,7 @@@ struct perf_event 
        struct irq_work                 pending;
  
        atomic_t                        event_limit;
 +      struct list_head                drv_configs;
  
        void (*destroy)(struct perf_event *);
        struct rcu_head                 rcu_head;
diff --combined kernel/events/uprobes.c
index 4dcc16991b6707564091f2d9a28d3ab62504a4ca,da0c09ff6112badb3fa3d4da719f78cee299c1ab..7b1b772ab1ce4f381ce085c82e11005af3f5a2c6
@@@ -171,8 -171,10 +171,10 @@@ static int __replace_page(struct vm_are
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        err = -EAGAIN;
        ptep = page_check_address(page, mm, addr, &ptl, 0);
-       if (!ptep)
+       if (!ptep) {
+               mem_cgroup_cancel_charge(kpage, memcg);
                goto unlock;
+       }
  
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr);
  
        err = 0;
   unlock:
-       mem_cgroup_cancel_charge(kpage, memcg);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
@@@ -1692,7 -1693,8 +1693,7 @@@ static int is_trap_at_addr(struct mm_st
        int result;
  
        pagefault_disable();
 -      result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
 -                                                      sizeof(opcode));
 +      result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
        pagefault_enable();
  
        if (likely(result == 0))
index e5c1f2e21f870bf53047c311829a2c653ddbee56,c53f787675685a7d52cac525d7c79b854ed31e7e..de3965c4e4aabecb5508999eb5ca99619da8d0f9
@@@ -273,9 -273,7 +273,9 @@@ intel_pt_pmu_default_config(struct perf
        return attr;
  }
  
 -static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused)
 +static size_t
 +intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused,
 +                      struct perf_evlist *evlist __maybe_unused)
  {
        return INTEL_PT_AUXTRACE_PRIV_SIZE;
  }
@@@ -501,7 -499,7 +501,7 @@@ static int intel_pt_recording_options(s
        struct intel_pt_recording *ptr =
                        container_of(itr, struct intel_pt_recording, itr);
        struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu;
-       bool have_timing_info;
+       bool have_timing_info, need_immediate = false;
        struct perf_evsel *evsel, *intel_pt_evsel = NULL;
        const struct cpu_map *cpus = evlist->cpus;
        bool privileged = geteuid() == 0 || perf_event_paranoid() < 0;
                                ptr->have_sched_switch = 3;
                        } else {
                                opts->record_switch_events = true;
+                               need_immediate = true;
                                if (cpu_wide)
                                        ptr->have_sched_switch = 3;
                                else
                tracking_evsel->attr.freq = 0;
                tracking_evsel->attr.sample_period = 1;
  
+               if (need_immediate)
+                       tracking_evsel->immediate = true;
                /* In per-cpu case, always need the time of mmap events etc */
                if (!cpu_map__empty(cpus)) {
                        perf_evsel__set_sample_bit(tracking_evsel, TIME);