From fadee632e12e4cd6050f9da3988f787573033614 Mon Sep 17 00:00:00 2001 From: Adam Nemet Date: Fri, 17 Jul 2015 18:14:19 +0000 Subject: [PATCH] Revert "ARM: Enable MachineScheduler and disable PostRAScheduler for swift." This reverts commit r242500. It broke some internal tests and Matthias asked me to revert it while he is investigating. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242553 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCSchedule.h | 3 - lib/Target/ARM/ARMScheduleSwift.td | 1038 ++++++++++++++++++++++++++++ lib/Target/ARM/ARMSubtarget.cpp | 11 - lib/Target/ARM/ARMSubtarget.h | 3 - test/CodeGen/ARM/adv-copy-opt.ll | 14 +- test/CodeGen/ARM/avoid-cpsr-rmw.ll | 16 +- test/CodeGen/ARM/cmpxchg-idioms.ll | 6 +- test/CodeGen/ARM/test-sharedidx.ll | 16 +- test/CodeGen/ARM/vector-load.ll | 4 +- test/CodeGen/ARM/vector-store.ll | 6 +- 10 files changed, 1069 insertions(+), 48 deletions(-) diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index a8b20570103..c0979163105 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -206,9 +206,6 @@ struct MCSchedModel { /// scheduling class (itinerary class or SchedRW list). bool isComplete() const { return CompleteModel; } - /// Return true if machine supports out of order execution. - bool isOutOfOrder() const { return MicroOpBufferSize > 1; } - unsigned getNumProcResourceKinds() const { return NumProcResourceKinds; } diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td index 8f9cf368ced..b03d5ff44c6 100644 --- a/lib/Target/ARM/ARMScheduleSwift.td +++ b/lib/Target/ARM/ARMScheduleSwift.td @@ -37,12 +37,1050 @@ def SW_FDIV : FuncUnit; // FIXME: Add preload instruction when it is documented. // FIXME: Model non-pipelined nature of FP div / sqrt unit. +def SwiftItineraries : ProcessorItineraries< + [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [ + // + // Move instructions, unconditional + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1]>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>, + InstrStage<1, [SW_ALU0, SW_ALU1]>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [3]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>, + InstrStage<1, [SW_ALU0, SW_ALU1]>, + InstrStage<1, [SW_LS]>], + [5]>, + // + // MVN instructions + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + // + // No operand cycles + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>]>, + // + // Binary Instructions that produce a result + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1, 1]>, + // + // Bitwise Instructions that produce a result + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1, 1]>, + // + // Unary Instructions that produce a result + + // CLZ, RBIT, etc. + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + + // BFC, BFI, UBFX, SBFX + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1]>, + + // + // Zero and sign extension instructions + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1, 1, 1]>, + // + // Compare instructions + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<2, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<2, [SW_ALU0, SW_ALU1]>], + [1, 1, 1]>, + // + // Test instructions + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<2, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<2, [SW_ALU0, SW_ALU1]>], + [1, 1, 1]>, + // + // Move instructions, conditional + // FIXME: Correctly model the extra input dep on the destination. + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1]>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2]>, + + // Integer multiply pipeline + // + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [3, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0], 1>, + InstrStage<1, [SW_ALU0], 3>, + InstrStage<1, [SW_ALU0]>], + [5, 5, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0], 1>, + InstrStage<1, [SW_ALU0], 1>, + InstrStage<1, [SW_ALU0, SW_ALU1], 3>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [5, 6, 1, 1]>, + // + // Integer divide + InstrItinData, + InstrStage<1, [SW_ALU0], 0>, + InstrStage<14, [SW_IDIV]>], + [14, 1, 1]>, + + // Integer load pipeline + // FIXME: The timings are some rough approximations + // + // Immediate offset + InstrItinData, + InstrStage<1, [SW_LS]>], + [3, 1]>, + InstrItinData, + InstrStage<1, [SW_LS]>], + [3, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 1>, + InstrStage<1, [SW_LS]>], + [3, 4, 1]>, + // + // Register offset + InstrItinData, + InstrStage<1, [SW_LS]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_LS]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_LS], 1>, + InstrStage<1, [SW_LS], 3>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [3, 4, 1, 1]>, + // + // Scaled register offset + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS]>], + [5, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS]>], + [5, 1, 1]>, + // + // Immediate offset with update + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [3, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [3, 1, 1]>, + // + // Register offset with update + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0], 1>, + InstrStage<1, [SW_LS]>], + [3, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0], 1>, + InstrStage<1, [SW_LS]>], + [3, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 0>, + InstrStage<1, [SW_LS], 3>, + InstrStage<1, [SW_LS], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [3, 4, 1, 1]>, + // + // Scaled register offset with update + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS], 3>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [5, 3, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [5, 3, 1, 1]>, + // + // Load multiple, def is the 5th operand. + // FIXME: This assumes 3 to 4 registers. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1, 1, 3], [], -1>, // dynamic uops + + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 0>, + InstrStage<1, [SW_LS], 3>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1, 1, 3], [], -1>, // dynamic uops + // + // Load multiple plus branch + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1, 1, 3], [], -1>, // dynamic uops + // + // Pop, def is the 3rd operand. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 3], [], -1>, // dynamic uops + // + // Pop + branch, def is the 3rd operand. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 3], [], -1>, // dynamic uops + + // + // iLoadi + iALUr for t2LDRpci_pic. + InstrItinData, + InstrStage<1, [SW_LS], 3>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [4, 1]>, + + // Integer store pipeline + /// + // Immediate offset + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_LS], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1]>, + // + // Register offset + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_LS], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1]>, + // + // Scaled register offset + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS]>], + [1, 1, 1]>, + // + // Immediate offset with update + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1]>, + // + // Register offset with update + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [1, 1, 1, 1]>, + // + // Scaled register offset with update + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>], + [3, 1, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 2>, + InstrStage<1, [SW_LS], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>], + [3, 1, 1, 1]>, + // + // Store multiple + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS], 1>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS], 1>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [], [], -1>, // dynamic uops + // + // Store multiple + update + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS], 1>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS], 1>, + InstrStage<1, [SW_ALU0, SW_ALU1], 1>, + InstrStage<1, [SW_LS]>], + [2], [], -1>, // dynamic uops + + // + // Preload + InstrItinData], [1, 1]>, + + // Branch + // + // no delay slots, so the latency of a branch is unimportant + InstrItinData]>, + + // FP Special Register to Integer Register File Move + InstrItinData, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [1]>, + // + // Single-precision FP Unary + // + // Most floating-point moves get issued on ALU0. + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1]>, + // + // Double-precision FP Unary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1]>, + + // + // Single-precision FP Compare + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [1, 1]>, + // + // Double-precision FP Compare + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [1, 1]>, + // + // Single to Double FP Convert + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1]>, + // + // Double to Single FP Convert + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1]>, + + // + // Single to Half FP Convert + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU1], 4>, + InstrStage<1, [SW_ALU1]>], + [6, 1]>, + // + // Half to Single FP Convert + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1]>, + + // + // Single-Precision FP to Integer Convert + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1]>, + // + // Double-Precision FP to Integer Convert + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1]>, + // + // Integer to Single-Precision FP Convert + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1]>, + // + // Integer to Double-Precision FP Convert + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1]>, + // + // Single-precision FP ALU + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Double-precision FP ALU + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Single-precision FP Multiply + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1]>, + // + // Double-precision FP Multiply + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [6, 1, 1]>, + // + // Single-precision FP MAC + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Double-precision FP MAC + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [12, 1, 1]>, + // + // Single-precision Fused FP MAC + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [12, 1, 1]>, + // + // Single-precision FP DIV + InstrItinData, + InstrStage<1, [SW_ALU1], 0>, + InstrStage<15, [SW_FDIV]>], + [17, 1, 1]>, + // + // Double-precision FP DIV + InstrItinData, + InstrStage<1, [SW_ALU1], 0>, + InstrStage<30, [SW_FDIV]>], + [32, 1, 1]>, + // + // Single-precision FP SQRT + InstrItinData, + InstrStage<1, [SW_ALU1], 0>, + InstrStage<15, [SW_FDIV]>], + [17, 1]>, + // + // Double-precision FP SQRT + InstrItinData, + InstrStage<1, [SW_ALU1], 0>, + InstrStage<30, [SW_FDIV]>], + [32, 1, 1]>, + + // + // Integer to Single-precision Move + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 4>, + InstrStage<1, [SW_ALU0]>], + [6, 1]>, + // + // Integer to Double-precision Move + InstrItinData, + InstrStage<1, [SW_LS]>], + [4, 1]>, + // + // Single-precision to Integer Move + InstrItinData, + InstrStage<1, [SW_LS]>], + [3, 1]>, + // + // Double-precision to Integer Move + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 3>, + InstrStage<1, [SW_LS]>], + [3, 4, 1]>, + // + // Single-precision FP Load + InstrItinData, + InstrStage<1, [SW_LS]>], + [4, 1]>, + // + // Double-precision FP Load + InstrItinData, + InstrStage<1, [SW_LS]>], + [4, 1]>, + // + // FP Load Multiple + // FIXME: Assumes a single Q register. + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1, 1, 4], [], -1>, // dynamic uops + // + // FP Load Multiple + update + // FIXME: Assumes a single Q register. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 4>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1, 1, 4], [], -1>, // dynamic uops + // + // Single-precision FP Store + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1]>, + // + // Double-precision FP Store + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1]>, + // + // FP Store Multiple + // FIXME: Assumes a single Q register. + InstrItinData, + InstrStage<1, [SW_LS]>], + [1, 1, 1], [], -1>, // dynamic uops + // + // FP Store Multiple + update + // FIXME: Assumes a single Q register. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 4>, + InstrStage<1, [SW_ALU0, SW_ALU1]>], + [2, 1, 1, 1], [], -1>, // dynamic uops + // NEON + // + // Double-register Integer Unary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1]>, + // + // Quad-register Integer Unary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1]>, + // + // Double-register Integer Q-Unary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1]>, + // + // Quad-register Integer CountQ-Unary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1]>, + // + // Double-register Integer Binary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Quad-register Integer Binary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Double-register Integer Subtract + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Quad-register Integer Subtract + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Double-register Integer Shift + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Quad-register Integer Shift + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Double-register Integer Shift (4 cycle) + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Quad-register Integer Shift (4 cycle) + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Double-register Integer Binary (4 cycle) + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Quad-register Integer Binary (4 cycle) + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Double-register Integer Subtract (4 cycle) + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Quad-register Integer Subtract (4 cycle) + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + + // + // Double-register Integer Count + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Quad-register Integer Count + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1, 1]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1, 1]>, + // + // Double-register Integer Pair Add Long + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Quad-register Integer Pair Add Long + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + + // + // Double-register Integer Multiply (.8, .16) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1]>, + // + // Quad-register Integer Multiply (.8, .16) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1]>, + + // + // Double-register Integer Multiply (.32) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1]>, + // + // Quad-register Integer Multiply (.32) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1]>, + // + // Double-register Integer Multiply-Accumulate (.8, .16) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1, 1]>, + // + // Double-register Integer Multiply-Accumulate (.32) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.8, .16) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1, 1]>, + // + // Quad-register Integer Multiply-Accumulate (.32) + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1, 1]>, + + // + // Move + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1]>, + // + // Move Immediate + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2]>, + // + // Double-register Permute Move + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [2, 1]>, + // + // Quad-register Permute Move + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [2, 1]>, + // + // Integer to Single-precision Move + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 4>, + InstrStage<1, [SW_ALU0]>], + [6, 1]>, + // + // Integer to Double-precision Move + InstrItinData, + InstrStage<1, [SW_LS]>], + [4, 1, 1]>, + // + // Single-precision to Integer Move + InstrItinData, + InstrStage<1, [SW_LS]>], + [3, 1]>, + // + // Double-precision to Integer Move + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 3>, + InstrStage<1, [SW_LS]>], + [3, 4, 1]>, + // + // Integer to Lane Move + // FIXME: I think this is correct, but it is not clear from the tuning guide. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_LS], 4>, + InstrStage<1, [SW_ALU0]>], + [6, 1]>, + + // + // Vector narrow move + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [2, 1]>, + // + // Double-register FP Unary + // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, + // and they issue on a different pipeline. + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1]>, + // + // Quad-register FP Unary + // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, + // and they issue on a different pipeline. + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [2, 1]>, + // + // Double-register FP Binary + // FIXME: We're using this itin for many instructions. + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + + // + // VPADD, etc. + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Double-register FP VMUL + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1]>, + // + // Quad-register FP Binary + InstrItinData, + InstrStage<1, [SW_ALU0]>], + [4, 1, 1]>, + // + // Quad-register FP VMUL + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [4, 1, 1]>, + // + // Double-register FP Multiple-Accumulate + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Quad-register FP Multiple-Accumulate + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Double-register Fused FP Multiple-Accumulate + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Quad-register FusedF P Multiple-Accumulate + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Double-register Reciprical Step + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Quad-register Reciprical Step + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [8, 1, 1]>, + // + // Double-register Permute + // FIXME: The latencies are unclear from the documentation. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [3, 4, 3, 4]>, + // + // Quad-register Permute + // FIXME: The latencies are unclear from the documentation. + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [3, 4, 3, 4]>, + // + // Quad-register Permute (3 cycle issue on A9) + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [3, 4, 3, 4]>, + + // + // Double-register VEXT + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [2, 1, 1]>, + // + // Quad-register VEXT + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [2, 1, 1]>, + // + // VTB + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [4, 1, 3, 3]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [6, 1, 3, 5, 5]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [8, 1, 3, 5, 7, 7]>, + // + // VTBX + InstrItinData, + InstrStage<1, [SW_ALU1]>], + [2, 1, 1]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [4, 1, 3, 3]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [6, 1, 3, 5, 5]>, + InstrItinData, + InstrStage<1, [SW_DIS1], 0>, + InstrStage<1, [SW_DIS2], 0>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1], 2>, + InstrStage<1, [SW_ALU1]>], + [8, 1, 3, 5, 7, 7]> +]>; + +// ===---------------------------------------------------------------------===// +// This following definitions describe the simple machine model which +// will replace itineraries. + // Swift machine model for scheduling and other instruction cost heuristics. def SwiftModel : SchedMachineModel { let IssueWidth = 3; // 3 micro-ops are dispatched per cycle. let MicroOpBufferSize = 45; // Based on NEON renamed registers. let LoadLatency = 3; let MispredictPenalty = 14; // A branch direction mispredict. + + let Itineraries = SwiftItineraries; } // Swift predicates. diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 3180480986d..31b65f2bfec 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -319,19 +319,8 @@ bool ARMSubtarget::hasSinCos() const { return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0); } -bool ARMSubtarget::enableMachineScheduler() const { - // Enable the MachineScheduler before register allocation for out-of-order - // architectures where we do not use the PostRA scheduler anymore (for now - // restricted to swift). - return getSchedModel().isOutOfOrder() && isSwift(); -} - // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { - // No need for PostRA scheduling on out of order CPUs (for now restricted to - // swift). - if (getSchedModel().isOutOfOrder() && isSwift()) - return false; return (!isThumb() || hasThumb2()); } diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 4f9bc372e4b..75425890a28 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -433,9 +433,6 @@ public: /// compiler runtime or math libraries. bool hasSinCos() const; - /// Returns true if machine scheduler should be enabled. - bool enableMachineScheduler() const override; - /// True for some subtargets at > -O0. bool enablePostRAScheduler() const override; diff --git a/test/CodeGen/ARM/adv-copy-opt.ll b/test/CodeGen/ARM/adv-copy-opt.ll index 395be345720..f71bf78b62c 100644 --- a/test/CodeGen/ARM/adv-copy-opt.ll +++ b/test/CodeGen/ARM/adv-copy-opt.ll @@ -11,25 +11,25 @@ ; r0 = r0 / r2 ; r1 = r1 / r3 ; -; NOOPT: vmov [[A:d[0-9]+]], r0, r1 -; NOOPT-NEXT: vmov [[B:d[0-9]+]], r2, r3 +; NOOPT: vmov [[B:d[0-9]+]], r2, r3 +; NOOPT-NEXT: vmov [[A:d[0-9]+]], r0, r1 ; Move the low part of B into a register. ; Unfortunately, we cannot express that the 's' register is the low ; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2. ; NOOPT-NEXT: vmov [[B_LOW:r[0-9]+]], s{{[0-9]+}} -; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}} ; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}} -; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}} ; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]] -; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]] +; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}} +; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}} ; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]] +; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]] ; NOOPT-NEXT: vmov.32 [[RES]][1], [[RES_HIGH]] ; NOOPT-NEXT: vmov r0, r1, [[RES]] ; NOOPT-NEXT: bx lr ; ; OPT-NOT: vmov -; OPT: udiv r1, r1, r3 -; OPT-NEXT: udiv r0, r0, r2 +; OPT: udiv r0, r0, r2 +; OPT-NEXT: udiv r1, r1, r3 ; OPT-NEXT: bx lr define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind { entry: diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll index 79e8e68e2f5..c3de07e03b6 100644 --- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll +++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false ; dependency) when it isn't dependent on last CPSR defining instruction. ; rdar://8928208 @@ -7,10 +7,8 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone { entry: ; CHECK-LABEL: t1: -; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2 -; CHECK-CORTEX-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0 -; CHECK-SWIFT: muls [[REG2:(r[0-9]+)]], r1, r0 -; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3 +; CHECK: muls [[REG:(r[0-9]+)]], r3, r2 +; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0 ; CHECK-NEXT: muls r0, [[REG]], [[REG2]] %0 = mul nsw i32 %a, %b %1 = mul nsw i32 %c, %d @@ -23,7 +21,8 @@ define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone { define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind { entry: ; CHECK-LABEL: t2: - br label %while.body + %tobool7 = icmp eq i32* %ptr2, null + br i1 %tobool7, label %while.end, label %while.body while.body: ; CHECK: while.body @@ -56,7 +55,8 @@ while.end: define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize { entry: ; CHECK-LABEL: t3: - br label %while.body + %tobool7 = icmp eq i32* %ptr2, null + br i1 %tobool7, label %while.end, label %while.body while.body: ; CHECK: while.body diff --git a/test/CodeGen/ARM/cmpxchg-idioms.ll b/test/CodeGen/ARM/cmpxchg-idioms.ll index 81e05acfef7..fb88575cab3 100644 --- a/test/CodeGen/ARM/cmpxchg-idioms.ll +++ b/test/CodeGen/ARM/cmpxchg-idioms.ll @@ -15,14 +15,14 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) { ; CHECK: bne [[LOOP]] ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}} -; CHECK: dmb ish ; CHECK: movs r0, #1 +; CHECK: dmb ish ; CHECK: bx lr ; CHECK: [[FAILED]]: ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}} -; CHECK: dmb ish ; CHECK: movs r0, #0 +; CHECK: dmb ish ; CHECK: bx lr %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst @@ -34,8 +34,8 @@ define i32 @test_return(i32* %p, i32 %oldval, i32 %newval) { define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) { ; CHECK-LABEL: test_return_bool: -; CHECK: dmb ishst ; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1 +; CHECK: dmb ishst ; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]: ; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0] diff --git a/test/CodeGen/ARM/test-sharedidx.ll b/test/CodeGen/ARM/test-sharedidx.ll index 77d0f30485d..377996c4c3c 100644 --- a/test/CodeGen/ARM/test-sharedidx.ll +++ b/test/CodeGen/ARM/test-sharedidx.ll @@ -20,8 +20,8 @@ entry: for.body: ; preds = %entry, %for.body.3 ; CHECK: %for.body -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! %i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ] %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09 %0 = load i8, i8* %arrayidx, align 1 @@ -42,8 +42,8 @@ for.end: ; preds = %for.body, %for.body for.body.1: ; preds = %for.body ; CHECK: %for.body.1 -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5 %2 = load i8, i8* %arrayidx.1, align 1 %conv6.1 = zext i8 %2 to i32 @@ -60,8 +60,8 @@ for.body.1: ; preds = %for.body for.body.2: ; preds = %for.body.1 ; CHECK: %for.body.2 -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1 %4 = load i8, i8* %arrayidx.2, align 1 %conv6.2 = zext i8 %4 to i32 @@ -78,8 +78,8 @@ for.body.2: ; preds = %for.body.1 for.body.3: ; preds = %for.body.2 ; CHECK: %for.body.3 -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! -; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! +; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]! %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2 %6 = load i8, i8* %arrayidx.3, align 1 %conv6.3 = zext i8 %6 to i32 diff --git a/test/CodeGen/ARM/vector-load.ll b/test/CodeGen/ARM/vector-load.ll index a638c2bdb9b..17f134f458a 100644 --- a/test/CodeGen/ARM/vector-load.ll +++ b/test/CodeGen/ARM/vector-load.ll @@ -238,12 +238,12 @@ define <4 x i32> @zextload_v8i8tov8i32(<4 x i8>** %ptr) { define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) { ;CHECK-LABEL: zextload_v8i8tov8i32_fake_update: -;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] +;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0] ;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32] ;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16 +;CHECK: str.w r[[INCREG]], [r0] ;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} ;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} -;CHECK: str r[[INCREG]], [r0] %A = load <4 x i8>*, <4 x i8>** %ptr %lA = load <4 x i8>, <4 x i8>* %A, align 4 %inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4 diff --git a/test/CodeGen/ARM/vector-store.ll b/test/CodeGen/ARM/vector-store.ll index 161bbf1d0fd..30baa9a20dd 100644 --- a/test/CodeGen/ARM/vector-store.ll +++ b/test/CodeGen/ARM/vector-store.ll @@ -228,9 +228,9 @@ define void @truncstore_v4i32tov4i8(<4 x i8>** %ptr, <4 x i32> %val) { ;CHECK: ldr.w r9, [sp] ;CHECK: vmov {{d[0-9]+}}, r3, r9 ;CHECK: vmov {{d[0-9]+}}, r1, r2 -;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] ;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}} ;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}} +;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32] %A = load <4 x i8>*, <4 x i8>** %ptr %trunc = trunc <4 x i32> %val to <4 x i8> @@ -243,10 +243,10 @@ define void @truncstore_v4i32tov4i8_fake_update(<4 x i8>** %ptr, <4 x i32> %val) ;CHECK: ldr.w r9, [sp] ;CHECK: vmov {{d[0-9]+}}, r3, r9 ;CHECK: vmov {{d[0-9]+}}, r1, r2 -;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] +;CHECK: movs [[IMM16:r[0-9]+]], #16 ;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}} ;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}} -;CHECK: movs [[IMM16:r[0-9]+]], #16 +;CHECK: ldr r[[PTRREG:[0-9]+]], [r0] ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]] ;CHECK: str r[[PTRREG]], [r0] %A = load <4 x i8>*, <4 x i8>** %ptr -- 2.34.1