From: Chad Rosier Date: Fri, 16 May 2014 17:15:33 +0000 (+0000) Subject: [ARM64] Increases the Sched Model accuracy for Cortex-A53. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=117b0385923b380387e8a8e1785673ba4a0c0829;p=oota-llvm.git [ARM64] Increases the Sched Model accuracy for Cortex-A53. Patch by Dave Estes http://reviews.llvm.org/D3769 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209001 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM64/ARM64InstrFormats.td b/lib/Target/ARM64/ARM64InstrFormats.td index be2f7bf9791..a9bad10b31e 100644 --- a/lib/Target/ARM64/ARM64InstrFormats.td +++ b/lib/Target/ARM64/ARM64InstrFormats.td @@ -1101,7 +1101,7 @@ class BaseOneOperandData opc, RegisterClass regtype, string asm, SDPatternOperator node> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", [(set regtype:$Rd, (node regtype:$Rn))]>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; @@ -1140,7 +1140,7 @@ class BaseBaseAddSubCarry pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", pattern>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; bits<5> Rd; bits<5> Rn; @@ -1214,11 +1214,11 @@ class BaseDiv { def Wr : BaseDiv, - Sched<[WriteID32]> { + Sched<[WriteID32, ReadID, ReadID]> { let Inst{31} = 0; } def Xr : BaseDiv, - Sched<[WriteID64]> { + Sched<[WriteID64, ReadID, ReadID]> { let Inst{31} = 1; } } @@ -1226,7 +1226,7 @@ multiclass Div { class BaseShift shift_type, RegisterClass regtype, string asm, SDPatternOperator OpNode = null_frag> : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>, - Sched<[WriteIS]> { + Sched<[WriteIS, ReadI]> { let Inst{11-10} = shift_type; } @@ -1278,13 +1278,13 @@ class BaseMulAccum opc, RegisterClass multype, multiclass MulAccum { def Wrrr : BaseMulAccum, - Sched<[WriteIM32]> { + Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { let Inst{31} = 0; } def Xrrr : BaseMulAccum, - Sched<[WriteIM64]> { + Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> { let Inst{31} = 1; } } @@ -1294,7 +1294,7 @@ class WideMulAccum opc, string asm, : BaseMulAccum, - Sched<[WriteIM32]> { + Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { let Inst{31} = 1; } @@ -1302,7 +1302,7 @@ class MulHi opc, string asm, SDNode OpNode> : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm), asm, "\t$Rd, $Rn, $Rm", "", [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>, - Sched<[WriteIM64]> { + Sched<[WriteIM64, ReadIM, ReadIM]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -1333,7 +1333,7 @@ class BaseCRC32 sz, bit C, RegisterClass StreamReg, : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm), asm, "\t$Rd, $Rn, $Rm", "", [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>, - Sched<[WriteISReg]> { + Sched<[WriteISReg, ReadI, ReadISReg]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -1420,7 +1420,7 @@ class BaseInsertImmediate opc, RegisterClass regtype, Operand shifter, : I<(outs regtype:$Rd), (ins regtype:$src, movimm32_imm:$imm, shifter:$shift), asm, "\t$Rd, $imm$shift", "$src = $Rd", []>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI]> { bits<5> Rd; bits<16> imm; bits<6> shift; @@ -1453,7 +1453,7 @@ class BaseAddSubImm, - Sched<[WriteI]> { + Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; bits<14> imm; @@ -1471,7 +1471,7 @@ class BaseAddSubRegPseudo : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, - Sched<[WriteI]>; + Sched<[WriteI, ReadI, ReadI]>; class BaseAddSubSReg, - Sched<[WriteISReg]> { + Sched<[WriteISReg, ReadI, ReadISReg]> { // The operands are in order to match the 'addr' MI operands, so we // don't need an encoder method and by-name matching. Just use the default // in-order handling. Since we're using by-order, make sure the names @@ -1508,7 +1508,7 @@ class BaseAddSubEReg, - Sched<[WriteIEReg]> { + Sched<[WriteIEReg, ReadI, ReadIEReg]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -1533,7 +1533,7 @@ class BaseAddSubEReg64, - Sched<[WriteIEReg]> { + Sched<[WriteIEReg, ReadI, ReadIEReg]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -1746,7 +1746,7 @@ class BaseBitfieldImm opc, RegisterClass regtype, Operand imm_type, string asm> : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms), asm, "\t$Rd, $Rn, $immr, $imms", "", []>, - Sched<[WriteIS]> { + Sched<[WriteIS, ReadI]> { bits<5> Rd; bits<5> Rn; bits<6> immr; @@ -1780,7 +1780,7 @@ class BaseBitfieldImmWith2RegArgs opc, : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr, imm_type:$imms), asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>, - Sched<[WriteIS]> { + Sched<[WriteIS, ReadI]> { bits<5> Rd; bits<5> Rn; bits<6> immr; @@ -1818,7 +1818,7 @@ class BaseLogicalImm opc, RegisterClass dregtype, list pattern> : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm), asm, "\t$Rd, $Rn, $imm", "", pattern>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; bits<13> imm; @@ -1839,7 +1839,7 @@ class BaseLogicalSReg opc, bit N, RegisterClass regtype, list pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", pattern>, - Sched<[WriteISReg]> { + Sched<[WriteISReg, ReadI, ReadISReg]> { // The operands are in order to match the 'addr' MI operands, so we // don't need an encoder method and by-name matching. Just use the default // in-order handling. Since we're using by-order, make sure the names @@ -1897,7 +1897,7 @@ multiclass LogicalImmS opc, string mnemonic, SDNode OpNode> { class BaseLogicalRegPseudo : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, - Sched<[WriteI]>; + Sched<[WriteI, ReadI, ReadI]>; // Split from LogicalImm as not all instructions have both. multiclass LogicalReg opc, bit N, string mnemonic, @@ -1953,7 +1953,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseCondSetFlagsImm : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond), asm, "\t$Rn, $imm, $nzcv, $cond", "", []>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -1985,7 +1985,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseCondSetFlagsReg : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2022,7 +2022,7 @@ class BaseCondSelect op2, RegisterClass regtype, string asm> asm, "\t$Rd, $Rn, $Rm, $cond", "", [(set regtype:$Rd, (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; bits<5> Rd; @@ -2055,7 +2055,7 @@ class BaseCondSelectOp op2, RegisterClass regtype, string asm, [(set regtype:$Rd, (ARM64csel regtype:$Rn, (frag regtype:$Rm), (i32 imm:$cond), NZCV))]>, - Sched<[WriteI]> { + Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; bits<5> Rd; diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp index f46f2cf13be..75d906d9da0 100644 --- a/lib/Target/ARM64/ARM64InstrInfo.cpp +++ b/lib/Target/ARM64/ARM64InstrInfo.cpp @@ -825,6 +825,19 @@ bool ARM64InstrInfo::optimizeCompareInstr( return true; } +/// Return true if this is this instruction has a non-zero immediate +bool ARM64InstrInfo::hasNonZeroImm(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + if (MI->getOperand(3).isImm()) { + unsigned val = MI->getOperand(3).getImm(); + return (val != 0); + } + break; + } + return false; +} + // Return true if this instruction simply sets its single destination register // to zero. This is equivalent to a register rename of the zero-register. bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const { diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h index 8f8165b02eb..a52d9ae50c7 100644 --- a/lib/Target/ARM64/ARM64InstrInfo.h +++ b/lib/Target/ARM64/ARM64InstrInfo.h @@ -56,6 +56,9 @@ public: unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const override; + /// \brief Is there a non-zero immediate? + bool hasNonZeroImm(const MachineInstr *MI) const; + /// \brief Does this instruction set its full destination register to zero? bool isGPRZero(const MachineInstr *MI) const; diff --git a/lib/Target/ARM64/ARM64SchedA53.td b/lib/Target/ARM64/ARM64SchedA53.td index 178b0153dc2..e07e93d12ef 100644 --- a/lib/Target/ARM64/ARM64SchedA53.td +++ b/lib/Target/ARM64/ARM64SchedA53.td @@ -20,7 +20,7 @@ def CortexA53Model : SchedMachineModel { let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order. let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency. - let LoadLatency = 2; // Optimistic load latency assuming bypass. + let LoadLatency = 3; // Optimistic load latency assuming bypass. // This is overriden by OperandCycles if the // Itineraries are queried instead. let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation @@ -32,7 +32,7 @@ def CortexA53Model : SchedMachineModel { //===----------------------------------------------------------------------===// // Define each kind of processor resource and number available. -// Modeling each pipeline as a ProcResource using the BufferSize = 0 since +// Modeling each pipeline as a ProcResource using the BufferSize = 0 since // Cortex-A53 is in-order. def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU @@ -50,16 +50,16 @@ def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt let SchedModel = CortexA53Model in { -// ALU - These are reduced to 1 despite a true latency of 4 in order to easily -// model forwarding logic. Once forwarding is properly modelled, then -// they'll be corrected. -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } +// ALU - Despite having a full latency of 4, most of the ALU instructions can +// forward a cycle earlier and then two cycles earlier in the case of a +// shift-only instruction. These latencies will be incorrect when the +// result cannot be forwarded, but modeling isn't rocket surgery. +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } // MAC def : WriteRes { let Latency = 4; } @@ -73,14 +73,41 @@ def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } + +// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd +// below, choosing the median of 3 which makes the latency 6. +// May model this more carefully in the future. The remaining +// A53WriteVLD# types represent the 1-5 cycle issues explicitly. +def : WriteRes { let Latency = 6; + let ResourceCycles = [3]; } +def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } +def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } +def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7; + let ResourceCycles = [4]; } +def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8; + let ResourceCycles = [5]; } + +// Pre/Post Indexing - Performed as part of address generation which is already +// accounted for in the WriteST* latencies below +def : WriteRes { let Latency = 0; } // Store def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } + +// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. +def : WriteRes { let Latency = 5; + let ResourceCycles = [2];} +def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } +def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } // Branch def : WriteRes; @@ -101,29 +128,143 @@ def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 33; let ResourceCycles = [29]; } -def A53WriteFDiv : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33; - let ResourceCycles = [29]; } -def A53WriteFSqrt : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32; - let ResourceCycles = [28]; } +def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; } +def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18; + let ResourceCycles = [14]; } +def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33; + let ResourceCycles = [29]; } +def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17; + let ResourceCycles = [13]; } +def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32; + let ResourceCycles = [28]; } //===----------------------------------------------------------------------===// // Subtarget-specific SchedRead types. -// While there is no forwarding information defined for these SchedRead types, -// they are still used by some instruction via a SchedRW list and so these zero -// SchedReadAdvances are required. - +// No forwarding for these reads. def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable +// operands are needed one cycle later if and only if they are to be +// shifted. Otherwise, they too are needed two cycle later. +def : ReadAdvance; +def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def A53ReadISReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def A53ReadIEReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// MAC - Operands are generally needed one cycle later in the MAC pipe. +// Accumulator operands are needed two cycles later. +def : ReadAdvance; +def : ReadAdvance; + +// Div +def : ReadAdvance; + //===----------------------------------------------------------------------===// // Subtarget-specific InstRWs. +//--- +// Miscellaneous +//--- def : InstRW<[WriteI], (instrs COPY)>; -def : InstRW<[WriteLD], (instregex "LD[1-4]")>; -def : InstRW<[WriteST], (instregex "ST[1-4]")>; -def : InstRW<[A53WriteFDiv], (instregex "^FDIV")>; -def : InstRW<[A53WriteFSqrt], (instregex ".*SQRT.*")>; + +//--- +// Vector Mul with Accumulate +//--- +//def : InstRW<[WriteIM32, A53ReadIMA], (instregex "^M(ADD|SUB)W.*")>; +//def : InstRW<[WriteIM64, A53ReadIMA], (instregex "^M(ADD|SUB)X.*")>; + +//--- +// Vector Loads +//--- +def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2dq)(_POST)?$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>; +def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)(_POST)?$")>; + +def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)(_POST)?$")>; + +def : InstRW<[A53WriteVLD1, A53WriteVLD1], (instregex "LDN?PS.*$")>; +def : InstRW<[A53WriteVLD2, A53WriteVLD2], (instregex "LDN?PD.*$")>; +def : InstRW<[A53WriteVLD4, A53WriteVLD4], (instregex "LDN?PQ.*$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>; +def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)(_POST)?$")>; + +def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)(_POST)?$")>; +def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)(_POST)?$")>; + +def : InstRW<[A53WriteVST1], (instregex "STN?P(S|D).*$")>; +def : InstRW<[A53WriteVST2], (instregex "STN?PQ.*$")>; + +//--- +// Floating Point MAC, DIV, SQRT +//--- +def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; } diff --git a/lib/Target/ARM64/ARM64SchedCyclone.td b/lib/Target/ARM64/ARM64SchedCyclone.td index 8b3a7592afd..c04a7bb8baf 100644 --- a/lib/Target/ARM64/ARM64SchedCyclone.td +++ b/lib/Target/ARM64/ARM64SchedCyclone.td @@ -851,4 +851,15 @@ def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; +//--- +// Unused SchedRead types +//--- + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + } // SchedModel = CycloneModel diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td index 52f9262312f..26a484fa0ad 100644 --- a/lib/Target/ARM64/ARM64Schedule.td +++ b/lib/Target/ARM64/ARM64Schedule.td @@ -25,13 +25,19 @@ def WriteImm : SchedWrite; // MOVN, MOVZ def WriteI : SchedWrite; // ALU def WriteISReg : SchedWrite; // ALU of Shifted-Reg def WriteIEReg : SchedWrite; // ALU of Extended-Reg +def ReadI : SchedRead; // ALU +def ReadISReg : SchedRead; // ALU of Shifted-Reg +def ReadIEReg : SchedRead; // ALU of Extended-Reg def WriteExtr : SchedWrite; // EXTR shifts a reg pair def ReadExtrHi : SchedRead; // Read the high reg of the EXTR pair def WriteIS : SchedWrite; // Shift/Scale def WriteID32 : SchedWrite; // 32-bit Divide def WriteID64 : SchedWrite; // 64-bit Divide +def ReadID : SchedRead; // 32/64-bit Divide def WriteIM32 : SchedWrite; // 32-bit Multiply def WriteIM64 : SchedWrite; // 64-bit Multiply +def ReadIM : SchedRead; // 32/64-bit Multiply +def ReadIMA : SchedRead; // 32/64-bit Multiply Accumulate def WriteBr : SchedWrite; // Branch def WriteBrReg : SchedWrite; // Indirect Branch @@ -44,6 +50,9 @@ def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled). def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. +// Predicate for determining when a shiftable register is shifted. +def RegShiftedPred : SchedPredicate<[{TII->hasNonZeroImm(MI)}]>; + // ScaledIdxPred is true if a WriteLDIdx operand will be // scaled. Subtargets can use this to dynamically select resources and // latency for WriteLDIdx and ReadAdrBase. diff --git a/test/CodeGen/ARM64/misched-basic-A53.ll b/test/CodeGen/ARM64/misched-basic-A53.ll index 9f0caa35429..608e5b65b63 100644 --- a/test/CodeGen/ARM64/misched-basic-A53.ll +++ b/test/CodeGen/ARM64/misched-basic-A53.ll @@ -8,9 +8,7 @@ ; CHECK: ********** MI Scheduling ********** ; CHECK: main ; CHECK: *** Final schedule for BB#2 *** -; CHECK: SU(13) ; CHECK: MADDWrrr -; CHECK: SU(4) ; CHECK: ADDWri ; CHECK: ********** INTERVALS ********** @main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4 diff --git a/test/CodeGen/ARM64/misched-forwarding-A53.ll b/test/CodeGen/ARM64/misched-forwarding-A53.ll new file mode 100644 index 00000000000..97bfb5ca9d3 --- /dev/null +++ b/test/CodeGen/ARM64/misched-forwarding-A53.ll @@ -0,0 +1,21 @@ +; REQUIRES: asserts +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; +; For Cortex-A53, shiftable operands that are not actually shifted +; are not needed for an additional two cycles. +; +; CHECK: ********** MI Scheduling ********** +; CHECK: shiftable +; CHECK: *** Final schedule for BB#0 *** +; CHECK: ADDXrr %vreg0, %vreg2 +; CHECK: ADDXrs %vreg0, %vreg2, 5 +; CHECK: ********** INTERVALS ********** +define i64 @shiftable(i64 %A, i64 %B) { + %tmp0 = sub i64 %B, 20 + %tmp1 = shl i64 %tmp0, 5; + %tmp2 = add i64 %A, %tmp1; + %tmp3 = add i64 %A, %tmp0 + %tmp4 = mul i64 %tmp2, %tmp3 + + ret i64 %tmp4 +}