From a0792de66c8364d47b0a688c7f408efb7b10f31b Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Wed, 6 Oct 2010 06:27:31 +0000 Subject: [PATCH] - Add TargetInstrInfo::getOperandLatency() to compute operand latencies. This allow target to correctly compute latency for cases where static scheduling itineraries isn't sufficient. e.g. variable_ops instructions such as ARM::ldm. This also allows target without scheduling itineraries to compute operand latencies. e.g. X86 can return (approximated) latencies for high latency instructions such as division. - Compute operand latencies for those defined by load multiple instructions, e.g. ldm and those used by store multiple instructions, e.g. stm. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@115755 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetInstrInfo.h | 16 ++ lib/CodeGen/ScheduleDAGInstrs.cpp | 9 +- .../SelectionDAG/ScheduleDAGSDNodes.cpp | 20 +-- lib/Target/ARM/ARMBaseInstrInfo.cpp | 161 ++++++++++++++++++ lib/Target/ARM/ARMBaseInstrInfo.h | 15 ++ lib/Target/ARM/ARMInstrInfo.td | 10 +- lib/Target/ARM/ARMInstrThumb.td | 14 +- lib/Target/ARM/ARMInstrThumb2.td | 11 +- lib/Target/ARM/ARMSchedule.td | 10 +- lib/Target/ARM/ARMScheduleA8.td | 44 ++++- lib/Target/ARM/ARMScheduleA9.td | 48 +++++- lib/Target/ARM/ARMScheduleV6.td | 25 ++- lib/Target/TargetInstrInfo.cpp | 33 +++- test/CodeGen/ARM/arguments.ll | 4 +- test/CodeGen/ARM/select.ll | 2 +- 15 files changed, 354 insertions(+), 68 deletions(-) diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 0063dfcafee..a2654c97de2 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -608,6 +608,22 @@ public: /// instruction will be decoded to on the target cpu. virtual unsigned getNumMicroOps(const MachineInstr *MI, const InstrItineraryData *ItinData) const; + + /// getOperandLatency - Compute and return the use operand latency of a given + /// itinerary class and operand index if the value is produced by an + /// instruction of the specified itinerary class and def operand index. + /// In most cases, the static scheduling itinerary was enough to determine the + /// operand latency. But it may not be possible for instructions with variable + /// number of defs / uses. + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const; }; /// TargetInstrInfoImpl - This is the default implementation of diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 3d2565dd18c..e01f0ab1ce1 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -527,10 +527,10 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, MachineInstr *DefMI = Def->getInstr(); int DefIdx = DefMI->findRegisterDefOperandIdx(Reg); if (DefIdx != -1) { - unsigned DefClass = DefMI->getDesc().getSchedClass(); - MachineInstr *UseMI = Use->getInstr(); - unsigned UseClass = UseMI->getDesc().getSchedClass(); + const TargetInstrDesc &DefTID = DefMI->getDesc(); + unsigned DefClass = DefTID.getSchedClass(); + MachineInstr *UseMI = Use->getInstr(); // For all uses of the register, calculate the maxmimum latency int Latency = -1; for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { @@ -541,8 +541,7 @@ void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, if (MOReg != Reg) continue; - int UseCycle = InstrItins->getOperandLatency(DefClass, DefIdx, - UseClass, i); + int UseCycle = TII->getOperandLatency(InstrItins, DefMI, DefIdx, UseMI, i); Latency = std::max(Latency, UseCycle); // If we found a latency, then replace the existing dependence latency. diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 23ff9c5807c..0ffb4da0f36 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -450,29 +450,11 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use, if (ForceUnitLatencies()) return; - if (!InstrItins || InstrItins->isEmpty()) - return; - if (dep.getKind() != SDep::Data) return; unsigned DefIdx = Use->getOperand(OpIdx).getResNo(); - if (!Def->isMachineOpcode()) - return; - - const TargetInstrDesc &II = TII->get(Def->getMachineOpcode()); - if (DefIdx >= II.getNumDefs()) - return; - - int Latency = 0; - if (!Use->isMachineOpcode()) { - Latency = InstrItins->getOperandCycle(II.getSchedClass(), DefIdx); - } else { - unsigned DefClass = II.getSchedClass(); - unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass(); - Latency = InstrItins->getOperandLatency(DefClass, DefIdx, UseClass, OpIdx); - } - + int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx); if (Latency >= 0) dep.setLatency(Latency); } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 0ccf7b6a7c4..6def552e174 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1642,3 +1642,164 @@ ARMBaseInstrInfo::getNumMicroOps(const MachineInstr *MI, } } } + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefIdx, unsigned DefAlign, + const TargetInstrDesc &UseTID, + unsigned UseIdx, unsigned UseAlign) const { + unsigned DefClass = DefTID.getSchedClass(); + unsigned UseClass = UseTID.getSchedClass(); + + if (DefIdx < DefTID.getNumDefs() && UseIdx < UseTID.getNumOperands()) + return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); + + // This may be a def / use of a variable_ops instruction, the operand + // latency might be determinable dynamically. Let the target try to + // figure it out. + bool LdmBypass = false; + int DefCycle = -1; + switch (DefTID.getOpcode()) { + default: + DefCycle = ItinData->getOperandCycle(DefClass, DefIdx); + break; + case ARM::LDM_RET: + case ARM::LDM: + case ARM::LDM_UPD: + case ARM::tLDM: + case ARM::tLDM_UPD: + case ARM::tPUSH: + case ARM::t2LDM_RET: + case ARM::t2LDM: + case ARM::t2LDM_UPD: { + LdmBypass = 1; + unsigned RegNo = (DefIdx+1) - DefTID.getNumOperands() + 1; + if (Subtarget.isCortexA8()) { + // 4 registers would be issued: 1, 2, 1. + // 5 registers would be issued: 1, 2, 2. + DefCycle = RegNo / 2; + if (DefCycle < 1) + DefCycle = 1; + // Result latency is issue cycle + 2: E2. + DefCycle += 2; + } else if (Subtarget.isCortexA9()) { + DefCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || DefAlign < 8) + ++DefCycle; + // Result latency is AGU cycles + 2. + DefCycle += 2; + } else { + // Assume the worst. + DefCycle = RegNo + 2; + } + } + } + + if (DefCycle == -1) + // We can't seem to determine the result latency of the def, assume it's 2. + DefCycle = 2; + + int UseCycle = -1; + switch (UseTID.getOpcode()) { + default: + UseCycle = ItinData->getOperandCycle(UseClass, UseIdx); + break; + case ARM::STM: + case ARM::STM_UPD: + case ARM::tSTM_UPD: + case ARM::tPOP_RET: + case ARM::tPOP: + case ARM::t2STM: + case ARM::t2STM_UPD: { + unsigned RegNo = UseIdx - UseTID.getNumOperands() + 1; + if (Subtarget.isCortexA8()) { + // 4 registers would be issued: 1, 2, 1. + // 5 registers would be issued: 1, 2, 2. + UseCycle = RegNo / 2; + if (UseCycle < 2) + UseCycle = 2; + // Result latency is issue cycle + 2: E2. + UseCycle += 2; + } else if (Subtarget.isCortexA9()) { + UseCycle = (RegNo / 2); + // If there are odd number of registers or if it's not 64-bit aligned, + // then it takes an extra AGU (Address Generation Unit) cycle. + if ((RegNo % 2) || UseAlign < 8) + ++UseCycle; + // Result latency is AGU cycles + 2. + UseCycle += 2; + } else { + // Assume the worst. + UseCycle = RegNo + 2; + } + } + } + + if (UseCycle == -1) + // Assume it's read in the first stage. + UseCycle = 1; + + UseCycle = DefCycle - UseCycle + 1; + if (UseCycle > 0) { + if (LdmBypass) { + // It's a variable_ops instruction so we can't use DefIdx here. Just use + // first def operand. + if (ItinData->hasPipelineForwarding(DefClass, DefTID.getNumOperands()-1, + UseClass, UseIdx)) + --UseCycle; + } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx, + UseClass, UseIdx)) + --UseCycle; + } + + return UseCycle; +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || + DefMI->isRegSequence() || DefMI->isImplicitDef()) + return 1; + + const TargetInstrDesc &DefTID = DefMI->getDesc(); + if (!ItinData || ItinData->isEmpty()) + return DefTID.mayLoad() ? 3 : 1; + + const TargetInstrDesc &UseTID = UseMI->getDesc(); + unsigned DefAlign = DefMI->hasOneMemOperand() + ? (*DefMI->memoperands_begin())->getAlignment() : 0; + unsigned UseAlign = UseMI->hasOneMemOperand() + ? (*UseMI->memoperands_begin())->getAlignment() : 0; + return getOperandLatency(ItinData, DefTID, DefIdx, DefAlign, + UseTID, UseIdx, UseAlign); +} + +int +ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const { + if (!DefNode->isMachineOpcode()) + return 1; + + const TargetInstrDesc &DefTID = get(DefNode->getMachineOpcode()); + if (!ItinData || ItinData->isEmpty()) + return DefTID.mayLoad() ? 3 : 1; + + if (!UseNode->isMachineOpcode()) + return ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx); + + const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode()); + const MachineSDNode *DefMN = dyn_cast(DefNode); + unsigned DefAlign = !DefMN->memoperands_empty() + ? (*DefMN->memoperands_begin())->getAlignment() : 0; + const MachineSDNode *UseMN = dyn_cast(UseNode); + unsigned UseAlign = !UseMN->memoperands_empty() + ? (*UseMN->memoperands_begin())->getAlignment() : 0; + return getOperandLatency(ItinData, DefTID, DefIdx, DefAlign, + UseTID, UseIdx, UseAlign); +} diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index c0b13b340b5..04fee641f1e 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -340,6 +340,21 @@ public: virtual unsigned getNumMicroOps(const MachineInstr *MI, const InstrItineraryData *ItinData) const; + + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + virtual + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const; +private: + int getOperandLatency(const InstrItineraryData *ItinData, + const TargetInstrDesc &DefTID, + unsigned DefIdx, unsigned DefAlign, + const TargetInstrDesc &UseTID, + unsigned UseIdx, unsigned UseAlign) const; }; static inline diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 6ee7e2fd3aa..755e687cb41 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -954,7 +954,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1 in def LDM_RET : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IndexModeUpd, LdStMulFrm, IIC_iLoadmBr, + IndexModeUpd, LdStMulFrm, IIC_iLoad_mBr, "ldm${addr:submode}${p}\t$addr!, $dsts", "$addr.addr = $wb", []>; @@ -1463,12 +1463,12 @@ def STRHT: AI3sthpo<(outs GPR:$base_wb), let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { def LDM : AXI4ld<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IndexModeNone, LdStMulFrm, IIC_iLoadm, + IndexModeNone, LdStMulFrm, IIC_iLoad_m, "ldm${addr:submode}${p}\t$addr, $dsts", "", []>; def LDM_UPD : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IndexModeUpd, LdStMulFrm, IIC_iLoadm, + IndexModeUpd, LdStMulFrm, IIC_iLoad_mu, "ldm${addr:submode}${p}\t$addr!, $dsts", "$addr.addr = $wb", []>; } // mayLoad, neverHasSideEffects, hasExtraDefRegAllocReq @@ -1476,12 +1476,12 @@ def LDM_UPD : AXI4ld<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { def STM : AXI4st<(outs), (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), - IndexModeNone, LdStMulFrm, IIC_iStorem, + IndexModeNone, LdStMulFrm, IIC_iStore_m, "stm${addr:submode}${p}\t$addr, $srcs", "", []>; def STM_UPD : AXI4st<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), - IndexModeUpd, LdStMulFrm, IIC_iStorem, + IndexModeUpd, LdStMulFrm, IIC_iStore_mu, "stm${addr:submode}${p}\t$addr!, $srcs", "$addr.addr = $wb", []>; } // mayStore, neverHasSideEffects, hasExtraSrcRegAllocReq diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 7593dadb582..60c5c6017d1 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -280,7 +280,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1 in def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), - IIC_iLoadmBr, + IIC_iPop_Br, "pop${p}\t$dsts", []>, T1Misc<{1,1,0,?,?,?,?}>; @@ -535,13 +535,13 @@ def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStore_i, let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { def tLDM : T1I<(outs), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IIC_iLoadm, + IIC_iLoad_m, "ldm${addr:submode}${p}\t$addr, $dsts", []>, T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53 def tLDM_UPD : T1It<(outs tGPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IIC_iLoadm, + IIC_iLoad_m, "ldm${addr:submode}${p}\t$addr!, $dsts", "$addr.addr = $wb", []>, T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53 @@ -550,18 +550,20 @@ def tLDM_UPD : T1It<(outs tGPR:$wb), let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in def tSTM_UPD : T1It<(outs tGPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), - IIC_iStorem, + IIC_iStore_mu, "stm${addr:submode}${p}\t$addr!, $srcs", "$addr.addr = $wb", []>, T1Encoding<{1,1,0,0,0,?}>; // A6.2 & A8.6.189 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in -def tPOP : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), IIC_iLoadmBr, +def tPOP : T1I<(outs), (ins pred:$p, reglist:$dsts, variable_ops), + IIC_iPop, "pop${p}\t$dsts", []>, T1Misc<{1,1,0,?,?,?,?}>; let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in -def tPUSH : T1I<(outs), (ins pred:$p, reglist:$srcs, variable_ops), IIC_iStorem, +def tPUSH : T1I<(outs), (ins pred:$p, reglist:$srcs, variable_ops), + IIC_iStore_m, "push${p}\t$srcs", []>, T1Misc<{0,1,0,?,?,?,?}>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index ee000fd9582..87e557bf6cc 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -1243,7 +1243,7 @@ defm t2PLI : T2Ipl<1, 0, "pli">; let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in { def t2LDM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), IIC_iLoadm, + reglist:$dsts, variable_ops), IIC_iLoad_m, "ldm${addr:submode}${p}${addr:wide}\t$addr, $dsts", []> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b00; @@ -1254,7 +1254,8 @@ def t2LDM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, } def t2LDM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, - reglist:$dsts, variable_ops), IIC_iLoadm, + reglist:$dsts, variable_ops), + IIC_iLoad_mu, "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts", "$addr.addr = $wb", []> { let Inst{31-27} = 0b11101; @@ -1268,7 +1269,7 @@ def t2LDM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { def t2STM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, - reglist:$srcs, variable_ops), IIC_iStorem, + reglist:$srcs, variable_ops), IIC_iStore_m, "stm${addr:submode}${p}${addr:wide}\t$addr, $srcs", []> { let Inst{31-27} = 0b11101; let Inst{26-25} = 0b00; @@ -1280,7 +1281,7 @@ def t2STM : T2XI<(outs), (ins addrmode4:$addr, pred:$p, def t2STM_UPD : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$srcs, variable_ops), - IIC_iStorem, + IIC_iStore_m, "stm${addr:submode}${p}${addr:wide}\t$addr!, $srcs", "$addr.addr = $wb", []> { let Inst{31-27} = 0b11101; @@ -2473,7 +2474,7 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1, hasExtraDefRegAllocReq = 1 in def t2LDM_RET : T2XIt<(outs GPR:$wb), (ins addrmode4:$addr, pred:$p, reglist:$dsts, variable_ops), - IIC_iLoadmBr, + IIC_iLoad_mBr, "ldm${addr:submode}${p}${addr:wide}\t$addr!, $dsts", "$addr.addr = $wb", []> { let Inst{31-27} = 0b11101; diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index ec7d29aac03..b7ce8322ba2 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -67,8 +67,11 @@ def IIC_iLoad_bh_siu : InstrItinClass; def IIC_iLoad_d_i : InstrItinClass; def IIC_iLoad_d_r : InstrItinClass; def IIC_iLoad_d_ru : InstrItinClass; -def IIC_iLoadm : InstrItinClass<0>; // micro-coded -def IIC_iLoadmBr : InstrItinClass<0>; // micro-coded +def IIC_iLoad_m : InstrItinClass<0>; // micro-coded +def IIC_iLoad_mu : InstrItinClass<0>; // micro-coded +def IIC_iLoad_mBr : InstrItinClass<0>; // micro-coded +def IIC_iPop : InstrItinClass<0>; // micro-coded +def IIC_iPop_Br : InstrItinClass<0>; // micro-coded def IIC_iLoadiALU : InstrItinClass; def IIC_iStore_i : InstrItinClass; def IIC_iStore_r : InstrItinClass; @@ -85,7 +88,8 @@ def IIC_iStore_bh_siu : InstrItinClass; def IIC_iStore_d_i : InstrItinClass; def IIC_iStore_d_r : InstrItinClass; def IIC_iStore_d_ru : InstrItinClass; -def IIC_iStorem : InstrItinClass<0>; // micro-coded +def IIC_iStore_m : InstrItinClass<0>; // micro-coded +def IIC_iStore_mu : InstrItinClass<0>; // micro-coded def IIC_Br : InstrItinClass; def IIC_fpSTAT : InstrItinClass; def IIC_fpUNA32 : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 915283bd259..714bf2e6353 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -172,21 +172,44 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0]>], [4, 3, 1, 1]>, // - // Load multiple - InstrItinData, + // Load multiple, def is the 5th operand. + InstrItinData, InstrStage<2, [A8_Pipe0], 0>, InstrStage<2, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, - InstrStage<1, [A8_LdSt0]>]>, - + InstrStage<1, [A8_LdSt0]>], [1, 1, 1, 1, 3]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [2, 1, 1, 1, 3]>, // // Load multiple plus branch - InstrItinData, + InstrItinData, InstrStage<2, [A8_Pipe0], 0>, InstrStage<2, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0]>, - InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 2, 1, 1, 3]>, + // + // Pop, def is the 3rd operand. + InstrItinData, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [1, 1, 3]>, + // + // Push, def is the 3th operand. + InstrItinData, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>], + [1, 1, 3]>, // // iLoadi + iALUr for t2LDRpci_pic. @@ -266,11 +289,18 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<1, [A8_LdSt0]>], [3, 3, 1, 1]>, // // Store multiple - InstrItinData, + InstrItinData, InstrStage<2, [A8_Pipe0], 0>, InstrStage<2, [A8_Pipe1]>, InstrStage<1, [A8_Pipe0, A8_Pipe1]>, InstrStage<1, [A8_LdSt0]>]>, + // + // Store multiple + update + InstrItinData, + InstrStage<2, [A8_Pipe0], 0>, + InstrStage<2, [A8_Pipe1]>, + InstrStage<1, [A8_Pipe0, A8_Pipe1]>, + InstrStage<1, [A8_LdSt0]>], [2]>, // Branch // diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index f96b50448a1..6dd27157030 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -242,18 +242,42 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_AGU]>], [5, 4, 1, 1], [A9_LdBypass]>, // - // Load multiple - InstrItinData, + // Load multiple, def is the 5th operand. + InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<2, [A9_AGU]>], - [3], [A9_LdBypass]>, - + [1, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU]>], + [2, 1, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, // // Load multiple plus branch - InstrItinData, + InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_AGU]>, - InstrStage<1, [A9_Branch]>]>, + InstrStage<1, [A9_Branch]>], + [1, 2, 1, 1, 3], + [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop, def is the 3rd operand. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<2, [A9_AGU]>, + InstrStage<1, [A9_Branch]>], + [1, 1, 3], + [NoBypass, NoBypass, A9_LdBypass]>, // // iLoadi + iALUr for t2LDRpci_pic. @@ -329,13 +353,21 @@ def CortexA9Itineraries : ProcessorItineraries< [3, 1, 1, 1]>, // // Store multiple - InstrItinData, + InstrItinData, InstrStage<1, [A9_MUX0], 0>, InstrStage<1, [A9_AGU]>]>, + // + // Store multiple + update + InstrItinData, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_AGU]>], [2]>, + // Branch // // no delay slots, so the latency of a branch is unimportant - InstrItinData]>, + InstrItinData, + InstrStage<1, [A9_Issue1], 0>, + InstrStage<1, [A9_Branch]>]>, // VFP and NEON shares the same register file. This means that every VFP // instruction should wait for full completion of the consecutive NEON diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td index b5ae9277fcf..f1dbdbc5a71 100644 --- a/lib/Target/ARM/ARMScheduleV6.td +++ b/lib/Target/ARM/ARMScheduleV6.td @@ -116,19 +116,29 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData], [5, 2, 2, 1]>, // - // Load multiple - InstrItinData]>, - + // Load multiple, def is the 5th operand. + InstrItinData], [1, 1, 1, 1, 4]>, + // + // Load multiple + update, defs are the 1st and 5th operands. + InstrItinData], [2, 1, 1, 1, 4]>, // // Load multiple plus branch - InstrItinData, - InstrStage<1, [V6_Pipe]>]>, + InstrItinData, + InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>, // // iLoadi + iALUr for t2LDRpci_pic. InstrItinData, InstrStage<1, [V6_Pipe]>], [3, 1]>, + // + // Pop, def is the 3rd operand. + InstrItinData], [1, 1, 4]>, + // + // Pop + branch, def is the 3rd operand. + InstrItinData, + InstrStage<1, [V6_Pipe]>], [1, 2, 4]>, + // Integer store pipeline // // Immediate offset @@ -159,7 +169,10 @@ def ARMV6Itineraries : ProcessorItineraries< InstrItinData], [2, 2, 2, 1]>, // // Store multiple - InstrItinData]>, + InstrItinData]>, + // + // Store multiple + update + InstrItinData], [2]>, // Branch // diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp index e3ebae94732..7f103226daf 100644 --- a/lib/Target/TargetInstrInfo.cpp +++ b/lib/Target/TargetInstrInfo.cpp @@ -12,9 +12,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/Target/TargetInstrItineraries.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -64,6 +65,36 @@ TargetInstrInfo::getNumMicroOps(const MachineInstr *MI, return 1; } +int +TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + if (!ItinData || ItinData->isEmpty()) + return -1; + + unsigned DefClass = DefMI->getDesc().getSchedClass(); + unsigned UseClass = UseMI->getDesc().getSchedClass(); + return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); +} + +int +TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const { + if (!ItinData || ItinData->isEmpty()) + return -1; + + if (!DefNode->isMachineOpcode()) + return -1; + + unsigned DefClass = get(DefNode->getMachineOpcode()).getSchedClass(); + if (!UseNode->isMachineOpcode()) + return ItinData->getOperandCycle(DefClass, DefIdx); + unsigned UseClass = get(UseNode->getMachineOpcode()).getSchedClass(); + return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); +} + + /// insertNoop - Insert a noop into the instruction stream at the specified /// point. void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB, diff --git a/test/CodeGen/ARM/arguments.ll b/test/CodeGen/ARM/arguments.ll index bb7853e66ef..c7fcb9755d9 100644 --- a/test/CodeGen/ARM/arguments.ll +++ b/test/CodeGen/ARM/arguments.ll @@ -13,8 +13,8 @@ define i32 @f1(i32 %a, i64 %b) { ; test that allocating the double to r2/r3 makes r1 unavailable on gnueabi. define i32 @f2() nounwind optsize { ; ELF: f2: -; ELF: mov r0, #128 -; ELF: str r0, [sp] +; ELF: mov [[REGISTER:(r[0-9]+)]], #128 +; ELF: str [[REGISTER]], [sp] ; DARWIN: f2: ; DARWIN: mov r3, #128 entry: diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll index 7413bed5c5b..06b42f4f448 100644 --- a/test/CodeGen/ARM/select.ll +++ b/test/CodeGen/ARM/select.ll @@ -79,9 +79,9 @@ define double @f7(double %a, double %b) { ; CHECK-NEON: movw [[REGISTER_1:r[0-9]+]], #1123 ; CHECK-NEON-NEXT: movs [[REGISTER_2:r[0-9]+]], #0 ; CHECK-NEON-NEXT: cmp r0, [[REGISTER_1]] -; CHECK-NEON-NEXT: adr [[REGISTER_3:r[0-9]+]], #LCPI ; CHECK-NEON-NEXT: it eq ; CHECK-NEON-NEXT: moveq [[REGISTER_2]], #4 +; CHECK-NEON-NEXT: adr [[REGISTER_3:r[0-9]+]], #LCPI ; CHECK-NEON-NEXT: ldr ; CHECK-NEON: bx -- 2.34.1