From b7e0289fb320c8440ba5eed121a8b932dbd806a2 Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Tue, 5 Jun 2012 21:11:27 +0000 Subject: [PATCH] misched: API for minimum vs. expected latency. Minimum latency determines per-cycle scheduling groups. Expected latency determines critical path and cost. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@158021 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/ScheduleDAG.h | 15 +-- include/llvm/CodeGen/ScheduleDAGInstrs.h | 10 +- include/llvm/MC/MCInstrItineraries.h | 8 +- include/llvm/Target/TargetInstrInfo.h | 40 ++++-- lib/CodeGen/MachineScheduler.cpp | 112 +++++++++++----- lib/CodeGen/ScheduleDAGInstrs.cpp | 79 +++--------- lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h | 6 - lib/CodeGen/TwoAddressInstructionPass.cpp | 2 +- lib/Target/ARM/ARMBaseInstrInfo.cpp | 19 +-- lib/Target/ARM/ARMBaseInstrInfo.h | 5 +- lib/Target/TargetInstrInfo.cpp | 121 ++++++++++++++++-- 11 files changed, 277 insertions(+), 140 deletions(-) diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index f4de6933b31..3dd3c0c4673 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -272,6 +272,9 @@ namespace llvm { unsigned Depth; // Node depth. unsigned Height; // Node height. public: + unsigned TopReadyCycle; // Cycle relative to start when node is ready. + unsigned BotReadyCycle; // Cycle relative to end when node is ready. + const TargetRegisterClass *CopyDstRC; // Is a special copy node if not null. const TargetRegisterClass *CopySrcRC; @@ -287,7 +290,7 @@ namespace llvm { isScheduleHigh(false), isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), - CopyDstRC(NULL), CopySrcRC(NULL) {} + TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} /// SUnit - Construct an SUnit for post-regalloc scheduling to represent /// a MachineInstr. @@ -301,7 +304,7 @@ namespace llvm { isScheduleHigh(false), isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), - CopyDstRC(NULL), CopySrcRC(NULL) {} + TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} /// SUnit - Construct a placeholder SUnit. SUnit() @@ -314,7 +317,7 @@ namespace llvm { isScheduleHigh(false), isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), - CopyDstRC(NULL), CopySrcRC(NULL) {} + TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} /// setNode - Assign the representative SDNode for this SUnit. /// This may be used during pre-regalloc scheduling. @@ -552,12 +555,6 @@ namespace llvm { /// virtual void computeLatency(SUnit *SU) = 0; - /// ComputeOperandLatency - Override dependence edge latency using - /// operand use/def information - /// - virtual void computeOperandLatency(SUnit *, SUnit *, - SDep&) const { } - /// ForceUnitLatencies - Return true if all scheduling edges should be given /// a latency value of one. The default is to return false; schedulers may /// override this as needed. diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h index 968cc56d5cf..874f9f1b80e 100644 --- a/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -291,11 +291,15 @@ namespace llvm { /// virtual void computeLatency(SUnit *SU); - /// computeOperandLatency - Override dependence edge latency using + /// computeOperandLatency - Return dependence edge latency using /// operand use/def information /// - virtual void computeOperandLatency(SUnit *Def, SUnit *Use, - SDep& dep) const; + /// FindMin may be set to get the minimum vs. expected latency. Minimum + /// latency is used for scheduling groups, while expected latency is for + /// instruction cost and critical path. + virtual unsigned computeOperandLatency(SUnit *Def, SUnit *Use, + const SDep& dep, + bool FindMin = false) const; /// schedule - Order nodes according to selected style, filling /// in the Sequence member. diff --git a/include/llvm/MC/MCInstrItineraries.h b/include/llvm/MC/MCInstrItineraries.h index 199489599e4..62e19143482 100644 --- a/include/llvm/MC/MCInstrItineraries.h +++ b/include/llvm/MC/MCInstrItineraries.h @@ -214,6 +214,12 @@ public: /// class. The latency is the maximum completion time for any stage /// in the itinerary. /// + /// InstrStages override the itinerary's MinLatency property. In fact, if the + /// stage latencies, which may be zero, are less than MinLatency, + /// getStageLatency returns a value less than MinLatency. + /// + /// If no stages exist, MinLatency is used. If MinLatency is invalid (<0), + /// then it defaults to one cycle. unsigned getStageLatency(unsigned ItinClassIndx) const { // If the target doesn't provide itinerary information, use a simple // non-zero default value for all instructions. Some target's provide a @@ -222,7 +228,7 @@ public: // stage). This is different from beginStage == endStage != 0, which could // be used for zero-latency pseudo ops. if (isEmpty() || Itineraries[ItinClassIndx].FirstStage == 0) - return 1; + return (Props.MinLatency < 0) ? 1 : Props.MinLatency; // Calculate the maximum completion time for any stage. unsigned Latency = 0, StartCycle = 0; diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 896c152f938..693166f5522 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -668,18 +668,36 @@ public: return Opcode <= TargetOpcode::COPY; } + virtual int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const = 0; + /// getOperandLatency - Compute and return the use operand latency of a given /// pair of def and use. /// In most cases, the static scheduling itinerary was enough to determine the /// operand latency. But it may not be possible for instructions with variable /// number of defs / uses. + /// + /// This is a raw interface to the itinerary that may be directly overriden by + /// a target. Use computeOperandLatency to get the best estimate of latency. virtual int getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const; - - virtual int getOperandLatency(const InstrItineraryData *ItinData, - SDNode *DefNode, unsigned DefIdx, - SDNode *UseNode, unsigned UseIdx) const = 0; + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const; + + /// computeOperandLatency - Compute and return the latency of the given data + /// dependent def and use. DefMI must be a valid def. UseMI may be NULL for + /// an unknown use. If the subtarget allows, this may or may not need to call + /// getOperandLatency(). + /// + /// FindMin may be set to get the minimum vs. expected latency. Minimum + /// latency is used for scheduling groups, while expected latency is for + /// instruction cost and critical path. + unsigned computeOperandLatency(const InstrItineraryData *ItinData, + const TargetRegisterInfo *TRI, + const MachineInstr *DefMI, + const MachineInstr *UseMI, + unsigned Reg, bool FindMin) const; /// getOutputLatency - Compute and return the output dependency latency of a /// a given pair of defs which both target the same register. This is usually @@ -693,13 +711,17 @@ public: /// getInstrLatency - Compute the instruction latency of a given instruction. /// If the instruction has higher cost when predicated, it's returned via /// PredCost. - virtual int getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost = 0) const; + virtual unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = 0) const; virtual int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const = 0; + /// Return the default expected latency for a def based on it's opcode. + unsigned defaultDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI) const; + /// isHighLatencyDef - Return true if this opcode has high latency to its /// result. virtual bool isHighLatencyDef(int opc) const { return false; } diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index c318d846171..aa591779ba6 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -21,8 +21,9 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -394,6 +395,12 @@ public: return RegionCriticalPSets; } + /// getIssueWidth - Return the max instructions per scheduling group. + /// + unsigned getIssueWidth() const { + return InstrItins ? InstrItins->Props.IssueWidth : 1; + } + protected: void initRegPressure(); void updateScheduledPressure(std::vector NewMaxPressure); @@ -787,13 +794,16 @@ class ConvergingScheduler : public MachineSchedStrategy { /// MinReadyCycle - Cycle of the soonest available instruction. unsigned MinReadyCycle; + // Remember the greatest min operand latency. + unsigned MaxMinLatency; + /// Pending queues extend the ready queues with the same ID and the /// PendingFlag set. SchedBoundary(unsigned ID, const Twine &Name): Available(ID, Name+".A"), Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"), CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0), - MinReadyCycle(UINT_MAX) {} + MinReadyCycle(UINT_MAX), MaxMinLatency(0) {} ~SchedBoundary() { delete HazardRec; } @@ -805,6 +815,8 @@ class ConvergingScheduler : public MachineSchedStrategy { void bumpCycle(); + void bumpNode(SUnit *SU, unsigned IssueWidth); + void releasePending(); void removeReady(SUnit *SU); @@ -868,25 +880,53 @@ void ConvergingScheduler::initialize(ScheduleDAGMI *dag) { } void ConvergingScheduler::releaseTopNode(SUnit *SU) { - Top.releaseNode(SU, SU->getDepth()); + if (SU->isScheduled) + return; + + for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; + unsigned Latency = + DAG->computeOperandLatency(I->getSUnit(), SU, *I, /*FindMin=*/true); +#ifndef NDEBUG + Top.MaxMinLatency = std::max(Latency, Top.MaxMinLatency); +#endif + if (SU->TopReadyCycle < PredReadyCycle + Latency) + SU->TopReadyCycle = PredReadyCycle + Latency; + } + Top.releaseNode(SU, SU->TopReadyCycle); } void ConvergingScheduler::releaseBottomNode(SUnit *SU) { - Bot.releaseNode(SU, SU->getHeight()); + if (SU->isScheduled) + return; + + assert(SU->getInstr() && "Scheduled SUnit must have instr"); + + for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); + I != E; ++I) { + unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; + unsigned Latency = + DAG->computeOperandLatency(SU, I->getSUnit(), *I, /*FindMin=*/true); +#ifndef NDEBUG + Bot.MaxMinLatency = std::max(Latency, Bot.MaxMinLatency); +#endif + if (SU->BotReadyCycle < SuccReadyCycle + Latency) + SU->BotReadyCycle = SuccReadyCycle + Latency; + } + Bot.releaseNode(SU, SU->BotReadyCycle); } void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) { - if (SU->isScheduled) - return; - if (ReadyCycle < MinReadyCycle) MinReadyCycle = ReadyCycle; // Check for interlocks first. For the purpose of other heuristics, an // instruction that cannot issue appears as if it's not in the ReadyQueue. - if (HazardRec->isEnabled() - && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) + if (ReadyCycle > CurrCycle + || (HazardRec->isEnabled() && (HazardRec->getHazardType(SU) + != ScheduleHazardRecognizer::NoHazard))) Pending.push(SU); else Available.push(SU); @@ -900,10 +940,11 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() { unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle); if (!HazardRec->isEnabled()) { - // Bypass lots of virtual calls in case of long latency. + // Bypass HazardRec virtual calls. CurrCycle = NextCycle; } else { + // Bypass getHazardType calls in case of long latency. for (; CurrCycle != NextCycle; ++CurrCycle) { if (isTop()) HazardRec->AdvanceCycle(); @@ -917,6 +958,26 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() { << CurrCycle << '\n'); } +/// Move the boundary of scheduled code by one SUnit. +void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU, + unsigned IssueWidth) { + // Update the reservation table. + if (HazardRec->isEnabled()) { + if (!isTop() && SU->isCall) { + // Calls are scheduled with their preceding instructions. For bottom-up + // scheduling, clear the pipeline state before emitting. + HazardRec->Reset(); + } + HazardRec->EmitInstruction(SU); + } + // Check the instruction group size limit. + ++IssueCount; + if (IssueCount == IssueWidth) { + DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n'); + bumpCycle(); + } +} + /// Release pending ready nodes in to the available queue. This makes them /// visible to heuristics. void ConvergingScheduler::SchedBoundary::releasePending() { @@ -928,7 +989,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() { // so, add them to the available queue. for (unsigned i = 0, e = Pending.size(); i != e; ++i) { SUnit *SU = *(Pending.begin()+i); - unsigned ReadyCycle = isTop() ? SU->getHeight() : SU->getDepth(); + unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; if (ReadyCycle < MinReadyCycle) MinReadyCycle = ReadyCycle; @@ -965,7 +1026,8 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() { releasePending(); for (unsigned i = 0; Available.empty(); ++i) { - assert(i <= HazardRec->getMaxLookAhead() && "permanent hazard"); (void)i; + assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && + "permanent hazard"); (void)i; bumpCycle(); releasePending(); } @@ -1205,27 +1267,15 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) { /// Update the scheduler's state after scheduling a node. This is the same node /// that was just returned by pickNode(). However, ScheduleDAGMI needs to update -/// it's state based on the current cycle before MachineSchedStrategy. +/// it's state based on the current cycle before MachineSchedStrategy does. void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) { - // Update the reservation table. - if (IsTopNode && Top.HazardRec->isEnabled()) { - Top.HazardRec->EmitInstruction(SU); - if (Top.HazardRec->atIssueLimit()) { - DEBUG(dbgs() << "*** Max instrs at cycle " << Top.CurrCycle << '\n'); - Top.bumpCycle(); - } + if (IsTopNode) { + SU->TopReadyCycle = Top.CurrCycle; + Top.bumpNode(SU, DAG->getIssueWidth()); } - else if (Bot.HazardRec->isEnabled()) { - if (SU->isCall) { - // Calls are scheduled with their preceding instructions. For bottom-up - // scheduling, clear the pipeline state before emitting. - Bot.HazardRec->Reset(); - } - Bot.HazardRec->EmitInstruction(SU); - if (Bot.HazardRec->atIssueLimit()) { - DEBUG(dbgs() << "*** Max instrs at cycle " << Bot.CurrCycle << '\n'); - Bot.bumpCycle(); - } + else { + SU->BotReadyCycle = Bot.CurrCycle; + Bot.bumpNode(SU, DAG->getIssueWidth()); } } diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 773a29d0c23..875e012ed12 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -271,10 +271,12 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, // Adjust the dependence latency using operand def/use // information (if any), and then allow the target to // perform its own adjustments. - const SDep& dep = SDep(SU, SDep::Data, LDataLatency, *Alias); + SDep dep(SU, SDep::Data, LDataLatency, *Alias); if (!UnitLatencies) { - computeOperandLatency(SU, UseSU, const_cast(dep)); - ST.adjustSchedDependency(SU, UseSU, const_cast(dep)); + unsigned Latency = computeOperandLatency(SU, UseSU, dep); + dep.setLatency(Latency); + + ST.adjustSchedDependency(SU, UseSU, dep); } UseSU->addPred(dep); } @@ -461,11 +463,13 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { // Create a data dependence. // // TODO: Handle "special" address latencies cleanly. - const SDep &dep = SDep(DefSU, SDep::Data, DefSU->Latency, Reg); + SDep dep(DefSU, SDep::Data, DefSU->Latency, Reg); if (!UnitLatencies) { // Adjust the dependence latency using operand def/use information, then // allow the target to perform its own adjustments. - computeOperandLatency(DefSU, SU, const_cast(dep)); + unsigned Latency = computeOperandLatency(DefSU, SU, const_cast(dep)); + dep.setLatency(Latency); + const TargetSubtargetInfo &ST = TM.getSubtarget(); ST.adjustSchedDependency(DefSU, SU, const_cast(dep)); } @@ -970,8 +974,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, } void ScheduleDAGInstrs::computeLatency(SUnit *SU) { - // Compute the latency for the node. - if (!InstrItins || InstrItins->isEmpty()) { + // Compute the latency for the node. We only provide a default for missing + // itineraries. Empty itineraries still have latency properties. + if (!InstrItins) { SU->Latency = 1; // Simplistic target-independent heuristic: assume that loads take @@ -983,63 +988,15 @@ void ScheduleDAGInstrs::computeLatency(SUnit *SU) { } } -void ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use, - SDep& dep) const { - if (!InstrItins || InstrItins->isEmpty()) - return; - +unsigned ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use, + const SDep& dep, + bool FindMin) const { // For a data dependency with a known register... if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0)) - return; + return 1; - const unsigned Reg = dep.getReg(); - - // ... find the definition of the register in the defining - // instruction - MachineInstr *DefMI = Def->getInstr(); - int DefIdx = DefMI->findRegisterDefOperandIdx(Reg); - if (DefIdx != -1) { - const MachineOperand &MO = DefMI->getOperand(DefIdx); - if (MO.isReg() && MO.isImplicit() && - DefIdx >= (int)DefMI->getDesc().getNumOperands()) { - // This is an implicit def, getOperandLatency() won't return the correct - // latency. e.g. - // %D6, %D7 = VLD1q16 %R2, 0, ..., %Q3 - // %Q1 = VMULv8i16 %Q1, %Q3, ... - // What we want is to compute latency between def of %D6/%D7 and use of - // %Q3 instead. - unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI); - if (DefMI->getOperand(Op2).isReg()) - DefIdx = Op2; - } - MachineInstr *UseMI = Use->getInstr(); - // For all uses of the register, calculate the maxmimum latency - int Latency = -1; - if (UseMI) { - for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = UseMI->getOperand(i); - if (!MO.isReg() || !MO.isUse()) - continue; - unsigned MOReg = MO.getReg(); - if (MOReg != Reg) - continue; - - int UseCycle = TII->getOperandLatency(InstrItins, DefMI, DefIdx, - UseMI, i); - Latency = std::max(Latency, UseCycle); - } - } else { - // UseMI is null, then it must be a scheduling barrier. - if (!InstrItins || InstrItins->isEmpty()) - return; - unsigned DefClass = DefMI->getDesc().getSchedClass(); - Latency = InstrItins->getOperandCycle(DefClass, DefIdx); - } - - // If we found a latency, then replace the existing dependence latency. - if (Latency >= 0) - dep.setLatency(Latency); - } + return TII->computeOperandLatency(InstrItins, TRI, Def->getInstr(), + Use->getInstr(), dep.getReg(), FindMin); } void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const { diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h index 75940ec33dd..5384576a0c9 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h @@ -98,12 +98,6 @@ namespace llvm { /// virtual void computeLatency(SUnit *SU); - /// computeOperandLatency - Override dependence edge latency using - /// operand use/def information - /// - virtual void computeOperandLatency(SUnit *Def, SUnit *Use, - SDep& dep) const { } - virtual void computeOperandLatency(SDNode *Def, SDNode *Use, unsigned OpIdx, SDep& dep) const; diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 5218aa1f7a8..ec2b5772308 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1046,7 +1046,7 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist, return true; // Below MI unsigned DefDist = DDI->second; assert(Dist > DefDist && "Visited def already?"); - if (TII->getInstrLatency(InstrItins, DefMI) > (int)(Dist - DefDist)) + if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist)) return true; } return false; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 398e0c9ec34..bcc4f9c9d20 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -2567,12 +2567,13 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const { if (DefMI->isCopyLike() || DefMI->isInsertSubreg() || - DefMI->isRegSequence() || DefMI->isImplicitDef()) + DefMI->isRegSequence() || DefMI->isImplicitDef()) { return 1; - + } if (!ItinData || ItinData->isEmpty()) return DefMI->mayLoad() ? 3 : 1; @@ -2983,14 +2984,16 @@ ARMBaseInstrInfo::getOutputLatency(const InstrItineraryData *ItinData, DepMI->getNumOperands()); } -int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost) const { +unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() || MI->isImplicitDef()) return 1; - if (!ItinData || ItinData->isEmpty()) + // Be sure to call getStageLatency for an empty itinerary in case it has a + // valid MinLatency property. + if (!ItinData) return 1; if (MI->isBundle()) { diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 2fe85072a33..8217f239d19 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -249,8 +249,9 @@ private: const MCInstrDesc &UseMCID, unsigned UseIdx, unsigned UseAlign) const; - int getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, unsigned *PredCost = 0) const; + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = 0) const; int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const; diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp index 6088ba5cc35..ae38732065c 100644 --- a/lib/Target/TargetInstrInfo.cpp +++ b/lib/Target/TargetInstrInfo.cpp @@ -61,22 +61,125 @@ TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, return 1; } +/// Return the default expected latency for a def based on it's opcode. +unsigned TargetInstrInfo::defaultDefLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI) const { + if (DefMI->mayLoad()) + return ItinData->Props.LoadLatency; + if (isHighLatencyDef(DefMI->getOpcode())) + return ItinData->Props.HighLatency; + return 1; +} + +/// Both DefMI and UseMI must be valid. By default, call directly to the +/// itinerary. This may be overriden by the target. int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { - if (!ItinData || ItinData->isEmpty()) - return -1; - + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const { unsigned DefClass = DefMI->getDesc().getSchedClass(); unsigned UseClass = UseMI->getDesc().getSchedClass(); return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx); } -int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost) const { - if (!ItinData || ItinData->isEmpty()) +/// computeOperandLatency - Compute and return the latency of the given data +/// dependent def and use. DefMI must be a valid def. UseMI may be NULL for an +/// unknown use. Depending on the subtarget's itinerary properties, this may or +/// may not need to call getOperandLatency(). +/// +/// FindMin may be set to get the minimum vs. expected latency. Minimum +/// latency is used for scheduling groups, while expected latency is for +/// instruction cost and critical path. +/// +/// For most subtargets, we don't need DefIdx or UseIdx to compute min latency. +/// DefMI must be a valid definition, but UseMI may be NULL for an unknown use. +unsigned TargetInstrInfo:: +computeOperandLatency(const InstrItineraryData *ItinData, + const TargetRegisterInfo *TRI, + const MachineInstr *DefMI, const MachineInstr *UseMI, + unsigned Reg, bool FindMin) const { + + // Default to one cycle for missing itinerary. Empty itineraries still have + // a properties. We have one hard-coded exception for loads, to preserve + // existing behavior. + if (!ItinData) + return DefMI->mayLoad() ? 2 : 1; + + // Return a latency based on the itinerary properties and defining instruction + // if possible. Some common subtargets don't require per-operand latency, + // especially for minimum latencies. + if (FindMin) { + // If MinLatency is valid, call getInstrLatency. This uses Stage latency if + // it exists before defaulting to MinLatency. + if (ItinData->Props.MinLatency >= 0) + return getInstrLatency(ItinData, DefMI); + + // If MinLatency is invalid, OperandLatency is interpreted as MinLatency. + // For empty itineraries, short-cirtuit the check and default to one cycle. + if (ItinData->isEmpty()) + return 1; + } + else if(ItinData->isEmpty()) + return defaultDefLatency(ItinData, DefMI); + + // ...operand lookup required + + // Find the definition of the register in the defining instruction. + int DefIdx = DefMI->findRegisterDefOperandIdx(Reg); + if (DefIdx != -1) { + const MachineOperand &MO = DefMI->getOperand(DefIdx); + if (MO.isReg() && MO.isImplicit() && + DefIdx >= (int)DefMI->getDesc().getNumOperands()) { + // This is an implicit def, getOperandLatency() won't return the correct + // latency. e.g. + // %D6, %D7 = VLD1q16 %R2, 0, ..., %Q3 + // %Q1 = VMULv8i16 %Q1, %Q3, ... + // What we want is to compute latency between def of %D6/%D7 and use of + // %Q3 instead. + unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI); + if (DefMI->getOperand(Op2).isReg()) + DefIdx = Op2; + } + // For all uses of the register, calculate the maxmimum latency + int OperLatency = -1; + + // UseMI is null, then it must be a scheduling barrier. + if (!UseMI) { + unsigned DefClass = DefMI->getDesc().getSchedClass(); + OperLatency = ItinData->getOperandCycle(DefClass, DefIdx); + } + else { + for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = UseMI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned MOReg = MO.getReg(); + if (MOReg != Reg) + continue; + + int UseCycle = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, i); + OperLatency = std::max(OperLatency, UseCycle); + } + } + // If we found an operand latency, we're done. + if (OperLatency >= 0) + return OperLatency; + } + // No operand latency was found. + unsigned InstrLatency = getInstrLatency(ItinData, DefMI); + // Expected latency is the max of the stage latency and itinerary props. + if (!FindMin) + InstrLatency = std::max(InstrLatency, defaultDefLatency(ItinData, DefMI)); + return InstrLatency; +} + +unsigned TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + // Default to one cycle for no itinerary. However, an "empty" itinerary may + // still have a MinLatency property, which getStageLatency checks. + if (!ItinData) return 1; return ItinData->getStageLatency(MI->getDesc().getSchedClass()); -- 2.34.1