From b86a0cdb674549d8493043331cecd9cbf53b80da Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Sat, 15 Jun 2013 04:49:57 +0000 Subject: [PATCH] Machine Model: Add MicroOpBufferSize and resource BufferSize. Replace the ill-defined MinLatency and ILPWindow properties with with straightforward buffer sizes: MCSchedMode::MicroOpBufferSize MCProcResourceDesc::BufferSize These can be used to more precisely model instruction execution if desired. Disabled some misched tests temporarily. They'll be reenabled in a few commits. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@184032 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/ScheduleDAG.h | 23 +------ include/llvm/CodeGen/ScheduleDAGInstrs.h | 2 +- include/llvm/CodeGen/TargetSchedule.h | 28 ++++---- include/llvm/MC/MCInstrItineraries.h | 10 +-- include/llvm/MC/MCSchedule.h | 66 ++++++++----------- include/llvm/Target/TargetInstrInfo.h | 8 +-- include/llvm/Target/TargetSchedule.td | 8 ++- lib/CodeGen/MachineScheduler.cpp | 29 ++++---- lib/CodeGen/MachineTraceMetrics.cpp | 10 ++- lib/CodeGen/ScheduleDAGInstrs.cpp | 24 ++----- lib/CodeGen/TargetInstrInfo.cpp | 28 ++------ lib/CodeGen/TargetSchedule.cpp | 58 ++++------------ lib/Target/ARM/ARMBaseInstrInfo.cpp | 3 +- lib/Target/ARM/ARMScheduleA9.td | 5 +- .../Hexagon/HexagonMachineScheduler.cpp | 4 +- lib/Target/X86/X86SchedHaswell.td | 1 - lib/Target/X86/X86SchedSandyBridge.td | 1 - lib/Target/X86/X86Schedule.td | 5 -- lib/Target/X86/X86ScheduleAtom.td | 1 - test/CodeGen/X86/misched-balance.ll | 4 +- test/CodeGen/X86/misched-matmul.ll | 3 +- test/CodeGen/X86/misched-matrix.ll | 19 +++--- utils/TableGen/SubtargetEmitter.cpp | 17 ++--- 23 files changed, 123 insertions(+), 234 deletions(-) diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index 7cff27e1724..6c62b52fa58 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -90,11 +90,6 @@ namespace llvm { /// the value of the Latency field of the predecessor, however advanced /// models may provide additional information about specific edges. unsigned Latency; - /// Record MinLatency seperately from "expected" Latency. - /// - /// FIXME: this field is not packed on LP64. Convert to 16-bit DAG edge - /// latency after introducing saturating truncation. - unsigned MinLatency; public: /// SDep - Construct a null SDep. This is only for use by container @@ -120,10 +115,9 @@ namespace llvm { Latency = 1; break; } - MinLatency = Latency; } SDep(SUnit *S, OrderKind kind) - : Dep(S, Order), Contents(), Latency(0), MinLatency(0) { + : Dep(S, Order), Contents(), Latency(0) { Contents.OrdKind = kind; } @@ -142,8 +136,7 @@ namespace llvm { } bool operator==(const SDep &Other) const { - return overlaps(Other) - && Latency == Other.Latency && MinLatency == Other.MinLatency; + return overlaps(Other) && Latency == Other.Latency; } bool operator!=(const SDep &Other) const { @@ -163,18 +156,6 @@ namespace llvm { Latency = Lat; } - /// getMinLatency - Return the minimum latency for this edge. Minimum - /// latency is used for scheduling groups, while normal (expected) latency - /// is for instruction cost and critical path. - unsigned getMinLatency() const { - return MinLatency; - } - - /// setMinLatency - Set the minimum latency for this edge. - void setMinLatency(unsigned Lat) { - MinLatency = Lat; - } - //// getSUnit - Return the SUnit to which this edge points. SUnit *getSUnit() const { return Dep.getPointer(); diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h index 990cac6348b..9ab1013bf16 100644 --- a/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -158,7 +158,7 @@ namespace llvm { /// \brief Resolve and cache a resolved scheduling class for an SUnit. const MCSchedClassDesc *getSchedClass(SUnit *SU) const { - if (!SU->SchedClass) + if (!SU->SchedClass && SchedModel.hasInstrSchedModel()) SU->SchedClass = SchedModel.resolveSchedClass(SU->getInstr()); return SU->SchedClass; } diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h index 3e22252eeac..f2adcf8875c 100644 --- a/include/llvm/CodeGen/TargetSchedule.h +++ b/include/llvm/CodeGen/TargetSchedule.h @@ -84,9 +84,6 @@ public: /// \brief Maximum number of micro-ops that may be scheduled per cycle. unsigned getIssueWidth() const { return SchedModel.IssueWidth; } - /// \brief Number of cycles the OOO processor is expected to hide. - unsigned getILPWindow() const { return SchedModel.ILPWindow; } - /// \brief Return the number of issue slots required for this MI. unsigned getNumMicroOps(const MachineInstr *MI, const MCSchedClassDesc *SC = 0) const; @@ -131,18 +128,23 @@ public: return ResourceLCM; } + /// \brief Number of micro-ops that may be buffered for OOO execution. + unsigned getMicroOpBufferSize() const { return SchedModel.MicroOpBufferSize; } + + /// \brief Number of resource units that may be buffered for OOO execution. + /// \return The buffer size in resource units or -1 for unlimited. + int getResourceBufferSize(unsigned PIdx) const { + return SchedModel.getProcResource(PIdx)->BufferSize; + } + /// \brief Compute operand latency based on the available machine model. /// - /// Computes and return the latency of the given data dependent def and use + /// Compute and return the latency of the given data dependent def and use /// when the operand indices are already known. UseMI may be NULL for an /// unknown user. - /// - /// FindMin may be set to get the minimum vs. expected latency. Minimum - /// latency is used for scheduling groups, while expected latency is for - /// instruction cost and critical path. unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, - const MachineInstr *UseMI, unsigned UseOperIdx, - bool FindMin) const; + const MachineInstr *UseMI, unsigned UseOperIdx) + const; /// \brief Compute the instruction latency based on the available machine /// model. @@ -157,12 +159,6 @@ public: /// This is typically one cycle. unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *DepMI) const; - -private: - /// getDefLatency is a helper for computeOperandLatency. Return the - /// instruction's latency if operand lookup is not required. - /// Otherwise return -1. - int getDefLatency(const MachineInstr *DefMI, bool FindMin) const; }; } // namespace llvm diff --git a/include/llvm/MC/MCInstrItineraries.h b/include/llvm/MC/MCInstrItineraries.h index 65d1559ac66..c4f9e1c32ab 100644 --- a/include/llvm/MC/MCInstrItineraries.h +++ b/include/llvm/MC/MCInstrItineraries.h @@ -157,17 +157,12 @@ public: /// class. The latency is the maximum completion time for any stage /// in the itinerary. /// - /// InstrStages override the itinerary's MinLatency property. In fact, if the - /// stage latencies, which may be zero, are less than MinLatency, - /// getStageLatency returns a value less than MinLatency. - /// - /// If no stages exist, MinLatency is used. If MinLatency is invalid (<0), - /// then it defaults to one cycle. + /// If no stages exist, it defaults to one cycle. unsigned getStageLatency(unsigned ItinClassIndx) const { // If the target doesn't provide itinerary information, use a simple // non-zero default value for all instructions. if (isEmpty()) - return SchedModel->MinLatency < 0 ? 1 : SchedModel->MinLatency; + return 1; // Calculate the maximum completion time for any stage. unsigned Latency = 0, StartCycle = 0; @@ -176,7 +171,6 @@ public: Latency = std::max(Latency, StartCycle + IS->getCycles()); StartCycle += IS->getNextCycles(); } - return Latency; } diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index defa2990354..673cdf6fb68 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -30,15 +30,18 @@ struct MCProcResourceDesc { unsigned NumUnits; // Number of resource of this kind unsigned SuperIdx; // Index of the resources kind that contains this kind. - // Buffered resources may be consumed at some indeterminate cycle after - // dispatch (e.g. for instructions that may issue out-of-order). Unbuffered - // resources always consume their resource some fixed number of cycles after - // dispatch (e.g. for instruction interlocking that may stall the pipeline). - bool IsBuffered; + // Number of resources that may be buffered. + // + // Buffered resources (BufferSize > 0 || BufferSize == -1) may be consumed at + // some indeterminate cycle after dispatch (e.g. for instructions that may + // issue out-of-order). Unbuffered resources (BufferSize == 0) always consume + // their resource some fixed number of cycles after dispatch (e.g. for + // instruction interlocking that may stall the pipeline). + int BufferSize; bool operator==(const MCProcResourceDesc &Other) const { return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx - && IsBuffered == Other.IsBuffered; + && BufferSize == Other.BufferSize; } }; @@ -134,28 +137,22 @@ public: unsigned IssueWidth; static const unsigned DefaultIssueWidth = 1; - // MinLatency is the minimum latency between a register write - // followed by a data dependent read. This determines which - // instructions may be scheduled in the same per-cycle group. This - // is distinct from *expected* latency, which determines the likely - // critical path but does not guarantee a pipeline - // hazard. MinLatency can always be overridden by the number of - // InstrStage cycles. + // MicroOpBufferSize is the number of micro-ops that the processor may buffer + // for out-of-order execution. // - // (-1) Standard in-order processor. - // Use InstrItinerary OperandCycles as MinLatency. - // If no OperandCycles exist, then use the cycle of the last InstrStage. + // "0" means operations that are not ready in this cycle are not considered + // for scheduling (they go in the pending queue). Latency is paramount. This + // may be more efficient if many instructions are pending in a schedule. // - // (0) Out-of-order processor, or in-order with bundled dependencies. - // RAW dependencies may be dispatched in the same cycle. - // Optional InstrItinerary OperandCycles provides expected latency. + // "1" means all instructions are considered for scheduling regardless of + // whether they are ready in this cycle. Latency still causes issue stalls, + // but we balance those stalls against other heuristics. // - // (>0) In-order processor with variable latencies. - // Use the greater of this value or the cycle of the last InstrStage. - // Optional InstrItinerary OperandCycles provides expected latency. - // TODO: can't yet specify both min and expected latency per operand. - int MinLatency; - static const int DefaultMinLatency = -1; + // "> 1" means the processor is out-of-order. This is a machine independent + // estimate of highly machine specific characteristics such are the register + // renaming pool and reorder buffer. + unsigned MicroOpBufferSize; + static const unsigned DefaultMicroOpBufferSize = 0; // LoadLatency is the expected latency of load instructions. // @@ -172,16 +169,6 @@ public: unsigned HighLatency; static const unsigned DefaultHighLatency = 10; - // ILPWindow is the number of cycles that the scheduler effectively ignores - // before attempting to hide latency. This should be zero for in-order cpus to - // always hide expected latency. For out-of-order cpus, it may be tweaked as - // desired to roughly approximate instruction buffers. The actual threshold is - // not very important for an OOO processor, as long as it isn't too high. A - // nonzero value helps avoid rescheduling to hide latency when its is fairly - // obviously useless and makes register pressure heuristics more effective. - unsigned ILPWindow; - static const unsigned DefaultILPWindow = 0; - // MispredictPenalty is the typical number of extra cycles the processor // takes to recover from a branch misprediction. unsigned MispredictPenalty; @@ -203,10 +190,9 @@ public: // initialized in this default ctor because some clients directly instantiate // MCSchedModel instead of using a generated itinerary. MCSchedModel(): IssueWidth(DefaultIssueWidth), - MinLatency(DefaultMinLatency), + MicroOpBufferSize(DefaultMicroOpBufferSize), LoadLatency(DefaultLoadLatency), HighLatency(DefaultHighLatency), - ILPWindow(DefaultILPWindow), MispredictPenalty(DefaultMispredictPenalty), ProcID(0), ProcResourceTable(0), SchedClassTable(0), NumProcResourceKinds(0), NumSchedClasses(0), @@ -216,12 +202,12 @@ public: } // Table-gen driven ctor. - MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned ilp, + MCSchedModel(unsigned iw, int mbs, unsigned ll, unsigned hl, unsigned mp, unsigned pi, const MCProcResourceDesc *pr, const MCSchedClassDesc *sc, unsigned npr, unsigned nsc, const InstrItinerary *ii): - IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl), - ILPWindow(ilp), MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr), + IssueWidth(iw), MicroOpBufferSize(mbs), LoadLatency(ll), HighLatency(hl), + MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc), InstrItineraries(ii) {} diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index d49ce1ce7f4..69fda8c47c2 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -817,12 +817,10 @@ public: /// computeOperandLatency - Compute and return the latency of the given data /// dependent def and use when the operand indices are already known. - /// - /// FindMin may be set to get the minimum vs. expected latency. unsigned computeOperandLatency(const InstrItineraryData *ItinData, const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx, - bool FindMin = false) const; + const MachineInstr *UseMI, unsigned UseIdx) + const; /// getInstrLatency - Compute the instruction latency of a given instruction. /// If the instruction has higher cost when predicated, it's returned via @@ -839,7 +837,7 @@ public: const MachineInstr *DefMI) const; int computeDefOperandLatency(const InstrItineraryData *ItinData, - const MachineInstr *DefMI, bool FindMin) const; + const MachineInstr *DefMI) const; /// isHighLatencyDef - Return true if this opcode has high latency to its /// result. diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td index 660d2c48b6c..0ac2eed9d56 100644 --- a/include/llvm/Target/TargetSchedule.td +++ b/include/llvm/Target/TargetSchedule.td @@ -72,11 +72,13 @@ def instregex; // // Target hooks allow subtargets to associate LoadLatency and // HighLatency with groups of opcodes. +// +// See MCSchedule.h for detailed comments. class SchedMachineModel { int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle. int MinLatency = -1; // Determines which instrucions are allowed in a group. // (-1) inorder (0) ooo, (1): inorder +var latencies. - int ILPWindow = -1; // Cycles of latency likely hidden by hardware buffers. + int MicroOpBufferSize = -1; // Max micro-ops that can be buffered. int LoadLatency = -1; // Cycles for loads to access the cache. int HighLatency = -1; // Approximation of cycles for "high latency" ops. int MispredictPenalty = -1; // Extra cycles for a mispredicted branch. @@ -106,7 +108,7 @@ class ProcResourceKind; // out-of-order engine that the compiler attempts to conserve. // Buffered resources may be held for multiple clock cycles, but the // scheduler does not pin them to a particular clock cycle relative to -// instruction dispatch. Setting Buffered=0 changes this to an +// instruction dispatch. Setting BufferSize=0 changes this to an // in-order resource. In this case, the scheduler counts down from the // cycle that the instruction issues in-order, forcing an interlock // with subsequent instructions that require the same resource until @@ -119,7 +121,7 @@ class ProcResourceUnits { ProcResourceKind Kind = kind; int NumUnits = num; ProcResourceKind Super = ?; - bit Buffered = 1; + int BufferSize = -1; SchedMachineModel SchedModel = ?; } diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 3840473a9aa..c87a1be7205 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -1257,8 +1257,9 @@ public: unsigned ExpectedCount; #ifndef NDEBUG - // Remember the greatest min operand latency. - unsigned MaxMinLatency; + // Remember the greatest operand latency as an upper bound on the number of + // times we should retry the pending queue because of a hazard. + unsigned MaxObservedLatency; #endif void reset() { @@ -1281,7 +1282,7 @@ public: IsResourceLimited = false; ExpectedCount = 0; #ifndef NDEBUG - MaxMinLatency = 0; + MaxObservedLatency = 0; #endif // Reserve a zero-count for invalid CritResIdx. ResourceCounts.resize(1); @@ -1466,13 +1467,15 @@ void ConvergingScheduler::releaseTopNode(SUnit *SU) { for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { + if (I->isWeak()) + continue; unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; - unsigned MinLatency = I->getMinLatency(); + unsigned Latency = I->getLatency(); #ifndef NDEBUG - Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency); + Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency); #endif - if (SU->TopReadyCycle < PredReadyCycle + MinLatency) - SU->TopReadyCycle = PredReadyCycle + MinLatency; + if (SU->TopReadyCycle < PredReadyCycle + Latency) + SU->TopReadyCycle = PredReadyCycle + Latency; } Top.releaseNode(SU, SU->TopReadyCycle); } @@ -1488,12 +1491,12 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) { if (I->isWeak()) continue; unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; - unsigned MinLatency = I->getMinLatency(); + unsigned Latency = I->getLatency(); #ifndef NDEBUG - Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency); + Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency); #endif - if (SU->BotReadyCycle < SuccReadyCycle + MinLatency) - SU->BotReadyCycle = SuccReadyCycle + MinLatency; + if (SU->BotReadyCycle < SuccReadyCycle + Latency) + SU->BotReadyCycle = SuccReadyCycle + Latency; } Bot.releaseNode(SU, SU->BotReadyCycle); } @@ -1558,7 +1561,7 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) { if (L > RemLatency) RemLatency = L; } - unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow(); + unsigned CriticalPathLimit = Rem->CriticalPath; DEBUG(dbgs() << " " << Available.getName() << " ExpectedLatency " << ExpectedLatency << " CP Limit " << CriticalPathLimit << '\n'); @@ -1751,7 +1754,7 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() { } } for (unsigned i = 0; Available.empty(); ++i) { - assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && + assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) && "permanent hazard"); (void)i; bumpCycle(); releasePending(); diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index 00f702c846c..6aa3f679628 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -853,8 +853,7 @@ computeInstrDepths(const MachineBasicBlock *MBB) { // Add latency if DefMI is a real instruction. Transients get latency 0. if (!Dep.DefMI->isTransient()) DepCycle += MTM.SchedModel - .computeOperandLatency(Dep.DefMI, Dep.DefOp, UseMI, Dep.UseOp, - /* FindMin = */ false); + .computeOperandLatency(Dep.DefMI, Dep.DefOp, UseMI, Dep.UseOp); Cycle = std::max(Cycle, DepCycle); } // Remember the instruction depth. @@ -902,8 +901,7 @@ static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height, // We may not know the UseMI of this dependency, if it came from the // live-in list. SchedModel can handle a NULL UseMI. DepHeight += SchedModel - .computeOperandLatency(MI, MO.getOperandNo(), I->MI, I->Op, - /* FindMin = */ false); + .computeOperandLatency(MI, MO.getOperandNo(), I->MI, I->Op); } Height = std::max(Height, DepHeight); // This regunit is dead above MI. @@ -941,7 +939,7 @@ static bool pushDepHeight(const DataDep &Dep, // Adjust height by Dep.DefMI latency. if (!Dep.DefMI->isTransient()) UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, - UseMI, Dep.UseOp, false); + UseMI, Dep.UseOp); // Update Heights[DefMI] to be the maximum height seen. MIHeightMap::iterator I; @@ -1171,7 +1169,7 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const { // Add latency if DefMI is a real instruction. Transients get latency 0. if (!Dep.DefMI->isTransient()) DepCycle += TE.MTM.SchedModel - .computeOperandLatency(Dep.DefMI, Dep.DefOp, PHI, Dep.UseOp, false); + .computeOperandLatency(Dep.DefMI, Dep.DefOp, PHI, Dep.UseOp); return DepCycle; } diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index e4da6a41eea..aaf5c88a832 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -267,13 +267,10 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { SU->hasPhysRegDefs = true; Dep = SDep(SU, SDep::Data, *Alias); RegUse = UseSU->getInstr(); - Dep.setMinLatency( - SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, - RegUse, UseOp, /*FindMin=*/true)); } Dep.setLatency( - SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, - RegUse, UseOp, /*FindMin=*/false)); + SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse, + UseOp)); ST.adjustSchedDependency(SU, UseSU, Dep); UseSU->addPred(Dep); @@ -310,10 +307,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias)); else { SDep Dep(SU, Kind, /*Reg=*/*Alias); - unsigned OutLatency = - SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()); - Dep.setMinLatency(OutLatency); - Dep.setLatency(OutLatency); + Dep.setLatency( + SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); DefSU->addPred(Dep); } } @@ -389,10 +384,8 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { SUnit *DefSU = DefI->SU; if (DefSU != SU && DefSU != &ExitSU) { SDep Dep(SU, SDep::Output, Reg); - unsigned OutLatency = - SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()); - Dep.setMinLatency(OutLatency); - Dep.setLatency(OutLatency); + Dep.setLatency( + SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); DefSU->addPred(Dep); } DefI->SU = SU; @@ -427,10 +420,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { // Adjust the dependence latency using operand def/use information, then // allow the target to perform its own adjustments. int DefOp = Def->findRegisterDefOperandIdx(Reg); - dep.setLatency( - SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, false)); - dep.setMinLatency( - SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, true)); + dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx)); const TargetSubtargetInfo &ST = TM.getSubtarget(); ST.adjustSchedDependency(DefSU, SU, const_cast(dep)); diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp index 20eb9187931..bb8bd426a92 100644 --- a/lib/CodeGen/TargetInstrInfo.cpp +++ b/lib/CodeGen/TargetInstrInfo.cpp @@ -668,27 +668,13 @@ getOperandLatency(const InstrItineraryData *ItinData, /// lookup, do so. Otherwise return -1. int TargetInstrInfo::computeDefOperandLatency( const InstrItineraryData *ItinData, - const MachineInstr *DefMI, bool FindMin) const { + const MachineInstr *DefMI) const { // Let the target hook getInstrLatency handle missing itineraries. if (!ItinData) return getInstrLatency(ItinData, DefMI); - // Return a latency based on the itinerary properties and defining instruction - // if possible. Some common subtargets don't require per-operand latency, - // especially for minimum latencies. - if (FindMin) { - // If MinLatency is valid, call getInstrLatency. This uses Stage latency if - // it exists before defaulting to MinLatency. - if (ItinData->SchedModel->MinLatency >= 0) - return getInstrLatency(ItinData, DefMI); - - // If MinLatency is invalid, OperandLatency is interpreted as MinLatency. - // For empty itineraries, short-cirtuit the check and default to one cycle. - if (ItinData->isEmpty()) - return 1; - } - else if(ItinData->isEmpty()) + if(ItinData->isEmpty()) return defaultDefLatency(ItinData->SchedModel, DefMI); // ...operand lookup required @@ -709,10 +695,9 @@ int TargetInstrInfo::computeDefOperandLatency( unsigned TargetInstrInfo:: computeOperandLatency(const InstrItineraryData *ItinData, const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx, - bool FindMin) const { + const MachineInstr *UseMI, unsigned UseIdx) const { - int DefLatency = computeDefOperandLatency(ItinData, DefMI, FindMin); + int DefLatency = computeDefOperandLatency(ItinData, DefMI); if (DefLatency >= 0) return DefLatency; @@ -732,8 +717,7 @@ computeOperandLatency(const InstrItineraryData *ItinData, unsigned InstrLatency = getInstrLatency(ItinData, DefMI); // Expected latency is the max of the stage latency and itinerary props. - if (!FindMin) - InstrLatency = std::max(InstrLatency, - defaultDefLatency(ItinData->SchedModel, DefMI)); + InstrLatency = std::max(InstrLatency, + defaultDefLatency(ItinData->SchedModel, DefMI)); return InstrLatency; } diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp index 1bf14dbcef2..53cd11c5daa 100644 --- a/lib/CodeGen/TargetSchedule.cpp +++ b/lib/CodeGen/TargetSchedule.cpp @@ -93,33 +93,10 @@ unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI, // effectively means infinite latency. Since users of the TargetSchedule API // don't know how to handle this, we convert it to a very large latency that is // easy to distinguish when debugging the DAG but won't induce overflow. -static unsigned convertLatency(int Cycles) { +static unsigned capLatency(int Cycles) { return Cycles >= 0 ? Cycles : 1000; } -/// If we can determine the operand latency from the def only, without machine -/// model or itinerary lookup, do so. Otherwise return -1. -int TargetSchedModel::getDefLatency(const MachineInstr *DefMI, - bool FindMin) const { - - // Return a latency based on the itinerary properties and defining instruction - // if possible. Some common subtargets don't require per-operand latency, - // especially for minimum latencies. - if (FindMin) { - // If MinLatency is invalid, then use the itinerary for MinLatency. If no - // itinerary exists either, then use single cycle latency. - if (SchedModel.MinLatency < 0 && !hasInstrItineraries()) { - return 1; - } - return SchedModel.MinLatency; - } - else if (!hasInstrSchedModel() && !hasInstrItineraries()) { - return TII->defaultDefLatency(&SchedModel, DefMI); - } - // ...operand lookup required - return -1; -} - /// Return the MCSchedClassDesc for this instruction. Some SchedClasses require /// evaluation of predicates that depend on instruction operands or flags. const MCSchedClassDesc *TargetSchedModel:: @@ -177,18 +154,16 @@ static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) { // Top-level API for clients that know the operand indices. unsigned TargetSchedModel::computeOperandLatency( const MachineInstr *DefMI, unsigned DefOperIdx, - const MachineInstr *UseMI, unsigned UseOperIdx, - bool FindMin) const { + const MachineInstr *UseMI, unsigned UseOperIdx) const { - int DefLatency = getDefLatency(DefMI, FindMin); - if (DefLatency >= 0) - return DefLatency; + if (!hasInstrSchedModel() && !hasInstrItineraries()) + return TII->defaultDefLatency(&SchedModel, DefMI); if (hasInstrItineraries()) { int OperLatency = 0; if (UseMI) { - OperLatency = - TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx, UseMI, UseOperIdx); + OperLatency = TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx, + UseMI, UseOperIdx); } else { unsigned DefClass = DefMI->getDesc().getSchedClass(); @@ -205,13 +180,11 @@ unsigned TargetSchedModel::computeOperandLatency( // hook to allow subtargets to specialize latency. This hook is only // applicable to the InstrItins model. InstrSchedModel should model all // special cases without TII hooks. - if (!FindMin) - InstrLatency = std::max(InstrLatency, - TII->defaultDefLatency(&SchedModel, DefMI)); + InstrLatency = std::max(InstrLatency, + TII->defaultDefLatency(&SchedModel, DefMI)); return InstrLatency; } - assert(!FindMin && hasInstrSchedModel() && - "Expected a SchedModel for this cpu"); + // hasInstrSchedModel() const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI); unsigned DefIdx = findDefIdx(DefMI, DefOperIdx); if (DefIdx < SCDesc->NumWriteLatencyEntries) { @@ -219,7 +192,7 @@ unsigned TargetSchedModel::computeOperandLatency( const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc, DefIdx); unsigned WriteID = WLEntry->WriteResourceID; - unsigned Latency = convertLatency(WLEntry->Cycles); + unsigned Latency = capLatency(WLEntry->Cycles); if (!UseMI) return Latency; @@ -263,7 +236,7 @@ unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const { // Lookup the definition's write latency in SubtargetInfo. const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc, DefIdx); - Latency = std::max(Latency, convertLatency(WLEntry->Cycles)); + Latency = std::max(Latency, capLatency(WLEntry->Cycles)); } return Latency; } @@ -274,13 +247,10 @@ unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const { unsigned TargetSchedModel:: computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *DepMI) const { - // MinLatency == -1 is for in-order processors that always have unit - // MinLatency. MinLatency > 0 is for in-order processors with varying min - // latencies, but since this is not a RAW dep, we always use unit latency. - if (SchedModel.MinLatency != 0) + if (SchedModel.MicroOpBufferSize <= 1) return 1; - // MinLatency == 0 indicates an out-of-order processor that can dispatch + // MicroOpBufferSize > 1 indicates an out-of-order processor that can dispatch // WAW dependencies in the same cycle. // Treat predication as a data dependency for out-of-order cpus. In-order @@ -302,7 +272,7 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx, if (SCDesc->isValid()) { for (const MCWriteProcResEntry *PRI = STI->getWriteProcResBegin(SCDesc), *PRE = STI->getWriteProcResEnd(SCDesc); PRI != PRE; ++PRI) { - if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->IsBuffered) + if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->BufferSize) return 1; } } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index ad1447503fd..496bcb29c66 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -3684,8 +3684,7 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, return true; // Hoist VFP / NEON instructions with 4 or higher latency. - int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx, - /*FindMin=*/false); + int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); if (Latency < 0) Latency = getInstrLatency(ItinData, DefMI); if (Latency <= 3) diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index d06ad7d6691..ce498576923 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -1887,9 +1887,6 @@ def CortexA9Model : SchedMachineModel { let LoadLatency = 2; // Optimistic load latency assuming bypass. // This is overriden by OperandCycles if the // Itineraries are queried instead. - let ILPWindow = 10; // Don't reschedule small blocks to hide - // latency. Minimum latency requirements are already - // modeled strictly by reserving resources. let MispredictPenalty = 8; // Based on estimate of pipeline depth. let Itineraries = CortexA9Itineraries; @@ -1904,7 +1901,7 @@ def A9UnitALU : ProcResource<2>; def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; } def A9UnitAGU : ProcResource<1>; def A9UnitLS : ProcResource<1>; -def A9UnitFP : ProcResource<1> { let Buffered = 0; } +def A9UnitFP : ProcResource<1>; def A9UnitB : ProcResource<1>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 6e966ecdeb7..b73e58538e0 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -222,7 +222,7 @@ void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) { for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle; - unsigned MinLatency = I->getMinLatency(); + unsigned MinLatency = I->getLatency(); #ifndef NDEBUG Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency); #endif @@ -241,7 +241,7 @@ void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) { for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); I != E; ++I) { unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; - unsigned MinLatency = I->getMinLatency(); + unsigned MinLatency = I->getLatency(); #ifndef NDEBUG Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency); #endif diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 84c9203c378..49e81a76317 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -18,7 +18,6 @@ def HaswellModel : SchedMachineModel { let IssueWidth = 4; let MinLatency = 0; // 0 = Out-of-order execution. let LoadLatency = 4; - let ILPWindow = 30; let MispredictPenalty = 16; } diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index b36b3ad947e..c5fa52173b7 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -19,7 +19,6 @@ def SandyBridgeModel : SchedMachineModel { let IssueWidth = 4; let MinLatency = 0; // 0 = Out-of-order execution. let LoadLatency = 4; - let ILPWindow = 20; let MispredictPenalty = 16; } diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 9f2c7810fa5..c32d12b1840 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -559,17 +559,12 @@ def IIC_NOP : InstrItinClass; // latencies. Since these latencies are not used for pipeline hazards, // they do not need to be exact. // -// ILPWindow=10 is an arbitrary threshold that approximates cycles of -// latency hidden by instruction buffers. The actual value is not very -// important but should be zero for inorder and nonzero for OOO processors. -// // The GenericModel contains no instruciton itineraries. def GenericModel : SchedMachineModel { let IssueWidth = 4; let MinLatency = 0; let LoadLatency = 4; let HighLatency = 10; - let ILPWindow = 10; } include "X86ScheduleAtom.td" diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index cb0960aad13..494a690248d 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -529,7 +529,6 @@ def AtomModel : SchedMachineModel { // OperandCycles may be used for expected latency. let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles. - let ILPWindow = 0; // Always try to hide expected latency. let Itineraries = AtomItineraries; } diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll index 2184d9e9603..526b9d718d2 100644 --- a/test/CodeGen/X86/misched-balance.ll +++ b/test/CodeGen/X86/misched-balance.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ -; RUN: -verify-machineinstrs | FileCheck %s +; RUN-disabled: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s +; RUN: true ; ; Verify that misched resource/latency balancy heuristics are sane. diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll index 15e8a0ad6f4..7ad54bdc27a 100644 --- a/test/CodeGen/X86/misched-matmul.ll +++ b/test/CodeGen/X86/misched-matmul.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -stats 2>&1 | FileCheck %s +; RUN-disabled: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -stats 2>&1 | FileCheck %s +; RUN: true ; ; Verify that register pressure heuristics are working in MachineScheduler. ; diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll index 4dc95c5e932..dee54d9c279 100644 --- a/test/CodeGen/X86/misched-matrix.ll +++ b/test/CodeGen/X86/misched-matrix.ll @@ -1,12 +1,13 @@ -; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ -; RUN: -misched-topdown -verify-machineinstrs \ -; RUN: | FileCheck %s -check-prefix=TOPDOWN -; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ -; RUN: -misched=ilpmin -verify-machineinstrs \ -; RUN: | FileCheck %s -check-prefix=ILPMIN -; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ -; RUN: -misched=ilpmax -verify-machineinstrs \ -; RUN: | FileCheck %s -check-prefix=ILPMAX +; RUN-disabled: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ +; RUN-disabled: -misched-topdown -verify-machineinstrs \ +; RUN-disabled: | FileCheck %s -check-prefix=TOPDOWN +; RUN-disabled: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ +; RUN-disabled: -misched=ilpmin -verify-machineinstrs \ +; RUN-disabled: | FileCheck %s -check-prefix=ILPMIN +; RUN-disabled: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \ +; RUN-disabled: -misched=ilpmax -verify-machineinstrs \ +; RUN-disabled: | FileCheck %s -check-prefix=ILPMAX +; RUN: true ; ; Verify that the MI scheduler minimizes register pressure for a ; uniform set of bottom-up subtrees (unrolled matrix multiply). diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp index 993eead9e13..c6ce35fce09 100644 --- a/utils/TableGen/SubtargetEmitter.cpp +++ b/utils/TableGen/SubtargetEmitter.cpp @@ -634,16 +634,14 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, Record *SuperDef = 0; unsigned SuperIdx = 0; unsigned NumUnits = 0; - bool IsBuffered = true; + int BufferSize = -1; if (PRDef->isSubClassOf("ProcResGroup")) { RecVec ResUnits = PRDef->getValueAsListOfDefs("Resources"); for (RecIter RUI = ResUnits.begin(), RUE = ResUnits.end(); RUI != RUE; ++RUI) { - if (!NumUnits) - IsBuffered = (*RUI)->getValueAsBit("Buffered"); - else if(IsBuffered != (*RUI)->getValueAsBit("Buffered")) - PrintFatalError(PRDef->getLoc(), - "Mixing buffered and unbuffered resources."); + int BuffSz = (*RUI)->getValueAsInt("BufferSize"); + if (!NumUnits || (unsigned)BufferSize < (unsigned)BuffSz) + BufferSize = BuffSz; NumUnits += (*RUI)->getValueAsInt("NumUnits"); } } @@ -655,7 +653,7 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, SuperIdx = ProcModel.getProcResourceIdx(SuperDef); } NumUnits = PRDef->getValueAsInt("NumUnits"); - IsBuffered = PRDef->getValueAsBit("Buffered"); + BufferSize = PRDef->getValueAsInt("BufferSize"); } // Emit the ProcResourceDesc if (i+1 == e) @@ -664,7 +662,7 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel, if (PRDef->getName().size() < 15) OS.indent(15 - PRDef->getName().size()); OS << NumUnits << ", " << SuperIdx << ", " - << IsBuffered << "}" << Sep << " // #" << i+1; + << BufferSize << "}" << Sep << " // #" << i+1; if (SuperDef) OS << ", Super=" << SuperDef->getName(); OS << "\n"; @@ -1200,10 +1198,9 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) { OS << "\n"; OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n"; EmitProcessorProp(OS, PI->ModelDef, "IssueWidth", ','); - EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ','); + EmitProcessorProp(OS, PI->ModelDef, "MicroOpBufferSize", ','); EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ','); EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ','); - EmitProcessorProp(OS, PI->ModelDef, "ILPWindow", ','); EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ','); OS << " " << PI->Index << ", // Processor ID\n"; if (PI->hasInstrSchedModel()) -- 2.34.1