From 8396e130427999c57422e52af3913eb8182847e5 Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Tue, 2 Apr 2013 17:49:51 +0000 Subject: [PATCH] Count processor resources individually in MachineTraceMetrics. The new instruction scheduling models provide information about the number of cycles consumed on each processor resource. This makes it possible to estimate ILP more accurately than simply counting instructions / issue width. The functions getResourceDepth() and getResourceLength() now identify the limiting processor resource, and return a cycle count based on that. This gives more precise resource information, particularly in traces that use one resource a lot more than others. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178553 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/MachineTraceMetrics.h | 25 ++++ lib/CodeGen/MachineTraceMetrics.cpp | 153 +++++++++++++++++++-- 2 files changed, 169 insertions(+), 9 deletions(-) diff --git a/include/llvm/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h index eaaa70a67d8..2775a048582 100644 --- a/include/llvm/CodeGen/MachineTraceMetrics.h +++ b/include/llvm/CodeGen/MachineTraceMetrics.h @@ -107,6 +107,13 @@ public: /// Get the fixed resource information about MBB. Compute it on demand. const FixedBlockInfo *getResources(const MachineBasicBlock*); + /// Get the scaled number of cycles used per processor resource in MBB. + /// This is an array with SchedModel.getNumProcResourceKinds() entries. + /// The getResources() function above must have been called first. + /// + /// These numbers have already been scaled by SchedModel.getResourceFactor(). + ArrayRef getProcResourceCycles(unsigned MBBNum) const; + /// A virtual register or regunit required by a basic block or its trace /// successors. struct LiveInReg { @@ -284,6 +291,8 @@ public: class Ensemble { SmallVector BlockInfo; DenseMap Cycles; + SmallVector ProcResourceDepths; + SmallVector ProcResourceHeights; friend class Trace; void computeTrace(const MachineBasicBlock*); @@ -303,6 +312,8 @@ public: const MachineLoop *getLoopFor(const MachineBasicBlock*) const; const TraceBlockInfo *getDepthResources(const MachineBasicBlock*) const; const TraceBlockInfo *getHeightResources(const MachineBasicBlock*) const; + ArrayRef getProcResourceDepths(unsigned MBBNum) const; + ArrayRef getProcResourceHeights(unsigned MBBNum) const; public: virtual ~Ensemble(); @@ -343,8 +354,22 @@ private: // One entry per basic block, indexed by block number. SmallVector BlockInfo; + // Cycles consumed on each processor resource per block. + // The number of processor resource kinds is constant for a given subtarget, + // but it is not known at compile time. The number of cycles consumed by + // block B on processor resource R is at ProcResourceCycles[B*Kinds + R] + // where Kinds = SchedModel.getNumProcResourceKinds(). + SmallVector ProcResourceCycles; + // One ensemble per strategy. Ensemble* Ensembles[TS_NumStrategies]; + + // Convert scaled resource usage to a cycle count that can be compared with + // latencies. + unsigned getCycles(unsigned Scaled) { + unsigned Factor = SchedModel.getLatencyFactor(); + return (Scaled + Factor - 1) / Factor; + } }; inline raw_ostream &operator<<(raw_ostream &OS, diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index 5bf0176eadc..c154b5c9c10 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -57,6 +58,8 @@ bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) { MF->getTarget().getSubtarget(); SchedModel.init(*ST.getSchedModel(), &ST, TII); BlockInfo.resize(MF->getNumBlockIDs()); + ProcResourceCycles.resize(MF->getNumBlockIDs() * + SchedModel.getNumProcResourceKinds()); return false; } @@ -85,9 +88,13 @@ MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) { return FBI; // Compute resource usage in the block. - // FIXME: Compute per-functional unit counts. FBI->HasCalls = false; unsigned InstrCount = 0; + + // Add up per-processor resource cycles as well. + unsigned PRKinds = SchedModel.getNumProcResourceKinds(); + SmallVector PRCycles(PRKinds); + for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); I != E; ++I) { const MachineInstr *MI = I; @@ -96,11 +103,39 @@ MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) { ++InstrCount; if (MI->isCall()) FBI->HasCalls = true; + + // Count processor resources used. + const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI); + if (!SC->isValid()) + continue; + + for (TargetSchedModel::ProcResIter + PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) { + assert(PI->ProcResourceIdx < PRKinds && "Bad processor resource kind"); + PRCycles[PI->ProcResourceIdx] += PI->Cycles; + } } FBI->InstrCount = InstrCount; + + // Scale the resource cycles so they are comparable. + unsigned PROffset = MBB->getNumber() * PRKinds; + for (unsigned K = 0; K != PRKinds; ++K) + ProcResourceCycles[PROffset + K] = + PRCycles[K] * SchedModel.getResourceFactor(K); + return FBI; } +ArrayRef +MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const { + assert(BlockInfo[MBBNum].hasResources() && + "getResources() must be called before getProcResourceCycles()"); + unsigned PRKinds = SchedModel.getNumProcResourceKinds(); + return ArrayRef(&ProcResourceCycles[MBBNum * PRKinds], PRKinds); +} + + //===----------------------------------------------------------------------===// // Ensemble utility functions //===----------------------------------------------------------------------===// @@ -108,6 +143,9 @@ MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) { MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct) : MTM(*ct) { BlockInfo.resize(MTM.BlockInfo.size()); + unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds(); + ProcResourceDepths.resize(MTM.BlockInfo.size() * PRKinds); + ProcResourceHeights.resize(MTM.BlockInfo.size() * PRKinds); } // Virtual destructor serves as an anchor. @@ -123,21 +161,32 @@ MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const { void MachineTraceMetrics::Ensemble:: computeDepthResources(const MachineBasicBlock *MBB) { TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()]; + unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds(); + unsigned PROffset = MBB->getNumber() * PRKinds; // Compute resources from trace above. The top block is simple. if (!TBI->Pred) { TBI->InstrDepth = 0; TBI->Head = MBB->getNumber(); + std::fill(ProcResourceDepths.begin() + PROffset, + ProcResourceDepths.begin() + PROffset + PRKinds, 0); return; } // Compute from the block above. A post-order traversal ensures the // predecessor is always computed first. - TraceBlockInfo *PredTBI = &BlockInfo[TBI->Pred->getNumber()]; + unsigned PredNum = TBI->Pred->getNumber(); + TraceBlockInfo *PredTBI = &BlockInfo[PredNum]; assert(PredTBI->hasValidDepth() && "Trace above has not been computed yet"); const FixedBlockInfo *PredFBI = MTM.getResources(TBI->Pred); TBI->InstrDepth = PredTBI->InstrDepth + PredFBI->InstrCount; TBI->Head = PredTBI->Head; + + // Compute per-resource depths. + ArrayRef PredPRDepths = getProcResourceDepths(PredNum); + ArrayRef PredPRCycles = MTM.getProcResourceCycles(PredNum); + for (unsigned K = 0; K != PRKinds; ++K) + ProcResourceDepths[PROffset + K] = PredPRDepths[K] + PredPRCycles[K]; } // Update resource-related information in the TraceBlockInfo for MBB. @@ -145,22 +194,33 @@ computeDepthResources(const MachineBasicBlock *MBB) { void MachineTraceMetrics::Ensemble:: computeHeightResources(const MachineBasicBlock *MBB) { TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()]; + unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds(); + unsigned PROffset = MBB->getNumber() * PRKinds; // Compute resources for the current block. TBI->InstrHeight = MTM.getResources(MBB)->InstrCount; + ArrayRef PRCycles = MTM.getProcResourceCycles(MBB->getNumber()); // The trace tail is done. if (!TBI->Succ) { TBI->Tail = MBB->getNumber(); + std::copy(PRCycles.begin(), PRCycles.end(), + ProcResourceHeights.begin() + PROffset); return; } // Compute from the block below. A post-order traversal ensures the // predecessor is always computed first. - TraceBlockInfo *SuccTBI = &BlockInfo[TBI->Succ->getNumber()]; + unsigned SuccNum = TBI->Succ->getNumber(); + TraceBlockInfo *SuccTBI = &BlockInfo[SuccNum]; assert(SuccTBI->hasValidHeight() && "Trace below has not been computed yet"); TBI->InstrHeight += SuccTBI->InstrHeight; TBI->Tail = SuccTBI->Tail; + + // Compute per-resource heights. + ArrayRef SuccPRHeights = getProcResourceHeights(SuccNum); + for (unsigned K = 0; K != PRKinds; ++K) + ProcResourceHeights[PROffset + K] = SuccPRHeights[K] + PRCycles[K]; } // Check if depth resources for MBB are valid and return the TBI. @@ -181,6 +241,31 @@ getHeightResources(const MachineBasicBlock *MBB) const { return TBI->hasValidHeight() ? TBI : 0; } +/// Get an array of processor resource depths for MBB. Indexed by processor +/// resource kind, this array contains the scaled processor resources consumed +/// by all blocks preceding MBB in its trace. It does not include instructions +/// in MBB. +/// +/// Compare TraceBlockInfo::InstrDepth. +ArrayRef +MachineTraceMetrics::Ensemble:: +getProcResourceDepths(unsigned MBBNum) const { + unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds(); + return ArrayRef(&ProcResourceDepths[MBBNum * PRKinds], PRKinds); +} + +/// Get an array of processor resource heights for MBB. Indexed by processor +/// resource kind, this array contains the scaled processor resources consumed +/// by this block and all blocks following it in its trace. +/// +/// Compare TraceBlockInfo::InstrHeight. +ArrayRef +MachineTraceMetrics::Ensemble:: +getProcResourceHeights(unsigned MBBNum) const { + unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds(); + return ArrayRef(&ProcResourceHeights[MBBNum * PRKinds], PRKinds); +} + //===----------------------------------------------------------------------===// // Trace Selection Strategies //===----------------------------------------------------------------------===// @@ -713,11 +798,24 @@ computeInstrDepths(const MachineBasicBlock *MBB) { SmallVector Deps; while (!Stack.empty()) { MBB = Stack.pop_back_val(); - DEBUG(dbgs() << "Depths for BB#" << MBB->getNumber() << ":\n"); + DEBUG(dbgs() << "\nDepths for BB#" << MBB->getNumber() << ":\n"); TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()]; TBI.HasValidInstrDepths = true; TBI.CriticalPath = 0; + // Print out resource depths here as well. + DEBUG({ + dbgs() << format("%7u Instructions\n", TBI.InstrDepth); + ArrayRef PRDepths = getProcResourceDepths(MBB->getNumber()); + for (unsigned K = 0; K != PRDepths.size(); ++K) + if (PRDepths[K]) { + unsigned Factor = MTM.SchedModel.getResourceFactor(K); + dbgs() << format("%6uc @ ", MTM.getCycles(PRDepths[K])) + << MTM.SchedModel.getProcResource(K)->Name << " (" + << PRDepths[K]/Factor << " ops x" << Factor << ")\n"; + } + }); + // Also compute the critical path length through MBB when possible. if (TBI.HasValidInstrHeights) TBI.CriticalPath = computeCrossBlockCriticalPath(TBI); @@ -928,6 +1026,18 @@ computeInstrHeights(const MachineBasicBlock *MBB) { TBI.HasValidInstrHeights = true; TBI.CriticalPath = 0; + DEBUG({ + dbgs() << format("%7u Instructions\n", TBI.InstrHeight); + ArrayRef PRHeights = getProcResourceHeights(MBB->getNumber()); + for (unsigned K = 0; K != PRHeights.size(); ++K) + if (PRHeights[K]) { + unsigned Factor = MTM.SchedModel.getResourceFactor(K); + dbgs() << format("%6uc @ ", MTM.getCycles(PRHeights[K])) + << MTM.SchedModel.getProcResource(K)->Name << " (" + << PRHeights[K]/Factor << " ops x" << Factor << ")\n"; + } + }); + // Get dependencies from PHIs in the trace successor. const MachineBasicBlock *Succ = TBI.Succ; // If MBB is the last block in the trace, and it has a back-edge to the @@ -1058,27 +1168,52 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const { } unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const { - // For now, we compute the resource depth from instruction count / issue - // width. Eventually, we should compute resource depth per functional unit - // and return the max. + // Find the limiting processor resource. + // Numbers have been pre-scaled to be comparable. + unsigned PRMax = 0; + ArrayRef PRDepths = TE.getProcResourceDepths(getBlockNum()); + if (Bottom) { + ArrayRef PRCycles = TE.MTM.getProcResourceCycles(getBlockNum()); + for (unsigned K = 0; K != PRDepths.size(); ++K) + PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]); + } else { + for (unsigned K = 0; K != PRDepths.size(); ++K) + PRMax = std::max(PRMax, PRDepths[K]); + } + // Convert to cycle count. + PRMax = TE.MTM.getCycles(PRMax); + unsigned Instrs = TBI.InstrDepth; if (Bottom) Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount; if (unsigned IW = TE.MTM.SchedModel.getIssueWidth()) Instrs /= IW; // Assume issue width 1 without a schedule model. - return Instrs; + return std::max(Instrs, PRMax); } unsigned MachineTraceMetrics::Trace:: getResourceLength(ArrayRef Extrablocks) const { + // Add up resources above and below the center block. + ArrayRef PRDepths = TE.getProcResourceDepths(getBlockNum()); + ArrayRef PRHeights = TE.getProcResourceHeights(getBlockNum()); + unsigned PRMax = 0; + for (unsigned K = 0; K != PRDepths.size(); ++K) { + unsigned PRCycles = PRDepths[K] + PRHeights[K]; + for (unsigned I = 0; I != Extrablocks.size(); ++I) + PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K]; + PRMax = std::max(PRMax, PRCycles); + } + // Convert to cycle count. + PRMax = TE.MTM.getCycles(PRMax); + unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight; for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i) Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount; if (unsigned IW = TE.MTM.SchedModel.getIssueWidth()) Instrs /= IW; // Assume issue width 1 without a schedule model. - return Instrs; + return std::max(Instrs, PRMax); } void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const { -- 2.34.1