From: Gerolf Hoflehner Date: Sun, 3 Aug 2014 21:35:39 +0000 (+0000) Subject: MachineCombiner Pass for selecting faster instruction X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=b0b708854ec3cd8047b4659d2ef881bc7d1b582c;p=oota-llvm.git MachineCombiner Pass for selecting faster instruction sequence - target independent framework When the DAGcombiner selects instruction sequences it could increase the critical path or resource len. For example, on arm64 there are multiply-accumulate instructions (madd, msub). If e.g. the equivalent multiply-add sequence is not on the crictial path it makes sense to select it instead of the combined, single accumulate instruction (madd/msub). The reason is that the conversion from add+mul to the madd could lengthen the critical path by the latency of the multiply. But the DAGCombiner would always combine and select the madd/msub instruction. This patch uses machine trace metrics to estimate critical path length and resource length of an original instruction sequence vs a combined instruction sequence and picks the faster code based on its estimates. This patch only commits the target independent framework that evaluates and selects code sequences. The machine instruction combiner is turned off for all targets and expected to evolve over time by gradually handling DAGCombiner pattern in the target specific code. This framework lays the groundwork for fixing rdar://16319955 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214666 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/CodeGen/MachineCombinerPattern.h b/include/llvm/CodeGen/MachineCombinerPattern.h new file mode 100644 index 00000000000..176af14dc31 --- /dev/null +++ b/include/llvm/CodeGen/MachineCombinerPattern.h @@ -0,0 +1,29 @@ +//===-- llvm/CodeGen/MachineCombinerPattern.h - Instruction pattern supported by +// combiner ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines instruction pattern supported by combiner +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINECOMBINERPATTERN_H +#define LLVM_CODEGEN_MACHINECOMBINERPATTERN_H + +namespace llvm { + +/// Enumeration of instruction pattern supported by machine combiner +/// +/// +namespace MachineCombinerPattern { +// Forward declaration +enum MC_PATTERN : int; +} // end namespace MachineCombinerPattern +} // end namespace llvm + +#endif diff --git a/include/llvm/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h index 323b694f393..06ea9c51603 100644 --- a/include/llvm/CodeGen/MachineTraceMetrics.h +++ b/include/llvm/CodeGen/MachineTraceMetrics.h @@ -264,8 +264,9 @@ public: /// classes are included. For the caller to account for extra machine /// instructions, it must first resolve each instruction's scheduling class. unsigned getResourceLength( - ArrayRef Extrablocks = None, - ArrayRef ExtraInstrs = None) const; + ArrayRef Extrablocks = None, + ArrayRef ExtraInstrs = None, + ArrayRef RemoveInstrs = None) const; /// Return the length of the (data dependency) critical path through the /// trace. @@ -286,6 +287,12 @@ public: /// Return the Depth of a PHI instruction in a trace center block successor. /// The PHI does not have to be part of the trace. unsigned getPHIDepth(const MachineInstr *PHI) const; + + /// A dependence is useful if the basic block of the defining instruction + /// is part of the trace of the user instruction. It is assumed that DefMI + /// dominates UseMI (see also isUsefulDominator). + bool isDepInTrace(const MachineInstr *DefMI, + const MachineInstr *UseMI) const; }; /// A trace ensemble is a collection of traces selected using the same diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index 87f55e8572f..0869e3e1857 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -489,6 +489,10 @@ namespace llvm { /// inserting cmov instructions. extern char &EarlyIfConverterID; + /// This pass performs instruction combining using trace metrics to estimate + /// critical-path and resource depth. + extern char &MachineCombinerID; + /// StackSlotColoring - This pass performs stack coloring and merging. /// It merges disjoint allocas to reduce the stack size. extern char &StackColoringID; diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h index 690b70fad89..51f79ae4e74 100644 --- a/include/llvm/CodeGen/TargetSchedule.h +++ b/include/llvm/CodeGen/TargetSchedule.h @@ -167,6 +167,7 @@ public: /// if converter after moving it to TargetSchedModel). unsigned computeInstrLatency(const MachineInstr *MI, bool UseDefaultDefLatency = true) const; + unsigned computeInstrLatency(unsigned Opcode) const; /// \brief Output dependency latency of a pair of defs of the same register. /// diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 20074f0a5d5..01cc081bfa4 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -278,6 +278,7 @@ void initializeSLPVectorizerPass(PassRegistry&); void initializeBBVectorizePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeStackMapLivenessPass(PassRegistry&); +void initializeMachineCombinerPass(PassRegistry &); void initializeLoadCombinePass(PassRegistry&); } diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index a589d0e5d8b..36e28a0799b 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -15,9 +15,12 @@ #define LLVM_TARGET_TARGETINSTRINFO_H #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" namespace llvm { @@ -572,6 +575,42 @@ public: const SmallVectorImpl &Ops, MachineInstr* LoadMI) const; + /// hasPattern - return true when there is potentially a faster code sequence + /// for an instruction chain ending in \p Root. All potential pattern are + /// returned in the \p Pattern vector. Pattern should be sorted in priority + /// order since the pattern evaluator stops checking as soon as it finds a + /// faster sequence. + /// \param Root - Instruction that could be combined with one of its operands + /// \param Pattern - Vector of possible combination pattern + + virtual bool hasPattern( + MachineInstr &Root, + SmallVectorImpl &Pattern) const { + return false; + } + + /// genAlternativeCodeSequence - when hasPattern() finds a pattern this + /// function generates the instructions that could replace the original code + /// sequence. The client has to decide whether the actual replacementment is + /// beneficial or not. + /// \param Root - Instruction that could be combined with one of its operands + /// \param P - Combination pattern for Root + /// \param InsInstr - Vector of new instructions that implement P + /// \param DelInstr - Old instructions, including Root, that could be replaced + /// by InsInstr + /// \param InstrIdxForVirtReg - map of virtual register to instruction in + /// InsInstr that defines it + virtual void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + return; + } + + /// useMachineCombiner - return true when a target supports MachineCombiner + virtual bool useMachineCombiner(void) const { return false; } + protected: /// foldMemoryOperandImpl - Target-dependent implementation for /// foldMemoryOperand. Target-independent code in foldMemoryOperand will diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index b71b30cc1cc..2a247c12e64 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -49,6 +49,7 @@ add_llvm_library(LLVMCodeGen MachineBranchProbabilityInfo.cpp MachineCSE.cpp MachineCodeEmitter.cpp + MachineCombiner.cpp MachineCopyPropagation.cpp MachineDominators.cpp MachineDominanceFrontier.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index b3beac3932b..12b0411065a 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -41,6 +41,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); initializeMachineCopyPropagationPass(Registry); + initializeMachineCombinerPass(Registry); initializeMachineCSEPass(Registry); initializeMachineDominatorTreePass(Registry); initializeMachinePostDominatorTreePass(Registry); diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp new file mode 100644 index 00000000000..591c4caf66e --- /dev/null +++ b/lib/CodeGen/MachineCombiner.cpp @@ -0,0 +1,434 @@ +//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The machine combiner pass uses machine trace metrics to ensure the combined +// instructions does not lengthen the critical path or the resource depth. +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "machine-combiner" + +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +STATISTIC(NumInstCombined, "Number of machineinst combined"); + +namespace { +class MachineCombiner : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const MCSchedModel *SchedModel; + MachineRegisterInfo *MRI; + MachineTraceMetrics *Traces; + MachineTraceMetrics::Ensemble *MinInstr; + + TargetSchedModel TSchedModel; + + /// OptSize - True if optimizing for code size. + bool OptSize; + +public: + static char ID; + MachineCombiner() : MachineFunctionPass(ID) { + initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { return "Machine InstCombiner"; } + +private: + bool doSubstitute(unsigned NewSize, unsigned OldSize); + bool combineInstructions(MachineBasicBlock *); + MachineInstr *getOperandDef(const MachineOperand &MO); + unsigned getDepth(SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + MachineTraceMetrics::Trace BlockTrace); + unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, + MachineTraceMetrics::Trace BlockTrace); + bool + preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg); + bool preservesResourceLen(MachineBasicBlock *MBB, + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs); + void instr2instrSC(SmallVectorImpl &Instrs, + SmallVectorImpl &InstrsSC); +}; +} + +char MachineCombiner::ID = 0; +char &llvm::MachineCombinerID = MachineCombiner::ID; + +INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", + "Machine InstCombiner", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", + false, false) + +void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { + MachineInstr *DefInstr = nullptr; + // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + DefInstr = MRI->getUniqueVRegDef(MO.getReg()); + // PHI's have no depth etc. + if (DefInstr && DefInstr->isPHI()) + DefInstr = nullptr; + return DefInstr; +} + +/// getDepth - Computes depth of instructions in vector \InsInstr. +/// +/// \param InsInstrs is a vector of machine instructions +/// \param InstrIdxForVirtReg is a dense map of virtual register to index +/// of defining machine instruction in \p InsInstrs +/// \param BlockTrace is a trace of machine instructions +/// +/// \returns Depth of last instruction in \InsInstrs ("NewRoot") +unsigned +MachineCombiner::getDepth(SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + MachineTraceMetrics::Trace BlockTrace) { + + SmallVector InstrDepth; + assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); + + // Foreach instruction in in the new sequence compute the depth based on the + // operands. Use the trace information when possible. For new operands which + // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth + for (auto *InstrPtr : InsInstrs) { // for each Use + unsigned IDepth = 0; + DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); + for (unsigned i = 0, e = InstrPtr->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = InstrPtr->getOperand(i); + // Check for virtual register operand. + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + if (!MO.isUse()) + continue; + unsigned DepthOp = 0; + unsigned LatencyOp = 0; + DenseMap::iterator II = + InstrIdxForVirtReg.find(MO.getReg()); + if (II != InstrIdxForVirtReg.end()) { + // Operand is new virtual register not in trace + assert(II->second >= 0 && II->second < InstrDepth.size() && + "Bad Index"); + MachineInstr *DefInstr = InsInstrs[II->second]; + assert(DefInstr && + "There must be a definition for a new virtual register"); + DepthOp = InstrDepth[II->second]; + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + } else { + MachineInstr *DefInstr = getOperandDef(MO); + if (DefInstr) { + DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + } + } + IDepth = std::max(IDepth, DepthOp + LatencyOp); + } + InstrDepth.push_back(IDepth); + } + unsigned NewRootIdx = InsInstrs.size() - 1; + return InstrDepth[NewRootIdx]; +} + +/// getLatency - Computes instruction latency as max of latency of defined +/// operands +/// +/// \param Root is a machine instruction that could be replaced by NewRoot. +/// It is used to compute a more accurate latency information for NewRoot in +/// case there is a dependent instruction in the same trace (\p BlockTrace) +/// \param NewRoot is the instruction for which the latency is computed +/// \param BlockTrace is a trace of machine instructions +/// +/// \returns Latency of \p NewRoot +unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, + MachineTraceMetrics::Trace BlockTrace) { + + assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); + + // Check each definition in NewRoot and compute the latency + unsigned NewRootLatency = 0; + + for (unsigned i = 0, e = NewRoot->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = NewRoot->getOperand(i); + // Check for virtual register operand. + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + if (!MO.isDef()) + continue; + // Get the first instruction that uses MO + MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); + RI++; + MachineInstr *UseMO = RI->getParent(); + unsigned LatencyOp = 0; + if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { + LatencyOp = TSchedModel.computeOperandLatency( + NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, + UseMO->findRegisterUseOperandIdx(MO.getReg())); + } else { + LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode()); + } + NewRootLatency = std::max(NewRootLatency, LatencyOp); + } + return NewRootLatency; +} + +/// preservesCriticalPathlen - True when the new instruction sequence does not +/// lengthen the critical path. The DAGCombine code sequence ends in MI +/// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A +/// necessary condition for the new sequence to replace the old sequence is that +/// is cannot lengthen the critical path. This is decided by the formula +/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). +/// The slack is the number of cycles Root can be delayed before the critical +/// patch becomes longer. +bool MachineCombiner::preservesCriticalPathLen( + MachineBasicBlock *MBB, MachineInstr *Root, + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg) { + + assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); + // NewRoot is the last instruction in the \p InsInstrs vector + // Get depth and latency of NewRoot + unsigned NewRootIdx = InsInstrs.size() - 1; + MachineInstr *NewRoot = InsInstrs[NewRootIdx]; + unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); + unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); + + // Get depth, latency and slack of Root + unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; + unsigned RootLatency = TSchedModel.computeInstrLatency(Root); + unsigned RootSlack = BlockTrace.getInstrSlack(Root); + + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + dbgs() << " NewRootDepth: " << NewRootDepth + << " NewRootLatency: " << NewRootLatency << "\n"; + dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency + << " RootSlack: " << RootSlack << "\n"; + dbgs() << " NewRootDepth + NewRootLatency " + << NewRootDepth + NewRootLatency << "\n"; + dbgs() << " RootDepth + RootLatency + RootSlack " + << RootDepth + RootLatency + RootSlack << "\n";); + + /// True when the new sequence does not lenghten the critical path. + return ((NewRootDepth + NewRootLatency) <= + (RootDepth + RootLatency + RootSlack)); +} + +/// helper routine to convert instructions into SC +void MachineCombiner::instr2instrSC( + SmallVectorImpl &Instrs, + SmallVectorImpl &InstrsSC) { + for (auto *InstrPtr : Instrs) { + unsigned Opc = InstrPtr->getOpcode(); + unsigned Idx = TII->get(Opc).getSchedClass(); + const MCSchedClassDesc *SC = SchedModel->getSchedClassDesc(Idx); + InstrsSC.push_back(SC); + } +} +/// preservesResourceLen - True when the new instructions do not increase +/// resource length +bool MachineCombiner::preservesResourceLen( + MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs) { + + // Compute current resource length + + ArrayRef MBBarr(MBB); + unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); + + // Deal with SC rather than Instructions. + SmallVector InsInstrsSC; + SmallVector DelInstrsSC; + + instr2instrSC(InsInstrs, InsInstrsSC); + instr2instrSC(DelInstrs, DelInstrsSC); + + ArrayRef MSCInsArr = makeArrayRef(InsInstrsSC); + ArrayRef MSCDelArr = makeArrayRef(DelInstrsSC); + + // Compute new resource length + unsigned ResLenAfterCombine = + BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); + + DEBUG(dbgs() << "RESOURCE DATA: \n"; + dbgs() << " resource len before: " << ResLenBeforeCombine + << " after: " << ResLenAfterCombine << "\n";); + + return ResLenAfterCombine <= ResLenBeforeCombine; +} + +/// \returns true when new instruction sequence should be generated +/// independent if it lenghtens critical path or not +bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { + if (OptSize && (NewSize < OldSize)) + return true; + if (!TSchedModel.hasInstrSchedModel()) + return true; + return false; +} + +/// combineInstructions - substitute a slow code sequence with a faster one by +/// evaluating instruction combining pattern. +/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction +/// combining based on machine trace metrics. Only combine a sequence of +/// instructions when this neither lengthens the critical path nor increases +/// resource pressure. When optimizing for codesize always combine when the new +/// sequence is shorter. +bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { + bool Changed = false; + DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); + + auto BlockIter = MBB->begin(); + + while (BlockIter != MBB->end()) { + auto &MI = *BlockIter++; + + DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); + SmallVector Pattern; + // The motivating example is: + // + // MUL Other MUL_op1 MUL_op2 Other + // \ / \ | / + // ADD/SUB => MADD/MSUB + // (=Root) (=NewRoot) + + // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is + // usually beneficial for code size it unfortunately can hurt performance + // when the ADD is on the critical path, but the MUL is not. With the + // substitution the MUL becomes part of the critical path (in form of the + // MADD) and can lengthen it on architectures where the MADD latency is + // longer than the ADD latency. + // + // For each instruction we check if it can be the root of a combiner + // pattern. Then for each pattern the new code sequence in form of MI is + // generated and evaluated. When the efficiency criteria (don't lengthen + // critical path, don't use more resources) is met the new sequence gets + // hooked up into the basic block before the old sequence is removed. + // + // The algorithm does not try to evaluate all patterns and pick the best. + // This is only an artificial restriction though. In practice there is + // mostly one pattern and hasPattern() can order patterns based on an + // internal cost heuristic. + + if (TII->hasPattern(MI, Pattern)) { + for (auto P : Pattern) { + SmallVector InsInstrs; + SmallVector DelInstrs; + DenseMap InstrIdxForVirtReg; + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + Traces->verifyAnalysis(); + TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + // Found pattern, but did not generate alternative sequence. + // This can happen e.g. when an immediate could not be materialized + // in a single instruction. + if (!InsInstrs.size()) + continue; + // Substitute when we optimize for codesize and the new sequence has + // fewer instructions OR + // the new sequence neither lenghten the critical path nor increases + // resource pressure. + if (doSubstitute(InsInstrs.size(), DelInstrs.size()) || + (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, + InstrIdxForVirtReg) && + preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { + for (auto *InstrPtr : InsInstrs) + MBB->insert((MachineBasicBlock::iterator) & MI, + (MachineInstr *)InstrPtr); + for (auto *InstrPtr : DelInstrs) + InstrPtr->eraseFromParent(); + + Changed = true; + ++NumInstCombined; + + Traces->invalidate(MBB); + Traces->verifyAnalysis(); + // Eagerly stop after the first pattern fired + break; + } else { + // Cleanup instructions of the alternative code sequence. There is no + // use for them. + for (auto *InstrPtr : InsInstrs) { + MachineFunction *MF = MBB->getParent(); + MF->DeleteMachineInstr((MachineInstr *)InstrPtr); + } + } + InstrIdxForVirtReg.clear(); + } + } + } + + return Changed; +} + +bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + const TargetSubtargetInfo &STI = + MF.getTarget().getSubtarget(); + SchedModel = STI.getSchedModel(); + TSchedModel.init(*SchedModel, &STI, TII); + MRI = &MF.getRegInfo(); + Traces = &getAnalysis(); + MinInstr = 0; + + OptSize = MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); + if (!TII->useMachineCombiner()) { + DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); + return false; + } + + bool Changed = false; + + // Try to combine instructions. + for (auto &MBB : MF) + Changed |= combineInstructions(&MBB); + + return Changed; +} diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index 1bbf0ade6a5..93528e05d3a 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -1169,6 +1169,7 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const { return DepCycle; } +/// When bottom is set include instructions in current block in estimate. unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const { // Find the limiting processor resource. // Numbers have been pre-scaled to be comparable. @@ -1185,7 +1186,9 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const { // Convert to cycle count. PRMax = TE.MTM.getCycles(PRMax); + /// All instructions before current block unsigned Instrs = TBI.InstrDepth; + // plus instructions in current block if (Bottom) Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount; if (unsigned IW = TE.MTM.SchedModel.getIssueWidth()) @@ -1194,44 +1197,72 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const { return std::max(Instrs, PRMax); } - -unsigned MachineTraceMetrics::Trace:: -getResourceLength(ArrayRef Extrablocks, - ArrayRef ExtraInstrs) const { +unsigned MachineTraceMetrics::Trace::getResourceLength( + ArrayRef Extrablocks, + ArrayRef ExtraInstrs, + ArrayRef RemoveInstrs) const { // Add up resources above and below the center block. ArrayRef PRDepths = TE.getProcResourceDepths(getBlockNum()); ArrayRef PRHeights = TE.getProcResourceHeights(getBlockNum()); unsigned PRMax = 0; - for (unsigned K = 0; K != PRDepths.size(); ++K) { - unsigned PRCycles = PRDepths[K] + PRHeights[K]; - for (unsigned I = 0; I != Extrablocks.size(); ++I) - PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K]; - for (unsigned I = 0; I != ExtraInstrs.size(); ++I) { - const MCSchedClassDesc* SC = ExtraInstrs[I]; + + // Capture computing cycles from extra instructions + auto extraCycles = [this](ArrayRef Instrs, + unsigned ResourceIdx) + ->unsigned { + unsigned Cycles = 0; + for (unsigned I = 0; I != Instrs.size(); ++I) { + const MCSchedClassDesc *SC = Instrs[I]; if (!SC->isValid()) continue; for (TargetSchedModel::ProcResIter - PI = TE.MTM.SchedModel.getWriteProcResBegin(SC), - PE = TE.MTM.SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) { - if (PI->ProcResourceIdx != K) + PI = TE.MTM.SchedModel.getWriteProcResBegin(SC), + PE = TE.MTM.SchedModel.getWriteProcResEnd(SC); + PI != PE; ++PI) { + if (PI->ProcResourceIdx != ResourceIdx) continue; - PRCycles += (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(K)); + Cycles += + (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(ResourceIdx)); } } + return Cycles; + }; + + for (unsigned K = 0; K != PRDepths.size(); ++K) { + unsigned PRCycles = PRDepths[K] + PRHeights[K]; + for (unsigned I = 0; I != Extrablocks.size(); ++I) + PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K]; + PRCycles += extraCycles(ExtraInstrs, K); + PRCycles -= extraCycles(RemoveInstrs, K); PRMax = std::max(PRMax, PRCycles); } // Convert to cycle count. PRMax = TE.MTM.getCycles(PRMax); + // Instrs: #instructions in current trace outside current block. unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight; + // Add instruction count from the extra blocks. for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i) Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount; + Instrs += ExtraInstrs.size(); + Instrs -= RemoveInstrs.size(); if (unsigned IW = TE.MTM.SchedModel.getIssueWidth()) Instrs /= IW; // Assume issue width 1 without a schedule model. return std::max(Instrs, PRMax); } +bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr *DefMI, + const MachineInstr *UseMI) const { + if (DefMI->getParent() == UseMI->getParent()) + return true; + + const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI->getParent()->getNumber()]; + const TraceBlockInfo &TBI = TE.BlockInfo[UseMI->getParent()->getNumber()]; + + return DepTBI.isUsefulDominator(TBI); +} + void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const { OS << getName() << " ensemble:\n"; for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) { diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp index b0f2ca68884..f42946f35ef 100644 --- a/lib/CodeGen/TargetSchedule.cpp +++ b/lib/CodeGen/TargetSchedule.cpp @@ -225,6 +225,28 @@ unsigned TargetSchedModel::computeOperandLatency( return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI); } +unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const { + assert(hasInstrSchedModel() && "Only call this function with a SchedModel"); + + unsigned SCIdx = TII->get(Opcode).getSchedClass(); + const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SCIdx); + unsigned Latency = 0; + + if (SCDesc->isValid() && !SCDesc->isVariant()) { + for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries; + DefIdx != DefEnd; ++DefIdx) { + // Lookup the definition's write latency in SubtargetInfo. + const MCWriteLatencyEntry *WLEntry = + STI->getWriteLatencyEntry(SCDesc, DefIdx); + Latency = std::max(Latency, capLatency(WLEntry->Cycles)); + } + return Latency; + } + + assert(Latency && "No MI sched latency"); + return 0; +} + unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI, bool UseDefaultDefLatency) const {