From: Dan Gohman Date: Tue, 16 Dec 2008 03:35:01 +0000 (+0000) Subject: Add initial support for back-scheduling address computations, X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=8749b61178228ba1fb2668034d79da1b247173d7;p=oota-llvm.git Add initial support for back-scheduling address computations, especially in the case of addresses computed from loop induction variables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@61075 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index ee640e24a1e..b0164cee556 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -242,6 +242,7 @@ namespace llvm { bool isPending : 1; // True once pending. bool isAvailable : 1; // True once available. bool isScheduled : 1; // True once scheduled. + bool isScheduleHigh : 1; // True if preferable to schedule high. private: bool isDepthCurrent : 1; // True if Depth is current. bool isHeightCurrent : 1; // True if Height is current. @@ -258,7 +259,7 @@ namespace llvm { Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), isPending(false), isAvailable(false), isScheduled(false), - isDepthCurrent(false), isHeightCurrent(false), + isScheduleHigh(false), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), CopyDstRC(NULL), CopySrcRC(NULL) {} @@ -269,7 +270,7 @@ namespace llvm { Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), isPending(false), isAvailable(false), isScheduled(false), - isDepthCurrent(false), isHeightCurrent(false), + isScheduleHigh(false), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), CopyDstRC(NULL), CopySrcRC(NULL) {} diff --git a/include/llvm/Target/TargetSubtarget.h b/include/llvm/Target/TargetSubtarget.h index fde8f44669a..eca45eb0d74 100644 --- a/include/llvm/Target/TargetSubtarget.h +++ b/include/llvm/Target/TargetSubtarget.h @@ -29,6 +29,12 @@ protected: // Can only create subclasses... TargetSubtarget(); public: virtual ~TargetSubtarget(); + + /// getSpecialAddressLatency - For targets where it is beneficial to + /// backschedule instructions that compute addresses, return a value + /// indicating the number of scheduling cycles of backscheduling that + /// should be attempted. + virtual unsigned getSpecialAddressLatency() const { return 0; } }; } // End llvm namespace diff --git a/lib/CodeGen/LatencyPriorityQueue.cpp b/lib/CodeGen/LatencyPriorityQueue.cpp index 7ad9ac9b51c..2e7b89c494f 100644 --- a/lib/CodeGen/LatencyPriorityQueue.cpp +++ b/lib/CodeGen/LatencyPriorityQueue.cpp @@ -19,6 +19,14 @@ using namespace llvm; bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const { + // The isScheduleHigh flag allows nodes with wraparound dependencies that + // cannot easily be modeled as edges with latencies to be scheduled as + // soon as possible in a top-down schedule. + if (LHS->isScheduleHigh && !RHS->isScheduleHigh) + return false; + if (!LHS->isScheduleHigh && RHS->isScheduleHigh) + return true; + unsigned LHSNum = LHS->NodeNum; unsigned RHSNum = RHS->NodeNum; diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index cfa639e940b..c3209006b97 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -15,6 +15,7 @@ #define DEBUG_TYPE "sched-instrs" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/PseudoSourceValue.h" @@ -29,6 +30,59 @@ #include using namespace llvm; +namespace { + class VISIBILITY_HIDDEN LoopDependencies { + const MachineLoopInfo &MLI; + const MachineDominatorTree &MDT; + + public: + typedef std::map > + LoopDeps; + LoopDeps Deps; + + LoopDependencies(const MachineLoopInfo &mli, + const MachineDominatorTree &mdt) : + MLI(mli), MDT(mdt) {} + + void VisitLoop(const MachineLoop *Loop) { + Deps.clear(); + MachineBasicBlock *Header = Loop->getHeader(); + SmallSet LoopLiveIns; + for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(), + LE = Header->livein_end(); LI != LE; ++LI) + LoopLiveIns.insert(*LI); + + VisitRegion(MDT.getNode(Header), Loop, LoopLiveIns); + } + + private: + void VisitRegion(const MachineDomTreeNode *Node, + const MachineLoop *Loop, + const SmallSet &LoopLiveIns) { + MachineBasicBlock *MBB = Node->getBlock(); + if (!Loop->contains(MBB)) return; + + unsigned Count = 0; + for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I, ++Count) { + const MachineInstr *MI = I; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned MOReg = MO.getReg(); + if (LoopLiveIns.count(MOReg)) + Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count))); + } + } + + const std::vector &Children = Node->getChildren(); + for (unsigned I = 0, E = Children.size(); I != E; ++I) + VisitRegion(Children[I], Loop, LoopLiveIns); + } + }; +} + ScheduleDAGInstrs::ScheduleDAGInstrs(MachineBasicBlock *bb, const TargetMachine &tm, const MachineLoopInfo &mli, @@ -65,9 +119,27 @@ void ScheduleDAGInstrs::BuildSchedUnits() { // all the work of the block is done before the terminator. SUnit *Terminator = 0; + LoopDependencies LoopRegs(MLI, MDT); + + // Track which regs are live into a loop, to help guide back-edge-aware + // scheduling. + SmallSet LoopLiveInRegs; + if (MachineLoop *ML = MLI.getLoopFor(BB)) + if (BB == ML->getLoopLatch()) { + MachineBasicBlock *Header = ML->getHeader(); + for (MachineBasicBlock::livein_iterator I = Header->livein_begin(), + E = Header->livein_end(); I != E; ++I) + LoopLiveInRegs.insert(*I); + LoopRegs.VisitLoop(ML); + } + // Check to see if the scheduler cares about latencies. bool UnitLatencies = ForceUnitLatencies(); + // Ask the target if address-backscheduling is desirable, and if so how much. + unsigned SpecialAddressLatency = + TM.getSubtarget().getSpecialAddressLatency(); + for (MachineBasicBlock::iterator MII = BB->end(), MIE = BB->begin(); MII != MIE; --MII) { MachineInstr *MI = prior(MII); @@ -118,7 +190,21 @@ void ScheduleDAGInstrs::BuildSchedUnits() { for (unsigned i = 0, e = UseList.size(); i != e; ++i) { SUnit *UseSU = UseList[i]; if (UseSU != SU) { - UseSU->addPred(SDep(SU, SDep::Data, DataLatency, Reg)); + unsigned LDataLatency = DataLatency; + // Optionally add in a special extra latency for nodes that + // feed addresses. + // TODO: Do this for register aliases too. + if (SpecialAddressLatency != 0 && !UnitLatencies) { + MachineInstr *UseMI = UseSU->getInstr(); + const TargetInstrDesc &UseTID = UseMI->getDesc(); + int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg); + assert(RegUseIndex >= 0 && "UseMI doesn's use register!"); + if ((UseTID.mayLoad() || UseTID.mayStore()) && + (unsigned)RegUseIndex < UseTID.getNumOperands() && + UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass()) + LDataLatency += SpecialAddressLatency; + } + UseSU->addPred(SDep(SU, SDep::Data, LDataLatency, Reg)); } } for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { @@ -130,6 +216,51 @@ void ScheduleDAGInstrs::BuildSchedUnits() { } } + // If a def is going to wrap back around to the top of the loop, + // backschedule it. + // TODO: Blocks in loops without terminators can benefit too. + if (!UnitLatencies && Terminator && DefList.empty()) { + LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg); + if (I != LoopRegs.Deps.end()) { + const MachineOperand *UseMO = I->second.first; + unsigned Count = I->second.second; + const MachineInstr *UseMI = UseMO->getParent(); + unsigned UseMOIdx = UseMO - &UseMI->getOperand(0); + const TargetInstrDesc &UseTID = UseMI->getDesc(); + // TODO: If we knew the total depth of the region here, we could + // handle the case where the whole loop is inside the region but + // is large enough that the isScheduleHigh trick isn't needed. + if (UseMOIdx < UseTID.getNumOperands()) { + // Currently, we only support scheduling regions consisting of + // single basic blocks. Check to see if the instruction is in + // the same region by checking to see if it has the same parent. + if (UseMI->getParent() != MI->getParent()) { + unsigned Latency = SU->Latency; + if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) + Latency += SpecialAddressLatency; + // This is a wild guess as to the portion of the latency which + // will be overlapped by work done outside the current + // scheduling region. + Latency -= std::min(Latency, Count); + // Add the artifical edge. + Terminator->addPred(SDep(SU, SDep::Order, Latency, + /*Reg=*/0, /*isNormalMemory=*/false, + /*isMustAlias=*/false, + /*isArtificial=*/true)); + } else if (SpecialAddressLatency > 0 && + UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) { + // The entire loop body is within the current scheduling region + // and the latency of this operation is assumed to be greater + // than the latency of the loop. + // TODO: Recursively mark data-edge predecessors as + // isScheduleHigh too. + SU->isScheduleHigh = true; + } + } + LoopRegs.Deps.erase(I); + } + } + UseList.clear(); if (!MO.isDead()) DefList.clear(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 2924d33a3b7..3c8f489c372 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -93,6 +93,17 @@ const char *X86Subtarget::getBZeroEntry() const { return 0; } +/// getSpecialAddressLatency - For targets where it is beneficial to +/// backschedule instructions that compute addresses, return a value +/// indicating the number of scheduling cycles of backscheduling that +/// should be attempted. +unsigned X86Subtarget::getSpecialAddressLatency() const { + // For x86 out-of-order targets, back-schedule address computations so + // that loads and stores aren't blocked. + // This value was chosen arbitrarily. + return 200; +} + /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the /// specified arguments. If we can't run cpuid on the host, return true. bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX, diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 5eb89d605d1..f405ac798bb 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -191,6 +191,12 @@ public: /// memset with zero passed as the second argument. Otherwise it /// returns null. const char *getBZeroEntry() const; + + /// getSpecialAddressLatency - For targets where it is beneficial to + /// backschedule instructions that compute addresses, return a value + /// indicating the number of scheduling cycles of backscheduling that + /// should be attempted. + unsigned getSpecialAddressLatency() const; }; namespace X86 {