Fixed the comment. No functionality change.

[oota-llvm.git] / lib / CodeGen / ScheduleDAGInstrs.cpp
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp

index 06d8ed9b25d0a89330dffd1c3b7f9cc7e0ce055b..8e18b3d17fda28e37b3a04a825690cdf6e5ffbd5 100644 (file)
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1,4 +1,4 @@
-//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
  //
  //                     The LLVM Compiler Infrastructure
  //
@@ -7,40 +7,174 @@
  //
  //===----------------------------------------------------------------------===//
  //
-// This implements the ScheduleDAG class, which is a base class used by
-// scheduling implementation classes.
+// This implements the ScheduleDAGInstrs class, which implements re-scheduling
+// of MachineInstrs.
  //
  //===----------------------------------------------------------------------===//
  
  #define DEBUG_TYPE "sched-instrs"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "ScheduleDAGInstrs.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
  #include "llvm/Target/TargetMachine.h"
  #include "llvm/Target/TargetInstrInfo.h"
  #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtarget.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
  using namespace llvm;
  
-ScheduleDAGInstrs::ScheduleDAGInstrs(MachineBasicBlock *bb,
-                                     const TargetMachine &tm)
-  : ScheduleDAG(0, bb, tm) {}
+ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
+                                     const MachineLoopInfo &mli,
+                                     const MachineDominatorTree &mdt)
+  : ScheduleDAG(mf), MLI(mli), MDT(mdt), LoopRegs(MLI, MDT) {}
  
-void ScheduleDAGInstrs::BuildSchedUnits() {
-  SUnits.clear();
+/// Run - perform scheduling.
+///
+void ScheduleDAGInstrs::Run(MachineBasicBlock *bb,
+                            MachineBasicBlock::iterator begin,
+                            MachineBasicBlock::iterator end,
+                            unsigned endcount) {
+  BB = bb;
+  Begin = begin;
+  InsertPosIndex = endcount;
+
+  ScheduleDAG::Run(bb, end);
+}
+
+/// getOpcode - If this is an Instruction or a ConstantExpr, return the
+/// opcode value. Otherwise return UserOp1.
+static unsigned getOpcode(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getOpcode();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    return CE->getOpcode();
+  // Use UserOp1 to mean there's no opcode.
+  return Instruction::UserOp1;
+}
+
+/// getUnderlyingObjectFromInt - This is the function that does the work of
+/// looking through basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObjectFromInt(const Value *V) {
+  do {
+    if (const User *U = dyn_cast<User>(V)) {
+      // If we find a ptrtoint, we can transfer control back to the
+      // regular getUnderlyingObjectFromInt.
+      if (getOpcode(U) == Instruction::PtrToInt)
+        return U->getOperand(0);
+      // If we find an add of a constant or a multiplied value, it's
+      // likely that the other operand will lead us to the base
+      // object. We don't have to worry about the case where the
+      // object address is somehow being computed bt the multiply,
+      // because our callers only care when the result is an
+      // identifibale object.
+      if (getOpcode(U) != Instruction::Add ||
+          (!isa<ConstantInt>(U->getOperand(1)) &&
+           getOpcode(U->getOperand(1)) != Instruction::Mul))
+        return V;
+      V = U->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(isa<IntegerType>(V->getType()) && "Unexpected operand type!");
+  } while (1);
+}
+
+/// getUnderlyingObject - This is a wrapper around Value::getUnderlyingObject
+/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObject(const Value *V) {
+  // First just call Value::getUnderlyingObject to let it do what it does.
+  do {
+    V = V->getUnderlyingObject();
+    // If it found an inttoptr, use special code to continue climing.
+    if (getOpcode(V) != Instruction::IntToPtr)
+      break;
+    const Value *O = getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
+    // If that succeeded in finding a pointer, continue the search.
+    if (!isa<PointerType>(O->getType()))
+      break;
+    V = O;
+  } while (1);
+  return V;
+}
+
+/// getUnderlyingObjectForInstr - If this machine instr has memory reference
+/// information and it can be tracked to a normal reference to a known
+/// object, return the Value for that object. Otherwise return null.
+static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI) {
+  if (!MI->hasOneMemOperand() ||
+      !MI->memoperands_begin()->getValue() ||
+      MI->memoperands_begin()->isVolatile())
+    return 0;
+
+  const Value *V = MI->memoperands_begin()->getValue();
+  if (!V)
+    return 0;
+
+  V = getUnderlyingObject(V);
+  if (!isa<PseudoSourceValue>(V) && !isIdentifiedObject(V))
+    return 0;
+
+  return V;
+}
+
+void ScheduleDAGInstrs::StartBlock(MachineBasicBlock *BB) {
+  if (MachineLoop *ML = MLI.getLoopFor(BB))
+    if (BB == ML->getLoopLatch()) {
+      MachineBasicBlock *Header = ML->getHeader();
+      for (MachineBasicBlock::livein_iterator I = Header->livein_begin(),
+           E = Header->livein_end(); I != E; ++I)
+        LoopLiveInRegs.insert(*I);
+      LoopRegs.VisitLoop(ML);
+    }
+}
+
+void ScheduleDAGInstrs::BuildSchedGraph() {
+  // We'll be allocating one SUnit for each instruction, plus one for
+  // the region exit node.
    SUnits.reserve(BB->size());
  
-  std::vector<SUnit *> PendingLoads;
-  SUnit *Terminator = 0;
+  // We build scheduling units by walking a block's instruction list from bottom
+  // to top.
+
+  // Remember where a generic side-effecting instruction is as we procede. If
+  // ChainMMO is null, this is assumed to have arbitrary side-effects. If
+  // ChainMMO is non-null, then Chain makes only a single memory reference.
    SUnit *Chain = 0;
-  SUnit *Defs[TargetRegisterInfo::FirstVirtualRegister] = {};
-  std::vector<SUnit *> Uses[TargetRegisterInfo::FirstVirtualRegister] = {};
-  int Cost = 1; // FIXME
+  MachineMemOperand *ChainMMO = 0;
+
+  // Memory references to specific known memory locations are tracked so that
+  // they can be given more precise dependencies.
+  std::map<const Value *, SUnit *> MemDefs;
+  std::map<const Value *, std::vector<SUnit *> > MemUses;
+
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = ForceUnitLatencies();
  
-  for (MachineBasicBlock::iterator MII = BB->end(), MIE = BB->begin();
+  // Ask the target if address-backscheduling is desirable, and if so how much.
+  unsigned SpecialAddressLatency =
+    TM.getSubtarget<TargetSubtarget>().getSpecialAddressLatency();
+
+  // Walk the list of instructions, from bottom moving up.
+  for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin;
         MII != MIE; --MII) {
      MachineInstr *MI = prior(MII);
+    const TargetInstrDesc &TID = MI->getDesc();
+    assert(!TID.isTerminator() && !MI->isLabel() &&
+           "Cannot schedule terminators or labels!");
+    // Create the SUnit for this MI.
      SUnit *SU = NewSUnit(MI);
  
+    // Assign the Latency field of SU using target-provided information.
+    if (UnitLatencies)
+      SU->Latency = 1;
+    else
+      ComputeLatency(SU);
+
+    // Add register-based dependencies (data, anti, and output).
      for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
        const MachineOperand &MO = MI->getOperand(j);
        if (!MO.isReg()) continue;
@@ -49,57 +183,227 @@ void ScheduleDAGInstrs::BuildSchedUnits() {
  
        assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!");
        std::vector<SUnit *> &UseList = Uses[Reg];
-      SUnit *&Def = Defs[Reg];
-      // Optionally add output and anti dependences.
-      if (Def && Def != SU)
-        Def->addPred(SU, /*isCtrl=*/true, /*isSpecial=*/false,
-                     /*PhyReg=*/Reg, Cost);
+      std::vector<SUnit *> &DefList = Defs[Reg];
+      // Optionally add output and anti dependencies.
+      // TODO: Using a latency of 1 here assumes there's no cost for
+      //       reusing registers.
+      SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
+      for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
+        SUnit *DefSU = DefList[i];
+        if (DefSU != SU &&
+            (Kind != SDep::Output || !MO.isDead() ||
+             !DefSU->getInstr()->registerDefIsDead(Reg)))
+          DefSU->addPred(SDep(SU, Kind, /*Latency=*/1, /*Reg=*/Reg));
+      }
        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-        SUnit *&Def = Defs[*Alias];
-        if (Def && Def != SU)
-          Def->addPred(SU, /*isCtrl=*/true, /*isSpecial=*/false,
-                       /*PhyReg=*/*Alias, Cost);
+        std::vector<SUnit *> &DefList = Defs[*Alias];
+        for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
+          SUnit *DefSU = DefList[i];
+          if (DefSU != SU &&
+              (Kind != SDep::Output || !MO.isDead() ||
+               !DefSU->getInstr()->registerDefIsDead(Reg)))
+            DefSU->addPred(SDep(SU, Kind, /*Latency=*/1, /*Reg=*/ *Alias));
+        }
        }
  
        if (MO.isDef()) {
          // Add any data dependencies.
-        for (unsigned i = 0, e = UseList.size(); i != e; ++i)
-          if (UseList[i] != SU)
-            UseList[i]->addPred(SU, /*isCtrl=*/false, /*isSpecial=*/false,
-                                /*PhysReg=*/Reg, Cost);
+        unsigned DataLatency = SU->Latency;
+        for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
+          SUnit *UseSU = UseList[i];
+          if (UseSU != SU) {
+            unsigned LDataLatency = DataLatency;
+            // Optionally add in a special extra latency for nodes that
+            // feed addresses.
+            // TODO: Do this for register aliases too.
+            if (SpecialAddressLatency != 0 && !UnitLatencies) {
+              MachineInstr *UseMI = UseSU->getInstr();
+              const TargetInstrDesc &UseTID = UseMI->getDesc();
+              int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg);
+              assert(RegUseIndex >= 0 && "UseMI doesn's use register!");
+              if ((UseTID.mayLoad() || UseTID.mayStore()) &&
+                  (unsigned)RegUseIndex < UseTID.getNumOperands() &&
+                  UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass())
+                LDataLatency += SpecialAddressLatency;
+            }
+            UseSU->addPred(SDep(SU, SDep::Data, LDataLatency, Reg));
+          }
+        }
          for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
            std::vector<SUnit *> &UseList = Uses[*Alias];
-          for (unsigned i = 0, e = UseList.size(); i != e; ++i)
-            if (UseList[i] != SU)
-              UseList[i]->addPred(SU, /*isCtrl=*/false, /*isSpecial=*/false,
-                                  /*PhysReg=*/*Alias, Cost);
+          for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
+            SUnit *UseSU = UseList[i];
+            if (UseSU != SU)
+              UseSU->addPred(SDep(SU, SDep::Data, DataLatency, *Alias));
+          }
+        }
+
+        // If a def is going to wrap back around to the top of the loop,
+        // backschedule it.
+        if (!UnitLatencies && DefList.empty()) {
+          LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg);
+          if (I != LoopRegs.Deps.end()) {
+            const MachineOperand *UseMO = I->second.first;
+            unsigned Count = I->second.second;
+            const MachineInstr *UseMI = UseMO->getParent();
+            unsigned UseMOIdx = UseMO - &UseMI->getOperand(0);
+            const TargetInstrDesc &UseTID = UseMI->getDesc();
+            // TODO: If we knew the total depth of the region here, we could
+            // handle the case where the whole loop is inside the region but
+            // is large enough that the isScheduleHigh trick isn't needed.
+            if (UseMOIdx < UseTID.getNumOperands()) {
+              // Currently, we only support scheduling regions consisting of
+              // single basic blocks. Check to see if the instruction is in
+              // the same region by checking to see if it has the same parent.
+              if (UseMI->getParent() != MI->getParent()) {
+                unsigned Latency = SU->Latency;
+                if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass())
+                  Latency += SpecialAddressLatency;
+                // This is a wild guess as to the portion of the latency which
+                // will be overlapped by work done outside the current
+                // scheduling region.
+                Latency -= std::min(Latency, Count);
+                // Add the artifical edge.
+                ExitSU.addPred(SDep(SU, SDep::Order, Latency,
+                                    /*Reg=*/0, /*isNormalMemory=*/false,
+                                    /*isMustAlias=*/false,
+                                    /*isArtificial=*/true));
+              } else if (SpecialAddressLatency > 0 &&
+                         UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) {
+                // The entire loop body is within the current scheduling region
+                // and the latency of this operation is assumed to be greater
+                // than the latency of the loop.
+                // TODO: Recursively mark data-edge predecessors as
+                //       isScheduleHigh too.
+                SU->isScheduleHigh = true;
+              }
+            }
+            LoopRegs.Deps.erase(I);
+          }
          }
  
          UseList.clear();
-        Def = SU;
+        if (!MO.isDead())
+          DefList.clear();
+        DefList.push_back(SU);
        } else {
          UseList.push_back(SU);
        }
      }
-    bool False = false;
-    bool True = true;
-    if (!MI->isSafeToMove(TII, False)) {
+
+    // Add chain dependencies.
+    // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable
+    // after stack slots are lowered to actual addresses.
+    // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
+    // produce more precise dependence information.
+    if (TID.isCall() || TID.hasUnmodeledSideEffects()) {
+    new_chain:
+      // This is the conservative case. Add dependencies on all memory
+      // references.
        if (Chain)
-        Chain->addPred(SU, /*isCtrl=*/false, /*isSpecial=*/false);
+        Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+      Chain = SU;
        for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        PendingLoads[k]->addPred(SU, /*isCtrl=*/false, /*isSpecial=*/false);
+        PendingLoads[k]->addPred(SDep(SU, SDep::Order, SU->Latency));
        PendingLoads.clear();
-      Chain = SU;
-    } else if (!MI->isSafeToMove(TII, True)) {
-      if (Chain)
-        Chain->addPred(SU, /*isCtrl=*/false, /*isSpecial=*/false);
-      PendingLoads.push_back(SU);
+      for (std::map<const Value *, SUnit *>::iterator I = MemDefs.begin(),
+           E = MemDefs.end(); I != E; ++I) {
+        I->second->addPred(SDep(SU, SDep::Order, SU->Latency));
+        I->second = SU;
+      }
+      for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
+           MemUses.begin(), E = MemUses.end(); I != E; ++I) {
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
+          I->second[i]->addPred(SDep(SU, SDep::Order, SU->Latency));
+        I->second.clear();
+      }
+      // See if it is known to just have a single memory reference.
+      MachineInstr *ChainMI = Chain->getInstr();
+      const TargetInstrDesc &ChainTID = ChainMI->getDesc();
+      if (!ChainTID.isCall() &&
+          !ChainTID.hasUnmodeledSideEffects() &&
+          ChainMI->hasOneMemOperand() &&
+          !ChainMI->memoperands_begin()->isVolatile() &&
+          ChainMI->memoperands_begin()->getValue())
+        // We know that the Chain accesses one specific memory location.
+        ChainMMO = &*ChainMI->memoperands_begin();
+      else
+        // Unknown memory accesses. Assume the worst.
+        ChainMMO = 0;
+    } else if (TID.mayStore()) {
+      if (const Value *V = getUnderlyingObjectForInstr(MI)) {
+        // A store to a specific PseudoSourceValue. Add precise dependencies.
+        // Handle the def in MemDefs, if there is one.
+        std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V);
+        if (I != MemDefs.end()) {
+          I->second->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0,
+                                  /*isNormalMemory=*/true));
+          I->second = SU;
+        } else {
+          MemDefs[V] = SU;
+        }
+        // Handle the uses in MemUses, if there are any.
+        std::map<const Value *, std::vector<SUnit *> >::iterator J =
+          MemUses.find(V);
+        if (J != MemUses.end()) {
+          for (unsigned i = 0, e = J->second.size(); i != e; ++i)
+            J->second[i]->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0,
+                                       /*isNormalMemory=*/true));
+          J->second.clear();
+        }
+        // Add dependencies from all the PendingLoads, since without
+        // memoperands we must assume they alias anything.
+        for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
+          PendingLoads[k]->addPred(SDep(SU, SDep::Order, SU->Latency));
+        // Add a general dependence too, if needed.
+        if (Chain)
+          Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+      } else
+        // Treat all other stores conservatively.
+        goto new_chain;
+    } else if (TID.mayLoad()) {
+      if (TII->isInvariantLoad(MI)) {
+        // Invariant load, no chain dependencies needed!
+      } else if (const Value *V = getUnderlyingObjectForInstr(MI)) {
+        // A load from a specific PseudoSourceValue. Add precise dependencies.
+        std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V);
+        if (I != MemDefs.end())
+          I->second->addPred(SDep(SU, SDep::Order, SU->Latency, /*Reg=*/0,
+                                  /*isNormalMemory=*/true));
+        MemUses[V].push_back(SU);
+
+        // Add a general dependence too, if needed.
+        if (Chain && (!ChainMMO ||
+                      (ChainMMO->isStore() || ChainMMO->isVolatile())))
+          Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+      } else if (MI->hasVolatileMemoryRef()) {
+        // Treat volatile loads conservatively. Note that this includes
+        // cases where memoperand information is unavailable.
+        goto new_chain;
+      } else {
+        // A normal load. Depend on the general chain, as well as on
+        // all stores. In the absense of MachineMemOperand information,
+        // we can't even assume that the load doesn't alias well-behaved
+        // memory locations.
+        if (Chain)
+          Chain->addPred(SDep(SU, SDep::Order, SU->Latency));
+        for (std::map<const Value *, SUnit *>::iterator I = MemDefs.begin(),
+             E = MemDefs.end(); I != E; ++I)
+          I->second->addPred(SDep(SU, SDep::Order, SU->Latency));
+        PendingLoads.push_back(SU);
+      }
      }
-    if (Terminator && SU->Succs.empty())
-      Terminator->addPred(SU, /*isCtrl=*/false, /*isSpecial=*/false);
-    if (MI->getDesc().isTerminator() || MI->isLabel())
-      Terminator = SU;
    }
+
+  for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    Defs[i].clear();
+    Uses[i].clear();
+  }
+  PendingLoads.clear();
+}
+
+void ScheduleDAGInstrs::FinishBlock() {
+  // Nothing to do.
  }
  
  void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
@@ -109,6 +413,12 @@ void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
    // all nodes flagged together into this SUnit.
    SU->Latency =
      InstrItins.getLatency(SU->getInstr()->getDesc().getSchedClass());
+
+  // Simplistic target-independent heuristic: assume that loads take
+  // extra time.
+  if (InstrItins.isEmpty())
+    if (SU->getInstr()->getDesc().mayLoad())
+      SU->Latency += 2;
  }
  
  void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
@@ -118,7 +428,12 @@ void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
  std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
    std::string s;
    raw_string_ostream oss(s);
-  SU->getInstr()->print(oss);
+  if (SU == &EntrySU)
+    oss << "<entry>";
+  else if (SU == &ExitSU)
+    oss << "<exit>";
+  else
+    SU->getInstr()->print(oss);
    return oss.str();
  }
  
@@ -126,9 +441,13 @@ std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
  MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() {
    // For MachineInstr-based scheduling, we're rescheduling the instructions in
    // the block, so start by removing them from the block.
-  while (!BB->empty())
-    BB->remove(BB->begin());
+  while (Begin != InsertPos) {
+    MachineBasicBlock::iterator I = Begin;
+    ++Begin;
+    BB->remove(I);
+  }
  
+  // Then re-insert them according to the given schedule.
    for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
      SUnit *SU = Sequence[i];
      if (!SU) {
@@ -137,8 +456,13 @@ MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() {
        continue;
      }
  
-    BB->push_back(SU->getInstr());
+    BB->insert(InsertPos, SU->getInstr());
    }
  
+  // Update the Begin iterator, as the first instruction in the block
+  // may have been scheduled later.
+  if (!Sequence.empty())
+    Begin = Sequence[0]->getInstr();
+
    return BB;
  }