Reenable tail duplication of bb with just an unconditional jump, but

[oota-llvm.git] / lib / CodeGen / SelectionDAG / ScheduleDAGRRList.cpp
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp

index ac2f3d5c8510871b4d786574ffe446ff64680956..a827187e357e524d9d2d154cd5a47ffdf70f3709 100644 (file)
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -71,6 +71,7 @@ static cl::opt<bool> DisableSchedCycles(
    cl::desc("Disable cycle-level precision during preRA scheduling"));
  
  // Temporary sched=list-ilp flags until the heuristics are robust.
+// Some options are also available under sched=list-hybrid.
  static cl::opt<bool> DisableSchedRegPressure(
    "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
    cl::desc("Disable regpressure priority in sched=list-ilp"));
@@ -80,6 +81,9 @@ static cl::opt<bool> DisableSchedLiveUses(
  static cl::opt<bool> DisableSchedVRegCycle(
    "disable-sched-vrcycle", cl::Hidden, cl::init(false),
    cl::desc("Disable virtual register cycle interference checks"));
+static cl::opt<bool> DisableSchedPhysRegJoin(
+  "disable-sched-physreg-join", cl::Hidden, cl::init(false),
+  cl::desc("Disable physreg def-use affinity"));
  static cl::opt<bool> DisableSchedStalls(
    "disable-sched-stalls", cl::Hidden, cl::init(true),
    cl::desc("Disable no-stall priority in sched=list-ilp"));
@@ -272,6 +276,43 @@ private:
  };
  }  // end anonymous namespace
  
+/// GetCostForDef - Looks up the register class and cost for a given definition.
+/// Typically this just means looking up the representative register class,
+/// but for untyped values (MVT::untyped) it means inspecting the node's
+/// opcode to determine what register class is being generated.
+static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
+                          const TargetLowering *TLI,
+                          const TargetInstrInfo *TII,
+                          const TargetRegisterInfo *TRI,
+                          unsigned &RegClass, unsigned &Cost) {
+  EVT VT = RegDefPos.GetValue();
+
+  // Special handling for untyped values.  These values can only come from
+  // the expansion of custom DAG-to-DAG patterns.
+  if (VT == MVT::untyped) {
+    const SDNode *Node = RegDefPos.GetNode();
+    unsigned Opcode = Node->getMachineOpcode();
+
+    if (Opcode == TargetOpcode::REG_SEQUENCE) {
+      unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+      const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+      RegClass = RC->getID();
+      Cost = 1;
+      return;
+    }
+
+    unsigned Idx = RegDefPos.GetIdx();
+    const TargetInstrDesc Desc = TII->get(Opcode);
+    const TargetRegisterClass *RC = Desc.getRegClass(Idx, TRI);
+    RegClass = RC->getID();
+    // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
+    // better way to determine it.
+    Cost = 1;
+  } else {
+    RegClass = TLI->getRepRegClassFor(VT)->getID();
+    Cost = TLI->getRepRegClassCostFor(VT);
+  }
+}
  
  /// Schedule - Schedule the DAG using list scheduling.
  void ScheduleDAGRRList::Schedule() {
@@ -1004,14 +1045,15 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
    for (const unsigned *AliasI = TRI->getOverlaps(Reg); *AliasI; ++AliasI) {
  
      // Check if Ref is live.
-    if (!LiveRegDefs[Reg]) continue;
+    if (!LiveRegDefs[*AliasI]) continue;
  
      // Allow multiple uses of the same def.
-    if (LiveRegDefs[Reg] == SU) continue;
+    if (LiveRegDefs[*AliasI] == SU) continue;
  
      // Add Reg to the set of interfering live regs.
-    if (RegAdded.insert(Reg))
-      LRegs.push_back(Reg);
+    if (RegAdded.insert(*AliasI)) {
+      LRegs.push_back(*AliasI);
+    }
    }
  }
  
@@ -1364,6 +1406,21 @@ struct queue_sort : public std::binary_function<SUnit*, SUnit*, bool> {
    bool isReady(SUnit* SU, unsigned CurCycle) const { return true; }
  };
  
+#ifndef NDEBUG
+template<class SF>
+struct reverse_sort : public queue_sort {
+  SF &SortFunc;
+  reverse_sort(SF &sf) : SortFunc(sf) {}
+  reverse_sort(const reverse_sort &RHS) : SortFunc(RHS.SortFunc) {}
+
+  bool operator()(SUnit* left, SUnit* right) const {
+    // reverse left/right rather than simply !SortFunc(left, right)
+    // to expose different paths in the comparison logic.
+    return SortFunc(right, left);
+  }
+};
+#endif // NDEBUG
+
  /// bu_ls_rr_sort - Priority function for bottom up register pressure
  // reduction scheduler.
  struct bu_ls_rr_sort : public queue_sort {
@@ -1564,20 +1621,33 @@ protected:
  };
  
  template<class SF>
-class RegReductionPriorityQueue : public RegReductionPQBase {
-  static SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker) {
-    std::vector<SUnit *>::iterator Best = Q.begin();
-    for (std::vector<SUnit *>::iterator I = llvm::next(Q.begin()),
-           E = Q.end(); I != E; ++I)
-      if (Picker(*Best, *I))
-        Best = I;
-    SUnit *V = *Best;
-    if (Best != prior(Q.end()))
-      std::swap(*Best, Q.back());
-    Q.pop_back();
-    return V;
+static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) {
+  std::vector<SUnit *>::iterator Best = Q.begin();
+  for (std::vector<SUnit *>::iterator I = llvm::next(Q.begin()),
+         E = Q.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != prior(Q.end()))
+    std::swap(*Best, Q.back());
+  Q.pop_back();
+  return V;
+}
+
+template<class SF>
+SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker, ScheduleDAG *DAG) {
+#ifndef NDEBUG
+  if (DAG->StressSched) {
+    reverse_sort<SF> RPicker(Picker);
+    return popFromQueueImpl(Q, RPicker);
    }
+#endif
+  (void)DAG;
+  return popFromQueueImpl(Q, Picker);
+}
  
+template<class SF>
+class RegReductionPriorityQueue : public RegReductionPQBase {
    SF Picker;
  
  public:
@@ -1598,7 +1668,7 @@ public:
    SUnit *pop() {
      if (Queue.empty()) return NULL;
  
-    SUnit *V = popFromQueue(Queue, Picker);
+    SUnit *V = popFromQueue(Queue, Picker, scheduleDAG);
      V->NodeQueueId = 0;
      return V;
    }
@@ -1608,7 +1678,7 @@ public:
      std::vector<SUnit*> DumpQueue = Queue;
      SF DumpPicker = Picker;
      while (!DumpQueue.empty()) {
-      SUnit *SU = popFromQueue(DumpQueue, DumpPicker);
+      SUnit *SU = popFromQueue(DumpQueue, DumpPicker, scheduleDAG);
        if (isBottomUp())
          dbgs() << "Height " << SU->getHeight() << ": ";
        else
@@ -1638,6 +1708,20 @@ ILPBURRPriorityQueue;
  //           Static Node Priority for Register Pressure Reduction
  //===----------------------------------------------------------------------===//
  
+// Check for special nodes that bypass scheduling heuristics.
+// Currently this pushes TokenFactor nodes down, but may be used for other
+// pseudo-ops as well.
+//
+// Return -1 to schedule right above left, 1 for left above right.
+// Return 0 if no bias exists.
+static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
+  bool LSchedLow = left->isScheduleLow;
+  bool RSchedLow = right->isScheduleLow;
+  if (LSchedLow != RSchedLow)
+    return LSchedLow < RSchedLow ? 1 : -1;
+  return 0;
+}
+
  /// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
  /// Smaller number is the higher priority.
  static unsigned
@@ -1714,7 +1798,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
      // If SU does not have a register def, schedule it close to its uses
      // because it does not lengthen any live ranges.
      return 0;
+#if 1
    return SethiUllmanNumbers[SU->NodeNum];
+#else
+  unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
+  if (SU->isCallOp) {
+    // FIXME: This assumes all of the defs are used as call operands.
+    int NP = (int)Priority - SU->getNode()->getNumValues();
+    return (NP > 0) ? NP : 0;
+  }
+  return Priority;
+#endif
  }
  
  //===----------------------------------------------------------------------===//
@@ -1749,9 +1843,9 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
      }
      for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
           RegDefPos.IsValid(); RegDefPos.Advance()) {
-      EVT VT = RegDefPos.GetValue();
-      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-      unsigned Cost = TLI->getRepRegClassCostFor(VT);
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+
        if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
          return true;
      }
@@ -1862,9 +1956,10 @@ void RegReductionPQBase::ScheduledNode(SUnit *SU) {
           RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
        if (SkipRegDefs)
          continue;
-      EVT VT = RegDefPos.GetValue();
-      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+      RegPressure[RCId] += Cost;
        break;
      }
    }
@@ -1877,16 +1972,16 @@ void RegReductionPQBase::ScheduledNode(SUnit *SU) {
         RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
      if (SkipRegDefs > 0)
        continue;
-    EVT VT = RegDefPos.GetValue();
-    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-    if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT)) {
+    unsigned RCId, Cost;
+    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+    if (RegPressure[RCId] < Cost) {
        // Register pressure tracking is imprecise. This can happen. But we try
        // hard not to let it happen because it likely results in poor scheduling.
        DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") has too many regdefs\n");
        RegPressure[RCId] = 0;
      }
      else {
-      RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+      RegPressure[RCId] -= Cost;
      }
    }
    dumpRegPressure();
@@ -2198,24 +2293,55 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
  }
  
  static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
+  // Schedule physical register definitions close to their use. This is
+  // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as
+  // long as shortening physreg live ranges is generally good, we can defer
+  // creating a subtarget hook.
+  if (!DisableSchedPhysRegJoin) {
+    bool LHasPhysReg = left->hasPhysRegDefs;
+    bool RHasPhysReg = right->hasPhysRegDefs;
+    if (LHasPhysReg != RHasPhysReg) {
+      DEBUG(++FactorCount[FactRegUses]);
+      #ifndef NDEBUG
+      const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"};
+      #endif
+      DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
+            << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
+            << PhysRegMsg[RHasPhysReg] << "\n");
+      return LHasPhysReg < RHasPhysReg;
+    }
+  }
+
+  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
    unsigned LPriority = SPQ->getNodePriority(left);
    unsigned RPriority = SPQ->getNodePriority(right);
+
+  // Be really careful about hoisting call operands above previous calls.
+  // Only allows it if it would reduce register pressure.
+  if (left->isCall && right->isCallOp) {
+    unsigned RNumVals = right->getNode()->getNumValues();
+    RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
+  }
+  if (right->isCall && left->isCallOp) {
+    unsigned LNumVals = left->getNode()->getNumValues();
+    LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
+  }
+
    if (LPriority != RPriority) {
      DEBUG(++FactorCount[FactStatic]);
      return LPriority > RPriority;
    }
-  else if(LPriority == 0) {
-    // Schedule zero-latency TokenFactor below any other special
-    // nodes. The alternative may be to avoid artificially boosting the
-    // TokenFactor's height when it is scheduled, but we currently rely on an
-    // instruction's final height to equal the cycle in which it is scheduled,
-    // so heights are monotonically increasing.
-    unsigned LOpc = left->getNode() ? left->getNode()->getOpcode() : 0;
-    unsigned ROpc = right->getNode() ? right->getNode()->getOpcode() : 0;
-    if (LOpc == ISD::TokenFactor)
-      return false;
-    if (ROpc == ISD::TokenFactor)
-      return true;
+
+  // One or both of the nodes are calls and their sethi-ullman numbers are the
+  // same, then keep source order.
+  if (left->isCall || right->isCall) {
+    unsigned LOrder = SPQ->getNodeOrdering(left);
+    unsigned ROrder = SPQ->getNodeOrdering(right);
+
+    // Prefer an ordering where the lower the non-zero order number, the higher
+    // the preference.
+    if ((LOrder || ROrder) && LOrder != ROrder)
+      return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
    }
  
    // Try schedule def + use closer when Sethi-Ullman numbers are the same.
@@ -2250,7 +2376,14 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
      return LScratch > RScratch;
    }
  
-  if (!DisableSchedCycles) {
+  // Comparing latency against a call makes little sense unless the node
+  // is register pressure-neutral.
+  if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0))
+    return (left->NodeQueueId > right->NodeQueueId);
+
+  // Do not compare latencies when one or both of the nodes are calls.
+  if (!DisableSchedCycles &&
+      !(left->isCall || right->isCall)) {
      int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
      if (result != 0)
        return result > 0;
@@ -2275,11 +2408,17 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
  
  // Bottom up
  bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    return BURRSort(left, right, SPQ);
  }
  
  // Source order, otherwise bottom up.
  bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    unsigned LOrder = SPQ->getNodeOrdering(left);
    unsigned ROrder = SPQ->getNodeOrdering(right);
  
@@ -2311,6 +2450,9 @@ bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
  
  // Return true if right should be scheduled with higher priority than left.
  bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    if (left->isCall || right->isCall)
      // No way to compute latency of calls.
      return BURRSort(left, right, SPQ);
@@ -2376,6 +2518,9 @@ static bool canEnableCoalescing(SUnit *SU) {
  // list-ilp is currently an experimental scheduler that allows various
  // heuristics to be enabled prior to the normal register reduction logic.
  bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
    if (left->isCall || right->isCall)
      // No way to compute latency of calls.
      return BURRSort(left, right, SPQ);
@@ -2734,6 +2879,9 @@ static unsigned LimitedSumOfUnscheduledPredsOfSuccs(const SUnit *SU,
  
  // Top down
  bool td_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res < 0;
+
    unsigned LPriority = SPQ->getNodePriority(left);
    unsigned RPriority = SPQ->getNodePriority(right);
    bool LIsTarget = left->getNode() && left->getNode()->isMachineOpcode();