Reapply r146481 with a fix to create the Builder value in the correct place and

[oota-llvm.git] / lib / CodeGen / MachineBlockPlacement.cpp
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp

index 412d0b1fab627e0780e8c4c6ff172d8fbc424d85..638d89577022054d2360f9f2d430273265e4e897 100644 (file)
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -36,10 +36,7 @@
  #include "llvm/CodeGen/Passes.h"
  #include "llvm/Support/Allocator.h"
  #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
  #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SCCIterator.h"
  #include "llvm/ADT/SmallPtrSet.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
@@ -55,22 +52,6 @@ STATISTIC(CondBranchTakenFreq,
  STATISTIC(UncondBranchTakenFreq,
            "Potential frequency of taking unconditional branches");
  
-namespace {
-/// \brief A structure for storing a weighted edge.
-///
-/// This stores an edge and its weight, computed as the product of the
-/// frequency that the starting block is entered with the probability of
-/// a particular exit block.
-struct WeightedEdge {
-  BlockFrequency EdgeFrequency;
-  MachineBasicBlock *From, *To;
-
-  bool operator<(const WeightedEdge &RHS) const {
-    return EdgeFrequency < RHS.EdgeFrequency;
-  }
-};
-}
-
  namespace {
  class BlockChain;
  /// \brief Type for our function-wide basic block -> block chain mapping.
@@ -214,13 +195,17 @@ class MachineBlockPlacement : public MachineFunctionPass {
    MachineBasicBlock *selectBestCandidateBlock(
        BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList,
        const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *getFirstUnplacedBlock(const BlockChain &PlacedChain,
-                                           ArrayRef<MachineBasicBlock *> Blocks,
-                                           unsigned &PrevUnplacedBlockIdx);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      MachineFunction &F,
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
    void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
-                  ArrayRef<MachineBasicBlock *> Blocks,
                    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                    const BlockFilterSet *BlockFilter = 0);
+  MachineBasicBlock *findBestLoopTop(MachineFunction &F,
+                                     MachineLoop &L,
+                                     const BlockFilterSet &LoopBlockSet);
    void buildLoopChains(MachineFunction &F, MachineLoop &L);
    void buildCFGChains(MachineFunction &F);
    void AlignLoops(MachineFunction &F);
@@ -314,7 +299,7 @@ void MachineBlockPlacement::markChainSuccessors(
        // This is a cross-chain edge that is within the loop, so decrement the
        // loop predecessor count of the destination chain.
        if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0)
-        BlockWorkList.push_back(*SI);
+        BlockWorkList.push_back(*SuccChain.begin());
      }
    }
  }
@@ -354,15 +339,45 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Already merged!\n");
        continue;
      }
+    if (*SI != *SuccChain.begin()) {
+      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Mid chain!\n");
+      continue;
+    }
  
      uint32_t SuccWeight = MBPI->getEdgeWeight(BB, *SI);
      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
  
      // Only consider successors which are either "hot", or wouldn't violate
      // any CFG constraints.
-    if (SuccChain.LoopPredecessors != 0 && SuccProb < HotProb) {
-      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
-      continue;
+    if (SuccChain.LoopPredecessors != 0) {
+      if (SuccProb < HotProb) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
+        continue;
+      }
+
+      // Make sure that a hot successor doesn't have a globally more important
+      // predecessor.
+      BlockFrequency CandidateEdgeFreq
+        = MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
+      bool BadCFGConflict = false;
+      for (MachineBasicBlock::pred_iterator PI = (*SI)->pred_begin(),
+                                            PE = (*SI)->pred_end();
+           PI != PE; ++PI) {
+        if (*PI == *SI || (BlockFilter && !BlockFilter->count(*PI)) ||
+            BlockToChain[*PI] == &Chain)
+          continue;
+        BlockFrequency PredEdgeFreq
+          = MBFI->getBlockFreq(*PI) * MBPI->getEdgeProbability(*PI, *SI);
+        if (PredEdgeFreq >= CandidateEdgeFreq) {
+          BadCFGConflict = true;
+          break;
+        }
+      }
+      if (BadCFGConflict) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI)
+                               << " -> non-cold CFG conflict\n");
+        continue;
+      }
      }
  
      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
@@ -444,18 +459,23 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
  ///
  /// This routine is called when we are unable to use the CFG to walk through
  /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
-/// We walk through the sequence of blocks, starting from the
-/// LastUnplacedBlockIdx. We update this index to avoid re-scanning the entire
-/// sequence on repeated calls to this routine.
+/// We walk through the function's blocks in order, starting from the
+/// LastUnplacedBlockIt. We update this iterator on each call to avoid
+/// re-scanning the entire sequence on repeated calls to this routine.
  MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
-    const BlockChain &PlacedChain,
-    ArrayRef<MachineBasicBlock *> Blocks,
-    unsigned &PrevUnplacedBlockIdx) {
-  for (unsigned i = PrevUnplacedBlockIdx, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *BB = Blocks[i];
-    if (BlockToChain[BB] != &PlacedChain) {
-      PrevUnplacedBlockIdx = i;
-      return BB;
+    MachineFunction &F, const BlockChain &PlacedChain,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    const BlockFilterSet *BlockFilter) {
+  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E;
+       ++I) {
+    if (BlockFilter && !BlockFilter->count(I))
+      continue;
+    if (BlockToChain[I] != &PlacedChain) {
+      PrevUnplacedBlockIt = I;
+      // Now select the head of the chain to which the unplaced block belongs
+      // as the block to place. This will force the entire chain to be placed,
+      // and satisfies the requirements of merging chains.
+      return *BlockToChain[I]->begin();
      }
    }
    return 0;
@@ -464,14 +484,12 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
  void MachineBlockPlacement::buildChain(
      MachineBasicBlock *BB,
      BlockChain &Chain,
-    ArrayRef<MachineBasicBlock *> Blocks,
      SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
      const BlockFilterSet *BlockFilter) {
    assert(BB);
    assert(BlockToChain[BB] == &Chain);
-  assert(*Chain.begin() == BB);
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
-  unsigned PrevUnplacedBlockIdx = 0;
+  MachineFunction &F = *BB->getParent();
+  MachineFunction::iterator PrevUnplacedBlockIt = F.begin();
  
    MachineBasicBlock *LoopHeaderBB = BB;
    markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter);
@@ -482,26 +500,9 @@ void MachineBlockPlacement::buildChain(
      assert(*llvm::prior(Chain.end()) == BB);
      MachineBasicBlock *BestSucc = 0;
  
-    // Check for unreasonable branches, and forcibly merge the existing layout
-    // successor for them. We can handle cases that AnalyzeBranch can't: jump
-    // tables etc are fine. The case we want to handle specially is when there
-    // is potential fallthrough, but the branch cannot be analyzed. This
-    // includes blocks without terminators as well as other cases.
-    Cond.clear();
-    MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
-    if (TII->AnalyzeBranch(*BB, TBB, FBB, Cond) && BB->canFallThrough()) {
-      MachineFunction::iterator I(BB), NextI(llvm::next(I));
-      // Ensure that the layout successor is a viable block, as we know that
-      // fallthrough is a possibility. Note that this may not be a valid block
-      // in the loop, but we allow that to cope with degenerate situations.
-      assert(NextI != BB->getParent()->end());
-      BestSucc = NextI;
-    }
-
-    // Otherwise, look for the best viable successor if there is one to place
-    // immediately after this block.
-    if (!BestSucc)
-      BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    // Look for the best viable successor if there is one to place immediately
+    // after this block.
+    BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
  
      // If an immediate successor isn't available, look for the best viable
      // block among those we've identified as not violating the loop's CFG at
@@ -510,7 +511,8 @@ void MachineBlockPlacement::buildChain(
        BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter);
  
      if (!BestSucc) {
-      BestSucc = getFirstUnplacedBlock(Chain, Blocks, PrevUnplacedBlockIdx);
+      BestSucc = getFirstUnplacedBlock(F, Chain, PrevUnplacedBlockIt,
+                                       BlockFilter);
        if (!BestSucc)
          break;
  
@@ -528,12 +530,134 @@ void MachineBlockPlacement::buildChain(
      markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, BlockFilter);
      Chain.merge(BestSucc, &SuccChain);
      BB = *llvm::prior(Chain.end());
-  };
+  }
  
    DEBUG(dbgs() << "Finished forming chain for header block "
                 << getBlockNum(*Chain.begin()) << "\n");
  }
  
+/// \brief Find the best loop top block for layout.
+///
+/// This routine implements the logic to analyze the loop looking for the best
+/// block to layout at the top of the loop. Typically this is done to maximize
+/// fallthrough opportunities.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
+                                       MachineLoop &L,
+                                       const BlockFilterSet &LoopBlockSet) {
+  BlockFrequency BestExitEdgeFreq;
+  MachineBasicBlock *ExitingBB = 0;
+  MachineBasicBlock *LoopingBB = 0;
+  // If there are exits to outer loops, loop rotation can severely limit
+  // fallthrough opportunites unless it selects such an exit. Keep a set of
+  // blocks where rotating to exit with that block will reach an outer loop.
+  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
+
+  DEBUG(dbgs() << "Finding best loop exit for: "
+               << getBlockName(L.getHeader()) << "\n");
+  for (MachineLoop::block_iterator I = L.block_begin(),
+                                   E = L.block_end();
+       I != E; ++I) {
+    BlockChain &Chain = *BlockToChain[*I];
+    // Ensure that this block is at the end of a chain; otherwise it could be
+    // mid-way through an inner loop or a successor of an analyzable branch.
+    if (*I != *llvm::prior(Chain.end()))
+      continue;
+
+    // Now walk the successors. We need to establish whether this has a viable
+    // exiting successor and whether it has a viable non-exiting successor.
+    // We store the old exiting state and restore it if a viable looping
+    // successor isn't found.
+    MachineBasicBlock *OldExitingBB = ExitingBB;
+    BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
+    // We also compute and store the best looping successor for use in layout.
+    MachineBasicBlock *BestLoopSucc = 0;
+    // FIXME: Due to the performance of the probability and weight routines in
+    // the MBPI analysis, we use the internal weights. This is only valid
+    // because it is purely a ranking function, we don't care about anything
+    // but the relative values.
+    uint32_t BestLoopSuccWeight = 0;
+    // FIXME: We also manually compute the probabilities to avoid quadratic
+    // behavior.
+    uint32_t WeightScale = 0;
+    uint32_t SumWeight = MBPI->getSumForBlock(*I, WeightScale);
+    for (MachineBasicBlock::succ_iterator SI = (*I)->succ_begin(),
+                                          SE = (*I)->succ_end();
+         SI != SE; ++SI) {
+      if ((*SI)->isLandingPad())
+        continue;
+      if (*SI == *I)
+        continue;
+      BlockChain &SuccChain = *BlockToChain[*SI];
+      // Don't split chains, either this chain or the successor's chain.
+      if (&Chain == &SuccChain || *SI != *SuccChain.begin()) {
+        DEBUG(dbgs() << "    " << (LoopBlockSet.count(*SI) ? "looping: "
+                                                           : "exiting: ")
+                     << getBlockName(*I) << " -> "
+                     << getBlockName(*SI) << " (chain conflict)\n");
+        continue;
+      }
+
+      uint32_t SuccWeight = MBPI->getEdgeWeight(*I, *SI);
+      if (LoopBlockSet.count(*SI)) {
+        DEBUG(dbgs() << "    looping: " << getBlockName(*I) << " -> "
+                     << getBlockName(*SI) << " (" << SuccWeight << ")\n");
+        if (BestLoopSucc && BestLoopSuccWeight >= SuccWeight)
+          continue;
+
+        BestLoopSucc = *SI;
+        BestLoopSuccWeight = SuccWeight;
+        continue;
+      }
+
+      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
+      BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(*I) * SuccProb;
+      DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
+                   << getBlockName(*SI) << " (" << ExitEdgeFreq << ")\n");
+      // Note that we slightly bias this toward an existing layout successor to
+      // retain incoming order in the absence of better information.
+      // FIXME: Should we bias this more strongly? It's pretty weak.
+      if (!ExitingBB || ExitEdgeFreq > BestExitEdgeFreq ||
+          ((*I)->isLayoutSuccessor(*SI) &&
+           !(ExitEdgeFreq < BestExitEdgeFreq))) {
+        BestExitEdgeFreq = ExitEdgeFreq;
+        ExitingBB = *I;
+      }
+
+      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI))
+        if (ExitLoop->contains(&L))
+          BlocksExitingToOuterLoop.insert(*I);
+    }
+
+    // Restore the old exiting state, no viable looping successor was found.
+    if (!BestLoopSucc) {
+      ExitingBB = OldExitingBB;
+      BestExitEdgeFreq = OldBestExitEdgeFreq;
+      continue;
+    }
+
+    // If this was best exiting block thus far, also record the looping block.
+    if (ExitingBB == *I)
+      LoopingBB = BestLoopSucc;
+  }
+  // Without a candidate exitting block or with only a single block in the
+  // loop, just use the loop header to layout the loop.
+  if (!ExitingBB || L.getNumBlocks() == 1)
+    return L.getHeader();
+
+  // Also, if we have exit blocks which lead to outer loops but didn't select
+  // one of them as the exiting block we are rotating toward, disable loop
+  // rotation altogether.
+  if (!BlocksExitingToOuterLoop.empty() &&
+      !BlocksExitingToOuterLoop.count(ExitingBB))
+    return L.getHeader();
+
+  assert(LoopingBB && "All successors of a loop block are exit blocks!");
+  DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
+  DEBUG(dbgs() << "  Best top block: " << getBlockName(LoopingBB) << "\n");
+  return LoopingBB;
+}
+
  /// \brief Forms basic block chains from the natural loop structures.
  ///
  /// These chains are designed to preserve the existing *structure* of the code
@@ -549,17 +673,21 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
  
    SmallVector<MachineBasicBlock *, 16> BlockWorkList;
    BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
-  BlockChain &LoopChain = *BlockToChain[L.getHeader()];
+
+  MachineBasicBlock *LayoutTop = findBestLoopTop(F, L, LoopBlockSet);
+  BlockChain &LoopChain = *BlockToChain[LayoutTop];
  
    // FIXME: This is a really lame way of walking the chains in the loop: we
    // walk the blocks, and use a set to prevent visiting a particular chain
    // twice.
    SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  assert(LoopChain.LoopPredecessors == 0);
+  UpdatedPreds.insert(&LoopChain);
    for (MachineLoop::block_iterator BI = L.block_begin(),
                                     BE = L.block_end();
         BI != BE; ++BI) {
      BlockChain &Chain = *BlockToChain[*BI];
-    if (!UpdatedPreds.insert(&Chain) || BI == L.block_begin())
+    if (!UpdatedPreds.insert(&Chain))
        continue;
  
      assert(Chain.LoopPredecessors == 0);
@@ -576,11 +704,10 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
      }
  
      if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(*BI);
+      BlockWorkList.push_back(*Chain.begin());
    }
  
-  buildChain(*L.block_begin(), LoopChain, L.getBlocks(), BlockWorkList,
-             &LoopBlockSet);
+  buildChain(LayoutTop, LoopChain, BlockWorkList, &LoopBlockSet);
  
    DEBUG({
      // Crash at the end so we get all of the debugging output first.
@@ -596,8 +723,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
        if (!LoopBlockSet.erase(*BCI)) {
          // We don't mark the loop as bad here because there are real situations
          // where this can occur. For example, with an unanalyzable fallthrough
-        // from a loop block to a non-loop block.
-        // FIXME: Such constructs shouldn't exist. Track them down and fix them.
+        // from a loop block to a non-loop block or vice versa.
          dbgs() << "Loop chain contains a block not contained by the loop!\n"
                 << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
                 << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
@@ -621,26 +747,43 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
  void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
    // Ensure that every BB in the function has an associated chain to simplify
    // the assumptions of the remaining algorithm.
-  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    BlockToChain[&*FI] =
-      new (ChainAllocator.Allocate()) BlockChain(BlockToChain, &*FI);
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    MachineBasicBlock *BB = FI;
+    BlockChain *Chain
+      = new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
+    // Also, merge any blocks which we cannot reason about and must preserve
+    // the exact fallthrough behavior for.
+    for (;;) {
+      Cond.clear();
+      MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+      if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
+        break;
+
+      MachineFunction::iterator NextFI(llvm::next(FI));
+      MachineBasicBlock *NextBB = NextFI;
+      // Ensure that the layout successor is a viable block, as we know that
+      // fallthrough is a possibility.
+      assert(NextFI != FE && "Can't fallthrough past the last block.");
+      DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
+                   << getBlockName(BB) << " -> " << getBlockName(NextBB)
+                   << "\n");
+      Chain->merge(NextBB, 0);
+      FI = NextFI;
+      BB = NextBB;
+    }
+  }
  
    // Build any loop-based chains.
    for (MachineLoopInfo::iterator LI = MLI->begin(), LE = MLI->end(); LI != LE;
         ++LI)
      buildLoopChains(F, **LI);
  
-  // We need a vector of blocks so that buildChain can handle unnatural CFG
-  // constructs by searching for unplaced blocks and just concatenating them.
-  SmallVector<MachineBasicBlock *, 16> Blocks;
-  Blocks.reserve(F.size());
-
    SmallVector<MachineBasicBlock *, 16> BlockWorkList;
  
    SmallPtrSet<BlockChain *, 4> UpdatedPreds;
    for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
      MachineBasicBlock *BB = &*FI;
-    Blocks.push_back(BB);
      BlockChain &Chain = *BlockToChain[BB];
      if (!UpdatedPreds.insert(&Chain))
        continue;
@@ -659,11 +802,11 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
      }
  
      if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(BB);
+      BlockWorkList.push_back(*Chain.begin());
    }
  
    BlockChain &FunctionChain = *BlockToChain[&F.front()];
-  buildChain(&F.front(), FunctionChain, Blocks, BlockWorkList);
+  buildChain(&F.front(), FunctionChain, BlockWorkList);
  
    typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
    DEBUG({
@@ -695,7 +838,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
  
    // Splice the blocks into place.
    MachineFunction::iterator InsertPos = F.begin();
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
    for (BlockChain::iterator BI = FunctionChain.begin(),
                              BE = FunctionChain.end();
         BI != BE; ++BI) {
@@ -766,6 +908,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
    AlignLoops(F);
  
    BlockToChain.clear();
+  ChainAllocator.DestroyAll();
  
    // We always return true as we have no way to track whether the final order
    // differs from the original order.