Reapply r146481 with a fix to create the Builder value in the correct place and
[oota-llvm.git] / lib / CodeGen / MachineBlockPlacement.cpp
index 412d0b1fab627e0780e8c4c6ff172d8fbc424d85..638d89577022054d2360f9f2d430273265e4e897 100644 (file)
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -55,22 +52,6 @@ STATISTIC(CondBranchTakenFreq,
 STATISTIC(UncondBranchTakenFreq,
           "Potential frequency of taking unconditional branches");
 
-namespace {
-/// \brief A structure for storing a weighted edge.
-///
-/// This stores an edge and its weight, computed as the product of the
-/// frequency that the starting block is entered with the probability of
-/// a particular exit block.
-struct WeightedEdge {
-  BlockFrequency EdgeFrequency;
-  MachineBasicBlock *From, *To;
-
-  bool operator<(const WeightedEdge &RHS) const {
-    return EdgeFrequency < RHS.EdgeFrequency;
-  }
-};
-}
-
 namespace {
 class BlockChain;
 /// \brief Type for our function-wide basic block -> block chain mapping.
@@ -214,13 +195,17 @@ class MachineBlockPlacement : public MachineFunctionPass {
   MachineBasicBlock *selectBestCandidateBlock(
       BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList,
       const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *getFirstUnplacedBlock(const BlockChain &PlacedChain,
-                                           ArrayRef<MachineBasicBlock *> Blocks,
-                                           unsigned &PrevUnplacedBlockIdx);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      MachineFunction &F,
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
-                  ArrayRef<MachineBasicBlock *> Blocks,
                   SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                   const BlockFilterSet *BlockFilter = 0);
+  MachineBasicBlock *findBestLoopTop(MachineFunction &F,
+                                     MachineLoop &L,
+                                     const BlockFilterSet &LoopBlockSet);
   void buildLoopChains(MachineFunction &F, MachineLoop &L);
   void buildCFGChains(MachineFunction &F);
   void AlignLoops(MachineFunction &F);
@@ -314,7 +299,7 @@ void MachineBlockPlacement::markChainSuccessors(
       // This is a cross-chain edge that is within the loop, so decrement the
       // loop predecessor count of the destination chain.
       if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0)
-        BlockWorkList.push_back(*SI);
+        BlockWorkList.push_back(*SuccChain.begin());
     }
   }
 }
@@ -354,15 +339,45 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
       DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Already merged!\n");
       continue;
     }
+    if (*SI != *SuccChain.begin()) {
+      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Mid chain!\n");
+      continue;
+    }
 
     uint32_t SuccWeight = MBPI->getEdgeWeight(BB, *SI);
     BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
 
     // Only consider successors which are either "hot", or wouldn't violate
     // any CFG constraints.
-    if (SuccChain.LoopPredecessors != 0 && SuccProb < HotProb) {
-      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
-      continue;
+    if (SuccChain.LoopPredecessors != 0) {
+      if (SuccProb < HotProb) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
+        continue;
+      }
+
+      // Make sure that a hot successor doesn't have a globally more important
+      // predecessor.
+      BlockFrequency CandidateEdgeFreq
+        = MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
+      bool BadCFGConflict = false;
+      for (MachineBasicBlock::pred_iterator PI = (*SI)->pred_begin(),
+                                            PE = (*SI)->pred_end();
+           PI != PE; ++PI) {
+        if (*PI == *SI || (BlockFilter && !BlockFilter->count(*PI)) ||
+            BlockToChain[*PI] == &Chain)
+          continue;
+        BlockFrequency PredEdgeFreq
+          = MBFI->getBlockFreq(*PI) * MBPI->getEdgeProbability(*PI, *SI);
+        if (PredEdgeFreq >= CandidateEdgeFreq) {
+          BadCFGConflict = true;
+          break;
+        }
+      }
+      if (BadCFGConflict) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI)
+                               << " -> non-cold CFG conflict\n");
+        continue;
+      }
     }
 
     DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
@@ -444,18 +459,23 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
 ///
 /// This routine is called when we are unable to use the CFG to walk through
 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
-/// We walk through the sequence of blocks, starting from the
-/// LastUnplacedBlockIdx. We update this index to avoid re-scanning the entire
-/// sequence on repeated calls to this routine.
+/// We walk through the function's blocks in order, starting from the
+/// LastUnplacedBlockIt. We update this iterator on each call to avoid
+/// re-scanning the entire sequence on repeated calls to this routine.
 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
-    const BlockChain &PlacedChain,
-    ArrayRef<MachineBasicBlock *> Blocks,
-    unsigned &PrevUnplacedBlockIdx) {
-  for (unsigned i = PrevUnplacedBlockIdx, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *BB = Blocks[i];
-    if (BlockToChain[BB] != &PlacedChain) {
-      PrevUnplacedBlockIdx = i;
-      return BB;
+    MachineFunction &F, const BlockChain &PlacedChain,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    const BlockFilterSet *BlockFilter) {
+  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E;
+       ++I) {
+    if (BlockFilter && !BlockFilter->count(I))
+      continue;
+    if (BlockToChain[I] != &PlacedChain) {
+      PrevUnplacedBlockIt = I;
+      // Now select the head of the chain to which the unplaced block belongs
+      // as the block to place. This will force the entire chain to be placed,
+      // and satisfies the requirements of merging chains.
+      return *BlockToChain[I]->begin();
     }
   }
   return 0;
@@ -464,14 +484,12 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 void MachineBlockPlacement::buildChain(
     MachineBasicBlock *BB,
     BlockChain &Chain,
-    ArrayRef<MachineBasicBlock *> Blocks,
     SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
     const BlockFilterSet *BlockFilter) {
   assert(BB);
   assert(BlockToChain[BB] == &Chain);
-  assert(*Chain.begin() == BB);
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
-  unsigned PrevUnplacedBlockIdx = 0;
+  MachineFunction &F = *BB->getParent();
+  MachineFunction::iterator PrevUnplacedBlockIt = F.begin();
 
   MachineBasicBlock *LoopHeaderBB = BB;
   markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter);
@@ -482,26 +500,9 @@ void MachineBlockPlacement::buildChain(
     assert(*llvm::prior(Chain.end()) == BB);
     MachineBasicBlock *BestSucc = 0;
 
-    // Check for unreasonable branches, and forcibly merge the existing layout
-    // successor for them. We can handle cases that AnalyzeBranch can't: jump
-    // tables etc are fine. The case we want to handle specially is when there
-    // is potential fallthrough, but the branch cannot be analyzed. This
-    // includes blocks without terminators as well as other cases.
-    Cond.clear();
-    MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
-    if (TII->AnalyzeBranch(*BB, TBB, FBB, Cond) && BB->canFallThrough()) {
-      MachineFunction::iterator I(BB), NextI(llvm::next(I));
-      // Ensure that the layout successor is a viable block, as we know that
-      // fallthrough is a possibility. Note that this may not be a valid block
-      // in the loop, but we allow that to cope with degenerate situations.
-      assert(NextI != BB->getParent()->end());
-      BestSucc = NextI;
-    }
-
-    // Otherwise, look for the best viable successor if there is one to place
-    // immediately after this block.
-    if (!BestSucc)
-      BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    // Look for the best viable successor if there is one to place immediately
+    // after this block.
+    BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -510,7 +511,8 @@ void MachineBlockPlacement::buildChain(
       BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter);
 
     if (!BestSucc) {
-      BestSucc = getFirstUnplacedBlock(Chain, Blocks, PrevUnplacedBlockIdx);
+      BestSucc = getFirstUnplacedBlock(F, Chain, PrevUnplacedBlockIt,
+                                       BlockFilter);
       if (!BestSucc)
         break;
 
@@ -528,12 +530,134 @@ void MachineBlockPlacement::buildChain(
     markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, BlockFilter);
     Chain.merge(BestSucc, &SuccChain);
     BB = *llvm::prior(Chain.end());
-  };
+  }
 
   DEBUG(dbgs() << "Finished forming chain for header block "
                << getBlockNum(*Chain.begin()) << "\n");
 }
 
+/// \brief Find the best loop top block for layout.
+///
+/// This routine implements the logic to analyze the loop looking for the best
+/// block to layout at the top of the loop. Typically this is done to maximize
+/// fallthrough opportunities.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
+                                       MachineLoop &L,
+                                       const BlockFilterSet &LoopBlockSet) {
+  BlockFrequency BestExitEdgeFreq;
+  MachineBasicBlock *ExitingBB = 0;
+  MachineBasicBlock *LoopingBB = 0;
+  // If there are exits to outer loops, loop rotation can severely limit
+  // fallthrough opportunites unless it selects such an exit. Keep a set of
+  // blocks where rotating to exit with that block will reach an outer loop.
+  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
+
+  DEBUG(dbgs() << "Finding best loop exit for: "
+               << getBlockName(L.getHeader()) << "\n");
+  for (MachineLoop::block_iterator I = L.block_begin(),
+                                   E = L.block_end();
+       I != E; ++I) {
+    BlockChain &Chain = *BlockToChain[*I];
+    // Ensure that this block is at the end of a chain; otherwise it could be
+    // mid-way through an inner loop or a successor of an analyzable branch.
+    if (*I != *llvm::prior(Chain.end()))
+      continue;
+
+    // Now walk the successors. We need to establish whether this has a viable
+    // exiting successor and whether it has a viable non-exiting successor.
+    // We store the old exiting state and restore it if a viable looping
+    // successor isn't found.
+    MachineBasicBlock *OldExitingBB = ExitingBB;
+    BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
+    // We also compute and store the best looping successor for use in layout.
+    MachineBasicBlock *BestLoopSucc = 0;
+    // FIXME: Due to the performance of the probability and weight routines in
+    // the MBPI analysis, we use the internal weights. This is only valid
+    // because it is purely a ranking function, we don't care about anything
+    // but the relative values.
+    uint32_t BestLoopSuccWeight = 0;
+    // FIXME: We also manually compute the probabilities to avoid quadratic
+    // behavior.
+    uint32_t WeightScale = 0;
+    uint32_t SumWeight = MBPI->getSumForBlock(*I, WeightScale);
+    for (MachineBasicBlock::succ_iterator SI = (*I)->succ_begin(),
+                                          SE = (*I)->succ_end();
+         SI != SE; ++SI) {
+      if ((*SI)->isLandingPad())
+        continue;
+      if (*SI == *I)
+        continue;
+      BlockChain &SuccChain = *BlockToChain[*SI];
+      // Don't split chains, either this chain or the successor's chain.
+      if (&Chain == &SuccChain || *SI != *SuccChain.begin()) {
+        DEBUG(dbgs() << "    " << (LoopBlockSet.count(*SI) ? "looping: "
+                                                           : "exiting: ")
+                     << getBlockName(*I) << " -> "
+                     << getBlockName(*SI) << " (chain conflict)\n");
+        continue;
+      }
+
+      uint32_t SuccWeight = MBPI->getEdgeWeight(*I, *SI);
+      if (LoopBlockSet.count(*SI)) {
+        DEBUG(dbgs() << "    looping: " << getBlockName(*I) << " -> "
+                     << getBlockName(*SI) << " (" << SuccWeight << ")\n");
+        if (BestLoopSucc && BestLoopSuccWeight >= SuccWeight)
+          continue;
+
+        BestLoopSucc = *SI;
+        BestLoopSuccWeight = SuccWeight;
+        continue;
+      }
+
+      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
+      BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(*I) * SuccProb;
+      DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
+                   << getBlockName(*SI) << " (" << ExitEdgeFreq << ")\n");
+      // Note that we slightly bias this toward an existing layout successor to
+      // retain incoming order in the absence of better information.
+      // FIXME: Should we bias this more strongly? It's pretty weak.
+      if (!ExitingBB || ExitEdgeFreq > BestExitEdgeFreq ||
+          ((*I)->isLayoutSuccessor(*SI) &&
+           !(ExitEdgeFreq < BestExitEdgeFreq))) {
+        BestExitEdgeFreq = ExitEdgeFreq;
+        ExitingBB = *I;
+      }
+
+      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI))
+        if (ExitLoop->contains(&L))
+          BlocksExitingToOuterLoop.insert(*I);
+    }
+
+    // Restore the old exiting state, no viable looping successor was found.
+    if (!BestLoopSucc) {
+      ExitingBB = OldExitingBB;
+      BestExitEdgeFreq = OldBestExitEdgeFreq;
+      continue;
+    }
+
+    // If this was best exiting block thus far, also record the looping block.
+    if (ExitingBB == *I)
+      LoopingBB = BestLoopSucc;
+  }
+  // Without a candidate exitting block or with only a single block in the
+  // loop, just use the loop header to layout the loop.
+  if (!ExitingBB || L.getNumBlocks() == 1)
+    return L.getHeader();
+
+  // Also, if we have exit blocks which lead to outer loops but didn't select
+  // one of them as the exiting block we are rotating toward, disable loop
+  // rotation altogether.
+  if (!BlocksExitingToOuterLoop.empty() &&
+      !BlocksExitingToOuterLoop.count(ExitingBB))
+    return L.getHeader();
+
+  assert(LoopingBB && "All successors of a loop block are exit blocks!");
+  DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
+  DEBUG(dbgs() << "  Best top block: " << getBlockName(LoopingBB) << "\n");
+  return LoopingBB;
+}
+
 /// \brief Forms basic block chains from the natural loop structures.
 ///
 /// These chains are designed to preserve the existing *structure* of the code
@@ -549,17 +673,21 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
 
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
-  BlockChain &LoopChain = *BlockToChain[L.getHeader()];
+
+  MachineBasicBlock *LayoutTop = findBestLoopTop(F, L, LoopBlockSet);
+  BlockChain &LoopChain = *BlockToChain[LayoutTop];
 
   // FIXME: This is a really lame way of walking the chains in the loop: we
   // walk the blocks, and use a set to prevent visiting a particular chain
   // twice.
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  assert(LoopChain.LoopPredecessors == 0);
+  UpdatedPreds.insert(&LoopChain);
   for (MachineLoop::block_iterator BI = L.block_begin(),
                                    BE = L.block_end();
        BI != BE; ++BI) {
     BlockChain &Chain = *BlockToChain[*BI];
-    if (!UpdatedPreds.insert(&Chain) || BI == L.block_begin())
+    if (!UpdatedPreds.insert(&Chain))
       continue;
 
     assert(Chain.LoopPredecessors == 0);
@@ -576,11 +704,10 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
     }
 
     if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(*BI);
+      BlockWorkList.push_back(*Chain.begin());
   }
 
-  buildChain(*L.block_begin(), LoopChain, L.getBlocks(), BlockWorkList,
-             &LoopBlockSet);
+  buildChain(LayoutTop, LoopChain, BlockWorkList, &LoopBlockSet);
 
   DEBUG({
     // Crash at the end so we get all of the debugging output first.
@@ -596,8 +723,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
       if (!LoopBlockSet.erase(*BCI)) {
         // We don't mark the loop as bad here because there are real situations
         // where this can occur. For example, with an unanalyzable fallthrough
-        // from a loop block to a non-loop block.
-        // FIXME: Such constructs shouldn't exist. Track them down and fix them.
+        // from a loop block to a non-loop block or vice versa.
         dbgs() << "Loop chain contains a block not contained by the loop!\n"
                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
@@ -621,26 +747,43 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
 void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
-  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    BlockToChain[&*FI] =
-      new (ChainAllocator.Allocate()) BlockChain(BlockToChain, &*FI);
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    MachineBasicBlock *BB = FI;
+    BlockChain *Chain
+      = new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
+    // Also, merge any blocks which we cannot reason about and must preserve
+    // the exact fallthrough behavior for.
+    for (;;) {
+      Cond.clear();
+      MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+      if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
+        break;
+
+      MachineFunction::iterator NextFI(llvm::next(FI));
+      MachineBasicBlock *NextBB = NextFI;
+      // Ensure that the layout successor is a viable block, as we know that
+      // fallthrough is a possibility.
+      assert(NextFI != FE && "Can't fallthrough past the last block.");
+      DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
+                   << getBlockName(BB) << " -> " << getBlockName(NextBB)
+                   << "\n");
+      Chain->merge(NextBB, 0);
+      FI = NextFI;
+      BB = NextBB;
+    }
+  }
 
   // Build any loop-based chains.
   for (MachineLoopInfo::iterator LI = MLI->begin(), LE = MLI->end(); LI != LE;
        ++LI)
     buildLoopChains(F, **LI);
 
-  // We need a vector of blocks so that buildChain can handle unnatural CFG
-  // constructs by searching for unplaced blocks and just concatenating them.
-  SmallVector<MachineBasicBlock *, 16> Blocks;
-  Blocks.reserve(F.size());
-
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
 
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     MachineBasicBlock *BB = &*FI;
-    Blocks.push_back(BB);
     BlockChain &Chain = *BlockToChain[BB];
     if (!UpdatedPreds.insert(&Chain))
       continue;
@@ -659,11 +802,11 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     }
 
     if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(BB);
+      BlockWorkList.push_back(*Chain.begin());
   }
 
   BlockChain &FunctionChain = *BlockToChain[&F.front()];
-  buildChain(&F.front(), FunctionChain, Blocks, BlockWorkList);
+  buildChain(&F.front(), FunctionChain, BlockWorkList);
 
   typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
   DEBUG({
@@ -695,7 +838,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
 
   // Splice the blocks into place.
   MachineFunction::iterator InsertPos = F.begin();
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
   for (BlockChain::iterator BI = FunctionChain.begin(),
                             BE = FunctionChain.end();
        BI != BE; ++BI) {
@@ -766,6 +908,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
   AlignLoops(F);
 
   BlockToChain.clear();
+  ChainAllocator.DestroyAll();
 
   // We always return true as we have no way to track whether the final order
   // differs from the original order.