[FastISel][AArch64] Add target-dependent instruction selection for Add/Sub.

[oota-llvm.git] / lib / Target / R600 / AMDILCFGStructurizer.cpp
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp

index fac56f074689f59d02a9138695577ab1fa263924..ee6e8ecfb29d7c5feefee75013ebee6c11eee6b1 100644 (file)
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -8,19 +8,14 @@
  /// \file
  //==-----------------------------------------------------------------------===//
  
-#define DEBUG_TYPE "structcfg"
-
  #include "AMDGPU.h"
  #include "AMDGPUInstrInfo.h"
  #include "R600InstrInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DepthFirstIterator.h"
  #include "llvm/ADT/SCCIterator.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/DominatorInternals.h"
-#include "llvm/Analysis/Dominators.h"
  #include "llvm/CodeGen/MachineDominators.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -30,11 +25,16 @@
  #include "llvm/CodeGen/MachineLoopInfo.h"
  #include "llvm/CodeGen/MachinePostDominators.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetInstrInfo.h"
  #include "llvm/Target/TargetMachine.h"
  
  using namespace llvm;
  
+#define DEBUG_TYPE "structcfg"
+
  #define DEFAULT_VEC_SLOTS 8
  
  // TODO: move-begin.
@@ -54,6 +54,10 @@ STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
  STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
  STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
  
+namespace llvm {
+  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
  //===----------------------------------------------------------------------===//
  //
  // Miscellaneous utility for CFGStructurizer.
@@ -131,16 +135,16 @@ public:
  
    static char ID;
  
-  AMDGPUCFGStructurizer(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm),
-      TII(static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
-      TRI(&TII->getRegisterInfo()) { }
+  AMDGPUCFGStructurizer() :
+      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
+    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
  
-   const char *getPassName() const {
-    return "AMD IL Control Flow Graph structurizer Pass";
+   const char *getPassName() const override {
+    return "AMDGPU Control Flow Graph structurizer Pass";
    }
  
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addPreserved<MachineFunctionAnalysis>();
      AU.addRequired<MachineFunctionAnalysis>();
      AU.addRequired<MachineDominatorTree>();
@@ -156,14 +160,16 @@ public:
    /// sure all loops have an exit block
    bool prepare();
  
-  bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    TRI = &TII->getRegisterInfo();
      DEBUG(MF.dump(););
      OrderedBlks.clear();
      FuncRep = &MF;
      MLI = &getAnalysis<MachineLoopInfo>();
      DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
      MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
      PDT = &getAnalysis<MachinePostDominatorTree>();
      DEBUG(PDT->print(dbgs()););
      prepare();
@@ -173,7 +179,6 @@ public:
    }
  
  protected:
-  TargetMachine &TM;
    MachineDominatorTree *MDT;
    MachinePostDominatorTree *PDT;
    MachineLoopInfo *MLI;
@@ -220,7 +225,7 @@ protected:
    /// Compute the reversed DFS post order of Blocks
    void orderBlocks(MachineFunction *MF);
  
-  // Function originaly from CFGStructTraits
+  // Function originally from CFGStructTraits
    void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
        DebugLoc DL = DebugLoc());
    MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
@@ -251,7 +256,6 @@ protected:
    MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
    static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
    static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
-  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *MBB);
    static bool isReturnBlock(MachineBasicBlock *MBB);
    static void cloneSuccessorList(MachineBasicBlock *DstMBB,
        MachineBasicBlock *SrcMBB) ;
@@ -331,10 +335,10 @@ protected:
        MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
    void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
    void retireBlock(MachineBasicBlock *MBB);
-  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
  
    MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
-  /// This is work around solution for findNearestCommonDominator not avaiable
+  /// This is work around solution for findNearestCommonDominator not available
    /// to post dom a proper fix should go to Dominators.h.
    MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
        MachineBasicBlock *MBB2);
@@ -358,7 +362,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
      const {
    LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
    if (It == LLInfoMap.end())
-    return NULL;
+    return nullptr;
    return (*It).second;
  }
  
@@ -629,7 +633,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
    MachineInstr *MI = &*It;
    if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
      return MI;
-  return NULL;
+  return nullptr;
  }
  
  MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
@@ -645,7 +649,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
          break;
      }
    }
-  return NULL;
+  return nullptr;
  }
  
  MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
@@ -655,7 +659,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
      if (instr->getOpcode() == AMDGPU::RETURN)
        return instr;
    }
-  return NULL;
+  return nullptr;
  }
  
  MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
@@ -665,17 +669,7 @@ MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
      if (MI->getOpcode() == AMDGPU::CONTINUE)
        return MI;
    }
-  return NULL;
-}
-
-MachineInstr *AMDGPUCFGStructurizer::getLoopBreakInstr(MachineBasicBlock *MBB) {
-  for (MachineBasicBlock::iterator It = MBB->begin(); (It != MBB->end());
-      ++It) {
-    MachineInstr *MI = &(*It);
-    if (MI->getOpcode() == AMDGPU::PREDICATED_BREAK)
-      return MI;
-  }
-  return NULL;
+  return nullptr;
  }
  
  bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
@@ -797,7 +791,7 @@ bool AMDGPUCFGStructurizer::prepare() {
  bool AMDGPUCFGStructurizer::run() {
  
    //Assume reducible CFG...
-  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n";FuncRep->viewCFG(););
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
  
  #ifdef STRESSTEST
    //Use the worse block ordering to test the algorithm.
@@ -826,7 +820,7 @@ bool AMDGPUCFGStructurizer::run() {
  
      SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
          It;
-    MachineBasicBlock *SccBeginMBB = NULL;
+    MachineBasicBlock *SccBeginMBB = nullptr;
      int SccNumBlk = 0;  // The number of active blocks, init to a
                          // maximum possible number.
      int SccNumIter;     // Number of iteration in this SCC.
@@ -869,8 +863,7 @@ bool AMDGPUCFGStructurizer::run() {
            ContNextScc = false;
            DEBUG(
              dbgs() << "repeat processing SCC" << getSCCNum(MBB)
-                   << "sccNumIter = " << SccNumIter << "\n";
-            FuncRep->viewCFG();
+                   << "sccNumIter = " << SccNumIter << '\n';
            );
          } else {
            // Finish the current scc.
@@ -882,7 +875,7 @@ bool AMDGPUCFGStructurizer::run() {
        }
  
        if (ContNextScc)
-        SccBeginMBB = NULL;
+        SccBeginMBB = nullptr;
      } //while, "one iteration" over the function.
  
      MachineBasicBlock *EntryMBB =
@@ -926,12 +919,10 @@ bool AMDGPUCFGStructurizer::run() {
    BlockInfoMap.clear();
    LLInfoMap.clear();
  
-  DEBUG(
-    FuncRep->viewCFG();
-  );
-
-  if (!Finish)
-    llvm_unreachable("IRREDUCIBL_CF");
+  if (!Finish) {
+    DEBUG(FuncRep->viewCFG());
+    llvm_unreachable("IRREDUCIBLE_CFG");
+  }
  
    return true;
  }
@@ -941,9 +932,9 @@ bool AMDGPUCFGStructurizer::run() {
  void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
    int SccNum = 0;
    MachineBasicBlock *MBB;
-  for (scc_iterator<MachineFunction *> It = scc_begin(MF), E = scc_end(MF);
-      It != E; ++It, ++SccNum) {
-    std::vector<MachineBasicBlock *> &SccNext = *It;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
+       ++It, ++SccNum) {
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
      for (std::vector<MachineBasicBlock *>::const_iterator
           blockIter = SccNext.begin(), blockEnd = SccNext.end();
           blockIter != blockEnd; ++blockIter) {
@@ -1016,13 +1007,14 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
      return 0;
  
    assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
  
    MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
-  serialPatternMatch(TrueMBB);
-  ifPatternMatch(TrueMBB);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
    MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
-  serialPatternMatch(FalseMBB);
-  ifPatternMatch(FalseMBB);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
    MachineBasicBlock *LandBlk;
    int Cloned = 0;
  
@@ -1035,7 +1027,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
    } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
      // Triangle pattern, false is empty
      LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
    } else if (FalseMBB->succ_size() == 1
               && *FalseMBB->succ_begin() == TrueMBB) {
      // Triangle pattern, true is empty
@@ -1043,7 +1035,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
      std::swap(TrueMBB, FalseMBB);
      reversePredicateSetter(MBB->end());
      LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
    } else if (FalseMBB->succ_size() == 1
               && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
      LandBlk = *FalseMBB->succ_begin();
@@ -1051,7 +1043,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
      && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
      LandBlk = *TrueMBB->succ_begin();
    } else {
-    return handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
    }
  
    // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
@@ -1079,18 +1071,16 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
  
    numClonedBlock += Cloned;
  
-  return 1 + Cloned;
+  return 1 + Cloned + NumMatch;
  }
  
  int AMDGPUCFGStructurizer::loopendPatternMatch() {
    std::vector<MachineLoop *> NestedLoops;
-  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
-      It != E; ++It) {
-    df_iterator<MachineLoop *> LpIt = df_begin(*It),
-        LpE = df_end(*It);
-    for (; LpIt != LpE; ++LpIt)
-      NestedLoops.push_back(*LpIt);
-  }
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E;
+       ++It)
+    for (MachineLoop *ML : depth_first(*It))
+      NestedLoops.push_back(ML);
+
    if (NestedLoops.size() == 0)
      return 0;
  
@@ -1244,7 +1234,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
  
        numClonedBlock += Num;
        Num += serialPatternMatch(*HeadMBB->succ_begin());
-      Num += serialPatternMatch(*(++HeadMBB->succ_begin()));
+      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
        Num += ifPatternMatch(HeadMBB);
        assert(Num > 0);
  
@@ -1253,7 +1243,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
      DEBUG(
        dbgs() << " not working\n";
      );
-    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
    } // walk down the postDomTree
  
    return Num;
@@ -1346,32 +1336,77 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
    // add initReg = initVal to headBlk
  
    const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  unsigned InitReg =
-    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  if (!MigrateTrue || !MigrateFalse)
-    llvm_unreachable("Extra register needed to handle CFG");
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
  
    int NumNewBlk = 0;
  
-  if (!LandBlk) {
-    LandBlk = HeadMBB->getParent()->CreateMachineBasicBlock();
-    HeadMBB->getParent()->push_back(LandBlk);  //insert to function
-
-    if (TrueMBB) {
-      TrueMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    if (FalseMBB) {
-      FalseMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    NumNewBlk ++;
-  }
-
    bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
  
    //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
@@ -1386,6 +1421,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
          CmpResReg, DebugLoc());
    }
  
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
        DebugLoc());
  
@@ -1529,26 +1568,8 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
    DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
                 << " land = BB" << LandMBB->getNumber() << "\n";);
  
-  /* we last inserterd the DebugLoc in the
-   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current
-   * dstBlk.
-   * search for the DebugLoc in the that statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  MachineInstr *LoopBreakInstr = getLoopBreakInstr(DstBlk);
-  DebugLoc DLBreak = (LoopBreakInstr) ? LoopBreakInstr->getDebugLoc() :
-      DebugLoc();
-
-  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DLBreak);
-
-  /* we last inserterd the DebugLoc in the continue statement in the current
-   * dstBlk.
-   * search for the DebugLoc in the continue statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  MachineInstr *ContinueInstr = getContinueInstr(DstBlk);
-  DebugLoc DLContinue = (ContinueInstr) ? ContinueInstr->getDebugLoc() :
-      DebugLoc();
-
-  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DLContinue);
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
    DstBlk->addSuccessor(LandMBB);
    DstBlk->removeSuccessor(DstBlk);
  }
@@ -1565,7 +1586,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
    MachineBasicBlock::iterator I = BranchMI;
    if (TrueBranch != LandMBB)
      reversePredicateSetter(I);
-  insertCondBranchBefore(I, AMDGPU::PREDICATED_BREAK, DL);
+  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+  insertInstrBefore(I, AMDGPU::BREAK);
+  insertInstrBefore(I, AMDGPU::ENDIF);
    //now branchInst can be erase safely
    BranchMI->eraseFromParent();
    //now take care of successors, retire blocks
@@ -1699,11 +1722,11 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
    const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
  
    if (!LoopHeader || !LoopLatch)
-    return NULL;
+    return nullptr;
    MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
    // Is LoopRep an infinite loop ?
    if (!BranchMI || !isUncondBranch(BranchMI))
-    return NULL;
+    return nullptr;
  
    MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
    FuncRep->push_back(DummyExitBlk);  //insert to function
@@ -1740,7 +1763,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
    if (MBB->succ_size() != 2)
      return;
    MachineBasicBlock *MBB1 = *MBB->succ_begin();
-  MachineBasicBlock *MBB2 = *(++MBB->succ_begin());
+  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
    if (MBB1 != MBB2)
      return;
  
@@ -1836,7 +1859,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
      return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
  
    if (!Node1 || !Node2)
-    return NULL;
+    return nullptr;
  
    Node1 = Node1->getIDom();
    while (Node1) {
@@ -1845,7 +1868,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
      Node1 = Node1->getIDom();
    }
  
-  return NULL;
+  return nullptr;
  }
  
  MachineBasicBlock *
@@ -1876,6 +1899,14 @@ char AMDGPUCFGStructurizer::ID = 0;
  } // end anonymous namespace
  
  
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm) {
-  return new AMDGPUCFGStructurizer(tm);
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+  return new AMDGPUCFGStructurizer();
  }