void initializeLoopDeletionPass(PassRegistry&);
void initializeLoopExtractorPass(PassRegistry&);
void initializeLoopInfoWrapperPassPass(PassRegistry&);
+void initializeLoopInterchangePass(PassRegistry &);
void initializeLoopInstSimplifyPass(PassRegistry&);
void initializeLoopRotatePass(PassRegistry&);
void initializeLoopSimplifyPass(PassRegistry&);
(void) llvm::createLICMPass();
(void) llvm::createLazyValueInfoPass();
(void) llvm::createLoopExtractorPass();
+ (void)llvm::createLoopInterchangePass();
(void) llvm::createLoopSimplifyPass();
(void) llvm::createLoopStrengthReducePass();
(void) llvm::createLoopRerollPass();
//
Pass *createLICMPass();
+//===----------------------------------------------------------------------===//
+//
+// LoopInterchange - This pass interchanges loops to provide a more
+// cache-friendly memory access patterns.
+//
+Pass *createLoopInterchangePass();
+
//===----------------------------------------------------------------------===//
//
// LoopStrengthReduce - This pass is strength reduces GEP instructions that use
EnableMLSM("mlsm", cl::init(true), cl::Hidden,
cl::desc("Enable motion of merged load and store"));
+static cl::opt<bool> EnableLoopInterchange(
+ "enable-loopinterchange", cl::init(false), cl::Hidden,
+ cl::desc("Enable the new, experimental LoopInterchange Pass"));
+
PassManagerBuilder::PassManagerBuilder() {
OptLevel = 2;
SizeLevel = 0;
MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
MPM.add(createLoopDeletionPass()); // Delete dead loops
+ if (EnableLoopInterchange)
+ MPM.add(createLoopInterchangePass()); // Interchange loops
if (!DisableUnrollLoops)
MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops
// More loops are countable; try to optimize them.
PM.add(createIndVarSimplifyPass());
PM.add(createLoopDeletionPass());
+ if (EnableLoopInterchange)
+ PM.add(createLoopInterchangePass());
+
PM.add(createLoopVectorizePass(true, LoopVectorize));
// More scalar chains could be vectorized due to more alias information
LoopDeletion.cpp
LoopIdiomRecognize.cpp
LoopInstSimplify.cpp
+ LoopInterchange.cpp
LoopRerollPass.cpp
LoopRotation.cpp
LoopStrengthReduce.cpp
--- /dev/null
+//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+namespace {
+
+typedef SmallVector<Loop *, 8> LoopVector;
+
+// TODO: Check if we can use a sparse matrix here.
+typedef std::vector<std::vector<char>> CharMatrix;
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+struct LoopInterchange;
+
+#ifdef DUMP_DEP_MATRICIES
+void printDepMatrix(CharMatrix &DepMatrix) {
+ for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
+ std::vector<char> Vec = *I;
+ for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
+ DEBUG(dbgs() << *II << " ");
+ DEBUG(dbgs() << "\n");
+ }
+}
+#endif
+
+bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L,
+ DependenceAnalysis *DA) {
+ typedef SmallVector<Value *, 16> ValueVector;
+ ValueVector MemInstr;
+
+ if (Level > MaxLoopNestDepth) {
+ DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+ << MaxLoopNestDepth << "\n");
+ return false;
+ }
+
+ // For each block.
+ for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
+ BB != BE; ++BB) {
+ // Scan the BB and collect legal loads and stores.
+ for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
+ ++I) {
+ Instruction *Ins = dyn_cast<Instruction>(I);
+ if (!Ins)
+ return false;
+ LoadInst *Ld = dyn_cast<LoadInst>(I);
+ StoreInst *St = dyn_cast<StoreInst>(I);
+ if (!St && !Ld)
+ continue;
+ if (Ld && !Ld->isSimple())
+ return false;
+ if (St && !St->isSimple())
+ return false;
+ MemInstr.push_back(I);
+ }
+ }
+
+ DEBUG(dbgs() << "Found " << MemInstr.size()
+ << " Loads and Stores to analyze\n");
+
+ ValueVector::iterator I, IE, J, JE;
+
+ for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+ for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+ std::vector<char> Dep;
+ Instruction *Src = dyn_cast<Instruction>(*I);
+ Instruction *Des = dyn_cast<Instruction>(*J);
+ if (Src == Des)
+ continue;
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Des))
+ continue;
+ if (auto D = DA->depends(Src, Des, true)) {
+ DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des
+ << "\n");
+ if (D->isFlow()) {
+ // TODO: Handle Flow dependence.Check if it is sufficient to populate
+ // the Dependence Matrix with the direction reversed.
+ DEBUG(dbgs() << "Flow dependence not handled");
+ return false;
+ }
+ if (D->isAnti()) {
+ DEBUG(dbgs() << "Found Anti dependence \n");
+ unsigned Levels = D->getLevels();
+ char Direction;
+ for (unsigned II = 1; II <= Levels; ++II) {
+ const SCEV *Distance = D->getDistance(II);
+ const SCEVConstant *SCEVConst =
+ dyn_cast_or_null<SCEVConstant>(Distance);
+ if (SCEVConst) {
+ const ConstantInt *CI = SCEVConst->getValue();
+ if (CI->isNegative())
+ Direction = '<';
+ else if (CI->isZero())
+ Direction = '=';
+ else
+ Direction = '>';
+ Dep.push_back(Direction);
+ } else if (D->isScalar(II)) {
+ Direction = 'S';
+ Dep.push_back(Direction);
+ } else {
+ unsigned Dir = D->getDirection(II);
+ if (Dir == Dependence::DVEntry::LT ||
+ Dir == Dependence::DVEntry::LE)
+ Direction = '<';
+ else if (Dir == Dependence::DVEntry::GT ||
+ Dir == Dependence::DVEntry::GE)
+ Direction = '>';
+ else if (Dir == Dependence::DVEntry::EQ)
+ Direction = '=';
+ else
+ Direction = '*';
+ Dep.push_back(Direction);
+ }
+ }
+ while (Dep.size() != Level) {
+ Dep.push_back('I');
+ }
+
+ DepMatrix.push_back(Dep);
+ if (DepMatrix.size() > MaxMemInstrCount) {
+ DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+ << " dependencies inside loop\n");
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ // We don't have a DepMatrix to check legality return false
+ if (DepMatrix.size() == 0)
+ return false;
+ return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx,
+ unsigned ToIndx) {
+ unsigned numRows = DepMatrix.size();
+ for (unsigned i = 0; i < numRows; ++i) {
+ char TmpVal = DepMatrix[i][ToIndx];
+ DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+ DepMatrix[i][FromIndx] = TmpVal;
+ }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i <= Column; ++i) {
+ if (DepMatrix[Row][i] == '<')
+ return false;
+ if (DepMatrix[Row][i] == '>')
+ return true;
+ }
+ // All dependencies were '=','S' or 'I'
+ return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i < Column; ++i) {
+ if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' ||
+ DepMatrix[Row][i] != 'I')
+ return false;
+ }
+ return true;
+}
+
+bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+ unsigned OuterLoopId, char InnerDep, char OuterDep) {
+
+ if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+ return false;
+
+ if (InnerDep == OuterDep)
+ return true;
+
+ // It is legal to interchange if and only if after interchange no row has a
+ // '>' direction as the leftmost non-'='.
+
+ if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+ return true;
+
+ if (InnerDep == '<')
+ return true;
+
+ if (InnerDep == '>') {
+ // If OuterLoopId represents outermost loop then interchanging will make the
+ // 1st dependency as '>'
+ if (OuterLoopId == 0)
+ return false;
+
+ // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+ // interchanging will result in this row having an outermost non '='
+ // dependency of '>'
+ if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+ return true;
+ }
+
+ return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorm] A permutation of the loops in a perfect nest is legal if and only if
+// the direction matrix, after the same permutation is applied to its columns,
+// has no ">" direction as the leftmost non-"=" direction in any row.
+bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId,
+ unsigned OuterLoopId) {
+
+ unsigned NumRows = DepMatrix.size();
+ // For each row check if it is valid to interchange.
+ for (unsigned Row = 0; Row < NumRows; ++Row) {
+ char InnerDep = DepMatrix[Row][InnerLoopId];
+ char OuterDep = DepMatrix[Row][OuterLoopId];
+ if (InnerDep == '*' || OuterDep == '*')
+ return false;
+ else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep,
+ OuterDep))
+ return false;
+ }
+ return true;
+}
+
+static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
+
+ DEBUG(dbgs() << "Calling populateWorklist called\n");
+ LoopVector LoopList;
+ Loop *CurrentLoop = &L;
+ std::vector<Loop *> vec = CurrentLoop->getSubLoopsVector();
+ while (vec.size() != 0) {
+ // The current loop has multiple subloops in it hence it is not tightly
+ // nested.
+ // Discard all loops above it added into Worklist.
+ if (vec.size() != 1) {
+ LoopList.clear();
+ return;
+ }
+ LoopList.push_back(CurrentLoop);
+ CurrentLoop = *(vec.begin());
+ vec = CurrentLoop->getSubLoopsVector();
+ }
+ LoopList.push_back(CurrentLoop);
+ V.push_back(LoopList);
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+ PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+ if (InnerIndexVar)
+ return InnerIndexVar;
+ if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+ return nullptr;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PhiVar = cast<PHINode>(I);
+ Type *PhiTy = PhiVar->getType();
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy())
+ return nullptr;
+ const SCEVAddRecExpr *AddRec =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+ if (!AddRec || !AddRec->isAffine())
+ continue;
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+ if (!C)
+ continue;
+ // Found the induction variable.
+ // FIXME: Handle loops with more than one induction variable. Note that,
+ // currently, legality makes sure we have only one induction variable.
+ return PhiVar;
+ }
+ return nullptr;
+}
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+ LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ LoopInterchange *Pass)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass) {}
+
+ /// Check if the loops can be interchanged.
+ bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+ /// Check if the loop structure is understood. We do not handle triangular
+ /// loops for now.
+ bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+ bool currentLimitations();
+
+private:
+ bool tightlyNested(Loop *Outer, Loop *Inner);
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+ LoopInterchange *CurrentPass;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+ LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+
+ /// Check if the loop interchange is profitable
+ bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+
+private:
+ int getInstrOrderCost();
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+};
+
+/// LoopInterchangeTransform interchanges the loop
+class LoopInterchangeTransform {
+public:
+ LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ LoopInfo *LI, DominatorTree *DT,
+ LoopInterchange *Pass, BasicBlock *LoopNestExit)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+ LoopExit(LoopNestExit) {
+ initialize();
+ }
+
+ /// Interchange OuterLoop and InnerLoop.
+ bool transform();
+ void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+ void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+ void initialize();
+
+private:
+ void splitInnerLoopLatch(Instruction *);
+ void splitOuterLoopLatch();
+ void splitInnerLoopHeader();
+ bool adjustLoopLinks();
+ void adjustLoopPreheaders();
+ void adjustOuterLoopPreheader();
+ void adjustInnerLoopPreheader();
+ bool adjustLoopBranches();
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ BasicBlock *LoopExit;
+};
+
+// Main LoopInterchange Pass
+struct LoopInterchange : public FunctionPass {
+ static char ID;
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ DependenceAnalysis *DA;
+ DominatorTree *DT;
+ LoopInterchange()
+ : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) {
+ initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolution>();
+ AU.addRequired<AliasAnalysis>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DependenceAnalysis>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ }
+
+ bool runOnFunction(Function &F) override {
+ SE = &getAnalysis<ScalarEvolution>();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DA = &getAnalysis<DependenceAnalysis>();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ // Build up a worklist of loop pairs to analyze.
+ SmallVector<LoopVector, 8> Worklist;
+
+ for (Loop *L : *LI)
+ populateWorklist(*L, Worklist);
+
+ DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+ bool Changed = true;
+ while (!Worklist.empty()) {
+ LoopVector LoopList = Worklist.pop_back_val();
+ Changed = processLoopList(LoopList);
+ }
+ return Changed;
+ }
+
+ bool isComputableLoopNest(LoopVector LoopList) {
+ for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) {
+ Loop *L = *I;
+ const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
+ if (ExitCountOuter == SE->getCouldNotCompute()) {
+ DEBUG(dbgs() << "Couldn't compute Backedge count\n");
+ return false;
+ }
+ if (L->getNumBackEdges() != 1) {
+ DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+ return false;
+ }
+ if (!L->getExitingBlock()) {
+ DEBUG(dbgs() << "Loop Doesn't have unique exit block\n");
+ return false;
+ }
+ }
+ return true;
+ }
+
+ unsigned selectLoopForInterchange(LoopVector LoopList) {
+ // TODO: Add a better heuristic to select the loop to be interchanged based
+ // on the dependece matrix. Currently we select the innermost loop.
+ return LoopList.size() - 1;
+ }
+
+ bool processLoopList(LoopVector LoopList) {
+ bool Changed = false;
+ bool containsLCSSAPHI = false;
+ CharMatrix DependencyMatrix;
+ if (LoopList.size() < 2) {
+ DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+ return false;
+ }
+ if (!isComputableLoopNest(LoopList)) {
+ DEBUG(dbgs() << "Not vaild loop candidate for interchange\n");
+ return false;
+ }
+ Loop *OuterMostLoop = *(LoopList.begin());
+
+ DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size()
+ << "\n");
+
+ if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(),
+ OuterMostLoop, DA)) {
+ DEBUG(dbgs() << "Populating Dependency matrix failed\n");
+ return false;
+ }
+#ifdef DUMP_DEP_MATRICIES
+ DEBUG(dbgs() << "Dependence before inter change \n");
+ printDepMatrix(DependencyMatrix);
+#endif
+
+ BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
+ BranchInst *OuterMostLoopLatchBI =
+ dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
+ if (!OuterMostLoopLatchBI)
+ return false;
+
+ // Since we currently do not handle LCSSA PHI's any failure in loop
+ // condition will now branch to LoopNestExit.
+ // TODO: This should be removed once we handle LCSSA PHI nodes.
+
+ // Get the Outermost loop exit.
+ BasicBlock *LoopNestExit;
+ if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
+ LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
+ else
+ LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
+
+ for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) {
+ Loop *L = *I;
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *Header = L->getHeader();
+ if (Latch && Latch != Header && isa<PHINode>(Latch->begin())) {
+ containsLCSSAPHI = true;
+ break;
+ }
+ }
+
+ // TODO: Handle lcssa PHI's. Currently LCSSA PHI's are not handled. Handle
+ // the same by splitting the loop latch and adjusting loop links
+ // accordingly.
+ if (containsLCSSAPHI)
+ return false;
+
+ unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+ // Move the selected loop outwards to the best posible position.
+ for (unsigned i = SelecLoopId; i > 0; i--) {
+ bool Interchanged =
+ processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+ if (!Interchanged)
+ return Changed;
+ // Loops interchanged reflect the same in LoopList
+ Loop *OldOuterLoop = LoopList[i - 1];
+ LoopList[i - 1] = LoopList[i];
+ LoopList[i] = OldOuterLoop;
+
+ // Update the DependencyMatrix
+ interChangeDepedencies(DependencyMatrix, i, i - 1);
+
+#ifdef DUMP_DEP_MATRICIES
+ DEBUG(dbgs() << "Dependence after inter change \n");
+ printDepMatrix(DependencyMatrix);
+#endif
+ Changed |= Interchanged;
+ }
+ return Changed;
+ }
+
+ bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+ unsigned OuterLoopId, BasicBlock *LoopNestExit,
+ std::vector<std::vector<char>> &DependencyMatrix) {
+
+ DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId << "\n");
+ Loop *InnerLoop = LoopList[InnerLoopId];
+ Loop *OuterLoop = LoopList[OuterLoopId];
+
+ LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this);
+ if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+ return false;
+ }
+ DEBUG(dbgs() << "Loops are legal to interchange\n");
+ LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+ if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ DEBUG(dbgs() << "Interchanging Loops not profitable\n");
+ return false;
+ }
+
+ LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this,
+ LoopNestExit);
+ LIT.transform();
+ DEBUG(dbgs() << "Loops interchanged\n");
+ return true;
+ }
+};
+
+} // end of namespace
+
+static bool containsUnsafeInstructions(BasicBlock *BB) {
+ for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+ if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ return true;
+ }
+ return false;
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+ DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n");
+
+ // A perfectly nested loop will not have any branch in between the outer and
+ // inner block i.e. outer header will branch to either inner preheader and
+ // outerloop latch.
+ BranchInst *outerLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ if (!outerLoopHeaderBI)
+ return false;
+ unsigned num = outerLoopHeaderBI->getNumSuccessors();
+ for (unsigned i = 0; i < num; i++) {
+ if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
+ outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+ return false;
+ }
+
+ DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n");
+ // We do not have any basic block in between now make sure the outer header
+ // and outer loop latch doesnt contain any unsafe instructions.
+ if (containsUnsafeInstructions(OuterLoopHeader) ||
+ containsUnsafeInstructions(OuterLoopLatch))
+ return false;
+
+ DEBUG(dbgs() << "Loops are perfectly nested \n");
+ // We have a perfect loop nest.
+ return true;
+}
+
+static unsigned getPHICount(BasicBlock *BB) {
+ unsigned PhiCount = 0;
+ for (auto I = BB->begin(); isa<PHINode>(I); ++I)
+ PhiCount++;
+ return PhiCount;
+}
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+ PHINode *InnerInduction) {
+
+ unsigned Num = InnerInduction->getNumOperands();
+ BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+ for (unsigned i = 0; i < Num; ++i) {
+ Value *Val = InnerInduction->getOperand(i);
+ if (isa<Constant>(Val))
+ continue;
+ Instruction *I = dyn_cast<Instruction>(Val);
+ if (!I)
+ return false;
+ // TODO: Handle triangular loops.
+ // e.g. for(int i=0;i<N;i++)
+ // for(int j=i;j<N;j++)
+ unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+ if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+ InnerLoopPreheader &&
+ !OuterLoop->isLoopInvariant(I)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+ PHINode *InnerInductionVar;
+ PHINode *OuterInductionVar;
+
+ // We currently handle only 1 induction variable inside the loop. We also do
+ // not handle reductions as of now.
+ if (getPHICount(InnerLoopHeader) > 1)
+ return true;
+
+ if (getPHICount(OuterLoopHeader) > 1)
+ return true;
+
+ InnerInductionVar = getInductionVariable(InnerLoop, SE);
+ OuterInductionVar = getInductionVariable(OuterLoop, SE);
+
+ if (!OuterInductionVar || !InnerInductionVar) {
+ DEBUG(dbgs() << "Induction variable not found\n");
+ return true;
+ }
+
+ // TODO: Triangular loops are not handled for now.
+ if (!isLoopStructureUnderstood(InnerInductionVar)) {
+ DEBUG(dbgs() << "Loop structure not understood by pass\n");
+ return true;
+ }
+
+ // TODO: Loops with LCSSA PHI's are currently not handled.
+ if (isa<PHINode>(OuterLoopLatch->begin())) {
+ DEBUG(dbgs() << "Found and LCSSA PHI in outer loop latch\n");
+ return true;
+ }
+ if (InnerLoopLatch != InnerLoopHeader &&
+ isa<PHINode>(InnerLoopLatch->begin())) {
+ DEBUG(dbgs() << "Found and LCSSA PHI in inner loop latch\n");
+ return true;
+ }
+
+ // TODO: Current limitation: Since we split the inner loop latch at the point
+ // were induction variable is incremented (induction.next); We cannot have
+ // more than 1 user of induction.next since it would result in broken code
+ // after split.
+ // e.g.
+ // for(i=0;i<N;i++) {
+ // for(j = 0;j<M;j++) {
+ // A[j+1][i+2] = A[j][i]+k;
+ // }
+ // }
+ bool FoundInduction = false;
+ Instruction *InnerIndexVarInc = nullptr;
+ if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+ else
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+ if (!InnerIndexVarInc)
+ return true;
+
+ // Since we split the inner loop latch on this induction variable. Make sure
+ // we do not have any instruction between the induction variable and branch
+ // instruction.
+
+ for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend();
+ I != E && !FoundInduction; ++I) {
+ if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I))
+ continue;
+ const Instruction &Ins = *I;
+ // We found an instruction. If this is not induction variable then it is not
+ // safe to split this loop latch.
+ if (!Ins.isIdenticalTo(InnerIndexVarInc))
+ return true;
+ else
+ FoundInduction = true;
+ }
+ // The loop latch ended and we didnt find the induction variable return as
+ // current limitation.
+ if (!FoundInduction)
+ return true;
+
+ return false;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+
+ if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+ DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+ << "and OuterLoopId = " << OuterLoopId
+ << "due to dependence\n");
+ return false;
+ }
+
+ // Create unique Preheaders if we already do not have one.
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+ // Create a unique outer preheader -
+ // 1) If OuterLoop preheader is not present.
+ // 2) If OuterLoop Preheader is same as OuterLoop Header
+ // 3) If OuterLoop Preheader is same as Header of the previous loop.
+ // 4) If OuterLoop Preheader is Entry node.
+ if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
+ isa<PHINode>(OuterLoopPreHeader->begin()) ||
+ !OuterLoopPreHeader->getUniquePredecessor()) {
+ OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass);
+ }
+
+ if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
+ InnerLoopPreHeader == OuterLoop->getHeader()) {
+ InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass);
+ }
+
+ // Check if the loops are tightly nested.
+ if (!tightlyNested(OuterLoop, InnerLoop)) {
+ DEBUG(dbgs() << "Loops not tightly nested\n");
+ return false;
+ }
+
+ // TODO: The loops could not be interchanged due to current limitations in the
+ // transform module.
+ if (currentLimitations()) {
+ DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+ return false;
+ }
+
+ return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+ unsigned GoodOrder, BadOrder;
+ BadOrder = GoodOrder = 0;
+ for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
+ BI != BE; ++BI) {
+ for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) {
+ const Instruction &Ins = *I;
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+ unsigned NumOp = GEP->getNumOperands();
+ bool FoundInnerInduction = false;
+ bool FoundOuterInduction = false;
+ for (unsigned i = 0; i < NumOp; ++i) {
+ const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+ if (!AR)
+ continue;
+
+ // If we find the inner induction after an outer induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[i][j] = A[i-1][j-1]+k;
+ // then it is a good order.
+ if (AR->getLoop() == InnerLoop) {
+ // We found an InnerLoop induction after OuterLoop induction. It is
+ // a good order.
+ FoundInnerInduction = true;
+ if (FoundOuterInduction) {
+ GoodOrder++;
+ break;
+ }
+ }
+ // If we find the outer induction after an inner induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[j][i] = A[j-1][i-1]+k;
+ // then it is a bad order.
+ if (AR->getLoop() == OuterLoop) {
+ // We found an OuterLoop induction after InnerLoop induction. It is
+ // a bad order.
+ FoundOuterInduction = true;
+ if (FoundInnerInduction) {
+ BadOrder++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ return GoodOrder - BadOrder;
+}
+
+bool isProfitabileForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ // TODO: Improve this heuristic to catch more cases.
+ // If the inner loop is loop independent or doesn't carry any dependency it is
+ // profitable to move this to outer position.
+ unsigned Row = DepMatrix.size();
+ for (unsigned i = 0; i < Row; ++i) {
+ if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
+ return false;
+ // TODO: We need to improve this heuristic.
+ if (DepMatrix[i][OuterLoopId] != '=')
+ return false;
+ }
+ // If outer loop has dependence and inner loop is loop independent then it is
+ // profitable to interchange to enable parallelism.
+ return true;
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+
+ // TODO: Add Better Profitibility checks.
+ // e.g
+ // 1) Construct dependency matrix and move the one with no loop carried dep
+ // inside to enable vectorization.
+
+ // This is rough cost estimation algorithm. It counts the good and bad order
+ // of induction variables in the instruction and allows reordering if number
+ // of bad orders is more than good.
+ int Cost = 0;
+ Cost += getInstrOrderCost();
+ DEBUG(dbgs() << "Cost = " << Cost << "\n");
+ if (Cost < 0)
+ return true;
+
+ // It is not profitable as per current cache profitibility model. But check if
+ // we can move this loop outside to improve parallelism.
+ bool ImprovesPar =
+ isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+ return ImprovesPar;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+ Loop *InnerLoop) {
+ for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end();; ++I) {
+ assert(I != E && "Couldn't find loop");
+ if (*I == InnerLoop) {
+ OuterLoop->removeChildLoop(I);
+ return;
+ }
+ }
+}
+void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
+ Loop *OuterLoop) {
+ Loop *OuterLoopParent = OuterLoop->getParentLoop();
+ if (OuterLoopParent) {
+ // Remove the loop from its parent loop.
+ removeChildLoop(OuterLoopParent, OuterLoop);
+ removeChildLoop(OuterLoop, InnerLoop);
+ OuterLoopParent->addChildLoop(InnerLoop);
+ } else {
+ removeChildLoop(OuterLoop, InnerLoop);
+ LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+ }
+
+ for (Loop::iterator I = InnerLoop->begin(), E = InnerLoop->end(); I != E; ++I)
+ OuterLoop->addChildLoop(InnerLoop->removeChildLoop(I));
+
+ InnerLoop->addChildLoop(OuterLoop);
+}
+
+bool LoopInterchangeTransform::transform() {
+
+ DEBUG(dbgs() << "transform\n");
+ bool Transformed = false;
+ Instruction *InnerIndexVar;
+
+ if (InnerLoop->getSubLoops().size() == 0) {
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ DEBUG(dbgs() << "Calling Split Inner Loop\n");
+ PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+ if (!InductionPHI) {
+ DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+ return false;
+ }
+
+ if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+ else
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+ //
+ // Split at the place were the induction variable is
+ // incremented/decremented.
+ // TODO: This splitting logic may not work always. Fix this.
+ splitInnerLoopLatch(InnerIndexVar);
+ DEBUG(dbgs() << "splitInnerLoopLatch Done\n");
+
+ // Splits the inner loops phi nodes out into a seperate basic block.
+ splitInnerLoopHeader();
+ DEBUG(dbgs() << "splitInnerLoopHeader Done\n");
+ }
+
+ Transformed |= adjustLoopLinks();
+ if (!Transformed) {
+ DEBUG(dbgs() << "adjustLoopLinks Failed\n");
+ return false;
+ }
+
+ restructureLoops(InnerLoop, OuterLoop);
+ return true;
+}
+
+void LoopInterchangeTransform::initialize() {}
+
+void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *inc) {
+
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock::iterator I = InnerLoopLatch->begin();
+ BasicBlock::iterator E = InnerLoopLatch->end();
+ for (; I != E; ++I) {
+ if (inc == I)
+ break;
+ }
+
+ BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
+ InnerLoopLatch = SplitBlock(InnerLoopLatchPred, I, DT, LI);
+}
+
+void LoopInterchangeTransform::splitOuterLoopLatch() {
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch;
+ OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock,
+ OuterLoopLatch->getFirstNonPHI(), DT, LI);
+}
+
+void LoopInterchangeTransform::splitInnerLoopHeader() {
+
+ // Split the inner loop header out.
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+
+ DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
+ "InnerLoopHeader \n");
+}
+
+void LoopInterchangeTransform::adjustOuterLoopPreheader() {
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ SmallVector<Instruction *, 8> Inst;
+ for (auto I = OuterLoopPreHeader->begin(), E = OuterLoopPreHeader->end();
+ I != E; ++I) {
+ if (isa<BranchInst>(*I))
+ break;
+ Inst.push_back(I);
+ }
+
+ BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader();
+ for (auto I = Inst.begin(), E = Inst.end(); I != E; ++I) {
+ Instruction *Ins = cast<Instruction>(*I);
+ Ins->moveBefore(InnerPreHeader->getTerminator());
+ }
+}
+
+void LoopInterchangeTransform::adjustInnerLoopPreheader() {
+
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ SmallVector<Instruction *, 8> Inst;
+ for (auto I = InnerLoopPreHeader->begin(), E = InnerLoopPreHeader->end();
+ I != E; ++I) {
+ if (isa<BranchInst>(*I))
+ break;
+ Inst.push_back(I);
+ }
+ BasicBlock *OuterHeader = OuterLoop->getHeader();
+ for (auto I = Inst.begin(), E = Inst.end(); I != E; ++I) {
+ Instruction *Ins = cast<Instruction>(*I);
+ Ins->moveBefore(OuterHeader->getTerminator());
+ }
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+
+ DEBUG(dbgs() << "adjustLoopBranches called\n");
+ // Adjust the loop preheader
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchPredecessor =
+ InnerLoopLatch->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchSuccessor;
+ BasicBlock *OuterLoopLatchSuccessor;
+
+ BranchInst *OuterLoopLatchBI =
+ dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+ BranchInst *InnerLoopLatchBI =
+ dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+ BranchInst *OuterLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ BranchInst *InnerLoopHeaderBI =
+ dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+ if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+ !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+ !InnerLoopHeaderBI)
+ return false;
+
+ BranchInst *InnerLoopLatchPredecessorBI =
+ dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+ BranchInst *OuterLoopPredecessorBI =
+ dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+ if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+ return false;
+ BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor();
+ if (!InnerLoopHeaderSucessor)
+ return false;
+
+ // Adjust Loop Preheader and headers
+
+ unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
+ OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
+ }
+
+ NumSucc = OuterLoopHeaderBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
+ OuterLoopHeaderBI->setSuccessor(i, LoopExit);
+ else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
+ OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor);
+ }
+
+ BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
+ InnerLoopHeaderBI->eraseFromParent();
+
+ // -------------Adjust loop latches-----------
+ if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+ else
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+ NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
+ for (unsigned i = 0; i < NumSucc; ++i) {
+ if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
+ InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
+ }
+
+ if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+ else
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+ if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
+ InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
+ else
+ InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+
+ if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
+ OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
+ } else {
+ OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+ }
+
+ return true;
+}
+void LoopInterchangeTransform::adjustLoopPreheaders() {
+
+ // We have interchanged the preheaders so we need to interchange the data in
+ // the preheader as well.
+ // This is because the content of inner preheader was previously executed
+ // inside the outer loop.
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BranchInst *InnerTermBI =
+ cast<BranchInst>(InnerLoopPreHeader->getTerminator());
+
+ SmallVector<Value *, 16> OuterPreheaderInstr;
+ SmallVector<Value *, 16> InnerPreheaderInstr;
+
+ for (auto I = OuterLoopPreHeader->begin(); !isa<BranchInst>(I); ++I)
+ OuterPreheaderInstr.push_back(I);
+
+ for (auto I = InnerLoopPreHeader->begin(); !isa<BranchInst>(I); ++I)
+ InnerPreheaderInstr.push_back(I);
+
+ BasicBlock *HeaderSplit =
+ SplitBlock(OuterLoopHeader, OuterLoopHeader->getTerminator(), DT, LI);
+ Instruction *InsPoint = HeaderSplit->getFirstNonPHI();
+ // These instructions should now be executed inside the loop.
+ // Move instruction into a new block after outer header.
+ for (auto I = InnerPreheaderInstr.begin(), E = InnerPreheaderInstr.end();
+ I != E; ++I) {
+ Instruction *Ins = cast<Instruction>(*I);
+ Ins->moveBefore(InsPoint);
+ }
+ // These instructions were not executed previously in the loop so move them to
+ // the older inner loop preheader.
+ for (auto I = OuterPreheaderInstr.begin(), E = OuterPreheaderInstr.end();
+ I != E; ++I) {
+ Instruction *Ins = cast<Instruction>(*I);
+ Ins->moveBefore(InnerTermBI);
+ }
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+
+ // Adjust all branches in the inner and outer loop.
+ bool Changed = adjustLoopBranches();
+ if (Changed)
+ adjustLoopPreheaders();
+ return Changed;
+}
+
+char LoopInterchange::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+ "Interchanges loops for cache reuse", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+
+INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+ "Interchanges loops for cache reuse", false, false)
+
+Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
initializeLoopDeletionPass(Registry);
initializeLoopAccessAnalysisPass(Registry);
initializeLoopInstSimplifyPass(Registry);
+ initializeLoopInterchangePass(Registry);
initializeLoopRotatePass(Registry);
initializeLoopStrengthReducePass(Registry);
initializeLoopRerollPass(Registry);
--- /dev/null
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; These are test that fail to interchange due to current limitation. This will go off once we extend the loop interchange pass.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;;--------------------------------------Test case 01------------------------------------
+;; [FIXME] This loop though valid is currently not interchanged due to the limitation that we cannot split the inner loop latch due to multiple use of inner induction
+;; variable.(used to increment the loop counter and to access A[j+1][i+1]
+;; for(int i=0;i<N-1;i++)
+;; for(int j=1;j<N-1;j++)
+;; A[j+1][i+1] = A[j+1][i+1] + k;
+
+define void @interchange_01(i32 %k, i32 %N) {
+ entry:
+ %sub = add nsw i32 %N, -1
+ %cmp26 = icmp sgt i32 %N, 1
+ br i1 %cmp26, label %for.cond1.preheader.lr.ph, label %for.end17
+
+ for.cond1.preheader.lr.ph:
+ %cmp324 = icmp sgt i32 %sub, 1
+ %0 = add i32 %N, -2
+ %1 = sext i32 %sub to i64
+ br label %for.cond1.preheader
+
+ for.cond.loopexit:
+ %cmp = icmp slt i64 %indvars.iv.next29, %1
+ br i1 %cmp, label %for.cond1.preheader, label %for.end17
+
+ for.cond1.preheader:
+ %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.cond.loopexit ]
+ %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
+ br i1 %cmp324, label %for.body4, label %for.cond.loopexit
+
+ for.body4:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 1, %for.cond1.preheader ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next, i64 %indvars.iv.next29
+ %2 = load i32, i32* %arrayidx7
+ %add8 = add nsw i32 %2, %k
+ store i32 %add8, i32* %arrayidx7
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.cond.loopexit, label %for.body4
+
+ for.end17:
+ ret void
+}
+;; Inner loop not split so it is not interchanged.
+; CHECK-LABEL: @interchange_01
+; CHECK: for.body4:
+; CHECK-NEXT: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 1, %for.body4.preheader ]
+; CHECK-NEXT: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next, i64 %indvars.iv.next29
+
--- /dev/null
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+declare void @foo(...)
+
+;;--------------------------------------Test case 01------------------------------------
+;; for(int i=0;i<N;i++)
+;; for(int j=1;j<N;j++)
+;; A[j][i] = A[j][i]+k;
+
+define void @interchange_01(i32 %k, i32 %N) {
+entry:
+ %cmp21 = icmp sgt i32 %N, 0
+ br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end12
+
+for.cond1.preheader.lr.ph:
+ %cmp219 = icmp sgt i32 %N, 1
+ %0 = add i32 %N, -1
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
+ br i1 %cmp219, label %for.body3, label %for.inc10
+
+for.body3:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+ %1 = load i32, i32* %arrayidx5
+ %add = add nsw i32 %1, %k
+ store i32 %add, i32* %arrayidx5
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+ %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+ %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
+ %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
+ br i1 %exitcond26, label %for.end12, label %for.cond1.preheader
+
+for.end12:
+ ret void
+}
+
+; CHECK-LABEL: @interchange_01
+; CHECK: entry:
+; CHECK: %cmp21 = icmp sgt i32 %N, 0
+; CHECK: br i1 %cmp21, label %for.body3.preheader, label %for.end12
+; CHECK: for.cond1.preheader.lr.ph:
+; CHECK: br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
+; CHECK: %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
+; CHECK: br i1 %cmp219, label %for.body3.split1, label %for.end12.loopexit
+; CHECK: for.body3.preheader:
+; CHECK: %cmp219 = icmp sgt i32 %N, 1
+; CHECK: %0 = add i32 %N, -1
+; CHECK: br label %for.body3
+; CHECK: for.body3:
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ]
+; CHECK: br label %for.cond1.preheader.lr.ph
+; CHECK: for.body3.split1:
+; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+; CHECK: %1 = load i32, i32* %arrayidx5
+; CHECK: %add = add nsw i32 %1, %k
+; CHECK: store i32 %add, i32* %arrayidx5
+; CHECK: br label %for.inc10.loopexit
+; CHECK: for.body3.split:
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32
+; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0
+; CHECK: br i1 %exitcond, label %for.end12.loopexit, label %for.body3
+; CHECK: for.inc10.loopexit:
+; CHECK: br label %for.inc10
+; CHECK: for.inc10:
+; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+; CHECK: %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
+; CHECK: %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
+; CHECK: br i1 %exitcond26, label %for.body3.split, label %for.cond1.preheader
+; CHECK: for.end12.loopexit:
+; CHECK: br label %for.end12
+; CHECK: for.end12:
+; CHECK: ret void
+
+;;--------------------------------------Test case 02-------------------------------------
+
+;; for(int i=0;i<100;i++)
+;; for(int j=100;j>=0;j--)
+;; A[j][i] = A[j][i]+k;
+
+define void @interchange_02(i32 %k) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ]
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
+ %0 = load i32, i32* %arrayidx5
+ %add = add nsw i32 %0, %k
+ store i32 %add, i32* %arrayidx5
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %cmp2 = icmp sgt i64 %indvars.iv, 0
+ br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:
+ %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+ %exitcond = icmp eq i64 %indvars.iv.next20, 100
+ br i1 %exitcond, label %for.end11, label %for.cond1.preheader
+
+for.end11:
+ ret void
+}
+
+; CHECK-LABEL: @interchange_02
+; CHECK: entry:
+; CHECK: br label %for.body3.preheader
+; CHECK: for.cond1.preheader.preheader:
+; CHECK: br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
+; CHECK: %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK: br label %for.body3.split1
+; CHECK: for.body3.preheader:
+; CHECK: br label %for.body3
+; CHECK: for.body3:
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ]
+; CHECK: br label %for.cond1.preheader.preheader
+; CHECK: for.body3.split1: ; preds = %for.cond1.preheader
+; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
+; CHECK: %0 = load i32, i32* %arrayidx5
+; CHECK: %add = add nsw i32 %0, %k
+; CHECK: store i32 %add, i32* %arrayidx5
+; CHECK: br label %for.inc10
+; CHECK: for.body3.split:
+; CHECK: %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK: %cmp2 = icmp sgt i64 %indvars.iv, 0
+; CHECK: br i1 %cmp2, label %for.body3, label %for.end11
+; CHECK: for.inc10:
+; CHECK: %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+; CHECK: %exitcond = icmp eq i64 %indvars.iv.next20, 100
+; CHECK: br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader
+; CHECK: for.end11:
+; CHECK: ret void
+
+;;--------------------------------------Test case 03-------------------------------------
+;; Loops should not be interchanged in this case as it is not profitable.
+;; for(int i=0;i<100;i++)
+;; for(int j=0;j<100;j++)
+;; A[i][j] = A[i][j]+k;
+
+define void @interchange_03(i32 %k) {
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ]
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx5
+ %add = add nsw i32 %0, %k
+ store i32 %add, i32* %arrayidx5
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 100
+ br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+ %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+ %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
+ br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
+
+for.end12:
+ ret void
+}
+
+; CHECK-LABEL: @interchange_03
+; CHECK: entry:
+; CHECK: br label %for.cond1.preheader.preheader
+; CHECK: for.cond1.preheader.preheader: ; preds = %entry
+; CHECK: br label %for.cond1.preheader
+; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc10
+; CHECK: %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK: br label %for.body3.preheader
+; CHECK: for.body3.preheader: ; preds = %for.cond1.preheader
+; CHECK: br label %for.body3
+; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
+; CHECK: %0 = load i32, i32* %arrayidx5
+; CHECK: %add = add nsw i32 %0, %k
+; CHECK: store i32 %add, i32* %arrayidx5
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 100
+; CHECK: br i1 %exitcond, label %for.inc10, label %for.body3
+; CHECK: for.inc10: ; preds = %for.body3
+; CHECK: %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+; CHECK: %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
+; CHECK: br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
+; CHECK: for.end12: ; preds = %for.inc10
+; CHECK: ret void
+
+
+;;--------------------------------------Test case 04-------------------------------------
+;; Loops should not be interchanged in this case as it is not legal due to dependency.
+;; for(int j=0;j<99;j++)
+;; for(int i=0;i<99;i++)
+;; A[j][i+1] = A[j+1][i]+k;
+
+define void @interchange_04(i32 %k){
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
+ %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx5
+ %add6 = add nsw i32 %0, %k
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
+ store i32 %add6, i32* %arrayidx11
+ %exitcond = icmp eq i64 %indvars.iv.next, 99
+ br i1 %exitcond, label %for.inc12, label %for.body3
+
+for.inc12:
+ %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
+ br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
+
+for.end14:
+ ret void
+}
+
+; CHECK-LABEL: @interchange_04
+; CHECK: entry:
+; CHECK: br label %for.cond1.preheader
+; CHECK: for.cond1.preheader: ; preds = %for.inc12, %entry
+; CHECK: %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
+; CHECK: %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+; CHECK: br label %for.body3
+; CHECK: for.body3: ; preds = %for.body3, %for.cond1.preheader
+; CHECK: %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
+; CHECK: %0 = load i32, i32* %arrayidx5
+; CHECK: %add6 = add nsw i32 %0, %k
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
+; CHECK: store i32 %add6, i32* %arrayidx11
+; CHECK: %exitcond = icmp eq i64 %indvars.iv.next, 99
+; CHECK: br i1 %exitcond, label %for.inc12, label %for.body3
+; CHECK: for.inc12: ; preds = %for.body3
+; CHECK: %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
+; CHECK: br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
+; CHECK: for.end14: ; preds = %for.inc12
+; CHECK: ret void
+
+
+
+;;--------------------------------------Test case 05-------------------------------------
+;; Loops not tightly nested are not interchanged
+;; for(int j=0;j<N;j++) {
+;; B[j] = j+k;
+;; for(int i=0;i<N;i++)
+;; A[j][i] = A[j][i]+B[j];
+;; }
+
+define void @interchange_05(i32 %k, i32 %N){
+entry:
+ %cmp30 = icmp sgt i32 %N, 0
+ br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:
+ %0 = add i32 %N, -1
+ %1 = zext i32 %k to i64
+ br label %for.body
+
+for.body:
+ %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+ %2 = add nsw i64 %indvars.iv32, %1
+ %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
+ %3 = trunc i64 %2 to i32
+ store i32 %3, i32* %arrayidx
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+ %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+ %4 = load i32, i32* %arrayidx7
+ %add10 = add nsw i32 %3, %4
+ store i32 %add10, i32* %arrayidx7
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.inc15, label %for.body3
+
+for.inc15:
+ %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+ %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+ %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+ br i1 %exitcond36, label %for.end17, label %for.body
+
+for.end17:
+ ret void
+}
+
+; CHECK-LABEL: @interchange_05
+; CHECK: entry:
+; CHECK: %cmp30 = icmp sgt i32 %N, 0
+; CHECK: br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+; CHECK: for.body.lr.ph:
+; CHECK: %0 = add i32 %N, -1
+; CHECK: %1 = zext i32 %k to i64
+; CHECK: br label %for.body
+; CHECK: for.body:
+; CHECK: %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+; CHECK: %2 = add nsw i64 %indvars.iv32, %1
+; CHECK: %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
+; CHECK: %3 = trunc i64 %2 to i32
+; CHECK: store i32 %3, i32* %arrayidx
+; CHECK: br label %for.body3.preheader
+; CHECK: for.body3.preheader:
+; CHECK: br label %for.body3
+; CHECK: for.body3:
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+; CHECK: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+; CHECK: %4 = load i32, i32* %arrayidx7
+; CHECK: %add10 = add nsw i32 %3, %4
+; CHECK: store i32 %add10, i32* %arrayidx7
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32
+; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0
+; CHECK: br i1 %exitcond, label %for.inc15, label %for.body3
+; CHECK: for.inc15:
+; CHECK: %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+; CHECK: %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+; CHECK: %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+; CHECK: br i1 %exitcond36, label %for.end17.loopexit, label %for.body
+; CHECK: for.end17.loopexit:
+; CHECK: br label %for.end17
+; CHECK: for.end17:
+; CHECK: ret void
+
+
+;;--------------------------------------Test case 06-------------------------------------
+;; Loops not tightly nested are not interchanged
+;; for(int j=0;j<N;j++) {
+;; foo();
+;; for(int i=2;i<N;i++)
+;; A[j][i] = A[j][i]+k;
+;; }
+
+define void @interchange_06(i32 %k, i32 %N) {
+entry:
+ %cmp22 = icmp sgt i32 %N, 0
+ br i1 %cmp22, label %for.body.lr.ph, label %for.end12
+
+for.body.lr.ph:
+ %0 = add i32 %N, -1
+ br label %for.body
+
+for.body:
+ %indvars.iv24 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next25, %for.inc10 ]
+ tail call void (...)* @foo()
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv24, i64 %indvars.iv
+ %1 = load i32, i32* %arrayidx5
+ %add = add nsw i32 %1, %k
+ store i32 %add, i32* %arrayidx5
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+ %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+ %lftr.wideiv26 = trunc i64 %indvars.iv24 to i32
+ %exitcond27 = icmp eq i32 %lftr.wideiv26, %0
+ br i1 %exitcond27, label %for.end12, label %for.body
+
+for.end12:
+ ret void
+}
+;; Here we are checking if the inner phi is not split then we have not interchanged.
+; CHECK-LABEL: @interchange_06
+; CHECK: phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body3.preheader ]
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: %1 = load
+
+;;--------------------------------------Test case 07-------------------------------------
+;; FIXME:
+;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported.
+;; for(gi=1;gi<N;gi++)
+;; for(gj=1;gj<M;gj++)
+;; A[gj][gi] = A[gj - 1][gi] + C[gj][gi];
+
+@gi = common global i32 0
+@gj = common global i32 0
+
+define void @interchange_07(i32 %N, i32 %M){
+entry:
+ store i32 1, i32* @gi
+ %cmp21 = icmp sgt i32 %N, 1
+ br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16
+
+for.cond1.preheader.lr.ph:
+ %cmp218 = icmp sgt i32 %M, 1
+ %gi.promoted = load i32, i32* @gi
+ %0 = add i32 %M, -1
+ %1 = sext i32 %gi.promoted to i64
+ %2 = sext i32 %N to i64
+ %3 = add i32 %gi.promoted, 1
+ %4 = icmp slt i32 %3, %N
+ %smax = select i1 %4, i32 %N, i32 %3
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ]
+ br i1 %cmp218, label %for.body3, label %for.inc14
+
+for.body3:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
+ %5 = add nsw i64 %indvars.iv, -1
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
+ %6 = load i32, i32* %arrayidx5
+ %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
+ %7 = load i32, i32* %arrayidx9
+ %add = add nsw i32 %7, %6
+ %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25
+ store i32 %add, i32* %arrayidx13
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.inc14, label %for.body3
+
+for.inc14:
+ %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ]
+ %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1
+ %cmp = icmp slt i64 %indvars.iv.next26, %2
+ br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge
+
+for.cond.for.end16_crit_edge:
+ store i32 %inc.lcssa23, i32* @gj
+ store i32 %smax, i32* @gi
+ br label %for.end16
+
+for.end16:
+ ret void
+}
+
+; CHECK-LABEL: @interchange_07
+; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
+; CHECK: %5 = add nsw i64 %indvars.iv, -1
+; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
+; CHECK: %6 = load i32, i32* %arrayidx5
+; CHECK: %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
+
+;;------------------------------------------------Test case 08-------------------------------
+;; Test for interchange in loop nest greater than 2.
+;; for(int i=0;i<100;i++)
+;; for(int j=0;j<100;j++)
+;; for(int k=0;k<100;k++)
+;; D[i][k][j] = D[i][k][j]+t;
+
+define void @interchange_08(i32 %t){
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %for.inc15, %entry
+ %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+ br label %for.cond4.preheader
+
+for.cond4.preheader: ; preds = %for.inc12, %for.cond1.preheader
+ %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
+ br label %for.body6
+
+for.body6: ; preds = %for.body6, %for.cond4.preheader
+ %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+ %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+ %0 = load i32, i32* %arrayidx8
+ %add = add nsw i32 %0, %t
+ store i32 %add, i32* %arrayidx8
+ %inc = add nuw nsw i32 %k.026, 1
+ %exitcond = icmp eq i32 %inc, 100
+ br i1 %exitcond, label %for.inc12, label %for.body6
+
+for.inc12: ; preds = %for.body6
+ %inc13 = add nuw nsw i32 %j.027, 1
+ %exitcond29 = icmp eq i32 %inc13, 100
+ br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
+
+for.inc15: ; preds = %for.inc12
+ %inc16 = add nuw nsw i32 %i.028, 1
+ %exitcond30 = icmp eq i32 %inc16, 100
+ br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
+
+for.end17: ; preds = %for.inc15
+ ret void
+}
+; CHECK-LABEL: @interchange_08
+; CHECK: entry:
+; CHECK: br label %for.cond1.preheader.preheader
+; CHECK: for.cond1.preheader.preheader: ; preds = %entry
+; CHECK: br label %for.cond1.preheader
+; CHECK: for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc15
+; CHECK: %i.028 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK: br label %for.body6.preheader
+; CHECK: for.cond4.preheader.preheader: ; preds = %for.body6
+; CHECK: br label %for.cond4.preheader
+; CHECK: for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc12
+; CHECK: %j.027 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.cond4.preheader.preheader ]
+; CHECK: br label %for.body6.split1
+; CHECK: for.body6.preheader: ; preds = %for.cond1.preheader
+; CHECK: br label %for.body6
+; CHECK: for.body6: ; preds = %for.body6.preheader, %for.body6.split
+; CHECK: %k.026 = phi i32 [ %inc, %for.body6.split ], [ 0, %for.body6.preheader ]
+; CHECK: br label %for.cond4.preheader.preheader
+; CHECK: for.body6.split1: ; preds = %for.cond4.preheader
+; CHECK: %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+; CHECK: %0 = load i32, i32* %arrayidx8
+; CHECK: %add = add nsw i32 %0, %t
+; CHECK: store i32 %add, i32* %arrayidx8
+; CHECK: br label %for.inc12
+; CHECK: for.body6.split: ; preds = %for.inc12
+; CHECK: %inc = add nuw nsw i32 %k.026, 1
+; CHECK: %exitcond = icmp eq i32 %inc, 100
+; CHECK: br i1 %exitcond, label %for.inc15, label %for.body6
+; CHECK: for.inc12: ; preds = %for.body6.split1
+; CHECK: %inc13 = add nuw nsw i32 %j.027, 1
+; CHECK: %exitcond29 = icmp eq i32 %inc13, 100
+; CHECK: br i1 %exitcond29, label %for.body6.split, label %for.cond4.preheader
+; CHECK: for.inc15: ; preds = %for.body6.split
+; CHECK: %inc16 = add nuw nsw i32 %i.028, 1
+; CHECK: %exitcond30 = icmp eq i32 %inc16, 100
+; CHECK: br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
+; CHECK: for.end17: ; preds = %for.inc15
+; CHECK: ret void
+
--- /dev/null
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test profitability model in these test cases.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x [100 x i32]] zeroinitializer
+
+;;---------------------------------------Test case 01---------------------------------
+;; Loops interchange will result in code vectorization and hence profitable. Check for interchange.
+;; for(int i=1;i<N;i++)
+;; for(int j=1;j<N;j++)
+;; A[j][i] = A[j - 1][i] + B[j][i];
+
+define void @interchange_01(i32 %N) {
+entry:
+ %cmp27 = icmp sgt i32 %N, 1
+ br i1 %cmp27, label %for.cond1.preheader.lr.ph, label %for.end16
+
+for.cond1.preheader.lr.ph:
+ %0 = add i32 %N, -1
+ br label %for.body3.preheader
+
+for.body3.preheader:
+ %indvars.iv30 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next31, %for.inc14 ]
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
+ %1 = add nsw i64 %indvars.iv, -1
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %indvars.iv30
+ %2 = load i32, i32* %arrayidx5
+ %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+ %3 = load i32, i32* %arrayidx9
+ %add = add nsw i32 %3, %2
+ %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+ store i32 %add, i32* %arrayidx13
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.inc14, label %for.body3
+
+for.inc14:
+ %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1
+ %lftr.wideiv32 = trunc i64 %indvars.iv30 to i32
+ %exitcond33 = icmp eq i32 %lftr.wideiv32, %0
+ br i1 %exitcond33, label %for.end16, label %for.body3.preheader
+
+for.end16:
+ ret void
+}
+;; Here we are checking partial .ll to check if loop are interchanged.
+; CHECK-LABEL: @interchange_01
+; CHECK: for.body3.preheader: ; preds = %for.inc14, %for.cond1.preheader.lr.ph
+; CHECK: %indvars.iv30 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next31, %for.inc14 ]
+; CHECK: br label %for.body3.split2
+
+; CHECK: for.body3.preheader1: ; preds = %entry
+; CHECK: br label %for.body3
+
+; CHECK: for.body3: ; preds = %for.body3.preheader1, %for.body3.split
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader1 ]
+; CHECK: br label %for.cond1.preheader.lr.ph
+
+; CHECK: for.body3.split2: ; preds = %for.body3.preheader
+; CHECK: %1 = add nsw i64 %indvars.iv, -1
+; CHECK: %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %indvars.iv30
+; CHECK: %2 = load i32, i32* %arrayidx5
+; CHECK: %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+; CHECK: %3 = load i32, i32* %arrayidx9
+; CHECK: %add = add nsw i32 %3, %2
+; CHECK: %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv30
+; CHECK: store i32 %add, i32* %arrayidx13
+; CHECK: br label %for.inc14
+
+
+;; ---------------------------------------Test case 02---------------------------------
+;; Check loop interchange profitability model.
+;; This tests profitability model when operands of getelementpointer and not exactly the induction variable but some
+;; arithmetic operation on them.
+;; for(int i=1;i<N;i++)
+;; for(int j=1;j<N;j++)
+;; A[j-1][i-1] = A[j - 1][i-1] + B[j-1][i-1];
+
+define void @interchange_02(i32 %N) {
+entry:
+ %cmp32 = icmp sgt i32 %N, 1
+ br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end21
+
+for.cond1.preheader.lr.ph:
+ %0 = add i32 %N, -1
+ br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+ %indvars.iv35 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next36, %for.inc19 ]
+ %1 = add nsw i64 %indvars.iv35, -1
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+ %2 = add nsw i64 %indvars.iv, -1
+ %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %2, i64 %1
+ %3 = load i32, i32* %arrayidx6
+ %arrayidx12 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %2, i64 %1
+ %4 = load i32, i32* %arrayidx12
+ %add = add nsw i32 %4, %3
+ store i32 %add, i32* %arrayidx6
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.inc19, label %for.body3
+
+for.inc19:
+ %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+ %lftr.wideiv38 = trunc i64 %indvars.iv35 to i32
+ %exitcond39 = icmp eq i32 %lftr.wideiv38, %0
+ br i1 %exitcond39, label %for.end21, label %for.body3.lr.ph
+
+for.end21:
+ ret void
+}
+; CHECK-LABEL: @interchange_02
+; CHECK: for.body3.lr.ph: ; preds = %for.inc19, %for.cond1.preheader.lr.ph
+; CHECK: %indvars.iv35 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next36, %for.inc19 ]
+; CHECK: %0 = add nsw i64 %indvars.iv35, -1
+; CHECK: br label %for.body3.split1
+
+; CHECK: for.body3.preheader: ; preds = %entry
+; CHECK: %1 = add i32 %N, -1
+; CHECK: br label %for.body3
+
+; CHECK: for.body3: ; preds = %for.body3.preheader, %for.body3.split
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ]
+; CHECK: br label %for.cond1.preheader.lr.ph
+
+; CHECK: for.body3.split1: ; preds = %for.body3.lr.ph
+; CHECK: %2 = add nsw i64 %indvars.iv, -1
+; CHECK: %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %2, i64 %0
+; CHECK: %3 = load i32, i32* %arrayidx6
+; CHECK: %arrayidx12 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %2, i64 %0
+; CHECK: %4 = load i32, i32* %arrayidx12
+; CHECK: %add = add nsw i32 %4, %3
+; CHECK: store i32 %add, i32* %arrayidx6
+; CHECK: br label %for.inc19
+
+
+;;---------------------------------------Test case 03---------------------------------
+;; Loops interchange is not profitable.
+;; for(int i=1;i<N;i++)
+;; for(int j=1;j<N;j++)
+;; A[i-1][j-1] = A[i - 1][j-1] + B[i][j];
+
+define void @interchange_03(i32 %N){
+entry:
+ %cmp31 = icmp sgt i32 %N, 1
+ br i1 %cmp31, label %for.cond1.preheader.lr.ph, label %for.end19
+
+for.cond1.preheader.lr.ph:
+ %0 = add i32 %N, -1
+ br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+ %indvars.iv34 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next35, %for.inc17 ]
+ %1 = add nsw i64 %indvars.iv34, -1
+ br label %for.body3
+
+for.body3:
+ %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+ %2 = add nsw i64 %indvars.iv, -1
+ %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %2
+ %3 = load i32, i32* %arrayidx6
+ %arrayidx10 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv34, i64 %indvars.iv
+ %4 = load i32, i32* %arrayidx10
+ %add = add nsw i32 %4, %3
+ store i32 %add, i32* %arrayidx6
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %0
+ br i1 %exitcond, label %for.inc17, label %for.body3
+
+for.inc17:
+ %indvars.iv.next35 = add nuw nsw i64 %indvars.iv34, 1
+ %lftr.wideiv37 = trunc i64 %indvars.iv34 to i32
+ %exitcond38 = icmp eq i32 %lftr.wideiv37, %0
+ br i1 %exitcond38, label %for.end19, label %for.body3.lr.ph
+
+for.end19:
+ ret void
+}
+
+; CHECK-LABEL: @interchange_03
+; CHECK: for.body3.lr.ph:
+; CHECK: %indvars.iv34 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next35, %for.inc17 ]
+; CHECK: %1 = add nsw i64 %indvars.iv34, -1
+; CHECK: br label %for.body3.preheader
+; CHECK: for.body3.preheader:
+; CHECK: br label %for.body3
+; CHECK: for.body3:
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
+; CHECK: %2 = add nsw i64 %indvars.iv, -1
+; CHECK: %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %2
+; CHECK: %3 = load i32, i32* %arrayidx6
+; CHECK: %arrayidx10 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv34, i64 %indvars.iv
+; CHECK: %4 = load i32, i32* %arrayidx10