From: Arnold Schwaighofer Date: Tue, 28 Jan 2014 01:01:53 +0000 (+0000) Subject: LoopVectorize: Support conditional stores by scalarizing X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=a47aa4b4efb2b6c5944c2edb9a4346c0f3ffcaf1;p=oota-llvm.git LoopVectorize: Support conditional stores by scalarizing The vectorizer takes a loop like this and widens all instructions except for the store. The stores are scalarized/unrolled and hidden behind an "if" block. for (i = 0; i < 128; ++i) { if (a[i] < 10) a[i] += val; } for (i = 0; i < 128; i+=2) { v = a[i:i+1]; v0 = (extract v, 0) + 10; v1 = (extract v, 1) + 10; if (v0 < 10) a[i] = v0; if (v1 < 10) a[i] = v1; } The vectorizer relies on subsequent optimizations to sink instructions into the conditional block where they are anticipated. The flag "vectorize-num-stores-pred" controls whether and how many stores to handle this way. Vectorization of conditional stores is disabled per default for now. This patch also adds a change to the heuristic when the flag "enable-loadstore-runtime-unroll" is enabled (off by default). It unrolls small loops until load/store ports are saturated. This heuristic uses TTI's getMaxUnrollFactor as a measure for load/store ports. I also added a second flag -enable-cond-stores-vec. It will enable vectorization of conditional stores. But there is no cost model for vectorization of conditional stores in place yet so this will not do good at the moment. rdar://15892953 Results for x86-64 -O3 -mavx +/- -mllvm -enable-loadstore-runtime-unroll -vectorize-num-stores-pred=1 (before the BFI change): Performance Regressions: Benchmarks/Ptrdist/yacr2/yacr2 7.35% (maze3() is identical but 10% slower) Applications/siod/siod 2.18% Performance improvements: mesa -4.42% libquantum -4.15% With a patch that slightly changes the register heuristics (by subtracting the induction variable on both sides of the register pressure equation, as the induction variable is probably not really unrolled): Performance Regressions: Benchmarks/Ptrdist/yacr2/yacr2 7.73% Applications/siod/siod 1.97% Performance Improvements: libquantum -13.05% (we now also unroll quantum_toffoli) mesa -4.27% git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200270 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 5d1f85f86df..78674952847 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,20 @@ static cl::opt SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the unroller.")); +// Runtime unroll loops for load/store throughput. +static cl::opt EnableLoadStoreRuntimeUnroll( + "enable-loadstore-runtime-unroll", cl::init(false), cl::Hidden, + cl::desc("Enable runtime unrolling until load/store ports are saturated")); + +/// The number of stores in a loop that are allowed to need predication. +static cl::opt NumberOfStoresToPredicate( + "vectorize-num-stores-pred", cl::init(0), cl::Hidden, + cl::desc("Max number of stores to be predicated behind an if.")); + +static cl::opt EnableCondStoresVectorization( + "enable-cond-stores-vec", cl::init(false), cl::Hidden, + cl::desc("Enable if predication of stores during vectorization.")); + namespace { // Forward declarations. @@ -275,8 +289,11 @@ protected: void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. - virtual void scalarizeInstruction(Instruction *Instr); + /// of scalars. If \p IfPredicateStore is true we need to 'hide' each + /// scalarized instruction behind an if block predicated on the control + /// dependence of the instruction. + virtual void scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore=false); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -379,7 +396,7 @@ protected: ///The ExitBlock of the scalar loop. BasicBlock *LoopExitBlock; ///The vector loop body. - BasicBlock *LoopVectorBody; + SmallVector LoopVectorBody; ///The scalar loop body. BasicBlock *LoopScalarBody; /// A list of all bypass blocks. The first block is the entry of the loop. @@ -406,7 +423,7 @@ public: InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } private: - virtual void scalarizeInstruction(Instruction *Instr); + virtual void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore = false); virtual void vectorizeMemoryInstruction(Instruction *Instr); virtual Value *getBroadcastInstrs(Value *V); virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); @@ -456,10 +473,14 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: + unsigned NumLoads; + unsigned NumStores; + unsigned NumPredStores; + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, DominatorTree *DT, TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), - Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), + : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), + DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {} /// This enum represents the kinds of reductions that we support. @@ -1206,7 +1227,9 @@ void LoopVectorizationLegality::RuntimePointerCheck::insert( Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast(V); - bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); + bool NewInstr = + (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), + Instr->getParent()) != LoopVectorBody.end()); bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. @@ -1411,6 +1434,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + if (SI && Legal->blockNeedsPredication(SI->getParent())) + return scalarizeInstruction(Instr, true); + if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); @@ -1529,7 +1555,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -1574,10 +1600,37 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + Instruction *InsertPt = Builder.GetInsertPoint(); + BasicBlock *IfBlock = Builder.GetInsertBlock(); + BasicBlock *CondBlock = 0; + + VectorParts Cond; + Loop *VectorLp = 0; + if (IfPredicateStore) { + assert(Instr->getParent()->getSinglePredecessor() && + "Only support single predecessor blocks"); + Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), + Instr->getParent()); + VectorLp = LI->getLoopFor(IfBlock); + assert(VectorLp && "Must have a loop for this block"); + } + // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: for (unsigned Width = 0; Width < VF; ++Width) { + + // Start if-block. + Value *Cmp = 0; + if (IfPredicateStore) { + Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + // Update Builder with newly created basic block. + Builder.SetInsertPoint(InsertPt); + } + Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -1598,6 +1651,16 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { if (!IsVoidRetTy) VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); + // End if-block. + if (IfPredicateStore) { + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } } } } @@ -2101,7 +2164,7 @@ void InnerLoopVectorizer::createEmptyLoop() { LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; + LoopVectorBody.push_back(VecBody); LoopScalarBody = OldBasicBlock; LoopVectorizeHints Hints(Lp, true); @@ -2369,25 +2432,42 @@ struct CSEDenseMapInfo { }; } +/// \brief Check whether this block is a predicated block. +/// Due to if predication of stores we might create a sequence of "if(pred) a[i] +/// = ...; " blocks. We start with one vectorized basic block. For every +/// conditional block we split this vectorized block. Therefore, every second +/// block will be a predicated one. +static bool isPredicatedBlock(unsigned BlockNum) { + return BlockNum % 2; +} + ///\brief Perform cse of induction variable instructions. -static void cse(BasicBlock *BB) { +static void cse(SmallVector &BBs) { // Perform simple cse. SmallDenseMap CSEMap; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + for (unsigned i = 0, e = BBs.size(); i != e; ++i) { + BasicBlock *BB = BBs[i]; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *In = I++; - if (!CSEDenseMapInfo::canHandle(In)) - continue; + if (!CSEDenseMapInfo::canHandle(In)) + continue; - // Check if we can replace this instruction with any of the - // visited instructions. - if (Instruction *V = CSEMap.lookup(In)) { - In->replaceAllUsesWith(V); - In->eraseFromParent(); - continue; - } + // Check if we can replace this instruction with any of the + // visited instructions. + if (Instruction *V = CSEMap.lookup(In)) { + In->replaceAllUsesWith(V); + In->eraseFromParent(); + continue; + } + // Ignore instructions in conditional blocks. We create "if (pred) a[i] = + // ...;" blocks for predicated stores. Every second block is a predicated + // block. + if (isPredicatedBlock(i)) + continue; - CSEMap[In] = In; + CSEMap[In] = In; + } } } @@ -2503,7 +2583,8 @@ void InnerLoopVectorizer::vectorizeLoop() { // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; cast(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); - cast(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); + cast(VecRdxPhi[part])->addIncoming(Val[part], + LoopVectorBody.back()); } // Before each round, move the insertion point right between @@ -2522,7 +2603,8 @@ void InnerLoopVectorizer::vectorizeLoop() { Value *StartVal = (part == 0) ? VectorStart : Identity; for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); + NewPhi->addIncoming(RdxExitVal[part], + LoopVectorBody.back()); RdxParts.push_back(NewPhi); } @@ -2695,7 +2777,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody-> getFirstInsertionPt()); + LoopVectorBody.back()-> getFirstInsertionPt()); } PV->push_back(P); return; @@ -3044,7 +3126,19 @@ void InnerLoopVectorizer::updateAnalysis() { for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); + + // Due to if predication of stores we might create a sequence of "if(pred) + // a[i] = ...; " blocks. + for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { + if (i == 0) + DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); + else if (isPredicatedBlock(i)) { + DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); + } else { + DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); + } + } + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); @@ -4292,6 +4386,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } + NumLoads++; Loads.push_back(Ld); DepChecker.addAccess(Ld); continue; @@ -4305,6 +4400,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } + NumStores++; Stores.push_back(St); DepChecker.addAccess(St); } @@ -4816,7 +4912,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, } // We don't predicate stores at the moment. - if (it->mayWriteToMemory() || it->mayThrow()) + if (it->mayWriteToMemory()) { + StoreInst *SI = dyn_cast(it); + // We only support predication of stores in basic blocks with one + // predecessor. + if (!SI || ++NumPredStores > NumberOfStoresToPredicate || + !SafePtrs.count(SI->getPointerOperand()) || + !SI->getParent()->getSinglePredecessor()) + return false; + } + if (it->mayThrow()) return false; // Check that we don't have a constant expression that can trap as operand. @@ -4851,6 +4956,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return Factor; } + if (!EnableCondStoresVectorization && Legal->NumPredStores) { + DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); + return Factor; + } + // Find the trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); @@ -5066,6 +5176,17 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, return UF; } + if (EnableLoadStoreRuntimeUnroll && + !Legal->getRuntimePointerCheck()->Need && + LoopCost < SmallLoopCost) { + // Unroll until store/load ports (estimated by max unroll factor) are + // saturated. + unsigned UnrollStores = UF / (Legal->NumStores ? Legal->NumStores : 1); + unsigned UnrollLoads = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + UF = std::max(std::min(UnrollStores, UnrollLoads), 1u); + return UF; + } + // We want to unroll tiny loops in order to reduce the loop overhead. // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and unroll until the cost of the @@ -5515,7 +5636,8 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { } -void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -5560,10 +5682,39 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + Instruction *InsertPt = Builder.GetInsertPoint(); + BasicBlock *IfBlock = Builder.GetInsertBlock(); + BasicBlock *CondBlock = 0; + + VectorParts Cond; + Loop *VectorLp = 0; + if (IfPredicateStore) { + assert(Instr->getParent()->getSinglePredecessor() && + "Only support single predecessor blocks"); + Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), + Instr->getParent()); + VectorLp = LI->getLoopFor(IfBlock); + assert(VectorLp && "Must have a loop for this block"); + } + // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: + // Start an "if (pred) a[i] = ..." block. + Value *Cmp = 0; + if (IfPredicateStore) { + if (Cond[Part]->getType()->isVectorTy()) + Cond[Part] = + Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], + ConstantInt::get(Cond[Part]->getType(), 1)); + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + // Update Builder with newly created basic block. + Builder.SetInsertPoint(InsertPt); + } + Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -5580,11 +5731,25 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { // so that future users will be able to use it. if (!IsVoidRetTy) VecResults[Part] = Cloned; + + // End if-block. + if (IfPredicateStore) { + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } } } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { - return scalarizeInstruction(Instr); + StoreInst *SI = dyn_cast(Instr); + bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); + + return scalarizeInstruction(Instr, IfPredicateStore); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll new file mode 100644 index 00000000000..909a1cb17d1 --- /dev/null +++ b/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -0,0 +1,86 @@ +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-unroll=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL +; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-unroll=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +; Test predication of stores. +define i32 @test(i32* nocapture %f) #0 { +entry: + br label %for.body + +; VEC-LABEL: test +; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, +; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, +; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], +; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0 +; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true +; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] +; +; VEC: [[cond]]: +; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 +; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0 +; VEC: store i32 %[[v13]], i32* %[[v14]], align 4 +; VEC: br label %[[else:.+]] +; +; VEC: [[else]]: +; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1 +; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true +; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] +; +; VEC: [[cond2]]: +; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 +; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1 +; VEC: store i32 %[[v17]], i32* %[[v18]], align 4 +; VEC: br label %[[else2:.+]] +; +; VEC: [[else2]]: + +; UNROLL-LABEL: test +; UNROLL: vector.body: +; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0 +; UNROLL: %[[IND1:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 1 +; UNROLL: %[[v0:[a-zA-Z0-9]+]] = getelementptr inbounds i32* %f, i64 %[[IND]] +; UNROLL: %[[v1:[a-zA-Z0-9]+]] = getelementptr inbounds i32* %f, i64 %[[IND1]] +; UNROLL: %[[v2:[a-zA-Z0-9]+]] = load i32* %[[v0]], align 4 +; UNROLL: %[[v3:[a-zA-Z0-9]+]] = load i32* %[[v1]], align 4 +; UNROLL: %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100 +; UNROLL: %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100 +; UNROLL: %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20 +; UNROLL: %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20 +; UNROLL: %[[v8:[a-zA-Z0-9]+]] = icmp eq i1 %[[v4]], true +; UNROLL: br i1 %[[v8]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]] +; +; UNROLL: [[cond]]: +; UNROLL: store i32 %[[v6]], i32* %[[v0]], align 4 +; UNROLL: br label %[[else]] +; +; UNROLL: [[else]]: +; UNROLL: %[[v9:[a-zA-Z0-9]+]] = icmp eq i1 %[[v5]], true +; UNROLL: br i1 %[[v9]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]] +; +; UNROLL: [[cond2]]: +; UNROLL: store i32 %[[v7]], i32* %[[v1]], align 4 +; UNROLL: br label %[[else2]] +; +; UNROLL: [[else2]]: + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32* %f, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %0, 100 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %add = add nsw i32 %0, 20 + store i32 %add, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 128 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret i32 0 +}