From 5c6e9ae14e012bfe1fdf0e19957922d3c4d85670 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 21 Oct 2012 15:03:07 +0000 Subject: [PATCH] LoopIdiom: Replace custom dependence analysis with LoopDependenceAnalysis. Requires a lot less code and complexity on loop-idiom's side and the more precise analysis can catch more cases, like the one I included as a test case. This also fixes the edge-case miscompilation from PR9481. I'm not entirely sure that all cases are handled that the old checks handled but LDA will certainly become smarter in the future. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166390 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 100 +++++------------- .../Transforms/LoopIdiom/multi-dimensional.ll | 49 +++++++++ test/Transforms/LoopIdiom/sideeffect.ll | 53 ++++++++++ 3 files changed, 128 insertions(+), 74 deletions(-) create mode 100644 test/Transforms/LoopIdiom/multi-dimensional.ll create mode 100644 test/Transforms/LoopIdiom/sideeffect.ll diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a44e798f121..5c398a8b693 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -48,6 +48,7 @@ #include "llvm/Module.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopDependenceAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -106,6 +107,8 @@ namespace { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addPreserved(); AU.addRequired(); AU.addRequired(); @@ -122,6 +125,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(LoopDependenceAnalysis) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -163,15 +167,6 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, } while (!NowDeadInsts.empty()); } -/// deleteIfDeadInstruction - If the specified value is a dead instruction, -/// delete it and any recursively used instructions. -static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, - const TargetLibraryInfo *TLI) { - if (Instruction *I = dyn_cast(V)) - if (isInstructionTriviallyDead(I, TLI)) - deleteDeadInstruction(I, SE, TLI); -} - bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { CurLoop = L; @@ -368,35 +363,16 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { MSI, Ev, BECount); } - -/// mayLoopAccessLocation - Return true if the specified loop might access the -/// specified pointer location, which is a loop-strided access. The 'Access' -/// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, - Loop *L, const SCEV *BECount, - unsigned StoreSize, AliasAnalysis &AA, - Instruction *IgnoredStore) { - // Get the location that may be stored across the loop. Since the access is - // strided positively through memory, we say that the modified location starts - // at the pointer and has infinite size. - uint64_t AccessSize = AliasAnalysis::UnknownSize; - - // If the loop iterates a fixed number of times, we can refine the access size - // to be exactly the size of the memset, which is (BECount+1)*StoreSize - if (const SCEVConstant *BECst = dyn_cast(BECount)) - AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; - - // TODO: For this to be really effective, we have to dive into the pointer - // operand in the store. Store to &A[i] of 100 will always return may alias - // with store of &A[100], we need to StoreLoc to be "A" with size of 100, - // which will then no-alias a store to &A[100]. - AliasAnalysis::Location StoreLoc(Ptr, AccessSize); - +/// hasDependence - Uses the LoopDependenceAnalysis to determine whether 'Inst' +/// depends on any other value in the Loop 'L'. +static bool hasDependence(Instruction *Inst, Loop *L, + LoopDependenceAnalysis &LDA) { for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) - if (&*I != IgnoredStore && - (AA.getModRefInfo(I, StoreLoc) & Access)) + if (&*I != Inst && I->mayReadOrWriteMemory() && + (I->mayWriteToMemory() || Inst->mayWriteToMemory()) && + LDA.depends(Inst, I)) return true; return false; @@ -474,6 +450,11 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, return false; } + // Make sure the store has no dependencies (i.e. other loads and stores) in + // the loop. + if (hasDependence(TheStore, CurLoop, getAnalysis())) + return false; + // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. @@ -482,25 +463,13 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, SCEVExpander Expander(*SE, "loop-idiom"); // Okay, we have a strided store "p[i]" of a splattable value. We can turn - // this into a memset in the loop preheader now if we want. However, this - // would be unsafe to do if there is anything else in the loop that may read - // or write to the aliased location. Check for any overlap by generating the - // base pointer and checking the region. + // this into a memset in the loop preheader now if we want. unsigned AddrSpace = cast(DestPtr->getType())->getAddressSpace(); Value *BasePtr = Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), Preheader->getTerminator()); - if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, - StoreSize, getAnalysis(), TheStore)){ - Expander.clear(); - // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(BasePtr, *SE, TLI); - return false; - } - // Okay, everything looks good, insert the memset. // The # stored bytes is (BECount+1)*Size. Expand the trip count out to @@ -563,6 +532,14 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, LoadInst *LI = cast(SI->getValueOperand()); + // Make sure the load and the store have no dependencies (i.e. other loads and + // stores) in the loop. + // FIXME: If we want to form a memmove SI and LI can be dependent but the + // distance must be positive. LDA doesn't provide that info currently. + LoopDependenceAnalysis &LDA = getAnalysis(); + if (hasDependence(SI, CurLoop, LDA) || hasDependence(LI, CurLoop, LDA)) + return false; + // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. @@ -571,41 +548,16 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, SCEVExpander Expander(*SE, "loop-idiom"); // Okay, we have a strided store "p[i]" of a loaded value. We can turn - // this into a memcpy in the loop preheader now if we want. However, this - // would be unsafe to do if there is anything else in the loop that may read - // or write the memory region we're storing to. This includes the load that - // feeds the stores. Check for an alias by generating the base address and - // checking everything. + // this into a memcpy in the loop preheader now if we want. Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(), Builder.getInt8PtrTy(SI->getPointerAddressSpace()), Preheader->getTerminator()); - - if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, StoreSize, - getAnalysis(), SI)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); - return false; - } - - // For a memcpy, we have to make sure that the input array is not being - // mutated by the loop. Value *LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(), Builder.getInt8PtrTy(LI->getPointerAddressSpace()), Preheader->getTerminator()); - if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, - StoreSize, getAnalysis(), SI)) { - Expander.clear(); - // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(LoadBasePtr, *SE, TLI); - deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); - return false; - } - // Okay, everything is safe, we can transform this! diff --git a/test/Transforms/LoopIdiom/multi-dimensional.ll b/test/Transforms/LoopIdiom/multi-dimensional.ll new file mode 100644 index 00000000000..991f2688d76 --- /dev/null +++ b/test/Transforms/LoopIdiom/multi-dimensional.ll @@ -0,0 +1,49 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-darwin10.0.0" + +%struct.ham = type { [2 x [2 x [2 x [16 x [8 x i32]]]]], i32, %struct.zot } +%struct.zot = type { i32, i16, i16, [2 x [1152 x i32]] } + +define void @test1(%struct.ham* nocapture %arg) nounwind { +bb: + br label %bb1 + +bb1: ; preds = %bb11, %bb + %tmp = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ] + br label %bb2 + +bb2: ; preds = %bb2, %bb1 + %tmp3 = phi i64 [ 0, %bb1 ], [ %tmp8, %bb2 ] + %tmp4 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 1, i64 1, i64 %tmp, i64 %tmp3 + store i32 0, i32* %tmp4, align 4 + %tmp5 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 1, i64 0, i64 %tmp, i64 %tmp3 + store i32 0, i32* %tmp5, align 4 + %tmp6 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 0, i64 1, i64 %tmp, i64 %tmp3 + store i32 0, i32* %tmp6, align 4 + %tmp7 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 0, i64 0, i64 %tmp, i64 %tmp3 + store i32 0, i32* %tmp7, align 4 + %tmp8 = add i64 %tmp3, 1 + %tmp9 = trunc i64 %tmp8 to i32 + %tmp10 = icmp eq i32 %tmp9, 8 + br i1 %tmp10, label %bb11, label %bb2 + +bb11: ; preds = %bb2 + %tmp12 = add i64 %tmp, 1 + %tmp13 = trunc i64 %tmp12 to i32 + %tmp14 = icmp eq i32 %tmp13, 16 + br i1 %tmp14, label %bb15, label %bb1 + +bb15: ; preds = %bb11 + ret void + +; CHECK: @test1 +; CHECK: bb1: +; CHECK-NOT: store +; CHECK: call void @llvm.memset.p0i8.i64 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64 +; CHECK-NOT: store +; CHECK: br +} diff --git a/test/Transforms/LoopIdiom/sideeffect.ll b/test/Transforms/LoopIdiom/sideeffect.ll new file mode 100644 index 00000000000..460e5233f95 --- /dev/null +++ b/test/Transforms/LoopIdiom/sideeffect.ll @@ -0,0 +1,53 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-darwin10.0.0" + +; PR9481 +define i32 @test1() nounwind uwtable ssp { +entry: + %a = alloca [10 x i8], align 1 + br label %for.body + +for.cond1.preheader: ; preds = %for.body + %arrayidx5.phi.trans.insert = getelementptr inbounds [10 x i8]* %a, i64 0, i64 0 + %.pre = load i8* %arrayidx5.phi.trans.insert, align 1 + br label %for.body3 + +for.body: ; preds = %for.body, %entry + %indvars.iv29 = phi i64 [ 0, %entry ], [ %indvars.iv.next30, %for.body ] + call void (...)* @bar() nounwind + %arrayidx = getelementptr inbounds [10 x i8]* %a, i64 0, i64 %indvars.iv29 + store i8 23, i8* %arrayidx, align 1 + %indvars.iv.next30 = add i64 %indvars.iv29, 1 + %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32 + %exitcond32 = icmp eq i32 %lftr.wideiv31, 1000000 + br i1 %exitcond32, label %for.cond1.preheader, label %for.body + +for.body3: ; preds = %for.body3, %for.cond1.preheader + %0 = phi i8 [ %.pre, %for.cond1.preheader ], [ %add, %for.body3 ] + %indvars.iv = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + call void (...)* @bar() nounwind + %arrayidx7 = getelementptr inbounds [10 x i8]* %a, i64 0, i64 %indvars.iv + %1 = load i8* %arrayidx7, align 1 + %add = add i8 %1, %0 + store i8 %add, i8* %arrayidx7, align 1 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1000000 + br i1 %exitcond, label %for.end12, label %for.body3 + +for.end12: ; preds = %for.body3 + %arrayidx13 = getelementptr inbounds [10 x i8]* %a, i64 0, i64 2 + %2 = load i8* %arrayidx13, align 1 + %conv14 = sext i8 %2 to i32 + %arrayidx15 = getelementptr inbounds [10 x i8]* %a, i64 0, i64 6 + %3 = load i8* %arrayidx15, align 1 + %conv16 = sext i8 %3 to i32 + %add17 = add nsw i32 %conv16, %conv14 + ret i32 %add17 + +; CHECK: @test1 +; CHECK-NOT: @llvm.memset +} + +declare void @bar(...) -- 2.34.1