From 2f46bb8178e30e3b845859a44b57c048db06ef84 Mon Sep 17 00:00:00 2001 From: Dale Johannesen Date: Wed, 14 Jan 2009 02:35:31 +0000 Subject: [PATCH] Fix the time regression I introduced in 464.h264ref with my earlier patch to this file. The issue there was that all uses of an IV inside a loop are actually references to Base[IV*2], and there was one use outside that was the same but LSR didn't see the base or the scaling because it didn't recurse into uses outside the loop; thus, it used base+IV*scale mode inside the loop instead of pulling base out of the loop. This was extra bad because register pressure later forced both base and IV into memory. Doing that recursion, at least enough to figure out addressing modes, is a good idea in general; the change in AddUsersIfInteresting does this. However, there were side effects.... It is also possible for recursing outside the loop to introduce another IV where there was only 1 before (if the refs inside are not scaled and the ref outside is). I don't think this is a common case, but it's in the testsuite. It is right to be very aggressive about getting rid of such introduced IVs (CheckForIVReuse and the handling of nonzero RewriteFactor in StrengthReduceStridedIVUsers). In the testcase in question the new IV produced this way has both a nonconstant stride and a nonzero base, neither of which was handled before. And when inserting new code that feeds into a PHI, it's right to put such code at the original location rather than in the PHI's immediate predecessor(s) when the original location is outside the loop (a case that couldn't happen before) (RewriteInstructionToUseNewBase); better to avoid making multiple copies of it in this case. Also, the mechanism for keeping SCEV's corresponding to GEP's no longer works, as the GEP might change after its SCEV is remembered, invalidating the SCEV, and we might get a bad SCEV value when looking up the GEP again for a later loop. This also couldn't happen before, as we weren't recursing into GEP's outside the loop. Also, when we build an expression that involves a (possibly non-affine) IV from a different loop as well as an IV from the one we're interested in (containsAddRecFromDifferentLoop), don't recurse into that. We can't do much with it and will get in trouble if we try to create new non-affine IVs or something. More testcases are coming. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@62212 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/LoopInfo.h | 4 +- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 231 ++++++++++++++---- ...9-01-13-nonconstant-stride-outside-loop.ll | 39 +++ 3 files changed, 228 insertions(+), 46 deletions(-) create mode 100644 test/Transforms/LoopStrengthReduce/2009-01-13-nonconstant-stride-outside-loop.ll diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h index 5ba8a7a0042..7aa325a9d1d 100644 --- a/include/llvm/Analysis/LoopInfo.h +++ b/include/llvm/Analysis/LoopInfo.h @@ -731,8 +731,8 @@ public: // Internals - static bool isNotAlreadyContainedIn(LoopBase *SubLoop, - LoopBase *ParentLoop) { + static bool isNotAlreadyContainedIn(const LoopBase *SubLoop, + const LoopBase *ParentLoop) { if (SubLoop == 0) return true; if (SubLoop == ParentLoop) return false; return isNotAlreadyContainedIn(SubLoop->getParentLoop(), ParentLoop); diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index c8f2ce29249..425add0a8be 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -130,6 +130,12 @@ namespace { /// dependent on random ordering of pointers in the process. SmallVector StrideOrder; + /// GEPlist - A list of the GEP's that have been remembered in the SCEV + /// data structures. SCEV does not know to update these when the operands + /// of the GEP are changed, which means we cannot leave them live across + /// loops. + SmallVector GEPlist; + /// CastedValues - As we need to cast values to uintptr_t, this keeps track /// of the casted version of each value. This is accessed by /// getCastedVersionOf. @@ -191,7 +197,7 @@ private: bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, const SCEVHandle *&CondStride); bool RequiresTypeConversion(const Type *Ty, const Type *NewTy); - int64_t CheckForIVReuse(bool, bool, bool, const SCEVHandle&, + SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&, IVExpr&, const Type*, const std::vector& UsersToProcess); bool ValidStride(bool, int64_t, @@ -340,13 +346,58 @@ SCEVHandle LoopStrengthReduce::GetExpressionSCEV(Instruction *Exp) { } SE->setSCEV(GEP, GEPVal); + GEPlist.push_back(GEP); return GEPVal; } +/// containsAddRecFromDifferentLoop - Determine whether expression S involves a +/// subexpression that is an AddRec from a loop other than L. An outer loop +/// of L is OK, but not an inner loop nor a disjoint loop. +static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) { + // This is very common, put it first. + if (isa(S)) + return false; + if (SCEVCommutativeExpr *AE = dyn_cast(S)) { + for (unsigned int i=0; i< AE->getNumOperands(); i++) + if (containsAddRecFromDifferentLoop(AE->getOperand(i), L)) + return true; + return false; + } + if (SCEVAddRecExpr *AE = dyn_cast(S)) { + if (const Loop *newLoop = AE->getLoop()) { + if (newLoop == L) + return false; + // if newLoop is an outer loop of L, this is OK. + if (!LoopInfoBase::isNotAlreadyContainedIn(L, newLoop)) + return false; + } + return true; + } + if (SCEVUDivExpr *DE = dyn_cast(S)) + return containsAddRecFromDifferentLoop(DE->getLHS(), L) || + containsAddRecFromDifferentLoop(DE->getRHS(), L); +#if 0 + // SCEVSDivExpr has been backed out temporarily, but will be back; we'll + // need this when it is. + if (SCEVSDivExpr *DE = dyn_cast(S)) + return containsAddRecFromDifferentLoop(DE->getLHS(), L) || + containsAddRecFromDifferentLoop(DE->getRHS(), L); +#endif + if (SCEVTruncateExpr *TE = dyn_cast(S)) + return containsAddRecFromDifferentLoop(TE->getOperand(), L); + if (SCEVZeroExtendExpr *ZE = dyn_cast(S)) + return containsAddRecFromDifferentLoop(ZE->getOperand(), L); + if (SCEVSignExtendExpr *SE = dyn_cast(S)) + return containsAddRecFromDifferentLoop(SE->getOperand(), L); + return false; +} + /// getSCEVStartAndStride - Compute the start and stride of this expression, /// returning false if the expression is not a start/stride pair, or true if it /// is. The stride must be a loop invariant expression, but the start may be -/// a mix of loop invariant and loop variant expressions. +/// a mix of loop invariant and loop variant expressions. The start cannot, +/// however, contain an AddRec from a different loop, unless that loop is an +/// outer loop of the current loop. static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, SCEVHandle &Start, SCEVHandle &Stride, ScalarEvolution *SE) { @@ -378,6 +429,12 @@ static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, // FIXME: Generalize to non-affine IV's. if (!AddRec->isAffine()) return false; + // If Start contains an SCEVAddRecExpr from a different loop, other than an + // outer loop of the current loop, reject it. SCEV has no concept of operating + // on one loop at a time so don't confuse it with such expressions. + if (containsAddRecFromDifferentLoop(Start, L)) + return false; + Start = SE->getAddExpr(Start, AddRec->getOperand(0)); if (!isa(AddRec->getOperand(1))) @@ -508,14 +565,22 @@ bool LoopStrengthReduce::AddUsersIfInteresting(Instruction *I, Loop *L, if (isa(User) && Processed.count(User)) continue; - // If this is an instruction defined in a nested loop, or outside this loop, - // don't recurse into it. + // Descend recursively, but not into PHI nodes outside the current loop. + // It's important to see the entire expression outside the loop to get + // choices that depend on addressing mode use right, although we won't + // consider references ouside the loop in all cases. + // If User is already in Processed, we don't want to recurse into it again, + // but do want to record a second reference in the same instruction. bool AddUserToIVUsers = false; if (LI->getLoopFor(User->getParent()) != L) { - DOUT << "FOUND USER in other loop: " << *User - << " OF SCEV: " << *ISE << "\n"; - AddUserToIVUsers = true; - } else if (!AddUsersIfInteresting(User, L, Processed)) { + if (isa(User) || Processed.count(User) || + !AddUsersIfInteresting(User, L, Processed)) { + DOUT << "FOUND USER in other loop: " << *User + << " OF SCEV: " << *ISE << "\n"; + AddUserToIVUsers = true; + } + } else if (Processed.count(User) || + !AddUsersIfInteresting(User, L, Processed)) { DOUT << "FOUND USER: " << *User << " OF SCEV: " << *ISE << "\n"; AddUserToIVUsers = true; @@ -704,34 +769,45 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase, PHINode *PN = cast(Inst); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { if (PN->getIncomingValue(i) == OperandValToReplace) { - // If this is a critical edge, split the edge so that we do not insert the - // code on all predecessor/successor paths. We do this unless this is the - // canonical backedge for this loop, as this can make some inserted code - // be in an illegal position. - BasicBlock *PHIPred = PN->getIncomingBlock(i); - if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 && - (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) { - - // First step, split the critical edge. - SplitCriticalEdge(PHIPred, PN->getParent(), P, false); - - // Next step: move the basic block. In particular, if the PHI node - // is outside of the loop, and PredTI is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after PredTI. - if (L->contains(PHIPred) && !L->contains(PN->getParent())) { - BasicBlock *NewBB = PN->getIncomingBlock(i); - NewBB->moveBefore(PN->getParent()); + // If the original expression is outside the loop, put the replacement + // code in the same place as the original expression, + // which need not be an immediate predecessor of this PHI. This way we + // need only one copy of it even if it is referenced multiple times in + // the PHI. We don't do this when the original expression is inside the + // loop because multiple copies sometimes do useful sinking of code in that + // case(?). + Instruction *OldLoc = dyn_cast(OperandValToReplace); + if (L->contains(OldLoc->getParent())) { + // If this is a critical edge, split the edge so that we do not insert the + // code on all predecessor/successor paths. We do this unless this is the + // canonical backedge for this loop, as this can make some inserted code + // be in an illegal position. + BasicBlock *PHIPred = PN->getIncomingBlock(i); + if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 && + (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) { + + // First step, split the critical edge. + SplitCriticalEdge(PHIPred, PN->getParent(), P, false); + + // Next step: move the basic block. In particular, if the PHI node + // is outside of the loop, and PredTI is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after PredTI. + if (L->contains(PHIPred) && !L->contains(PN->getParent())) { + BasicBlock *NewBB = PN->getIncomingBlock(i); + NewBB->moveBefore(PN->getParent()); + } + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); } - - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); } - Value *&Code = InsertedCode[PN->getIncomingBlock(i)]; if (!Code) { // Insert the code into the end of the predecessor block. - Instruction *InsertPt = PN->getIncomingBlock(i)->getTerminator(); + Instruction *InsertPt = (L->contains(OldLoc->getParent())) ? + PN->getIncomingBlock(i)->getTerminator() : + OldLoc->getParent()->getTerminator(); Code = InsertCodeForBaseAtPosition(NewBase, Rewriter, InsertPt, L); // Adjust the type back to match the PHI. Note that we can't use @@ -1168,7 +1244,12 @@ bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1, /// mode scale component and optional base reg. This allows the users of /// this stride to be rewritten as prev iv * factor. It returns 0 if no /// reuse is possible. Factors can be negative on same targets, e.g. ARM. -int64_t LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, +/// +/// If all uses are outside the loop, we don't require that all multiplies +/// be folded into the addressing mode, nor even that the factor be constant; +/// a multiply (executed once) outside the loop is better than another IV +/// within. Well, usually. +SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, bool AllUsesAreAddresses, bool AllUsesAreOutsideLoop, const SCEVHandle &Stride, @@ -1180,7 +1261,7 @@ int64_t LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, ++NewStride) { std::map::iterator SI = IVsByStride.find(StrideOrder[NewStride]); - if (SI == IVsByStride.end()) + if (SI == IVsByStride.end() || !isa(SI->first)) continue; int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); if (SI->first != Stride && @@ -1202,11 +1283,53 @@ int64_t LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, if (II->Base->isZero() && !RequiresTypeConversion(II->Base->getType(), Ty)) { IV = *II; - return Scale; + return SE->getIntegerSCEV(Scale, Stride->getType()); } } + } else if (AllUsesAreOutsideLoop) { + // Accept nonconstant strides here; it is really really right to substitute + // an existing IV if we can. + for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e; + ++NewStride) { + std::map::iterator SI = + IVsByStride.find(StrideOrder[NewStride]); + if (SI == IVsByStride.end() || !isa(SI->first)) + continue; + int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); + if (SI->first != Stride && SSInt != 1) + continue; + for (std::vector::iterator II = SI->second.IVs.begin(), + IE = SI->second.IVs.end(); II != IE; ++II) + // Accept nonzero base here. + // Only reuse previous IV if it would not require a type conversion. + if (!RequiresTypeConversion(II->Base->getType(), Ty)) { + IV = *II; + return Stride; + } + } + // Special case, old IV is -1*x and this one is x. Can treat this one as + // -1*old. + for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e; + ++NewStride) { + std::map::iterator SI = + IVsByStride.find(StrideOrder[NewStride]); + if (SI == IVsByStride.end()) + continue; + if (SCEVMulExpr *ME = dyn_cast(SI->first)) + if (SCEVConstant *SC = dyn_cast(ME->getOperand(0))) + if (Stride == ME->getOperand(1) && + SC->getValue()->getSExtValue() == -1LL) + for (std::vector::iterator II = SI->second.IVs.begin(), + IE = SI->second.IVs.end(); II != IE; ++II) + // Accept nonzero base here. + // Only reuse previous IV if it would not require type conversion. + if (!RequiresTypeConversion(II->Base->getType(), Ty)) { + IV = *II; + return SE->getIntegerSCEV(-1LL, Stride->getType()); + } + } } - return 0; + return SE->getIntegerSCEV(0, Stride->getType()); } /// PartitionByIsUseOfPostIncrementedValue - Simple boolean predicate that @@ -1357,12 +1480,13 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, IVExpr ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty), SE->getIntegerSCEV(0, Type::Int32Ty), 0, 0); - int64_t RewriteFactor = 0; - RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses, + SCEVHandle RewriteFactor = + CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses, AllUsesAreOutsideLoop, Stride, ReuseIV, CommonExprs->getType(), UsersToProcess); - if (RewriteFactor != 0) { + if (!isa(RewriteFactor) || + !cast(RewriteFactor)->isZero()) { DOUT << "BASED ON IV of STRIDE " << *ReuseIV.Stride << " and BASE " << *ReuseIV.Base << " :\n"; NewPHI = ReuseIV.PHI; @@ -1390,7 +1514,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, Value *CommonBaseV = PreheaderRewriter.expandCodeFor(CommonExprs, PreInsertPt); - if (RewriteFactor == 0) { + if (isa(RewriteFactor) && + cast(RewriteFactor)->isZero()) { // Create a new Phi for this base, and stick it in the loop header. NewPHI = PHINode::Create(ReplacedTy, "iv.", PhiInsertBefore); ++NumInserted; @@ -1537,18 +1662,33 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // If we are reusing the iv, then it must be multiplied by a constant // factor take advantage of addressing mode scale component. - if (RewriteFactor != 0) { - RewriteExpr = SE->getMulExpr(SE->getIntegerSCEV(RewriteFactor, - RewriteExpr->getType()), + if (!isa(RewriteFactor) || + !cast(RewriteFactor)->isZero()) { + // If we're reusing an IV with a nonzero base (currently this happens + // only when all reuses are outside the loop) subtract that base here. + // The base has been used to initialize the PHI node but we don't want + // it here. + if (!ReuseIV.Base->isZero()) + RewriteExpr = SE->getMinusSCEV(RewriteExpr, ReuseIV.Base); + + // Multiply old variable, with base removed, by new scale factor. + RewriteExpr = SE->getMulExpr(RewriteFactor, RewriteExpr); // The common base is emitted in the loop preheader. But since we // are reusing an IV, it has not been used to initialize the PHI node. // Add it to the expression used to rewrite the uses. + // When this use is outside the loop, we earlier subtracted the + // common base, and are adding it back here. Use the same expression + // as before, rather than CommonBaseV, so DAGCombiner will zap it. if (!isa(CommonBaseV) || - !cast(CommonBaseV)->isZero()) - RewriteExpr = SE->getAddExpr(RewriteExpr, + !cast(CommonBaseV)->isZero()) { + if (L->contains(User.Inst->getParent())) + RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(CommonBaseV)); + else + RewriteExpr = SE->getAddExpr(RewriteExpr, CommonExprs); + } } // Now that we know what we need to do, insert code before User for the @@ -2174,6 +2314,9 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { IVUsesByStride.clear(); IVsByStride.clear(); StrideOrder.clear(); + for (unsigned i=0; ideleteValueFromRecords(GEPlist[i]); + GEPlist.clear(); // Clean up after ourselves if (!DeadInsts.empty()) { diff --git a/test/Transforms/LoopStrengthReduce/2009-01-13-nonconstant-stride-outside-loop.ll b/test/Transforms/LoopStrengthReduce/2009-01-13-nonconstant-stride-outside-loop.ll new file mode 100644 index 00000000000..a7072858c8a --- /dev/null +++ b/test/Transforms/LoopStrengthReduce/2009-01-13-nonconstant-stride-outside-loop.ll @@ -0,0 +1,39 @@ +; RUN: llvm-as < %s | opt -loop-reduce | llvm-dis | grep phi | count 1 +; RUN: llvm-as < %s | opt -loop-reduce | llvm-dis | grep mul | count 1 +; ModuleID = '' +; Make sure examining a fuller expression outside the loop doesn't cause us to create a second +; IV of stride %3. +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i386-apple-darwin9.5" + %struct.anon = type { %struct.obj*, %struct.obj* } + %struct.obj = type { i16, i16, { %struct.anon } } +@heap_size = external global i32 ; [#uses=1] +@"\01LC85" = external constant [39 x i8] ; <[39 x i8]*> [#uses=1] + +declare i32 @sprintf(i8*, i8*, ...) nounwind + +define %struct.obj* @gc_status(%struct.obj* %args) nounwind { +entry: + br label %bb1.i + +bb.i2: ; preds = %bb2.i3 + %indvar.next24 = add i32 %m.0.i, 1 ; [#uses=1] + br label %bb1.i + +bb1.i: ; preds = %bb.i2, %entry + %m.0.i = phi i32 [ 0, %entry ], [ %indvar.next24, %bb.i2 ] ; [#uses=4] + %0 = icmp slt i32 %m.0.i, 0 ; [#uses=1] + br i1 %0, label %bb2.i3, label %nactive_heaps.exit + +bb2.i3: ; preds = %bb1.i + %1 = load %struct.obj** null, align 4 ; <%struct.obj*> [#uses=1] + %2 = icmp eq %struct.obj* %1, null ; [#uses=1] + br i1 %2, label %nactive_heaps.exit, label %bb.i2 + +nactive_heaps.exit: ; preds = %bb2.i3, %bb1.i + %3 = load i32* @heap_size, align 4 ; [#uses=1] + %4 = mul i32 %3, %m.0.i ; [#uses=1] + %5 = sub i32 %4, 0 ; [#uses=1] + %6 = tail call i32 (i8*, i8*, ...)* @sprintf(i8* null, i8* getelementptr ([39 x i8]* @"\01LC85", i32 0, i32 0), i32 %m.0.i, i32 0, i32 %5, i32 0) nounwind ; [#uses=0] + ret %struct.obj* null +} -- 2.34.1