X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FScalar%2FLoopStrengthReduce.cpp;h=d8f6cc18a1e942f5ccdca745a96542568b605860;hb=13ad5aaaff8a446758b402fd5e9aea22f5bc5682;hp=a6795a72ef9b17524e97c5b40810dfce36c0e94e;hpb=c7749b747e634f310b7f0e075f7329dcb88f58cc;p=oota-llvm.git diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index a6795a72ef9..d8f6cc18a1e 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -7,8 +7,15 @@ // //===----------------------------------------------------------------------===// // +// This transformation analyzes and transforms the induction variables (and +// computations derived from them) into forms suitable for efficient execution +// on the target. +// // This pass performs a strength reduction on array references inside loops that -// have as one or more of their components the loop induction variable. +// have as one or more of their components the loop induction variable, it +// rewrites expressions to take advantage of scaled-index addressing modes +// available on the target, and it performs a variety of other optimizations +// related to loop induction variables. // //===----------------------------------------------------------------------===// @@ -20,18 +27,19 @@ #include "llvm/Type.h" #include "llvm/DerivedTypes.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/AddrModeMatcher.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include using namespace llvm; @@ -42,6 +50,7 @@ STATISTIC(NumVariable, "Number of PHIs with variable strides"); STATISTIC(NumEliminated, "Number of strides eliminated"); STATISTIC(NumShadow, "Number of Shadow IVs optimized"); STATISTIC(NumImmSunk, "Number of common expr immediates sunk into uses"); +STATISTIC(NumLoopCond, "Number of loop terminating conds optimized"); static cl::opt EnableFullLSRMode("enable-full-lsr", cl::init(false), @@ -51,84 +60,46 @@ namespace { struct BasedUser; - /// IVStrideUse - Keep track of one use of a strided induction variable, where - /// the stride is stored externally. The Offset member keeps track of the - /// offset from the IV, User is the actual user of the operand, and - /// 'OperandValToReplace' is the operand of the User that is the use. - struct VISIBILITY_HIDDEN IVStrideUse { - SCEVHandle Offset; - Instruction *User; - Value *OperandValToReplace; - - // isUseOfPostIncrementedValue - True if this should use the - // post-incremented version of this IV, not the preincremented version. - // This can only be set in special cases, such as the terminating setcc - // instruction for a loop or uses dominated by the loop. - bool isUseOfPostIncrementedValue; - - IVStrideUse(const SCEVHandle &Offs, Instruction *U, Value *O) - : Offset(Offs), User(U), OperandValToReplace(O), - isUseOfPostIncrementedValue(false) {} - }; - - /// IVUsersOfOneStride - This structure keeps track of all instructions that - /// have an operand that is based on the trip count multiplied by some stride. - /// The stride for all of these users is common and kept external to this - /// structure. - struct VISIBILITY_HIDDEN IVUsersOfOneStride { - /// Users - Keep track of all of the users of this stride as well as the - /// initial value and the operand that uses the IV. - std::vector Users; - - void addUser(const SCEVHandle &Offset,Instruction *User, Value *Operand) { - Users.push_back(IVStrideUse(Offset, User, Operand)); - } - }; - /// IVInfo - This structure keeps track of one IV expression inserted during /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as /// well as the PHI node and increment value created for rewrite. - struct VISIBILITY_HIDDEN IVExpr { - SCEVHandle Stride; - SCEVHandle Base; + struct IVExpr { + const SCEV *Stride; + const SCEV *Base; PHINode *PHI; - IVExpr(const SCEVHandle &stride, const SCEVHandle &base, PHINode *phi) + IVExpr(const SCEV *const stride, const SCEV *const base, PHINode *phi) : Stride(stride), Base(base), PHI(phi) {} }; /// IVsOfOneStride - This structure keeps track of all IV expression inserted /// during StrengthReduceStridedIVUsers for a particular stride of the IV. - struct VISIBILITY_HIDDEN IVsOfOneStride { + struct IVsOfOneStride { std::vector IVs; - void addIV(const SCEVHandle &Stride, const SCEVHandle &Base, PHINode *PHI) { + void addIV(const SCEV *const Stride, const SCEV *const Base, PHINode *PHI) { IVs.push_back(IVExpr(Stride, Base, PHI)); } }; - class VISIBILITY_HIDDEN LoopStrengthReduce : public LoopPass { + class LoopStrengthReduce : public LoopPass { + IVUsers *IU; LoopInfo *LI; DominatorTree *DT; ScalarEvolution *SE; bool Changed; - /// IVUsesByStride - Keep track of all uses of induction variables that we - /// are interested in. The key of the map is the stride of the access. - std::map IVUsesByStride; - /// IVsByStride - Keep track of all IVs that have been inserted for a /// particular stride. - std::map IVsByStride; + std::map IVsByStride; - /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable: - /// We use this to iterate over the IVUsesByStride collection without being - /// dependent on random ordering of pointers in the process. - SmallVector StrideOrder; + /// StrideNoReuse - Keep track of all the strides whose ivs cannot be + /// reused (nor should they be rewritten to reuse other strides). + SmallSet StrideNoReuse; /// DeadInsts - Keep track of instructions we may have made dead, so that /// we can remove them after we are done working. - SmallVector DeadInsts; + SmallVector DeadInsts; /// TLI - Keep a pointer of a TargetLowering to consult for determining /// transformation profitability. @@ -155,34 +126,39 @@ namespace { AU.addRequired(); AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } -private: - bool AddUsersIfInteresting(Instruction *I, Loop *L, - SmallPtrSet &Processed); + private: ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond, IVStrideUse* &CondUse, - const SCEVHandle* &CondStride); + const SCEV *const * &CondStride); + void OptimizeIndvars(Loop *L); + void OptimizeLoopCountIV(Loop *L); + void OptimizeLoopTermCond(Loop *L); /// OptimizeShadowIV - If IV is used in a int-to-float cast /// inside the loop then try to eliminate the cast opeation. void OptimizeShadowIV(Loop *L); - /// OptimizeSMax - Rewrite the loop's terminating condition - /// if it uses an smax computation. - ICmpInst *OptimizeSMax(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse); + /// OptimizeMax - Rewrite the loop's terminating condition + /// if it uses a max computation. + ICmpInst *OptimizeMax(Loop *L, ICmpInst *Cond, + IVStrideUse* &CondUse); bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, - const SCEVHandle *&CondStride); + const SCEV *const * &CondStride); bool RequiresTypeConversion(const Type *Ty, const Type *NewTy); - SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&, + const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *const&, IVExpr&, const Type*, const std::vector& UsersToProcess); - bool ValidStride(bool, int64_t, + bool ValidScale(bool, int64_t, + const std::vector& UsersToProcess); + bool ValidOffset(bool, int64_t, int64_t, const std::vector& UsersToProcess); - SCEVHandle CollectIVUsers(const SCEVHandle &Stride, + const SCEV *CollectIVUsers(const SCEV *const &Stride, IVUsersOfOneStride &Uses, Loop *L, bool &AllUsesAreAddresses, @@ -192,11 +168,11 @@ private: const std::vector &UsersToProcess, const Loop *L, bool AllUsesAreAddresses, - SCEVHandle Stride); + const SCEV *Stride); void PrepareToStrengthReduceFully( std::vector &UsersToProcess, - SCEVHandle Stride, - SCEVHandle CommonExprs, + const SCEV *Stride, + const SCEV *CommonExprs, const Loop *L, SCEVExpander &PreheaderRewriter); void PrepareToStrengthReduceFromSmallerStride( @@ -206,12 +182,13 @@ private: Instruction *PreInsertPt); void PrepareToStrengthReduceWithNewPhi( std::vector &UsersToProcess, - SCEVHandle Stride, - SCEVHandle CommonExprs, + const SCEV *Stride, + const SCEV *CommonExprs, Value *CommonBaseV, + Instruction *IVIncInsertPt, const Loop *L, SCEVExpander &PreheaderRewriter); - void StrengthReduceStridedIVUsers(const SCEVHandle &Stride, + void StrengthReduceStridedIVUsers(const SCEV *const &Stride, IVUsersOfOneStride &Uses, Loop *L); void DeleteTriviallyDeadInstructions(); @@ -232,28 +209,13 @@ Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { void LoopStrengthReduce::DeleteTriviallyDeadInstructions() { if (DeadInsts.empty()) return; - // Sort the deadinsts list so that we can trivially eliminate duplicates as we - // go. The code below never adds a non-dead instruction to the worklist, but - // callers may not be so careful. - array_pod_sort(DeadInsts.begin(), DeadInsts.end()); - - // Drop duplicate instructions and those with uses. - for (unsigned i = 0, e = DeadInsts.size()-1; i < e; ++i) { - Instruction *I = DeadInsts[i]; - if (!I->use_empty()) DeadInsts[i] = 0; - while (i != e && DeadInsts[i+1] == I) - DeadInsts[++i] = 0; - } - while (!DeadInsts.empty()) { - Instruction *I = DeadInsts.back(); + Instruction *I = dyn_cast_or_null(DeadInsts.back()); DeadInsts.pop_back(); if (I == 0 || !isInstructionTriviallyDead(I)) continue; - SE->deleteValueFromRecords(I); - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { if (Instruction *U = dyn_cast(*OI)) { *OI = 0; @@ -270,7 +232,7 @@ void LoopStrengthReduce::DeleteTriviallyDeadInstructions() { /// containsAddRecFromDifferentLoop - Determine whether expression S involves a /// subexpression that is an AddRec from a loop other than L. An outer loop /// of L is OK, but not an inner loop nor a disjoint loop. -static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) { +static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) { // This is very common, put it first. if (isa(S)) return false; @@ -285,7 +247,7 @@ static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) { if (newLoop == L) return false; // if newLoop is an outer loop of L, this is OK. - if (!LoopInfoBase::isNotAlreadyContainedIn(L, newLoop)) + if (!LoopInfo::isNotAlreadyContainedIn(L, newLoop)) return false; } return true; @@ -305,124 +267,6 @@ static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) { return false; } -/// getSCEVStartAndStride - Compute the start and stride of this expression, -/// returning false if the expression is not a start/stride pair, or true if it -/// is. The stride must be a loop invariant expression, but the start may be -/// a mix of loop invariant and loop variant expressions. The start cannot, -/// however, contain an AddRec from a different loop, unless that loop is an -/// outer loop of the current loop. -static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, - SCEVHandle &Start, SCEVHandle &Stride, - ScalarEvolution *SE, DominatorTree *DT) { - SCEVHandle TheAddRec = Start; // Initialize to zero. - - // If the outer level is an AddExpr, the operands are all start values except - // for a nested AddRecExpr. - if (const SCEVAddExpr *AE = dyn_cast(SH)) { - for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i) - if (SCEVAddRecExpr *AddRec = - dyn_cast(AE->getOperand(i))) { - if (AddRec->getLoop() == L) - TheAddRec = SE->getAddExpr(AddRec, TheAddRec); - else - return false; // Nested IV of some sort? - } else { - Start = SE->getAddExpr(Start, AE->getOperand(i)); - } - - } else if (isa(SH)) { - TheAddRec = SH; - } else { - return false; // not analyzable. - } - - const SCEVAddRecExpr *AddRec = dyn_cast(TheAddRec); - if (!AddRec || AddRec->getLoop() != L) return false; - - // FIXME: Generalize to non-affine IV's. - if (!AddRec->isAffine()) return false; - - // If Start contains an SCEVAddRecExpr from a different loop, other than an - // outer loop of the current loop, reject it. SCEV has no concept of - // operating on more than one loop at a time so don't confuse it with such - // expressions. - if (containsAddRecFromDifferentLoop(AddRec->getOperand(0), L)) - return false; - - Start = SE->getAddExpr(Start, AddRec->getOperand(0)); - - if (!isa(AddRec->getOperand(1))) { - // If stride is an instruction, make sure it dominates the loop preheader. - // Otherwise we could end up with a use before def situation. - BasicBlock *Preheader = L->getLoopPreheader(); - if (!AddRec->getOperand(1)->dominates(Preheader, DT)) - return false; - - DOUT << "[" << L->getHeader()->getName() - << "] Variable stride: " << *AddRec << "\n"; - } - - Stride = AddRec->getOperand(1); - return true; -} - -/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression -/// and now we need to decide whether the user should use the preinc or post-inc -/// value. If this user should use the post-inc version of the IV, return true. -/// -/// Choosing wrong here can break dominance properties (if we choose to use the -/// post-inc value when we cannot) or it can end up adding extra live-ranges to -/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we -/// should use the post-inc value). -static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV, - Loop *L, DominatorTree *DT, Pass *P, - SmallVectorImpl &DeadInsts){ - // If the user is in the loop, use the preinc value. - if (L->contains(User->getParent())) return false; - - BasicBlock *LatchBlock = L->getLoopLatch(); - - // Ok, the user is outside of the loop. If it is dominated by the latch - // block, use the post-inc value. - if (DT->dominates(LatchBlock, User->getParent())) - return true; - - // There is one case we have to be careful of: PHI nodes. These little guys - // can live in blocks that do not dominate the latch block, but (since their - // uses occur in the predecessor block, not the block the PHI lives in) should - // still use the post-inc value. Check for this case now. - PHINode *PN = dyn_cast(User); - if (!PN) return false; // not a phi, not dominated by latch block. - - // Look at all of the uses of IV by the PHI node. If any use corresponds to - // a block that is not dominated by the latch block, give up and use the - // preincremented value. - unsigned NumUses = 0; - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == IV) { - ++NumUses; - if (!DT->dominates(LatchBlock, PN->getIncomingBlock(i))) - return false; - } - - // Okay, all uses of IV by PN are in predecessor blocks that really are - // dominated by the latch block. Split the critical edges and use the - // post-incremented value. - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == IV) { - SplitCriticalEdge(PN->getIncomingBlock(i), PN->getParent(), P, false); - // Splitting the critical edge can reduce the number of entries in this - // PHI. - e = PN->getNumIncomingValues(); - if (--NumUses == 0) break; - } - - // PHI node might have become a constant value after SplitCriticalEdge. - DeadInsts.push_back(User); - - return true; -} - /// isAddressUse - Returns true if the specified instruction is using the /// specified value as an address. static bool isAddressUse(Instruction *Inst, Value *OperandVal) { @@ -453,9 +297,9 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { /// getAccessType - Return the type of the memory being accessed. static const Type *getAccessType(const Instruction *Inst) { - const Type *UseTy = Inst->getType(); + const Type *AccessTy = Inst->getType(); if (const StoreInst *SI = dyn_cast(Inst)) - UseTy = SI->getOperand(0)->getType(); + AccessTy = SI->getOperand(0)->getType(); else if (const IntrinsicInst *II = dyn_cast(Inst)) { // Addressing modes can also be folded into prefetches and a variety // of intrinsics. @@ -465,95 +309,11 @@ static const Type *getAccessType(const Instruction *Inst) { case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: case Intrinsic::x86_sse2_storel_dq: - UseTy = II->getOperand(1)->getType(); + AccessTy = II->getOperand(1)->getType(); break; } } - return UseTy; -} - -/// AddUsersIfInteresting - Inspect the specified instruction. If it is a -/// reducible SCEV, recursively add its users to the IVUsesByStride set and -/// return true. Otherwise, return false. -bool LoopStrengthReduce::AddUsersIfInteresting(Instruction *I, Loop *L, - SmallPtrSet &Processed) { - if (!SE->isSCEVable(I->getType())) - return false; // Void and FP expressions cannot be reduced. - - // LSR is not APInt clean, do not touch integers bigger than 64-bits. - if (SE->getTypeSizeInBits(I->getType()) > 64) - return false; - - if (!Processed.insert(I)) - return true; // Instruction already handled. - - // Get the symbolic expression for this instruction. - SCEVHandle ISE = SE->getSCEV(I); - if (isa(ISE)) return false; - - // Get the start and stride for this expression. - SCEVHandle Start = SE->getIntegerSCEV(0, ISE->getType()); - SCEVHandle Stride = Start; - if (!getSCEVStartAndStride(ISE, L, Start, Stride, SE, DT)) - return false; // Non-reducible symbolic expression, bail out. - - std::vector IUsers; - // Collect all I uses now because IVUseShouldUsePostIncValue may - // invalidate use_iterator. - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; ++UI) - IUsers.push_back(cast(*UI)); - - for (unsigned iused_index = 0, iused_size = IUsers.size(); - iused_index != iused_size; ++iused_index) { - - Instruction *User = IUsers[iused_index]; - - // Do not infinitely recurse on PHI nodes. - if (isa(User) && Processed.count(User)) - continue; - - // Descend recursively, but not into PHI nodes outside the current loop. - // It's important to see the entire expression outside the loop to get - // choices that depend on addressing mode use right, although we won't - // consider references ouside the loop in all cases. - // If User is already in Processed, we don't want to recurse into it again, - // but do want to record a second reference in the same instruction. - bool AddUserToIVUsers = false; - if (LI->getLoopFor(User->getParent()) != L) { - if (isa(User) || Processed.count(User) || - !AddUsersIfInteresting(User, L, Processed)) { - DOUT << "FOUND USER in other loop: " << *User - << " OF SCEV: " << *ISE << "\n"; - AddUserToIVUsers = true; - } - } else if (Processed.count(User) || - !AddUsersIfInteresting(User, L, Processed)) { - DOUT << "FOUND USER: " << *User - << " OF SCEV: " << *ISE << "\n"; - AddUserToIVUsers = true; - } - - if (AddUserToIVUsers) { - IVUsersOfOneStride &StrideUses = IVUsesByStride[Stride]; - if (StrideUses.Users.empty()) // First occurrence of this stride? - StrideOrder.push_back(Stride); - - // Okay, we found a user that we cannot reduce. Analyze the instruction - // and decide what to do with it. If we are a use inside of the loop, use - // the value before incrementation, otherwise use it after incrementation. - if (IVUseShouldUsePostIncValue(User, I, L, DT, this, DeadInsts)) { - // The value used will be incremented by the stride more than we are - // expecting, so subtract this off. - SCEVHandle NewStart = SE->getMinusSCEV(Start, Stride); - StrideUses.addUser(NewStart, User, I); - StrideUses.Users.back().isUseOfPostIncrementedValue = true; - DOUT << " USING POSTINC SCEV, START=" << *NewStart<< "\n"; - } else { - StrideUses.addUser(Start, User, I); - } - } - } - return true; + return AccessTy; } namespace { @@ -567,7 +327,7 @@ namespace { /// this use. As the use is processed, information gets moved from this /// field to the Imm field (below). BasedUser values are sorted by this /// field. - SCEVHandle Base; + const SCEV *Base; /// Inst - The instruction using the induction variable. Instruction *Inst; @@ -580,7 +340,7 @@ namespace { /// before Inst, because it will be folded into the imm field of the /// instruction. This is also sometimes used for loop-variant values that /// must be added inside the loop. - SCEVHandle Imm; + const SCEV *Imm; /// Phi - The induction variable that performs the striding that /// should be used for this user. @@ -594,42 +354,44 @@ namespace { bool isUseOfPostIncrementedValue; BasedUser(IVStrideUse &IVSU, ScalarEvolution *se) - : SE(se), Base(IVSU.Offset), Inst(IVSU.User), - OperandValToReplace(IVSU.OperandValToReplace), + : SE(se), Base(IVSU.getOffset()), Inst(IVSU.getUser()), + OperandValToReplace(IVSU.getOperandValToReplace()), Imm(SE->getIntegerSCEV(0, Base->getType())), - isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue) {} + isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {} // Once we rewrite the code to insert the new IVs we want, update the // operands of Inst to use the new expression 'NewBase', with 'Imm' added // to it. - void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase, + void RewriteInstructionToUseNewBase(const SCEV *const &NewBase, Instruction *InsertPt, SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl &DeadInsts); + LoopInfo &LI, + SmallVectorImpl &DeadInsts); - Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, + Value *InsertCodeForBaseAtPosition(const SCEV *const &NewBase, const Type *Ty, SCEVExpander &Rewriter, - Instruction *IP, Loop *L); + Instruction *IP, Loop *L, + LoopInfo &LI); void dump() const; }; } void BasedUser::dump() const { - cerr << " Base=" << *Base; - cerr << " Imm=" << *Imm; - cerr << " Inst: " << *Inst; + errs() << " Base=" << *Base; + errs() << " Imm=" << *Imm; + errs() << " Inst: " << *Inst; } -Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, +Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase, const Type *Ty, SCEVExpander &Rewriter, - Instruction *IP, Loop *L) { + Instruction *IP, Loop *L, + LoopInfo &LI) { // Figure out where we *really* want to insert this code. In particular, if // the user is inside of a loop that is nested inside of L, we really don't // want to insert this expression before the user, we'd rather pull it out as // many loops as possible. - LoopInfo &LI = Rewriter.getLoopInfo(); Instruction *BaseInsertPt = IP; // Figure out the most-nested loop that IP is in. @@ -643,19 +405,13 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, InsertLoop = InsertLoop->getParentLoop(); } - Value *Base = Rewriter.expandCodeFor(NewBase, Ty, BaseInsertPt); + Value *Base = Rewriter.expandCodeFor(NewBase, 0, BaseInsertPt); - // If there is no immediate value, skip the next part. - if (Imm->isZero()) - return Base; + const SCEV *NewValSCEV = SE->getUnknown(Base); + + // Always emit the immediate into the same block as the user. + NewValSCEV = SE->getAddExpr(NewValSCEV, Imm); - // If we are inserting the base and imm values in the same block, make sure to - // adjust the IP position if insertion reused a result. - if (IP == BaseInsertPt) - IP = Rewriter.getInsertionPoint(); - - // Always emit the immediate (if non-zero) into the same block as the user. - SCEVHandle NewValSCEV = SE->getAddExpr(SE->getUnknown(Base), Imm); return Rewriter.expandCodeFor(NewValSCEV, Ty, IP); } @@ -666,10 +422,11 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, // value of NewBase in the case that it's a diffferent instruction from // the PHI that NewBase is computed from, or null otherwise. // -void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase, +void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase, Instruction *NewBasePt, SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl &DeadInsts){ + LoopInfo &LI, + SmallVectorImpl &DeadInsts) { if (!isa(Inst)) { // By default, insert code at the user instruction. BasicBlock::iterator InsertPt = Inst; @@ -698,13 +455,14 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase, } Value *NewVal = InsertCodeForBaseAtPosition(NewBase, OperandValToReplace->getType(), - Rewriter, InsertPt, L); + Rewriter, InsertPt, L, LI); // Replace the use of the operand Value with the new Phi we just created. Inst->replaceUsesOfWith(OperandValToReplace, NewVal); - DOUT << " Replacing with "; - DEBUG(WriteAsOperand(*DOUT, NewVal, /*PrintType=*/false)); - DOUT << ", which has value " << *NewBase << " plus IMM " << *Imm << "\n"; + DEBUG(errs() << " Replacing with "); + DEBUG(WriteAsOperand(errs(), NewVal, /*PrintType=*/false)); + DEBUG(errs() << ", which has value " << *NewBase << " plus IMM " + << *Imm << "\n"); return; } @@ -725,43 +483,45 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase, // loop because multiple copies sometimes do useful sinking of code in // that case(?). Instruction *OldLoc = dyn_cast(OperandValToReplace); + BasicBlock *PHIPred = PN->getIncomingBlock(i); if (L->contains(OldLoc->getParent())) { // If this is a critical edge, split the edge so that we do not insert // the code on all predecessor/successor paths. We do this unless this // is the canonical backedge for this loop, as this can make some // inserted code be in an illegal position. - BasicBlock *PHIPred = PN->getIncomingBlock(i); if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 && (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) { // First step, split the critical edge. - SplitCriticalEdge(PHIPred, PN->getParent(), P, false); + BasicBlock *NewBB = SplitCriticalEdge(PHIPred, PN->getParent(), + P, false); // Next step: move the basic block. In particular, if the PHI node // is outside of the loop, and PredTI is in the loop, we want to // move the block to be immediately before the PHI block, not // immediately after PredTI. - if (L->contains(PHIPred) && !L->contains(PN->getParent())) { - BasicBlock *NewBB = PN->getIncomingBlock(i); + if (L->contains(PHIPred) && !L->contains(PN->getParent())) NewBB->moveBefore(PN->getParent()); - } // Splitting the edge can reduce the number of PHI entries we have. e = PN->getNumIncomingValues(); + PHIPred = NewBB; + i = PN->getBasicBlockIndex(PHIPred); } } - Value *&Code = InsertedCode[PN->getIncomingBlock(i)]; + Value *&Code = InsertedCode[PHIPred]; if (!Code) { // Insert the code into the end of the predecessor block. Instruction *InsertPt = (L->contains(OldLoc->getParent())) ? - PN->getIncomingBlock(i)->getTerminator() : + PHIPred->getTerminator() : OldLoc->getParent()->getTerminator(); Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(), - Rewriter, InsertPt, L); + Rewriter, InsertPt, L, LI); - DOUT << " Changing PHI use to "; - DEBUG(WriteAsOperand(*DOUT, Code, /*PrintType=*/false)); - DOUT << ", which has value " << *NewBase << " plus IMM " << *Imm << "\n"; + DEBUG(errs() << " Changing PHI use to "); + DEBUG(WriteAsOperand(errs(), Code, /*PrintType=*/false)); + DEBUG(errs() << ", which has value " << *NewBase << " plus IMM " + << *Imm << "\n"); } // Replace the use of the operand Value with the new Phi we just created. @@ -777,7 +537,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase, /// fitsInAddressMode - Return true if V can be subsumed within an addressing /// mode, and does not need to be put in a register first. -static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy, +static bool fitsInAddressMode(const SCEV *const &V, const Type *AccessTy, const TargetLowering *TLI, bool HasBaseReg) { if (const SCEVConstant *SC = dyn_cast(V)) { int64_t VC = SC->getValue()->getSExtValue(); @@ -785,7 +545,7 @@ static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy, TargetLowering::AddrMode AM; AM.BaseOffs = VC; AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, UseTy); + return TLI->isLegalAddressingMode(AM, AccessTy); } else { // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field. return (VC > -(1 << 16) && VC < (1 << 16)-1); @@ -794,10 +554,14 @@ static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy, if (const SCEVUnknown *SU = dyn_cast(V)) if (GlobalValue *GV = dyn_cast(SU->getValue())) { - TargetLowering::AddrMode AM; - AM.BaseGV = GV; - AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, UseTy); + if (TLI) { + TargetLowering::AddrMode AM; + AM.BaseGV = GV; + AM.HasBaseReg = HasBaseReg; + return TLI->isLegalAddressingMode(AM, AccessTy); + } else { + // Default: assume global addresses are not legal. + } } return false; @@ -805,12 +569,12 @@ static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy, /// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are /// loop varying to the Imm operand. -static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm, - Loop *L, ScalarEvolution *SE) { +static void MoveLoopVariantsToImmediateField(const SCEV *&Val, const SCEV *&Imm, + Loop *L, ScalarEvolution *SE) { if (Val->isLoopInvariant(L)) return; // Nothing to do. if (const SCEVAddExpr *SAE = dyn_cast(Val)) { - std::vector NewOps; + SmallVector NewOps; NewOps.reserve(SAE->getNumOperands()); for (unsigned i = 0; i != SAE->getNumOperands(); ++i) @@ -828,10 +592,10 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm, Val = SE->getAddExpr(NewOps); } else if (const SCEVAddRecExpr *SARE = dyn_cast(Val)) { // Try to pull immediates out of the start value of nested addrec's. - SCEVHandle Start = SARE->getStart(); + const SCEV *Start = SARE->getStart(); MoveLoopVariantsToImmediateField(Start, Imm, L, SE); - std::vector Ops(SARE->op_begin(), SARE->op_end()); + SmallVector Ops(SARE->op_begin(), SARE->op_end()); Ops[0] = Start; Val = SE->getAddRecExpr(Ops, SARE->getLoop()); } else { @@ -846,17 +610,17 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm, /// that can fit into the immediate field of instructions in the target. /// Accumulate these immediate values into the Imm value. static void MoveImmediateValues(const TargetLowering *TLI, - const Type *UseTy, - SCEVHandle &Val, SCEVHandle &Imm, + const Type *AccessTy, + const SCEV *&Val, const SCEV *&Imm, bool isAddress, Loop *L, ScalarEvolution *SE) { if (const SCEVAddExpr *SAE = dyn_cast(Val)) { - std::vector NewOps; + SmallVector NewOps; NewOps.reserve(SAE->getNumOperands()); for (unsigned i = 0; i != SAE->getNumOperands(); ++i) { - SCEVHandle NewOp = SAE->getOperand(i); - MoveImmediateValues(TLI, UseTy, NewOp, Imm, isAddress, L, SE); + const SCEV *NewOp = SAE->getOperand(i); + MoveImmediateValues(TLI, AccessTy, NewOp, Imm, isAddress, L, SE); if (!NewOp->isLoopInvariant(L)) { // If this is a loop-variant expression, it must stay in the immediate @@ -874,23 +638,24 @@ static void MoveImmediateValues(const TargetLowering *TLI, return; } else if (const SCEVAddRecExpr *SARE = dyn_cast(Val)) { // Try to pull immediates out of the start value of nested addrec's. - SCEVHandle Start = SARE->getStart(); - MoveImmediateValues(TLI, UseTy, Start, Imm, isAddress, L, SE); + const SCEV *Start = SARE->getStart(); + MoveImmediateValues(TLI, AccessTy, Start, Imm, isAddress, L, SE); if (Start != SARE->getStart()) { - std::vector Ops(SARE->op_begin(), SARE->op_end()); + SmallVector Ops(SARE->op_begin(), SARE->op_end()); Ops[0] = Start; Val = SE->getAddRecExpr(Ops, SARE->getLoop()); } return; } else if (const SCEVMulExpr *SME = dyn_cast(Val)) { // Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field. - if (isAddress && fitsInAddressMode(SME->getOperand(0), UseTy, TLI, false) && + if (isAddress && + fitsInAddressMode(SME->getOperand(0), AccessTy, TLI, false) && SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) { - SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType()); - SCEVHandle NewOp = SME->getOperand(1); - MoveImmediateValues(TLI, UseTy, NewOp, SubImm, isAddress, L, SE); + const SCEV *SubImm = SE->getIntegerSCEV(0, Val->getType()); + const SCEV *NewOp = SME->getOperand(1); + MoveImmediateValues(TLI, AccessTy, NewOp, SubImm, isAddress, L, SE); // If we extracted something out of the subexpressions, see if we can // simplify this! @@ -898,7 +663,7 @@ static void MoveImmediateValues(const TargetLowering *TLI, // Scale SubImm up by "8". If the result is a target constant, we are // good. SubImm = SE->getMulExpr(SubImm, SME->getOperand(0)); - if (fitsInAddressMode(SubImm, UseTy, TLI, false)) { + if (fitsInAddressMode(SubImm, AccessTy, TLI, false)) { // Accumulate the immediate. Imm = SE->getAddExpr(Imm, SubImm); @@ -912,7 +677,7 @@ static void MoveImmediateValues(const TargetLowering *TLI, // Loop-variant expressions must stay in the immediate field of the // expression. - if ((isAddress && fitsInAddressMode(Val, UseTy, TLI, false)) || + if ((isAddress && fitsInAddressMode(Val, AccessTy, TLI, false)) || !Val->isLoopInvariant(L)) { Imm = SE->getAddExpr(Imm, Val); Val = SE->getIntegerSCEV(0, Val->getType()); @@ -924,29 +689,29 @@ static void MoveImmediateValues(const TargetLowering *TLI, static void MoveImmediateValues(const TargetLowering *TLI, Instruction *User, - SCEVHandle &Val, SCEVHandle &Imm, + const SCEV *&Val, const SCEV *&Imm, bool isAddress, Loop *L, ScalarEvolution *SE) { - const Type *UseTy = getAccessType(User); - MoveImmediateValues(TLI, UseTy, Val, Imm, isAddress, L, SE); + const Type *AccessTy = getAccessType(User); + MoveImmediateValues(TLI, AccessTy, Val, Imm, isAddress, L, SE); } /// SeparateSubExprs - Decompose Expr into all of the subexpressions that are /// added together. This is used to reassociate common addition subexprs /// together for maximal sharing when rewriting bases. -static void SeparateSubExprs(std::vector &SubExprs, - SCEVHandle Expr, +static void SeparateSubExprs(SmallVector &SubExprs, + const SCEV *Expr, ScalarEvolution *SE) { if (const SCEVAddExpr *AE = dyn_cast(Expr)) { for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j) SeparateSubExprs(SubExprs, AE->getOperand(j), SE); } else if (const SCEVAddRecExpr *SARE = dyn_cast(Expr)) { - SCEVHandle Zero = SE->getIntegerSCEV(0, Expr->getType()); + const SCEV *Zero = SE->getIntegerSCEV(0, Expr->getType()); if (SARE->getOperand(0) == Zero) { SubExprs.push_back(Expr); } else { // Compute the addrec with zero as its base. - std::vector Ops(SARE->op_begin(), SARE->op_end()); + SmallVector Ops(SARE->op_begin(), SARE->op_end()); Ops[0] = Zero; // Start with zero base. SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop())); @@ -970,7 +735,7 @@ struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; }; /// not remove anything. This looks for things like (a+b+c) and /// (a+c+d) and computes the common (a+c) subexpression. The common expression /// is *removed* from the Bases and returned. -static SCEVHandle +static const SCEV * RemoveCommonExpressionsFromUseBases(std::vector &Uses, ScalarEvolution *SE, Loop *L, const TargetLowering *TLI) { @@ -978,9 +743,9 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, // Only one use? This is a very common case, so we handle it specially and // cheaply. - SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType()); - SCEVHandle Result = Zero; - SCEVHandle FreeResult = Zero; + const SCEV *Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType()); + const SCEV *Result = Zero; + const SCEV *FreeResult = Zero; if (NumUses == 1) { // If the use is inside the loop, use its base, regardless of what it is: // it is clearly shared across all the IV's. If the use is outside the loop @@ -996,13 +761,13 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, // Also track whether all uses of each expression can be moved into an // an addressing mode "for free"; such expressions are left within the loop. // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; }; - std::map SubExpressionUseData; + std::map SubExpressionUseData; // UniqueSubExprs - Keep track of all of the subexpressions we see in the // order we see them. - std::vector UniqueSubExprs; + SmallVector UniqueSubExprs; - std::vector SubExprs; + SmallVector SubExprs; unsigned NumUsesInsideLoop = 0; for (unsigned i = 0; i != NumUses; ++i) { // If the user is outside the loop, just ignore it for base computation. @@ -1022,11 +787,11 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, // If this use is as an address we may be able to put CSEs in the addressing // mode rather than hoisting them. bool isAddrUse = isAddressUse(Uses[i].Inst, Uses[i].OperandValToReplace); - // We may need the UseTy below, but only when isAddrUse, so compute it + // We may need the AccessTy below, but only when isAddrUse, so compute it // only in that case. - const Type *UseTy = 0; + const Type *AccessTy = 0; if (isAddrUse) - UseTy = getAccessType(Uses[i].Inst); + AccessTy = getAccessType(Uses[i].Inst); // Split the expression into subexprs. SeparateSubExprs(SubExprs, Uses[i].Base, SE); @@ -1037,7 +802,7 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) { if (++SubExpressionUseData[SubExprs[j]].Count == 1) UniqueSubExprs.push_back(SubExprs[j]); - if (!isAddrUse || !fitsInAddressMode(SubExprs[j], UseTy, TLI, false)) + if (!isAddrUse || !fitsInAddressMode(SubExprs[j], AccessTy, TLI, false)) SubExpressionUseData[SubExprs[j]].notAllUsesAreFree = true; } SubExprs.clear(); @@ -1046,7 +811,7 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, // Now that we know how many times each is used, build Result. Iterate over // UniqueSubexprs so that we have a stable ordering. for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) { - std::map::iterator I = + std::map::iterator I = SubExpressionUseData.find(UniqueSubExprs[i]); assert(I != SubExpressionUseData.end() && "Entry not found?"); if (I->second.Count == NumUsesInsideLoop) { // Found CSE! @@ -1071,8 +836,8 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, continue; // We know this is an addressing mode use; if there are any uses that // are not, FreeResult would be Zero. - const Type *UseTy = getAccessType(Uses[i].Inst); - if (!fitsInAddressMode(FreeResult, UseTy, TLI, Result!=Zero)) { + const Type *AccessTy = getAccessType(Uses[i].Inst); + if (!fitsInAddressMode(FreeResult, AccessTy, TLI, Result!=Zero)) { // FIXME: could split up FreeResult into pieces here, some hoisted // and some not. There is no obvious advantage to this. Result = SE->getAddExpr(Result, FreeResult); @@ -1090,7 +855,7 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, if (FreeResult != Zero) { SeparateSubExprs(SubExprs, FreeResult, SE); for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) { - std::map::iterator I = + std::map::iterator I = SubExpressionUseData.find(SubExprs[j]); SubExpressionUseData.erase(I); } @@ -1129,18 +894,18 @@ RemoveCommonExpressionsFromUseBases(std::vector &Uses, return Result; } -/// ValidStride - Check whether the given Scale is valid for all loads and +/// ValidScale - Check whether the given Scale is valid for all loads and /// stores in UsersToProcess. /// -bool LoopStrengthReduce::ValidStride(bool HasBaseReg, - int64_t Scale, +bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale, const std::vector& UsersToProcess) { if (!TLI) return true; - for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) { + for (unsigned i = 0, e = UsersToProcess.size(); i!=e; ++i) { // If this is a load or other access, pass the type of the access in. - const Type *AccessTy = Type::VoidTy; + const Type *AccessTy = + Type::getVoidTy(UsersToProcess[i].Inst->getContext()); if (isAddressUse(UsersToProcess[i].Inst, UsersToProcess[i].OperandValToReplace)) AccessTy = getAccessType(UsersToProcess[i].Inst); @@ -1160,13 +925,49 @@ bool LoopStrengthReduce::ValidStride(bool HasBaseReg, return true; } +/// ValidOffset - Check whether the given Offset is valid for all loads and +/// stores in UsersToProcess. +/// +bool LoopStrengthReduce::ValidOffset(bool HasBaseReg, + int64_t Offset, + int64_t Scale, + const std::vector& UsersToProcess) { + if (!TLI) + return true; + + for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) { + // If this is a load or other access, pass the type of the access in. + const Type *AccessTy = + Type::getVoidTy(UsersToProcess[i].Inst->getContext()); + if (isAddressUse(UsersToProcess[i].Inst, + UsersToProcess[i].OperandValToReplace)) + AccessTy = getAccessType(UsersToProcess[i].Inst); + else if (isa(UsersToProcess[i].Inst)) + continue; + + TargetLowering::AddrMode AM; + if (const SCEVConstant *SC = dyn_cast(UsersToProcess[i].Imm)) + AM.BaseOffs = SC->getValue()->getSExtValue(); + AM.BaseOffs = (uint64_t)AM.BaseOffs + (uint64_t)Offset; + AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero(); + AM.Scale = Scale; + + // If load[imm+r*scale] is illegal, bail out. + if (!TLI->isLegalAddressingMode(AM, AccessTy)) + return false; + } + return true; +} + /// RequiresTypeConversion - Returns true if converting Ty1 to Ty2 is not /// a nop. bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1, const Type *Ty2) { if (Ty1 == Ty2) return false; - if (SE->getEffectiveSCEVType(Ty1) == SE->getEffectiveSCEVType(Ty2)) + Ty1 = SE->getEffectiveSCEVType(Ty1); + Ty2 = SE->getEffectiveSCEVType(Ty2); + if (Ty1 == Ty2) return false; if (Ty1->canLosslesslyBitCastTo(Ty2)) return false; @@ -1185,23 +986,27 @@ bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1, /// be folded into the addressing mode, nor even that the factor be constant; /// a multiply (executed once) outside the loop is better than another IV /// within. Well, usually. -SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, +const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, bool AllUsesAreAddresses, bool AllUsesAreOutsideLoop, - const SCEVHandle &Stride, + const SCEV *const &Stride, IVExpr &IV, const Type *Ty, const std::vector& UsersToProcess) { + if (StrideNoReuse.count(Stride)) + return SE->getIntegerSCEV(0, Stride->getType()); + if (const SCEVConstant *SC = dyn_cast(Stride)) { int64_t SInt = SC->getValue()->getSExtValue(); - for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e; - ++NewStride) { - std::map::iterator SI = - IVsByStride.find(StrideOrder[NewStride]); - if (SI == IVsByStride.end() || !isa(SI->first)) + for (unsigned NewStride = 0, e = IU->StrideOrder.size(); + NewStride != e; ++NewStride) { + std::map::iterator SI = + IVsByStride.find(IU->StrideOrder[NewStride]); + if (SI == IVsByStride.end() || !isa(SI->first) || + StrideNoReuse.count(SI->first)) continue; int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); if (SI->first != Stride && - (unsigned(abs(SInt)) < SSInt || (SInt % SSInt) != 0)) + (unsigned(abs64(SInt)) < SSInt || (SInt % SSInt) != 0)) continue; int64_t Scale = SInt / SSInt; // Check that this stride is valid for all the types used for loads and @@ -1211,24 +1016,44 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, // multiplications. if (Scale == 1 || (AllUsesAreAddresses && - ValidStride(HasBaseReg, Scale, UsersToProcess))) + ValidScale(HasBaseReg, Scale, UsersToProcess))) { + // Prefer to reuse an IV with a base of zero. for (std::vector::iterator II = SI->second.IVs.begin(), IE = SI->second.IVs.end(); II != IE; ++II) - // FIXME: Only handle base == 0 for now. - // Only reuse previous IV if it would not require a type conversion. + // Only reuse previous IV if it would not require a type conversion + // and if the base difference can be folded. if (II->Base->isZero() && !RequiresTypeConversion(II->Base->getType(), Ty)) { IV = *II; return SE->getIntegerSCEV(Scale, Stride->getType()); } + // Otherwise, settle for an IV with a foldable base. + if (AllUsesAreAddresses) + for (std::vector::iterator II = SI->second.IVs.begin(), + IE = SI->second.IVs.end(); II != IE; ++II) + // Only reuse previous IV if it would not require a type conversion + // and if the base difference can be folded. + if (SE->getEffectiveSCEVType(II->Base->getType()) == + SE->getEffectiveSCEVType(Ty) && + isa(II->Base)) { + int64_t Base = + cast(II->Base)->getValue()->getSExtValue(); + if (Base > INT32_MIN && Base <= INT32_MAX && + ValidOffset(HasBaseReg, -Base * Scale, + Scale, UsersToProcess)) { + IV = *II; + return SE->getIntegerSCEV(Scale, Stride->getType()); + } + } + } } } else if (AllUsesAreOutsideLoop) { // Accept nonconstant strides here; it is really really right to substitute // an existing IV if we can. - for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e; - ++NewStride) { - std::map::iterator SI = - IVsByStride.find(StrideOrder[NewStride]); + for (unsigned NewStride = 0, e = IU->StrideOrder.size(); + NewStride != e; ++NewStride) { + std::map::iterator SI = + IVsByStride.find(IU->StrideOrder[NewStride]); if (SI == IVsByStride.end() || !isa(SI->first)) continue; int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); @@ -1245,10 +1070,10 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg, } // Special case, old IV is -1*x and this one is x. Can treat this one as // -1*old. - for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e; - ++NewStride) { - std::map::iterator SI = - IVsByStride.find(StrideOrder[NewStride]); + for (unsigned NewStride = 0, e = IU->StrideOrder.size(); + NewStride != e; ++NewStride) { + std::map::iterator SI = + IVsByStride.find(IU->StrideOrder[NewStride]); if (SI == IVsByStride.end()) continue; if (const SCEVMulExpr *ME = dyn_cast(SI->first)) @@ -1276,7 +1101,7 @@ static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) { /// isNonConstantNegative - Return true if the specified scev is negated, but /// not a constant. -static bool isNonConstantNegative(const SCEVHandle &Expr) { +static bool isNonConstantNegative(const SCEV *const &Expr) { const SCEVMulExpr *Mul = dyn_cast(Expr); if (!Mul) return false; @@ -1288,26 +1113,31 @@ static bool isNonConstantNegative(const SCEVHandle &Expr) { return SC->getValue()->getValue().isNegative(); } -// CollectIVUsers - Transform our list of users and offsets to a bit more -// complex table. In this new vector, each 'BasedUser' contains 'Base', the base -// of the strided accesses, as well as the old information from Uses. We -// progressively move information from the Base field to the Imm field, until -// we eventually have the full access expression to rewrite the use. -SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride, +/// CollectIVUsers - Transform our list of users and offsets to a bit more +/// complex table. In this new vector, each 'BasedUser' contains 'Base', the base +/// of the strided accesses, as well as the old information from Uses. We +/// progressively move information from the Base field to the Imm field, until +/// we eventually have the full access expression to rewrite the use. +const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride, IVUsersOfOneStride &Uses, Loop *L, bool &AllUsesAreAddresses, bool &AllUsesAreOutsideLoop, std::vector &UsersToProcess) { + // FIXME: Generalize to non-affine IV's. + if (!Stride->isLoopInvariant(L)) + return SE->getIntegerSCEV(0, Stride->getType()); + UsersToProcess.reserve(Uses.Users.size()); - for (unsigned i = 0, e = Uses.Users.size(); i != e; ++i) { - UsersToProcess.push_back(BasedUser(Uses.Users[i], SE)); - + for (ilist::iterator I = Uses.Users.begin(), + E = Uses.Users.end(); I != E; ++I) { + UsersToProcess.push_back(BasedUser(*I, SE)); + // Move any loop variant operands from the offset field to the immediate // field of the use, so that we don't try to use something before it is // computed. MoveLoopVariantsToImmediateField(UsersToProcess.back().Base, - UsersToProcess.back().Imm, L, SE); + UsersToProcess.back().Imm, L, SE); assert(UsersToProcess.back().Base->isLoopInvariant(L) && "Base value is not loop invariant!"); } @@ -1319,7 +1149,7 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride, // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find // "A+B"), emit it to the preheader, then remove the expression from the // UsersToProcess base values. - SCEVHandle CommonExprs = + const SCEV *CommonExprs = RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI); // Next, figure out what we can represent in the immediate fields of @@ -1385,7 +1215,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode( const std::vector &UsersToProcess, const Loop *L, bool AllUsesAreAddresses, - SCEVHandle Stride) { + const SCEV *Stride) { if (!EnableFullLSRMode) return false; @@ -1400,14 +1230,14 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode( // TODO: For now, don't do full strength reduction if there could // potentially be greater-stride multiples of the current stride // which could reuse the current stride IV. - if (StrideOrder.back() != Stride) + if (IU->StrideOrder.back() != Stride) return false; // Iterate through the uses to find conditions that automatically rule out // full-lsr mode. for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) { - SCEV *Base = UsersToProcess[i].Base; - SCEV *Imm = UsersToProcess[i].Imm; + const SCEV *Base = UsersToProcess[i].Base; + const SCEV *Imm = UsersToProcess[i].Imm; // If any users have a loop-variant component, they can't be fully // strength-reduced. if (Imm && !Imm->isLoopInvariant(L)) @@ -1416,16 +1246,16 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode( // the two Imm values can't be folded into the address, full // strength reduction would increase register pressure. do { - SCEV *CurImm = UsersToProcess[i].Imm; + const SCEV *CurImm = UsersToProcess[i].Imm; if ((CurImm || Imm) && CurImm != Imm) { if (!CurImm) CurImm = SE->getIntegerSCEV(0, Stride->getType()); if (!Imm) Imm = SE->getIntegerSCEV(0, Stride->getType()); const Instruction *Inst = UsersToProcess[i].Inst; - const Type *UseTy = getAccessType(Inst); - SCEVHandle Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm); + const Type *AccessTy = getAccessType(Inst); + const SCEV *Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm); if (!Diff->isZero() && (!AllUsesAreAddresses || - !fitsInAddressMode(Diff, UseTy, TLI, /*HasBaseReg=*/true))) + !fitsInAddressMode(Diff, AccessTy, TLI, /*HasBaseReg=*/true))) return false; } } while (++i != e && Base == UsersToProcess[i].Base); @@ -1456,7 +1286,8 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode( /// /// Return the created phi node. /// -static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step, +static PHINode *InsertAffinePhi(const SCEV *Start, const SCEV *Step, + Instruction *IVIncInsertPt, const Loop *L, SCEVExpander &Rewriter) { assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!"); @@ -1475,21 +1306,22 @@ static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step, // If the stride is negative, insert a sub instead of an add for the // increment. bool isNegative = isNonConstantNegative(Step); - SCEVHandle IncAmount = Step; + const SCEV *IncAmount = Step; if (isNegative) IncAmount = Rewriter.SE.getNegativeSCEV(Step); // Insert an add instruction right before the terminator corresponding - // to the back-edge. + // to the back-edge or just before the only use. The location is determined + // by the caller and passed in as IVIncInsertPt. Value *StepV = Rewriter.expandCodeFor(IncAmount, Ty, Preheader->getTerminator()); Instruction *IncV; if (isNegative) { IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next", - LatchBlock->getTerminator()); + IVIncInsertPt); } else { IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next", - LatchBlock->getTerminator()); + IVIncInsertPt); } if (!isa(StepV)) ++NumVariable; @@ -1513,13 +1345,13 @@ static void SortUsersToProcess(std::vector &UsersToProcess) { // loop before users outside of the loop with a particular base. // // We would like to use stable_sort here, but we can't. The problem is that - // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so + // const SCEV *'s don't have a deterministic ordering w.r.t to each other, so // we don't have anything to do a '<' comparison on. Because we think the // number of uses is small, do a horrible bubble sort which just relies on // ==. for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) { // Get a base value. - SCEVHandle Base = UsersToProcess[i].Base; + const SCEV *Base = UsersToProcess[i].Base; // Compact everything with this base to be consecutive with this one. for (unsigned j = i+1; j != e; ++j) { @@ -1538,22 +1370,23 @@ static void SortUsersToProcess(std::vector &UsersToProcess) { void LoopStrengthReduce::PrepareToStrengthReduceFully( std::vector &UsersToProcess, - SCEVHandle Stride, - SCEVHandle CommonExprs, + const SCEV *Stride, + const SCEV *CommonExprs, const Loop *L, SCEVExpander &PreheaderRewriter) { - DOUT << " Fully reducing all users\n"; + DEBUG(errs() << " Fully reducing all users\n"); // Rewrite the UsersToProcess records, creating a separate PHI for each // unique Base value. + Instruction *IVIncInsertPt = L->getLoopLatch()->getTerminator(); for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) { // TODO: The uses are grouped by base, but not sorted. We arbitrarily // pick the first Imm value here to start with, and adjust it for the // other uses. - SCEVHandle Imm = UsersToProcess[i].Imm; - SCEVHandle Base = UsersToProcess[i].Base; - SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm); - PHINode *Phi = InsertAffinePhi(Start, Stride, L, + const SCEV *Imm = UsersToProcess[i].Imm; + const SCEV *Base = UsersToProcess[i].Base; + const SCEV *Start = SE->getAddExpr(CommonExprs, Base, Imm); + PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L, PreheaderRewriter); // Loop over all the users with the same base. do { @@ -1566,21 +1399,34 @@ LoopStrengthReduce::PrepareToStrengthReduceFully( } } +/// FindIVIncInsertPt - Return the location to insert the increment instruction. +/// If the only use if a use of postinc value, (must be the loop termination +/// condition), then insert it just before the use. +static Instruction *FindIVIncInsertPt(std::vector &UsersToProcess, + const Loop *L) { + if (UsersToProcess.size() == 1 && + UsersToProcess[0].isUseOfPostIncrementedValue && + L->contains(UsersToProcess[0].Inst->getParent())) + return UsersToProcess[0].Inst; + return L->getLoopLatch()->getTerminator(); +} + /// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the /// given users to share. /// void LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi( std::vector &UsersToProcess, - SCEVHandle Stride, - SCEVHandle CommonExprs, + const SCEV *Stride, + const SCEV *CommonExprs, Value *CommonBaseV, + Instruction *IVIncInsertPt, const Loop *L, SCEVExpander &PreheaderRewriter) { - DOUT << " Inserting new PHI:\n"; + DEBUG(errs() << " Inserting new PHI:\n"); PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV), - Stride, L, + Stride, IVIncInsertPt, L, PreheaderRewriter); // Remember this in case a later stride is multiple of this. @@ -1590,13 +1436,13 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi( for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) UsersToProcess[i].Phi = Phi; - DOUT << " IV="; - DEBUG(WriteAsOperand(*DOUT, Phi, /*PrintType=*/false)); - DOUT << "\n"; + DEBUG(errs() << " IV="); + DEBUG(WriteAsOperand(errs(), Phi, /*PrintType=*/false)); + DEBUG(errs() << "\n"); } -/// PrepareToStrengthReduceWithNewPhi - Prepare for the given users to reuse -/// an induction variable with a stride that is a factor of the current +/// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to +/// reuse an induction variable with a stride that is a factor of the current /// induction variable. /// void @@ -1605,8 +1451,8 @@ LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride( Value *CommonBaseV, const IVExpr &ReuseIV, Instruction *PreInsertPt) { - DOUT << " Rewriting in terms of existing IV of STRIDE " << *ReuseIV.Stride - << " and BASE " << *ReuseIV.Base << "\n"; + DEBUG(errs() << " Rewriting in terms of existing IV of STRIDE " + << *ReuseIV.Stride << " and BASE " << *ReuseIV.Base << "\n"); // All the users will share the reused IV. for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) @@ -1648,7 +1494,7 @@ static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset, /// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single /// stride of IV. All of the users may have different starting values, and this /// may not be the only stride. -void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, +void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride, IVUsersOfOneStride &Uses, Loop *L) { // If all the users are moved to another stride, then there is nothing to do. @@ -1671,7 +1517,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // move information from the Base field to the Imm field, until we eventually // have the full access expression to rewrite the use. std::vector UsersToProcess; - SCEVHandle CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses, + const SCEV *CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses, AllUsesAreOutsideLoop, UsersToProcess); @@ -1689,9 +1535,11 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // If all uses are addresses, consider sinking the immediate part of the // common expression back into uses if they can fit in the immediate fields. if (TLI && HaveCommonExprs && AllUsesAreAddresses) { - SCEVHandle NewCommon = CommonExprs; - SCEVHandle Imm = SE->getIntegerSCEV(0, ReplacedTy); - MoveImmediateValues(TLI, Type::VoidTy, NewCommon, Imm, true, L, SE); + const SCEV *NewCommon = CommonExprs; + const SCEV *Imm = SE->getIntegerSCEV(0, ReplacedTy); + MoveImmediateValues(TLI, Type::getVoidTy( + L->getLoopPreheader()->getContext()), + NewCommon, Imm, true, L, SE); if (!Imm->isZero()) { bool DoSink = true; @@ -1706,11 +1554,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, if (GV || Offset) // Pass VoidTy as the AccessTy to be conservative, because // there could be multiple access types among all the uses. - DoSink = IsImmFoldedIntoAddrMode(GV, Offset, Type::VoidTy, + DoSink = IsImmFoldedIntoAddrMode(GV, Offset, + Type::getVoidTy(L->getLoopPreheader()->getContext()), UsersToProcess, TLI); if (DoSink) { - DOUT << " Sinking " << *Imm << " back down into uses\n"; + DEBUG(errs() << " Sinking " << *Imm << " back down into uses\n"); for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, Imm); CommonExprs = NewCommon; @@ -1722,22 +1571,25 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // Now that we know what we need to do, insert the PHI node itself. // - DOUT << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE " - << *Stride << ":\n" - << " Common base: " << *CommonExprs << "\n"; + DEBUG(errs() << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE " + << *Stride << ":\n" + << " Common base: " << *CommonExprs << "\n"); - SCEVExpander Rewriter(*SE, *LI); - SCEVExpander PreheaderRewriter(*SE, *LI); + SCEVExpander Rewriter(*SE); + SCEVExpander PreheaderRewriter(*SE); BasicBlock *Preheader = L->getLoopPreheader(); Instruction *PreInsertPt = Preheader->getTerminator(); BasicBlock *LatchBlock = L->getLoopLatch(); + Instruction *IVIncInsertPt = LatchBlock->getTerminator(); Value *CommonBaseV = Constant::getNullValue(ReplacedTy); - SCEVHandle RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy); - IVExpr ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty), - SE->getIntegerSCEV(0, Type::Int32Ty), + const SCEV *RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy); + IVExpr ReuseIV(SE->getIntegerSCEV(0, + Type::getInt32Ty(Preheader->getContext())), + SE->getIntegerSCEV(0, + Type::getInt32Ty(Preheader->getContext())), 0); /// Choose a strength-reduction strategy and prepare for it by creating @@ -1751,47 +1603,49 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, CommonBaseV = PreheaderRewriter.expandCodeFor(CommonExprs, ReplacedTy, PreInsertPt); - // If all uses are addresses, check if it is possible to reuse an IV with a - // stride that is a factor of this stride. And that the multiple is a number - // that can be encoded in the scale field of the target addressing mode. And - // that we will have a valid instruction after this substition, including - // the immediate field, if any. + // If all uses are addresses, check if it is possible to reuse an IV. The + // new IV must have a stride that is a multiple of the old stride; the + // multiple must be a number that can be encoded in the scale field of the + // target addressing mode; and we must have a valid instruction after this + // substitution, including the immediate field, if any. RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses, AllUsesAreOutsideLoop, Stride, ReuseIV, ReplacedTy, UsersToProcess); - if (isa(RewriteFactor) && - cast(RewriteFactor)->isZero()) - PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs, - CommonBaseV, L, PreheaderRewriter); - else + if (!RewriteFactor->isZero()) PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV, ReuseIV, PreInsertPt); + else { + IVIncInsertPt = FindIVIncInsertPt(UsersToProcess, L); + PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs, + CommonBaseV, IVIncInsertPt, + L, PreheaderRewriter); + } } // Process all the users now, replacing their strided uses with // strength-reduced forms. This outer loop handles all bases, the inner // loop handles all users of a particular base. while (!UsersToProcess.empty()) { - SCEVHandle Base = UsersToProcess.back().Base; + const SCEV *Base = UsersToProcess.back().Base; Instruction *Inst = UsersToProcess.back().Inst; // Emit the code for Base into the preheader. Value *BaseV = 0; if (!Base->isZero()) { - BaseV = PreheaderRewriter.expandCodeFor(Base, Base->getType(), - PreInsertPt); + BaseV = PreheaderRewriter.expandCodeFor(Base, 0, PreInsertPt); - DOUT << " INSERTING code for BASE = " << *Base << ":"; + DEBUG(errs() << " INSERTING code for BASE = " << *Base << ":"); if (BaseV->hasName()) - DOUT << " Result value name = %" << BaseV->getNameStr(); - DOUT << "\n"; + DEBUG(errs() << " Result value name = %" << BaseV->getName()); + DEBUG(errs() << "\n"); // If BaseV is a non-zero constant, make sure that it gets inserted into // the preheader, instead of being forward substituted into the uses. We // do this by forcing a BitCast (noop cast) to be inserted into the // preheader in this case. - if (!fitsInAddressMode(Base, getAccessType(Inst), TLI, false)) { + if (!fitsInAddressMode(Base, getAccessType(Inst), TLI, false) && + isa(BaseV)) { // We want this constant emitted into the preheader! This is just // using cast as a copy so BitCast (no-op cast) is appropriate BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert", @@ -1805,27 +1659,33 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // FIXME: Use emitted users to emit other users. BasedUser &User = UsersToProcess.back(); - DOUT << " Examining use "; - DEBUG(WriteAsOperand(*DOUT, UsersToProcess.back().OperandValToReplace, + DEBUG(errs() << " Examining "); + if (User.isUseOfPostIncrementedValue) + DEBUG(errs() << "postinc"); + else + DEBUG(errs() << "preinc"); + DEBUG(errs() << " use "); + DEBUG(WriteAsOperand(errs(), UsersToProcess.back().OperandValToReplace, /*PrintType=*/false)); - DOUT << " in Inst: " << *Inst; + DEBUG(errs() << " in Inst: " << *User.Inst); // If this instruction wants to use the post-incremented value, move it // after the post-inc and use its value instead of the PHI. Value *RewriteOp = User.Phi; if (User.isUseOfPostIncrementedValue) { RewriteOp = User.Phi->getIncomingValueForBlock(LatchBlock); - // If this user is in the loop, make sure it is the last thing in the - // loop to ensure it is dominated by the increment. - if (L->contains(User.Inst->getParent())) - User.Inst->moveBefore(LatchBlock->getTerminator()); + // loop to ensure it is dominated by the increment. In case it's the + // only use of the iv, the increment instruction is already before the + // use. + if (L->contains(User.Inst->getParent()) && User.Inst != IVIncInsertPt) + User.Inst->moveBefore(IVIncInsertPt); } - SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp); + const SCEV *RewriteExpr = SE->getUnknown(RewriteOp); - if (SE->getTypeSizeInBits(RewriteOp->getType()) != - SE->getTypeSizeInBits(ReplacedTy)) { + if (SE->getEffectiveSCEVType(RewriteOp->getType()) != + SE->getEffectiveSCEVType(ReplacedTy)) { assert(SE->getTypeSizeInBits(RewriteOp->getType()) > SE->getTypeSizeInBits(ReplacedTy) && "Unexpected widening cast!"); @@ -1854,9 +1714,9 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, // The base has been used to initialize the PHI node but we don't want // it here. if (!ReuseIV.Base->isZero()) { - SCEVHandle typedBase = ReuseIV.Base; - if (SE->getTypeSizeInBits(RewriteExpr->getType()) != - SE->getTypeSizeInBits(ReuseIV.Base->getType())) { + const SCEV *typedBase = ReuseIV.Base; + if (SE->getEffectiveSCEVType(RewriteExpr->getType()) != + SE->getEffectiveSCEVType(ReuseIV.Base->getType())) { // It's possible the original IV is a larger type than the new IV, // in which case we have to truncate the Base. We checked in // RequiresTypeConversion that this is valid. @@ -1895,12 +1755,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(BaseV)); User.RewriteInstructionToUseNewBase(RewriteExpr, NewBasePt, - Rewriter, L, this, + Rewriter, L, this, *LI, DeadInsts); // Mark old value we replaced as possibly dead, so that it is eliminated // if we just replaced the last use of that value. - DeadInsts.push_back(cast(User.OperandValToReplace)); + DeadInsts.push_back(User.OperandValToReplace); UsersToProcess.pop_back(); ++NumReduced; @@ -1919,20 +1779,20 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride, /// set the IV user and stride information and return true, otherwise return /// false. bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, - const SCEVHandle *&CondStride) { - for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e && !CondUse; - ++Stride) { - std::map::iterator SI = - IVUsesByStride.find(StrideOrder[Stride]); - assert(SI != IVUsesByStride.end() && "Stride doesn't exist!"); - - for (std::vector::iterator UI = SI->second.Users.begin(), - E = SI->second.Users.end(); UI != E; ++UI) - if (UI->User == Cond) { + const SCEV *const * &CondStride) { + for (unsigned Stride = 0, e = IU->StrideOrder.size(); + Stride != e && !CondUse; ++Stride) { + std::map::iterator SI = + IU->IVUsesByStride.find(IU->StrideOrder[Stride]); + assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); + + for (ilist::iterator UI = SI->second->Users.begin(), + E = SI->second->Users.end(); UI != E; ++UI) + if (UI->getUser() == Cond) { // NOTE: we could handle setcc instructions with multiple uses here, but // InstCombine does it as well for simple uses, it's not clear that it // occurs enough in real life to handle. - CondUse = &*UI; + CondUse = UI; CondStride = &SI->first; return true; } @@ -1949,7 +1809,7 @@ namespace { const ScalarEvolution *SE; explicit StrideCompare(const ScalarEvolution *se) : SE(se) {} - bool operator()(const SCEVHandle &LHS, const SCEVHandle &RHS) { + bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) { const SCEVConstant *LHSC = dyn_cast(LHS); const SCEVConstant *RHSC = dyn_cast(RHS); if (LHSC && RHSC) { @@ -1992,10 +1852,19 @@ namespace { /// if (v1 < 30) goto loop ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, IVStrideUse* &CondUse, - const SCEVHandle* &CondStride) { - if (StrideOrder.size() < 2 || - IVUsesByStride[*CondStride].Users.size() != 1) + const SCEV *const* &CondStride) { + // If there's only one stride in the loop, there's nothing to do here. + if (IU->StrideOrder.size() < 2) + return Cond; + // If there are other users of the condition's stride, don't bother + // trying to change the condition because the stride will still + // remain. + std::map::iterator I = + IU->IVUsesByStride.find(*CondStride); + if (I == IU->IVUsesByStride.end() || + I->second->Users.size() != 1) return Cond; + // Only handle constant strides for now. const SCEVConstant *SC = dyn_cast(*CondStride); if (!SC) return Cond; @@ -2007,11 +1876,11 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, const Type *NewCmpTy = NULL; unsigned TyBits = SE->getTypeSizeInBits(CmpTy); unsigned NewTyBits = 0; - SCEVHandle *NewStride = NULL; + const SCEV **NewStride = NULL; Value *NewCmpLHS = NULL; Value *NewCmpRHS = NULL; int64_t Scale = 1; - SCEVHandle NewOffset = SE->getIntegerSCEV(0, CmpTy); + const SCEV *NewOffset = SE->getIntegerSCEV(0, CmpTy); if (ConstantInt *C = dyn_cast(Cond->getOperand(1))) { int64_t CmpVal = C->getValue().getSExtValue(); @@ -2022,22 +1891,26 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, return Cond; // Look for a suitable stride / iv as replacement. - for (unsigned i = 0, e = StrideOrder.size(); i != e; ++i) { - std::map::iterator SI = - IVUsesByStride.find(StrideOrder[i]); + for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) { + std::map::iterator SI = + IU->IVUsesByStride.find(IU->StrideOrder[i]); if (!isa(SI->first)) continue; int64_t SSInt = cast(SI->first)->getValue()->getSExtValue(); if (SSInt == CmpSSInt || - abs(SSInt) < abs(CmpSSInt) || + abs64(SSInt) < abs64(CmpSSInt) || (SSInt % CmpSSInt) != 0) continue; Scale = SSInt / CmpSSInt; int64_t NewCmpVal = CmpVal * Scale; - APInt Mul = APInt(BitWidth, NewCmpVal); + APInt Mul = APInt(BitWidth*2, CmpVal, true); + Mul = Mul * APInt(BitWidth*2, Scale, true); // Check for overflow. - if (Mul.getSExtValue() != NewCmpVal) + if (!Mul.isSignedIntN(BitWidth)) + continue; + // Check for overflow in the stride's type too. + if (!Mul.isSignedIntN(SE->getTypeSizeInBits(SI->first->getType()))) continue; // Watch out for overflow. @@ -2049,9 +1922,27 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, continue; // Pick the best iv to use trying to avoid a cast. NewCmpLHS = NULL; - for (std::vector::iterator UI = SI->second.Users.begin(), - E = SI->second.Users.end(); UI != E; ++UI) { - NewCmpLHS = UI->OperandValToReplace; + for (ilist::iterator UI = SI->second->Users.begin(), + E = SI->second->Users.end(); UI != E; ++UI) { + Value *Op = UI->getOperandValToReplace(); + + // If the IVStrideUse implies a cast, check for an actual cast which + // can be used to find the original IV expression. + if (SE->getEffectiveSCEVType(Op->getType()) != + SE->getEffectiveSCEVType(SI->first->getType())) { + CastInst *CI = dyn_cast(Op); + // If it's not a simple cast, it's complicated. + if (!CI) + continue; + // If it's a cast from a type other than the stride type, + // it's complicated. + if (CI->getOperand(0)->getType() != SI->first->getType()) + continue; + // Ok, we found the IV expression in the stride's type. + Op = CI->getOperand(0); + } + + NewCmpLHS = Op; if (NewCmpLHS->getType() == CmpTy) break; } @@ -2060,7 +1951,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, NewCmpTy = NewCmpLHS->getType(); NewTyBits = SE->getTypeSizeInBits(NewCmpTy); - const Type *NewCmpIntTy = IntegerType::get(NewTyBits); + const Type *NewCmpIntTy = IntegerType::get(Cond->getContext(), NewTyBits); if (RequiresTypeConversion(NewCmpTy, CmpTy)) { // Check if it is possible to rewrite it using // an iv / stride of a smaller integer type. @@ -2075,13 +1966,13 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, // Don't rewrite if use offset is non-constant and the new type is // of a different type. // FIXME: too conservative? - if (NewTyBits != TyBits && !isa(CondUse->Offset)) + if (NewTyBits != TyBits && !isa(CondUse->getOffset())) continue; bool AllUsesAreAddresses = true; bool AllUsesAreOutsideLoop = true; std::vector UsersToProcess; - SCEVHandle CommonExprs = CollectIVUsers(SI->first, SI->second, L, + const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, AllUsesAreAddresses, AllUsesAreOutsideLoop, UsersToProcess); @@ -2089,7 +1980,13 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, // if it's likely the new stride uses will be rewritten using the // stride of the compare instruction. if (AllUsesAreAddresses && - ValidStride(!CommonExprs->isZero(), Scale, UsersToProcess)) + ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) + continue; + + // Avoid rewriting the compare instruction with an iv which has + // implicit extension or truncation built into it. + // TODO: This is over-conservative. + if (SE->getTypeSizeInBits(CondUse->getOffset()->getType()) != TyBits) continue; // If scale is negative, use swapped predicate unless it's testing @@ -2097,18 +1994,19 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, if (Scale < 0 && !Cond->isEquality()) Predicate = ICmpInst::getSwappedPredicate(Predicate); - NewStride = &StrideOrder[i]; + NewStride = &IU->StrideOrder[i]; if (!isa(NewCmpTy)) NewCmpRHS = ConstantInt::get(NewCmpTy, NewCmpVal); else { - ConstantInt *CI = ConstantInt::get(NewCmpIntTy, NewCmpVal); + Constant *CI = ConstantInt::get(NewCmpIntTy, NewCmpVal); NewCmpRHS = ConstantExpr::getIntToPtr(CI, NewCmpTy); } NewOffset = TyBits == NewTyBits - ? SE->getMulExpr(CondUse->Offset, - SE->getConstant(ConstantInt::get(CmpTy, Scale))) - : SE->getConstant(ConstantInt::get(NewCmpIntTy, - cast(CondUse->Offset)->getValue()->getSExtValue()*Scale)); + ? SE->getMulExpr(CondUse->getOffset(), + SE->getConstant(CmpTy, Scale)) + : SE->getConstant(NewCmpIntTy, + cast(CondUse->getOffset())->getValue() + ->getSExtValue()*Scale); break; } } @@ -2130,28 +2028,26 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, // Create a new compare instruction using new stride / iv. ICmpInst *OldCond = Cond; // Insert new compare instruction. - Cond = new ICmpInst(Predicate, NewCmpLHS, NewCmpRHS, - L->getHeader()->getName() + ".termcond", - OldCond); + Cond = new ICmpInst(OldCond, Predicate, NewCmpLHS, NewCmpRHS, + L->getHeader()->getName() + ".termcond"); // Remove the old compare instruction. The old indvar is probably dead too. - DeadInsts.push_back(cast(CondUse->OperandValToReplace)); - SE->deleteValueFromRecords(OldCond); + DeadInsts.push_back(CondUse->getOperandValToReplace()); OldCond->replaceAllUsesWith(Cond); OldCond->eraseFromParent(); - IVUsesByStride[*CondStride].Users.pop_back(); - IVUsesByStride[*NewStride].addUser(NewOffset, Cond, NewCmpLHS); - CondUse = &IVUsesByStride[*NewStride].Users.back(); + IU->IVUsesByStride[*NewStride]->addUser(NewOffset, Cond, NewCmpLHS); + CondUse = &IU->IVUsesByStride[*NewStride]->Users.back(); CondStride = NewStride; ++NumEliminated; + Changed = true; } return Cond; } -/// OptimizeSMax - Rewrite the loop's terminating condition if it uses -/// an smax computation. +/// OptimizeMax - Rewrite the loop's terminating condition if it uses +/// a max computation. /// /// This is a narrow solution to a specific, but acute, problem. For loops /// like this: @@ -2161,10 +2057,10 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, /// p[i] = 0.0; /// } while (++i < n); /// -/// where the comparison is signed, the trip count isn't just 'n', because -/// 'n' could be negative. And unfortunately this can come up even for loops -/// where the user didn't use a C do-while loop. For example, seemingly -/// well-behaved top-test loops will commonly be lowered like this: +/// the trip count isn't just 'n', because 'n' might not be positive. And +/// unfortunately this can come up even for loops where the user didn't use +/// a C do-while loop. For example, seemingly well-behaved top-test loops +/// will commonly be lowered like this: // /// if (n > 0) { /// i = 0; @@ -2177,14 +2073,14 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, /// test in such a way that indvars can't find it. /// /// When indvars can't find the if test in loops like this, it creates a -/// signed-max expression, which allows it to give the loop a canonical +/// max expression, which allows it to give the loop a canonical /// induction variable: /// /// i = 0; -/// smax = n < 1 ? 1 : n; +/// max = n < 1 ? 1 : n; /// do { /// p[i] = 0.0; -/// } while (++i != smax); +/// } while (++i != max); /// /// Canonical induction variables are necessary because the loop passes /// are designed around them. The most obvious example of this is the @@ -2200,8 +2096,8 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond, /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting /// the instructions for the maximum computation. /// -ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse) { +ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond, + IVStrideUse* &CondUse) { // Check that the loop matches the pattern we're looking for. if (Cond->getPredicate() != CmpInst::ICMP_EQ && Cond->getPredicate() != CmpInst::ICMP_NE) @@ -2210,25 +2106,32 @@ ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond, SelectInst *Sel = dyn_cast(Cond->getOperand(1)); if (!Sel || !Sel->hasOneUse()) return Cond; - SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L); + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); if (isa(BackedgeTakenCount)) return Cond; - SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType()); + const SCEV *One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType()); // Add one to the backedge-taken count to get the trip count. - SCEVHandle IterationCount = SE->getAddExpr(BackedgeTakenCount, One); + const SCEV *IterationCount = SE->getAddExpr(BackedgeTakenCount, One); // Check for a max calculation that matches the pattern. - SCEVSMaxExpr *SMax = dyn_cast(IterationCount); - if (!SMax || SMax != SE->getSCEV(Sel)) return Cond; + if (!isa(IterationCount) && !isa(IterationCount)) + return Cond; + const SCEVNAryExpr *Max = cast(IterationCount); + if (Max != SE->getSCEV(Sel)) return Cond; + + // To handle a max with more than two operands, this optimization would + // require additional checking and setup. + if (Max->getNumOperands() != 2) + return Cond; - SCEVHandle SMaxLHS = SMax->getOperand(0); - SCEVHandle SMaxRHS = SMax->getOperand(1); - if (!SMaxLHS || SMaxLHS != One) return Cond; + const SCEV *MaxLHS = Max->getOperand(0); + const SCEV *MaxRHS = Max->getOperand(1); + if (!MaxLHS || MaxLHS != One) return Cond; // Check the relevant induction variable for conformance to // the pattern. - SCEVHandle IV = SE->getSCEV(Cond->getOperand(0)); + const SCEV *IV = SE->getSCEV(Cond->getOperand(0)); const SCEVAddRecExpr *AR = dyn_cast(IV); if (!AR || !AR->isAffine() || AR->getStart() != One || @@ -2241,32 +2144,32 @@ ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond, // Check the right operand of the select, and remember it, as it will // be used in the new comparison instruction. Value *NewRHS = 0; - if (SE->getSCEV(Sel->getOperand(1)) == SMaxRHS) + if (SE->getSCEV(Sel->getOperand(1)) == MaxRHS) NewRHS = Sel->getOperand(1); - else if (SE->getSCEV(Sel->getOperand(2)) == SMaxRHS) + else if (SE->getSCEV(Sel->getOperand(2)) == MaxRHS) NewRHS = Sel->getOperand(2); if (!NewRHS) return Cond; + // Determine the new comparison opcode. It may be signed or unsigned, + // and the original comparison may be either equality or inequality. + CmpInst::Predicate Pred = + isa(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; + if (Cond->getPredicate() == CmpInst::ICMP_EQ) + Pred = CmpInst::getInversePredicate(Pred); + // Ok, everything looks ok to change the condition into an SLT or SGE and // delete the max calculation. ICmpInst *NewCond = - new ICmpInst(Cond->getPredicate() == CmpInst::ICMP_NE ? - CmpInst::ICMP_SLT : - CmpInst::ICMP_SGE, - Cond->getOperand(0), NewRHS, "scmp", Cond); + new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); // Delete the max calculation instructions. - SE->deleteValueFromRecords(Cond); Cond->replaceAllUsesWith(NewCond); - Cond->eraseFromParent(); + CondUse->setUser(NewCond); Instruction *Cmp = cast(Sel->getOperand(0)); - SE->deleteValueFromRecords(Sel); + Cond->eraseFromParent(); Sel->eraseFromParent(); - if (Cmp->use_empty()) { - SE->deleteValueFromRecords(Cmp); + if (Cmp->use_empty()) Cmp->eraseFromParent(); - } - CondUse->User = NewCond; return NewCond; } @@ -2274,23 +2177,23 @@ ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond, /// inside the loop then try to eliminate the cast opeation. void LoopStrengthReduce::OptimizeShadowIV(Loop *L) { - SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L); + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); if (isa(BackedgeTakenCount)) return; - - for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; + + for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e; ++Stride) { - std::map::iterator SI = - IVUsesByStride.find(StrideOrder[Stride]); - assert(SI != IVUsesByStride.end() && "Stride doesn't exist!"); + std::map::iterator SI = + IU->IVUsesByStride.find(IU->StrideOrder[Stride]); + assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); if (!isa(SI->first)) continue; - for (std::vector::iterator UI = SI->second.Users.begin(), - E = SI->second.Users.end(); UI != E; /* empty */) { - std::vector::iterator CandidateUI = UI; + for (ilist::iterator UI = SI->second->Users.begin(), + E = SI->second->Users.end(); UI != E; /* empty */) { + ilist::iterator CandidateUI = UI; ++UI; - Instruction *ShadowUse = CandidateUI->User; + Instruction *ShadowUse = CandidateUI->getUser(); const Type *DestTy = NULL; /* If shadow use is a int->float cast then insert a second IV @@ -2305,16 +2208,16 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) { for (unsigned i = 0; i < n; ++i, ++d) foo(d); */ - if (UIToFPInst *UCast = dyn_cast(CandidateUI->User)) + if (UIToFPInst *UCast = dyn_cast(CandidateUI->getUser())) DestTy = UCast->getDestTy(); - else if (SIToFPInst *SCast = dyn_cast(CandidateUI->User)) + else if (SIToFPInst *SCast = dyn_cast(CandidateUI->getUser())) DestTy = SCast->getDestTy(); if (!DestTy) continue; if (TLI) { - /* If target does not support DestTy natively then do not apply - this transformation. */ - MVT DVT = TLI->getValueType(DestTy); + // If target does not support DestTy natively then do not apply + // this transformation. + EVT DVT = TLI->getValueType(DestTy); if (!TLI->isTypeLegal(DVT)) continue; } @@ -2339,7 +2242,7 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) { ConstantInt *Init = dyn_cast(PH->getIncomingValue(Entry)); if (!Init) continue; - ConstantFP *NewInit = ConstantFP::get(DestTy, Init->getZExtValue()); + Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue()); BinaryOperator *Incr = dyn_cast(PH->getIncomingValue(Latch)); @@ -2363,62 +2266,123 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) { PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH); /* create new increment. '++d' in above example. */ - ConstantFP *CFP = ConstantFP::get(DestTy, C->getZExtValue()); + Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); BinaryOperator *NewIncr = - BinaryOperator::Create(Incr->getOpcode(), + BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? + Instruction::FAdd : Instruction::FSub, NewPH, CFP, "IV.S.next.", Incr); NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); /* Remove cast operation */ - SE->deleteValueFromRecords(ShadowUse); ShadowUse->replaceAllUsesWith(NewPH); ShadowUse->eraseFromParent(); - SI->second.Users.erase(CandidateUI); NumShadow++; break; } } } -// OptimizeIndvars - Now that IVUsesByStride is set up with all of the indvar -// uses in the loop, look to see if we can eliminate some, in favor of using -// common indvars for the different uses. +/// OptimizeIndvars - Now that IVUsesByStride is set up with all of the indvar +/// uses in the loop, look to see if we can eliminate some, in favor of using +/// common indvars for the different uses. void LoopStrengthReduce::OptimizeIndvars(Loop *L) { // TODO: implement optzns here. OptimizeShadowIV(L); +} +/// OptimizeLoopTermCond - Change loop terminating condition to use the +/// postinc iv when possible. +void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) { // Finally, get the terminating condition for the loop if possible. If we // can, we want to change it to use a post-incremented version of its // induction variable, to allow coalescing the live ranges for the IV into // one register value. - PHINode *SomePHI = cast(L->getHeader()->begin()); - BasicBlock *Preheader = L->getLoopPreheader(); - BasicBlock *LatchBlock = - SomePHI->getIncomingBlock(SomePHI->getIncomingBlock(0) == Preheader); - BranchInst *TermBr = dyn_cast(LatchBlock->getTerminator()); - if (!TermBr || TermBr->isUnconditional() || - !isa(TermBr->getCondition())) + BasicBlock *LatchBlock = L->getLoopLatch(); + BasicBlock *ExitingBlock = L->getExitingBlock(); + + if (!ExitingBlock) + // Multiple exits, just look at the exit in the latch block if there is one. + ExitingBlock = LatchBlock; + BranchInst *TermBr = dyn_cast(ExitingBlock->getTerminator()); + if (!TermBr) + return; + if (TermBr->isUnconditional() || !isa(TermBr->getCondition())) return; - ICmpInst *Cond = cast(TermBr->getCondition()); // Search IVUsesByStride to find Cond's IVUse if there is one. IVStrideUse *CondUse = 0; - const SCEVHandle *CondStride = 0; - + const SCEV *const *CondStride = 0; + ICmpInst *Cond = cast(TermBr->getCondition()); if (!FindIVUserForCond(Cond, CondUse, CondStride)) return; // setcc doesn't use the IV. - // If the trip count is computed in terms of an smax (due to ScalarEvolution + if (ExitingBlock != LatchBlock) { + if (!Cond->hasOneUse()) + // See below, we don't want the condition to be cloned. + return; + + // If exiting block is the latch block, we know it's safe and profitable to + // transform the icmp to use post-inc iv. Otherwise do so only if it would + // not reuse another iv and its iv would be reused by other uses. We are + // optimizing for the case where the icmp is the only use of the iv. + IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[*CondStride]; + for (ilist::iterator I = StrideUses.Users.begin(), + E = StrideUses.Users.end(); I != E; ++I) { + if (I->getUser() == Cond) + continue; + if (!I->isUseOfPostIncrementedValue()) + return; + } + + // FIXME: This is expensive, and worse still ChangeCompareStride does a + // similar check. Can we perform all the icmp related transformations after + // StrengthReduceStridedIVUsers? + if (const SCEVConstant *SC = dyn_cast(*CondStride)) { + int64_t SInt = SC->getValue()->getSExtValue(); + for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee; + ++NewStride) { + std::map::iterator SI = + IU->IVUsesByStride.find(IU->StrideOrder[NewStride]); + if (!isa(SI->first) || SI->first == *CondStride) + continue; + int64_t SSInt = + cast(SI->first)->getValue()->getSExtValue(); + if (SSInt == SInt) + return; // This can definitely be reused. + if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0) + continue; + int64_t Scale = SSInt / SInt; + bool AllUsesAreAddresses = true; + bool AllUsesAreOutsideLoop = true; + std::vector UsersToProcess; + const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L, + AllUsesAreAddresses, + AllUsesAreOutsideLoop, + UsersToProcess); + // Avoid rewriting the compare instruction with an iv of new stride + // if it's likely the new stride uses will be rewritten using the + // stride of the compare instruction. + if (AllUsesAreAddresses && + ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) + return; + } + } + + StrideNoReuse.insert(*CondStride); + } + + // If the trip count is computed in terms of a max (due to ScalarEvolution // being unable to find a sufficient guard, for example), change the loop - // comparison to use SLT instead of NE. - Cond = OptimizeSMax(L, Cond, CondUse); + // comparison to use SLT or ULT instead of NE. + Cond = OptimizeMax(L, Cond, CondUse); // If possible, change stride and operands of the compare instruction to // eliminate one stride. - Cond = ChangeCompareStride(L, Cond, CondUse, CondStride); + if (ExitingBlock == LatchBlock) + Cond = ChangeCompareStride(L, Cond, CondUse, CondStride); // It's possible for the setcc instruction to be anywhere in the loop, and // possible for it to have multiple users. If it is not immediately before @@ -2433,43 +2397,141 @@ void LoopStrengthReduce::OptimizeIndvars(Loop *L) { LatchBlock->getInstList().insert(TermBr, Cond); // Clone the IVUse, as the old use still exists! - IVUsesByStride[*CondStride].addUser(CondUse->Offset, Cond, - CondUse->OperandValToReplace); - CondUse = &IVUsesByStride[*CondStride].Users.back(); + IU->IVUsesByStride[*CondStride]->addUser(CondUse->getOffset(), Cond, + CondUse->getOperandValToReplace()); + CondUse = &IU->IVUsesByStride[*CondStride]->Users.back(); } } // If we get to here, we know that we can transform the setcc instruction to // use the post-incremented version of the IV, allowing us to coalesce the // live ranges for the IV correctly. - CondUse->Offset = SE->getMinusSCEV(CondUse->Offset, *CondStride); - CondUse->isUseOfPostIncrementedValue = true; + CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), *CondStride)); + CondUse->setIsUseOfPostIncrementedValue(true); + Changed = true; + + ++NumLoopCond; +} + +/// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding +/// when to exit the loop is used only for that purpose, try to rearrange things +/// so it counts down to a test against zero. +void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) { + + // If the number of times the loop is executed isn't computable, give up. + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + if (isa(BackedgeTakenCount)) + return; + + // Get the terminating condition for the loop if possible (this isn't + // necessarily in the latch, or a block that's a predecessor of the header). + if (!L->getExitBlock()) + return; // More than one loop exit blocks. + + // Okay, there is one exit block. Try to find the condition that causes the + // loop to be exited. + BasicBlock *ExitingBlock = L->getExitingBlock(); + if (!ExitingBlock) + return; // More than one block exiting! + + // Okay, we've computed the exiting block. See what condition causes us to + // exit. + // + // FIXME: we should be able to handle switch instructions (with a single exit) + BranchInst *TermBr = dyn_cast(ExitingBlock->getTerminator()); + if (TermBr == 0) return; + assert(TermBr->isConditional() && "If unconditional, it can't be in loop!"); + if (!isa(TermBr->getCondition())) + return; + ICmpInst *Cond = cast(TermBr->getCondition()); + + // Handle only tests for equality for the moment, and only stride 1. + if (Cond->getPredicate() != CmpInst::ICMP_EQ) + return; + const SCEV *IV = SE->getSCEV(Cond->getOperand(0)); + const SCEVAddRecExpr *AR = dyn_cast(IV); + const SCEV *One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType()); + if (!AR || !AR->isAffine() || AR->getStepRecurrence(*SE) != One) + return; + // If the RHS of the comparison is defined inside the loop, the rewrite + // cannot be done. + if (Instruction *CR = dyn_cast(Cond->getOperand(1))) + if (L->contains(CR->getParent())) + return; + + // Make sure the IV is only used for counting. Value may be preinc or + // postinc; 2 uses in either case. + if (!Cond->getOperand(0)->hasNUses(2)) + return; + PHINode *phi = dyn_cast(Cond->getOperand(0)); + Instruction *incr; + if (phi && phi->getParent()==L->getHeader()) { + // value tested is preinc. Find the increment. + // A CmpInst is not a BinaryOperator; we depend on this. + Instruction::use_iterator UI = phi->use_begin(); + incr = dyn_cast(UI); + if (!incr) + incr = dyn_cast(++UI); + // 1 use for postinc value, the phi. Unnecessarily conservative? + if (!incr || !incr->hasOneUse() || incr->getOpcode()!=Instruction::Add) + return; + } else { + // Value tested is postinc. Find the phi node. + incr = dyn_cast(Cond->getOperand(0)); + if (!incr || incr->getOpcode()!=Instruction::Add) + return; + + Instruction::use_iterator UI = Cond->getOperand(0)->use_begin(); + phi = dyn_cast(UI); + if (!phi) + phi = dyn_cast(++UI); + // 1 use for preinc value, the increment. + if (!phi || phi->getParent()!=L->getHeader() || !phi->hasOneUse()) + return; + } + + // Replace the increment with a decrement. + BinaryOperator *decr = + BinaryOperator::Create(Instruction::Sub, incr->getOperand(0), + incr->getOperand(1), "tmp", incr); + incr->replaceAllUsesWith(decr); + incr->eraseFromParent(); + + // Substitute endval-startval for the original startval, and 0 for the + // original endval. Since we're only testing for equality this is OK even + // if the computation wraps around. + BasicBlock *Preheader = L->getLoopPreheader(); + Instruction *PreInsertPt = Preheader->getTerminator(); + int inBlock = L->contains(phi->getIncomingBlock(0)) ? 1 : 0; + Value *startVal = phi->getIncomingValue(inBlock); + Value *endVal = Cond->getOperand(1); + // FIXME check for case where both are constant + Constant* Zero = ConstantInt::get(Cond->getOperand(1)->getType(), 0); + BinaryOperator *NewStartVal = + BinaryOperator::Create(Instruction::Sub, endVal, startVal, + "tmp", PreInsertPt); + phi->setIncomingValue(inBlock, NewStartVal); + Cond->setOperand(1, Zero); + Changed = true; } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { + IU = &getAnalysis(); LI = &getAnalysis(); DT = &getAnalysis(); SE = &getAnalysis(); Changed = false; - // Find all uses of induction variables in this loop, and categorize - // them by stride. Start by finding all of the PHI nodes in the header for - // this loop. If they are induction variables, inspect their uses. - SmallPtrSet Processed; // Don't reprocess instructions. - for (BasicBlock::iterator I = L->getHeader()->begin(); isa(I); ++I) - AddUsersIfInteresting(I, L, Processed); - - if (!IVUsesByStride.empty()) { -#ifndef NDEBUG - DOUT << "\nLSR on \"" << L->getHeader()->getParent()->getNameStart() - << "\" "; - DEBUG(L->dump()); -#endif + if (!IU->IVUsesByStride.empty()) { + DEBUG(errs() << "\nLSR on \"" << L->getHeader()->getParent()->getName() + << "\" "; + L->dump()); // Sort the StrideOrder so we process larger strides first. - std::stable_sort(StrideOrder.begin(), StrideOrder.end(), StrideCompare(SE)); + std::stable_sort(IU->StrideOrder.begin(), IU->StrideOrder.end(), + StrideCompare(SE)); // Optimize induction variables. Some indvar uses can be transformed to use // strides that will be needed for other purposes. A common example of this @@ -2477,8 +2539,14 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { // computation of some other indvar to decide when to terminate the loop. OptimizeIndvars(L); - // FIXME: We can widen subreg IV's here for RISC targets. e.g. instead of - // doing computation in byte values, promote to 32-bit values if safe. + // Change loop terminating condition to use the postinc iv when possible + // and optimize loop terminating compare. FIXME: Move this after + // StrengthReduceStridedIVUsers? + OptimizeLoopTermCond(L); + + // FIXME: We can shrink overlarge IV's here. e.g. if the code has + // computation in i64 values and the target doesn't support i64, demote + // the computation to 32-bit if safe. // FIXME: Attempt to reuse values across multiple IV's. In particular, we // could have something like "for(i) { foo(i*8); bar(i*16) }", which should @@ -2494,58 +2562,33 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) { // Also, note that we iterate over IVUsesByStride indirectly by using // StrideOrder. This extra layer of indirection makes the ordering of // strides deterministic - not dependent on map order. - for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) { - std::map::iterator SI = - IVUsesByStride.find(StrideOrder[Stride]); - assert(SI != IVUsesByStride.end() && "Stride doesn't exist!"); - StrengthReduceStridedIVUsers(SI->first, SI->second, L); + for (unsigned Stride = 0, e = IU->StrideOrder.size(); + Stride != e; ++Stride) { + std::map::iterator SI = + IU->IVUsesByStride.find(IU->StrideOrder[Stride]); + assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!"); + // FIXME: Generalize to non-affine IV's. + if (!SI->first->isLoopInvariant(L)) + continue; + StrengthReduceStridedIVUsers(SI->first, *SI->second, L); } } + // After all sharing is done, see if we can adjust the loop to test against + // zero instead of counting up to a maximum. This is usually faster. + OptimizeLoopCountIV(L); + // We're done analyzing this loop; release all the state we built up for it. - IVUsesByStride.clear(); IVsByStride.clear(); - StrideOrder.clear(); + StrideNoReuse.clear(); // Clean up after ourselves - if (!DeadInsts.empty()) { + if (!DeadInsts.empty()) DeleteTriviallyDeadInstructions(); - BasicBlock::iterator I = L->getHeader()->begin(); - while (PHINode *PN = dyn_cast(I++)) { - // At this point, we know that we have killed one or more IV users. - // It is worth checking to see if the cannonical indvar is also - // dead, so that we can remove it as well. - // - // We can remove a PHI if it is on a cycle in the def-use graph - // where each node in the cycle has degree one, i.e. only one use, - // and is an instruction with no side effects. - // - // FIXME: this needs to eliminate an induction variable even if it's being - // compared against some value to decide loop termination. - if (!PN->hasOneUse()) - continue; - - SmallPtrSet PHIs; - for (Instruction *J = dyn_cast(*PN->use_begin()); - J && J->hasOneUse() && !J->mayWriteToMemory(); - J = dyn_cast(*J->use_begin())) { - // If we find the original PHI, we've discovered a cycle. - if (J == PN) { - // Break the cycle and mark the PHI for deletion. - SE->deleteValueFromRecords(PN); - PN->replaceAllUsesWith(UndefValue::get(PN->getType())); - DeadInsts.push_back(PN); - Changed = true; - break; - } - // If we find a PHI more than once, we're on a cycle that - // won't prove fruitful. - if (isa(J) && !PHIs.insert(cast(J))) - break; - } - } - DeleteTriviallyDeadInstructions(); - } + // At this point, it is worth checking to see if any recurrence PHIs are also + // dead, so that we can remove them as well. + DeleteDeadPHIs(L->getHeader()); + return Changed; }